Spaces:

Abhi2504
/

Syncnet_FCN

Sleeping

Syncnet_FCN / SyncNetModel_FCN_Classification.py

Shubham

Deploy clean version

579f772 11 days ago

26.2 kB

	#!/usr/bin/python
	#-- coding: utf-8 --

	"""
	Fully Convolutional SyncNet (FCN-SyncNet) - CLASSIFICATION VERSION

	Key difference from regression version:
	- Output: Probability distribution over discrete offset classes
	- Loss: CrossEntropyLoss instead of MSE
	- Avoids regression-to-mean problem

	Offset classes: -15 to +15 frames (31 classes total)
	Class 0 = -15 frames, Class 15 = 0 frames, Class 30 = +15 frames

	Author: Enhanced version based on original SyncNet
	Date: 2025-12-04
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import math
	import numpy as np
	import cv2
	import os
	import subprocess
	from scipy.io import wavfile
	import python_speech_features


	class TemporalCorrelation(nn.Module):
	"""
	Compute correlation between audio and video features across time.
	"""
	def __init__(self, max_displacement=15):
	super(TemporalCorrelation, self).__init__()
	self.max_displacement = max_displacement

	def forward(self, feat1, feat2):
	"""
	Args:
	feat1: [B, C, T] - visual features
	feat2: [B, C, T] - audio features
	Returns:
	correlation: [B, 2*max_displacement+1, T] - correlation map
	"""
	B, C, T = feat1.shape
	max_disp = self.max_displacement

	# Normalize features
	feat1 = F.normalize(feat1, dim=1)
	feat2 = F.normalize(feat2, dim=1)

	# Pad feat2 for shifting
	feat2_padded = F.pad(feat2, (max_disp, max_disp), mode='replicate')

	corr_list = []
	for offset in range(-max_disp, max_disp + 1):
	shifted_feat2 = feat2_padded[:, :, offset+max_disp:offset+max_disp+T]
	corr = (feat1 * shifted_feat2).sum(dim=1, keepdim=True)
	corr_list.append(corr)

	correlation = torch.cat(corr_list, dim=1)
	return correlation


	class ChannelAttention(nn.Module):
	"""Squeeze-and-Excitation style channel attention."""
	def __init__(self, channels, reduction=16):
	super(ChannelAttention, self).__init__()
	self.avg_pool = nn.AdaptiveAvgPool1d(1)
	self.fc = nn.Sequential(
	nn.Linear(channels, channels // reduction, bias=False),
	nn.ReLU(inplace=True),
	nn.Linear(channels // reduction, channels, bias=False),
	nn.Sigmoid()
	)

	def forward(self, x):
	b, c, t = x.size()
	y = self.avg_pool(x).view(b, c)
	y = self.fc(y).view(b, c, 1)
	return x * y.expand_as(x)


	class TemporalAttention(nn.Module):
	"""Self-attention over temporal dimension."""
	def __init__(self, channels):
	super(TemporalAttention, self).__init__()
	self.query_conv = nn.Conv1d(channels, channels // 8, 1)
	self.key_conv = nn.Conv1d(channels, channels // 8, 1)
	self.value_conv = nn.Conv1d(channels, channels, 1)
	self.gamma = nn.Parameter(torch.zeros(1))

	def forward(self, x):
	B, C, T = x.size()
	query = self.query_conv(x).permute(0, 2, 1)
	key = self.key_conv(x)
	value = self.value_conv(x)
	attention = torch.bmm(query, key)
	attention = F.softmax(attention, dim=-1)
	out = torch.bmm(value, attention.permute(0, 2, 1))
	out = self.gamma * out + x
	return out


	class FCN_AudioEncoder(nn.Module):
	"""Fully convolutional audio encoder."""
	def __init__(self, output_channels=512):
	super(FCN_AudioEncoder, self).__init__()

	self.conv_layers = nn.Sequential(
	nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
	nn.BatchNorm2d(64),
	nn.ReLU(inplace=True),

	nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
	nn.BatchNorm2d(192),
	nn.ReLU(inplace=True),
	nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)),

	nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)),
	nn.BatchNorm2d(384),
	nn.ReLU(inplace=True),

	nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)),
	nn.BatchNorm2d(256),
	nn.ReLU(inplace=True),

	nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)),
	nn.BatchNorm2d(256),
	nn.ReLU(inplace=True),
	nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)),

	nn.Conv2d(256, 512, kernel_size=(5,1), stride=(5,1), padding=(0,0)),
	nn.BatchNorm2d(512),
	nn.ReLU(inplace=True),
	)

	self.channel_conv = nn.Sequential(
	nn.Conv1d(512, 512, kernel_size=1),
	nn.BatchNorm1d(512),
	nn.ReLU(inplace=True),
	nn.Conv1d(512, output_channels, kernel_size=1),
	nn.BatchNorm1d(output_channels),
	)

	self.channel_attn = ChannelAttention(output_channels)

	def forward(self, x):
	x = self.conv_layers(x)
	B, C, F, T = x.size()
	x = x.view(B, C * F, T)
	x = self.channel_conv(x)
	x = self.channel_attn(x)
	return x


	class FCN_VideoEncoder(nn.Module):
	"""Fully convolutional video encoder."""
	def __init__(self, output_channels=512):
	super(FCN_VideoEncoder, self).__init__()

	self.conv_layers = nn.Sequential(
	nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=(2,3,3)),
	nn.BatchNorm3d(96),
	nn.ReLU(inplace=True),
	nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),

	nn.Conv3d(96, 256, kernel_size=(3,5,5), stride=(1,2,2), padding=(1,2,2)),
	nn.BatchNorm3d(256),
	nn.ReLU(inplace=True),
	nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),

	nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)),
	nn.BatchNorm3d(256),
	nn.ReLU(inplace=True),

	nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)),
	nn.BatchNorm3d(256),
	nn.ReLU(inplace=True),

	nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)),
	nn.BatchNorm3d(256),
	nn.ReLU(inplace=True),
	nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),

	nn.Conv3d(256, 512, kernel_size=(3,3,3), stride=(1,1,1), padding=(1,1,1)),
	nn.BatchNorm3d(512),
	nn.ReLU(inplace=True),
	nn.AdaptiveAvgPool3d((None, 1, 1))
	)

	self.channel_conv = nn.Sequential(
	nn.Conv1d(512, 512, kernel_size=1),
	nn.BatchNorm1d(512),
	nn.ReLU(inplace=True),
	nn.Conv1d(512, output_channels, kernel_size=1),
	nn.BatchNorm1d(output_channels),
	)

	self.channel_attn = ChannelAttention(output_channels)

	def forward(self, x):
	x = self.conv_layers(x)
	B, C, T, H, W = x.size()
	x = x.view(B, C, T)
	x = self.channel_conv(x)
	x = self.channel_attn(x)
	return x


	class SyncNetFCN_Classification(nn.Module):
	"""
	Fully Convolutional SyncNet with CLASSIFICATION output.

	Treats offset detection as a multi-class classification problem:
	- num_classes = 2 * max_offset + 1 (e.g., 251 classes for max_offset=125)
	- Class index = offset + max_offset (e.g., offset -5 → class 120)
	- Uses CrossEntropyLoss for training
	- Default: ±125 frames = ±5 seconds at 25fps

	This avoids the regression-to-mean problem encountered with MSE loss.

	Architecture:
	1. Audio encoder: MFCC → temporal features
	2. Video encoder: frames → temporal features
	3. Correlation layer: compute audio-video similarity over time
	4. Classifier: predict offset class probabilities
	"""
	def __init__(self, embedding_dim=512, max_offset=125, dropout=0.3):
	super(SyncNetFCN_Classification, self).__init__()

	self.embedding_dim = embedding_dim
	self.max_offset = max_offset
	self.num_classes = 2 * max_offset + 1 # -15 to +15 = 31 classes

	# Encoders
	self.audio_encoder = FCN_AudioEncoder(output_channels=embedding_dim)
	self.video_encoder = FCN_VideoEncoder(output_channels=embedding_dim)

	# Temporal correlation
	self.correlation = TemporalCorrelation(max_displacement=max_offset)

	# Classifier head (replaces regressor)
	self.classifier = nn.Sequential(
	nn.Conv1d(self.num_classes, 128, kernel_size=3, padding=1),
	nn.BatchNorm1d(128),
	nn.ReLU(inplace=True),
	nn.Dropout(dropout),

	nn.Conv1d(128, 64, kernel_size=3, padding=1),
	nn.BatchNorm1d(64),
	nn.ReLU(inplace=True),
	nn.Dropout(dropout),

	# Output: class logits for each timestep
	nn.Conv1d(64, self.num_classes, kernel_size=1),
	)

	# Global classifier (for single prediction from sequence)
	self.global_classifier = nn.Sequential(
	nn.AdaptiveAvgPool1d(1),
	nn.Flatten(),
	nn.Linear(self.num_classes, 128),
	nn.ReLU(inplace=True),
	nn.Dropout(dropout),
	nn.Linear(128, self.num_classes),
	)

	def forward_audio(self, audio_mfcc):
	"""Extract audio features."""
	return self.audio_encoder(audio_mfcc)

	def forward_video(self, video_frames):
	"""Extract video features."""
	return self.video_encoder(video_frames)

	def forward(self, audio_mfcc, video_frames, return_temporal=False):
	"""
	Forward pass with audio-video offset classification.

	Args:
	audio_mfcc: [B, 1, F, T] - MFCC features
	video_frames: [B, 3, T', H, W] - video frames
	return_temporal: If True, also return per-timestep predictions

	Returns:
	class_logits: [B, num_classes] - global offset class logits
	temporal_logits: [B, num_classes, T] - per-timestep logits (if return_temporal)
	audio_features: [B, C, T_a] - audio embeddings
	video_features: [B, C, T_v] - video embeddings
	"""
	# Extract features
	if audio_mfcc.dim() == 3:
	audio_mfcc = audio_mfcc.unsqueeze(1)

	audio_features = self.audio_encoder(audio_mfcc)
	video_features = self.video_encoder(video_frames)

	# Align temporal dimensions
	min_time = min(audio_features.size(2), video_features.size(2))
	audio_features = audio_features[:, :, :min_time]
	video_features = video_features[:, :, :min_time]

	# Compute correlation
	correlation = self.correlation(video_features, audio_features)

	# Per-timestep classification
	temporal_logits = self.classifier(correlation)

	# Global classification (aggregate over time)
	class_logits = self.global_classifier(temporal_logits)

	if return_temporal:
	return class_logits, temporal_logits, audio_features, video_features
	return class_logits, audio_features, video_features

	def predict_offset(self, class_logits):
	"""
	Convert class logits to offset prediction.

	Args:
	class_logits: [B, num_classes] - classification logits

	Returns:
	offsets: [B] - predicted offset in frames
	confidences: [B] - prediction confidence (softmax probability)
	"""
	probs = F.softmax(class_logits, dim=1)
	predicted_class = probs.argmax(dim=1)
	offsets = predicted_class - self.max_offset # Convert class to offset
	confidences = probs.max(dim=1).values
	return offsets, confidences

	def offset_to_class(self, offset):
	"""Convert offset value to class index."""
	return offset + self.max_offset

	def class_to_offset(self, class_idx):
	"""Convert class index to offset value."""
	return class_idx - self.max_offset


	class StreamSyncFCN_Classification(nn.Module):
	"""
	Streaming-capable FCN SyncNet with classification output.

	Includes preprocessing, transfer learning, and inference utilities.
	"""

	def __init__(self, embedding_dim=512, max_offset=125,
	window_size=25, stride=5, buffer_size=100,
	pretrained_syncnet_path=None, auto_load_pretrained=True,
	dropout=0.3):
	super(StreamSyncFCN_Classification, self).__init__()

	self.window_size = window_size
	self.stride = stride
	self.buffer_size = buffer_size
	self.max_offset = max_offset
	self.num_classes = 2 * max_offset + 1

	# Initialize classification model
	self.fcn_model = SyncNetFCN_Classification(
	embedding_dim=embedding_dim,
	max_offset=max_offset,
	dropout=dropout
	)

	# Auto-load pretrained weights
	if auto_load_pretrained and pretrained_syncnet_path:
	self.load_pretrained_syncnet(pretrained_syncnet_path)

	self.reset_buffers()

	def reset_buffers(self):
	"""Reset temporal buffers."""
	self.logits_buffer = []
	self.frame_count = 0

	def load_pretrained_syncnet(self, syncnet_model_path, freeze_conv=True, verbose=True):
	"""Load conv layers from original SyncNet."""
	if verbose:
	print(f"Loading pretrained SyncNet from: {syncnet_model_path}")

	try:
	pretrained = torch.load(syncnet_model_path, map_location='cpu')
	if isinstance(pretrained, dict):
	pretrained_dict = pretrained.get('model_state_dict', pretrained.get('state_dict', pretrained))
	else:
	pretrained_dict = pretrained.state_dict()

	fcn_dict = self.fcn_model.state_dict()
	loaded_count = 0

	for key in list(pretrained_dict.keys()):
	if key.startswith('netcnnaud.'):
	idx = key.split('.')[1]
	param = '.'.join(key.split('.')[2:])
	new_key = f'audio_encoder.conv_layers.{idx}.{param}'
	if new_key in fcn_dict and pretrained_dict[key].shape == fcn_dict[new_key].shape:
	fcn_dict[new_key] = pretrained_dict[key]
	loaded_count += 1

	elif key.startswith('netcnnlip.'):
	idx = key.split('.')[1]
	param = '.'.join(key.split('.')[2:])
	new_key = f'video_encoder.conv_layers.{idx}.{param}'
	if new_key in fcn_dict and pretrained_dict[key].shape == fcn_dict[new_key].shape:
	fcn_dict[new_key] = pretrained_dict[key]
	loaded_count += 1

	self.fcn_model.load_state_dict(fcn_dict, strict=False)

	if verbose:
	print(f"✓ Loaded {loaded_count} pretrained conv parameters")

	if freeze_conv:
	for name, param in self.fcn_model.named_parameters():
	if 'conv_layers' in name:
	param.requires_grad = False
	if verbose:
	print("✓ Froze pretrained conv layers")

	except Exception as e:
	if verbose:
	print(f"⚠ Could not load pretrained weights: {e}")

	def load_fcn_checkpoint(self, checkpoint_path, verbose=True):
	"""Load FCN classification checkpoint."""
	checkpoint = torch.load(checkpoint_path, map_location='cpu')

	if 'model_state_dict' in checkpoint:
	state_dict = checkpoint['model_state_dict']
	else:
	state_dict = checkpoint

	# Try to load directly first
	try:
	self.fcn_model.load_state_dict(state_dict, strict=True)
	if verbose:
	print(f"✓ Loaded full checkpoint from {checkpoint_path}")
	except:
	# Load only matching keys
	model_dict = self.fcn_model.state_dict()
	pretrained_dict = {k: v for k, v in state_dict.items()
	if k in model_dict and v.shape == model_dict[k].shape}
	model_dict.update(pretrained_dict)
	self.fcn_model.load_state_dict(model_dict, strict=False)
	if verbose:
	print(f"✓ Loaded {len(pretrained_dict)}/{len(state_dict)} parameters from {checkpoint_path}")

	return checkpoint.get('epoch', None)

	def unfreeze_all_layers(self, verbose=True):
	"""Unfreeze all layers for fine-tuning."""
	for param in self.fcn_model.parameters():
	param.requires_grad = True
	if verbose:
	print("✓ Unfrozen all layers for fine-tuning")

	def forward(self, audio_mfcc, video_frames, return_temporal=False):
	"""Forward pass through FCN model."""
	return self.fcn_model(audio_mfcc, video_frames, return_temporal)

	def extract_audio_mfcc(self, video_path, temp_dir='temp'):
	"""Extract audio and compute MFCC."""
	os.makedirs(temp_dir, exist_ok=True)
	audio_path = os.path.join(temp_dir, 'temp_audio.wav')

	cmd = ['ffmpeg', '-y', '-i', video_path, '-ac', '1', '-ar', '16000',
	'-vn', '-acodec', 'pcm_s16le', audio_path]
	subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)

	sample_rate, audio = wavfile.read(audio_path)
	mfcc = python_speech_features.mfcc(audio, sample_rate, numcep=13).T
	mfcc_tensor = torch.FloatTensor(mfcc).unsqueeze(0).unsqueeze(0)

	if os.path.exists(audio_path):
	os.remove(audio_path)

	return mfcc_tensor

	def extract_video_frames(self, video_path, target_size=(112, 112)):
	"""Extract video frames as tensor."""
	cap = cv2.VideoCapture(video_path)
	frames = []

	while True:
	ret, frame = cap.read()
	if not ret:
	break
	frame = cv2.resize(frame, target_size)
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frames.append(frame.astype(np.float32) / 255.0)

	cap.release()

	if not frames:
	raise ValueError(f"No frames extracted from {video_path}")

	frames_array = np.stack(frames, axis=0)
	video_tensor = torch.FloatTensor(frames_array).permute(3, 0, 1, 2).unsqueeze(0)

	return video_tensor

	def detect_offset(self, video_path, temp_dir='temp', verbose=True):
	"""
	Detect AV offset using classification approach.

	Args:
	video_path: Path to video file
	temp_dir: Temporary directory for audio extraction
	verbose: Print progress information

	Returns:
	offset: Predicted offset in frames (positive = audio ahead)
	confidence: Classification confidence (0-1)
	class_probs: Full probability distribution over offset classes
	"""
	if verbose:
	print(f"Processing: {video_path}")

	# Extract features
	mfcc = self.extract_audio_mfcc(video_path, temp_dir)
	video = self.extract_video_frames(video_path)

	if verbose:
	print(f" Audio MFCC: {mfcc.shape}, Video: {video.shape}")

	# Run inference
	self.fcn_model.eval()
	with torch.no_grad():
	class_logits, _, _ = self.fcn_model(mfcc, video)
	offset, confidence = self.fcn_model.predict_offset(class_logits)
	class_probs = F.softmax(class_logits, dim=1)

	offset = offset.item()
	confidence = confidence.item()

	if verbose:
	print(f" Detected offset: {offset:+d} frames")
	print(f" Confidence: {confidence:.4f}")

	return offset, confidence, class_probs.squeeze(0).numpy()

	def process_video_file(self, video_path, temp_dir='temp', verbose=True):
	"""Alias for detect_offset for compatibility."""
	offset, confidence, _ = self.detect_offset(video_path, temp_dir, verbose)
	return offset, confidence


	def create_classification_criterion(max_offset=125, label_smoothing=0.1):
	"""
	Create loss function for classification training.

	Args:
	max_offset: Maximum offset value
	label_smoothing: Label smoothing factor (0 = no smoothing)

	Returns:
	criterion: CrossEntropyLoss with optional label smoothing
	"""
	return nn.CrossEntropyLoss(label_smoothing=label_smoothing)


	def train_step_classification(model, audio, video, target_offset, criterion, optimizer, device):
	"""
	Single training step for classification model.

	Args:
	model: SyncNetFCN_Classification or StreamSyncFCN_Classification
	audio: [B, 1, F, T] audio MFCC
	video: [B, 3, T, H, W] video frames
	target_offset: [B] target offset in frames (-max_offset to +max_offset)
	criterion: CrossEntropyLoss
	optimizer: Optimizer
	device: torch device

	Returns:
	loss: Training loss value
	accuracy: Classification accuracy
	"""
	model.train()
	optimizer.zero_grad()

	audio = audio.to(device)
	video = video.to(device)

	# Convert offset to class index
	if hasattr(model, 'fcn_model'):
	target_class = target_offset + model.fcn_model.max_offset
	else:
	target_class = target_offset + model.max_offset
	target_class = target_class.long().to(device)

	# Forward pass
	if hasattr(model, 'fcn_model'):
	class_logits, _, _ = model(audio, video)
	else:
	class_logits, _, _ = model(audio, video)

	# Compute loss
	loss = criterion(class_logits, target_class)

	# Backward pass
	loss.backward()
	optimizer.step()

	# Compute accuracy
	predicted_class = class_logits.argmax(dim=1)
	accuracy = (predicted_class == target_class).float().mean().item()

	return loss.item(), accuracy


	def validate_classification(model, dataloader, criterion, device, max_offset=125):
	"""
	Validate classification model.

	Returns:
	avg_loss: Average validation loss
	accuracy: Classification accuracy
	mean_error: Mean absolute error in frames
	"""
	model.eval()
	total_loss = 0
	correct = 0
	total = 0
	total_error = 0

	with torch.no_grad():
	for audio, video, target_offset in dataloader:
	audio = audio.to(device)
	video = video.to(device)
	target_class = (target_offset + max_offset).long().to(device)

	if hasattr(model, 'fcn_model'):
	class_logits, _, _ = model(audio, video)
	else:
	class_logits, _, _ = model(audio, video)

	loss = criterion(class_logits, target_class)
	total_loss += loss.item() * audio.size(0)

	predicted_class = class_logits.argmax(dim=1)
	correct += (predicted_class == target_class).sum().item()
	total += audio.size(0)

	# Mean absolute error
	predicted_offset = predicted_class - max_offset
	target_offset_dev = target_class - max_offset
	total_error += (predicted_offset - target_offset_dev).abs().sum().item()

	return total_loss / total, correct / total, total_error / total


	if __name__ == "__main__":
	print("Testing SyncNetFCN_Classification...")

	# Test model creation (use smaller offset for quick testing)
	model = SyncNetFCN_Classification(embedding_dim=512, max_offset=125)
	print(f"Number of classes: {model.num_classes}")

	# Test forward pass
	audio_input = torch.randn(2, 1, 13, 100)
	video_input = torch.randn(2, 3, 25, 112, 112)

	class_logits, audio_feat, video_feat = model(audio_input, video_input)
	print(f"Class logits: {class_logits.shape}")
	print(f"Audio features: {audio_feat.shape}")
	print(f"Video features: {video_feat.shape}")

	# Test prediction
	offsets, confidences = model.predict_offset(class_logits)
	print(f"Predicted offsets: {offsets}")
	print(f"Confidences: {confidences}")

	# Test with temporal output
	class_logits, temporal_logits, _, _ = model(audio_input, video_input, return_temporal=True)
	print(f"Temporal logits: {temporal_logits.shape}")

	# Test training step
	print("\nTesting training step...")
	criterion = create_classification_criterion(max_offset=125, label_smoothing=0.1)
	optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
	target_offset = torch.tensor([3, -5]) # Example target offsets

	loss, acc = train_step_classification(
	model, audio_input, video_input, target_offset,
	criterion, optimizer, 'cpu'
	)
	print(f"Training loss: {loss:.4f}, Accuracy: {acc:.2%}")

	# Count parameters
	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(f"\nTotal parameters: {total_params:,}")
	print(f"Trainable parameters: {trainable_params:,}")

	print("\nTesting StreamSyncFCN_Classification...")
	stream_model = StreamSyncFCN_Classification(
	embedding_dim=512, max_offset=125,
	pretrained_syncnet_path=None, auto_load_pretrained=False
	)

	class_logits, _, _ = stream_model(audio_input, video_input)
	print(f"Stream model class logits: {class_logits.shape}")

	print("\n✓ All tests passed!")