Spaces:

Abhi2504
/

Syncnet_FCN

Sleeping

Syncnet_FCN / SyncNetModel_FCN.py

Shubham

Deploy clean version

579f772 11 days ago

35.6 kB

	#!/usr/bin/python
	#-- coding: utf-8 --

	"""
	Fully Convolutional SyncNet (FCN-SyncNet)

	Key improvements:
	1. Fully convolutional architecture (no FC layers)
	2. Temporal feature maps instead of single embeddings
	3. Correlation-based audio-video fusion
	4. Dense sync probability predictions over time
	5. Multi-scale feature extraction
	6. Attention mechanisms

	Author: Enhanced version based on original SyncNet
	Date: 2025-11-22
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import math
	import numpy as np
	import cv2
	import os
	import subprocess
	from scipy.io import wavfile
	import python_speech_features
	from collections import OrderedDict


	class TemporalCorrelation(nn.Module):
	"""
	Compute correlation between audio and video features across time.
	Inspired by FlowNet correlation layer.
	"""
	def __init__(self, max_displacement=10):
	super(TemporalCorrelation, self).__init__()
	self.max_displacement = max_displacement

	def forward(self, feat1, feat2):
	"""
	Args:
	feat1: [B, C, T] - visual features
	feat2: [B, C, T] - audio features
	Returns:
	correlation: [B, 2*max_displacement+1, T] - correlation map
	"""
	B, C, T = feat1.shape
	max_disp = self.max_displacement

	# Normalize features
	feat1 = F.normalize(feat1, dim=1)
	feat2 = F.normalize(feat2, dim=1)

	# Pad feat2 for shifting
	feat2_padded = F.pad(feat2, (max_disp, max_disp), mode='replicate')

	corr_list = []
	for offset in range(-max_disp, max_disp + 1):
	# Shift audio features
	shifted_feat2 = feat2_padded[:, :, offset+max_disp:offset+max_disp+T]

	# Compute correlation (cosine similarity)
	corr = (feat1 * shifted_feat2).sum(dim=1, keepdim=True) # [B, 1, T]
	corr_list.append(corr)

	# Stack all correlations
	correlation = torch.cat(corr_list, dim=1) # [B, 2*max_disp+1, T]

	return correlation


	class ChannelAttention(nn.Module):
	"""Squeeze-and-Excitation style channel attention."""
	def __init__(self, channels, reduction=16):
	super(ChannelAttention, self).__init__()
	self.avg_pool = nn.AdaptiveAvgPool1d(1)
	self.fc = nn.Sequential(
	nn.Linear(channels, channels // reduction, bias=False),
	nn.ReLU(inplace=True),
	nn.Linear(channels // reduction, channels, bias=False),
	nn.Sigmoid()
	)

	def forward(self, x):
	b, c, t = x.size()
	y = self.avg_pool(x).view(b, c)
	y = self.fc(y).view(b, c, 1)
	return x * y.expand_as(x)


	class TemporalAttention(nn.Module):
	"""Self-attention over temporal dimension."""
	def __init__(self, channels):
	super(TemporalAttention, self).__init__()
	self.query_conv = nn.Conv1d(channels, channels // 8, 1)
	self.key_conv = nn.Conv1d(channels, channels // 8, 1)
	self.value_conv = nn.Conv1d(channels, channels, 1)
	self.gamma = nn.Parameter(torch.zeros(1))

	def forward(self, x):
	"""
	Args:
	x: [B, C, T]
	"""
	B, C, T = x.size()

	# Generate query, key, value
	query = self.query_conv(x).permute(0, 2, 1) # [B, T, C']
	key = self.key_conv(x) # [B, C', T]
	value = self.value_conv(x) # [B, C, T]

	# Attention weights
	attention = torch.bmm(query, key) # [B, T, T]
	attention = F.softmax(attention, dim=-1)

	# Apply attention
	out = torch.bmm(value, attention.permute(0, 2, 1)) # [B, C, T]
	out = self.gamma * out + x

	return out


	class FCN_AudioEncoder(nn.Module):
	"""
	Fully convolutional audio encoder.
	Input: MFCC or Mel spectrogram [B, 1, F, T]
	Output: Feature map [B, C, T']
	"""
	def __init__(self, output_channels=512):
	super(FCN_AudioEncoder, self).__init__()

	# Convolutional layers (preserve temporal dimension)
	self.conv_layers = nn.Sequential(
	# Layer 1
	nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
	nn.BatchNorm2d(64),
	nn.ReLU(inplace=True),

	# Layer 2
	nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
	nn.BatchNorm2d(192),
	nn.ReLU(inplace=True),
	nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)), # Reduce frequency, keep time

	# Layer 3
	nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)),
	nn.BatchNorm2d(384),
	nn.ReLU(inplace=True),

	# Layer 4
	nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)),
	nn.BatchNorm2d(256),
	nn.ReLU(inplace=True),

	# Layer 5
	nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)),
	nn.BatchNorm2d(256),
	nn.ReLU(inplace=True),
	nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)),

	# Layer 6 - Reduce frequency dimension to 1
	nn.Conv2d(256, 512, kernel_size=(5,1), stride=(5,1), padding=(0,0)),
	nn.BatchNorm2d(512),
	nn.ReLU(inplace=True),
	)

	# 1×1 conv to adjust channels (replaces FC layer)
	self.channel_conv = nn.Sequential(
	nn.Conv1d(512, 512, kernel_size=1),
	nn.BatchNorm1d(512),
	nn.ReLU(inplace=True),
	nn.Conv1d(512, output_channels, kernel_size=1),
	nn.BatchNorm1d(output_channels),
	)

	# Channel attention
	self.channel_attn = ChannelAttention(output_channels)

	def forward(self, x):
	"""
	Args:
	x: [B, 1, F, T] - MFCC features
	Returns:
	features: [B, C, T'] - temporal feature map
	"""
	# Convolutional encoding
	x = self.conv_layers(x) # [B, 512, F', T']

	# Collapse frequency dimension
	B, C, F, T = x.size()
	x = x.view(B, C * F, T) # Flatten frequency into channels

	# Reduce to output_channels
	x = self.channel_conv(x) # [B, output_channels, T']

	# Apply attention
	x = self.channel_attn(x)

	return x


	class FCN_VideoEncoder(nn.Module):
	"""
	Fully convolutional video encoder.
	Input: Video clip [B, 3, T, H, W]
	Output: Feature map [B, C, T']
	"""
	def __init__(self, output_channels=512):
	super(FCN_VideoEncoder, self).__init__()

	# 3D Convolutional layers
	self.conv_layers = nn.Sequential(
	# Layer 1
	nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=(2,3,3)),
	nn.BatchNorm3d(96),
	nn.ReLU(inplace=True),
	nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),

	# Layer 2
	nn.Conv3d(96, 256, kernel_size=(3,5,5), stride=(1,2,2), padding=(1,2,2)),
	nn.BatchNorm3d(256),
	nn.ReLU(inplace=True),
	nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),

	# Layer 3
	nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)),
	nn.BatchNorm3d(256),
	nn.ReLU(inplace=True),

	# Layer 4
	nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)),
	nn.BatchNorm3d(256),
	nn.ReLU(inplace=True),

	# Layer 5
	nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)),
	nn.BatchNorm3d(256),
	nn.ReLU(inplace=True),
	nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),

	# Layer 6 - Reduce spatial dimension
	nn.Conv3d(256, 512, kernel_size=(3,3,3), stride=(1,1,1), padding=(1,1,1)),
	nn.BatchNorm3d(512),
	nn.ReLU(inplace=True),
	# Adaptive pooling to 1x1 spatial
	nn.AdaptiveAvgPool3d((None, 1, 1)) # Keep temporal, pool spatial to 1x1
	)

	# 1×1 conv to adjust channels (replaces FC layer)
	self.channel_conv = nn.Sequential(
	nn.Conv1d(512, 512, kernel_size=1),
	nn.BatchNorm1d(512),
	nn.ReLU(inplace=True),
	nn.Conv1d(512, output_channels, kernel_size=1),
	nn.BatchNorm1d(output_channels),
	)

	# Channel attention
	self.channel_attn = ChannelAttention(output_channels)

	def forward(self, x):
	"""
	Args:
	x: [B, 3, T, H, W] - video frames
	Returns:
	features: [B, C, T'] - temporal feature map
	"""
	# Convolutional encoding
	x = self.conv_layers(x) # [B, 512, T', 1, 1]

	# Remove spatial dimensions
	B, C, T, H, W = x.size()
	x = x.view(B, C, T) # [B, 512, T']

	# Reduce to output_channels
	x = self.channel_conv(x) # [B, output_channels, T']

	# Apply attention
	x = self.channel_attn(x)

	return x


	class SyncNetFCN(nn.Module):
	"""
	Fully Convolutional SyncNet with temporal outputs (REGRESSION VERSION).

	Architecture:
	1. Audio encoder: MFCC → temporal features
	2. Video encoder: frames → temporal features
	3. Correlation layer: compute audio-video similarity over time
	4. Offset regressor: predict continuous offset value for each frame

	Changes from classification version:
	- Output: [B, 1, T] continuous offset values (not probability distribution)
	- Default max_offset: 125 frames (±5 seconds at 25fps) for streaming
	- Loss: L1/MSE instead of CrossEntropy
	"""
	def __init__(self, embedding_dim=512, max_offset=125):
	super(SyncNetFCN, self).__init__()

	self.embedding_dim = embedding_dim
	self.max_offset = max_offset

	# Encoders
	self.audio_encoder = FCN_AudioEncoder(output_channels=embedding_dim)
	self.video_encoder = FCN_VideoEncoder(output_channels=embedding_dim)

	# Temporal correlation
	self.correlation = TemporalCorrelation(max_displacement=max_offset)

	# Offset regressor (processes correlation map) - REGRESSION OUTPUT
	self.offset_regressor = nn.Sequential(
	nn.Conv1d(2*max_offset+1, 128, kernel_size=3, padding=1),
	nn.BatchNorm1d(128),
	nn.ReLU(inplace=True),
	nn.Conv1d(128, 64, kernel_size=3, padding=1),
	nn.BatchNorm1d(64),
	nn.ReLU(inplace=True),
	nn.Conv1d(64, 1, kernel_size=1), # Output: single continuous offset value
	)

	# Optional: Temporal smoothing with dilated convolutions
	self.temporal_smoother = nn.Sequential(
	nn.Conv1d(1, 32, kernel_size=3, dilation=2, padding=2),
	nn.BatchNorm1d(32),
	nn.ReLU(inplace=True),
	nn.Conv1d(32, 1, kernel_size=1),
	)

	def forward_audio(self, audio_mfcc):
	"""Extract audio features."""
	return self.audio_encoder(audio_mfcc)

	def forward_video(self, video_frames):
	"""Extract video features."""
	return self.video_encoder(video_frames)

	def forward(self, audio_mfcc, video_frames):
	"""
	Forward pass with audio-video offset regression.

	Args:
	audio_mfcc: [B, 1, F, T] - MFCC features
	video_frames: [B, 3, T', H, W] - video frames

	Returns:
	predicted_offsets: [B, 1, T''] - predicted offset in frames for each timestep
	audio_features: [B, C, T_a] - audio embeddings
	video_features: [B, C, T_v] - video embeddings
	"""
	# Extract features
	if audio_mfcc.dim() == 3:
	audio_mfcc = audio_mfcc.unsqueeze(1) # [B, 1, F, T]

	audio_features = self.audio_encoder(audio_mfcc) # [B, C, T_a]
	video_features = self.video_encoder(video_frames) # [B, C, T_v]

	# Align temporal dimensions (if needed)
	min_time = min(audio_features.size(2), video_features.size(2))
	audio_features = audio_features[:, :, :min_time]
	video_features = video_features[:, :, :min_time]

	# Compute correlation
	correlation = self.correlation(video_features, audio_features) # [B, 2*K+1, T]

	# Predict offset (regression)
	offset_logits = self.offset_regressor(correlation) # [B, 1, T]
	predicted_offsets = self.temporal_smoother(offset_logits) # Temporal smoothing

	# Clamp to valid range
	predicted_offsets = torch.clamp(predicted_offsets, -self.max_offset, self.max_offset)

	return predicted_offsets, audio_features, video_features

	def compute_offset(self, predicted_offsets):
	"""
	Extract offset and confidence from regression predictions.

	Args:
	predicted_offsets: [B, 1, T] - predicted offsets

	Returns:
	offsets: [B, T] - predicted offset for each frame
	confidences: [B, T] - confidence scores (inverse of variance)
	"""
	# Remove channel dimension
	offsets = predicted_offsets.squeeze(1) # [B, T]

	# Confidence = inverse of temporal variance (stable predictions = high confidence)
	temporal_variance = torch.var(offsets, dim=1, keepdim=True) + 1e-6 # [B, 1]
	confidences = 1.0 / temporal_variance # [B, 1]
	confidences = confidences.expand_as(offsets) # [B, T]

	# Normalize confidence to [0, 1]
	confidences = torch.sigmoid(confidences - 5.0) # Shift to reasonable range

	return offsets, confidences


	class SyncNetFCN_WithAttention(SyncNetFCN):
	"""
	Enhanced version with cross-modal attention.
	Audio and video features attend to each other before correlation.
	"""
	def __init__(self, embedding_dim=512, max_offset=15):
	super(SyncNetFCN_WithAttention, self).__init__(embedding_dim, max_offset)

	# Cross-modal attention
	self.audio_to_video_attn = nn.MultiheadAttention(
	embed_dim=embedding_dim,
	num_heads=8,
	batch_first=False
	)

	self.video_to_audio_attn = nn.MultiheadAttention(
	embed_dim=embedding_dim,
	num_heads=8,
	batch_first=False
	)

	# Self-attention for temporal modeling
	self.audio_self_attn = TemporalAttention(embedding_dim)
	self.video_self_attn = TemporalAttention(embedding_dim)

	def forward(self, audio_mfcc, video_frames):
	"""
	Forward pass with attention mechanisms.
	"""
	# Extract features
	if audio_mfcc.dim() == 3:
	audio_mfcc = audio_mfcc.unsqueeze(1) # [B, 1, F, T]

	audio_features = self.audio_encoder(audio_mfcc) # [B, C, T_a]
	video_features = self.video_encoder(video_frames) # [B, C, T_v]

	# Self-attention
	audio_features = self.audio_self_attn(audio_features)
	video_features = self.video_self_attn(video_features)

	# Align temporal dimensions
	min_time = min(audio_features.size(2), video_features.size(2))
	audio_features = audio_features[:, :, :min_time]
	video_features = video_features[:, :, :min_time]

	# Cross-modal attention
	# Reshape for attention: [T, B, C]
	audio_t = audio_features.permute(2, 0, 1)
	video_t = video_features.permute(2, 0, 1)

	# Audio attends to video
	audio_attended, _ = self.audio_to_video_attn(
	query=audio_t, key=video_t, value=video_t
	)
	audio_features = audio_features + audio_attended.permute(1, 2, 0)

	# Video attends to audio
	video_attended, _ = self.video_to_audio_attn(
	query=video_t, key=audio_t, value=audio_t
	)
	video_features = video_features + video_attended.permute(1, 2, 0)

	# Compute correlation
	correlation = self.correlation(video_features, audio_features)

	# Predict offset (regression)
	offset_logits = self.offset_regressor(correlation)
	predicted_offsets = self.temporal_smoother(offset_logits)

	# Clamp to valid range
	predicted_offsets = torch.clamp(predicted_offsets, -self.max_offset, self.max_offset)

	return predicted_offsets, audio_features, video_features


	class StreamSyncFCN(nn.Module):
	"""
	StreamSync-style FCN with built-in preprocessing and transfer learning.

	Features:
	1. Sliding window processing for streams
	2. HLS stream support (.m3u8)
	3. Raw video file processing (MP4, AVI, etc.)
	4. Automatic transfer learning from Sync NetModel.py
	5. Temporal buffering and smoothing
	"""

	def __init__(self, embedding_dim=512, max_offset=15,
	window_size=25, stride=5, buffer_size=100,
	use_attention=False, pretrained_syncnet_path=None,
	auto_load_pretrained=True):
	"""
	Args:
	embedding_dim: Feature dimension
	max_offset: Maximum temporal offset (frames)
	window_size: Frames per processing window
	stride: Window stride
	buffer_size: Temporal buffer size
	use_attention: Use attention model
	pretrained_syncnet_path: Path to original SyncNet weights
	auto_load_pretrained: Auto-load pretrained weights if path provided
	"""
	super(StreamSyncFCN, self).__init__()

	self.window_size = window_size
	self.stride = stride
	self.buffer_size = buffer_size
	self.max_offset = max_offset

	# Initialize FCN model
	if use_attention:
	self.fcn_model = SyncNetFCN_WithAttention(embedding_dim, max_offset)
	else:
	self.fcn_model = SyncNetFCN(embedding_dim, max_offset)

	# Auto-load pretrained weights
	if auto_load_pretrained and pretrained_syncnet_path:
	self.load_pretrained_syncnet(pretrained_syncnet_path)

	self.reset_buffers()

	def reset_buffers(self):
	"""Reset temporal buffers."""
	self.offset_buffer = []
	self.confidence_buffer = []
	self.frame_count = 0

	def load_pretrained_syncnet(self, syncnet_model_path, freeze_conv=True, verbose=True):
	"""
	Load conv layers from original SyncNet (SyncNetModel.py).
	Maps: netcnnaud.* → audio_encoder.conv_layers.*
	netcnnlip.* → video_encoder.conv_layers.*
	"""
	if verbose:
	print(f"Loading pretrained SyncNet from: {syncnet_model_path}")

	try:
	pretrained = torch.load(syncnet_model_path, map_location='cpu')
	if isinstance(pretrained, dict):
	pretrained_dict = pretrained.get('model_state_dict', pretrained.get('state_dict', pretrained))
	else:
	pretrained_dict = pretrained.state_dict()

	fcn_dict = self.fcn_model.state_dict()
	loaded_count = 0

	# Map audio conv layers
	for key in list(pretrained_dict.keys()):
	if key.startswith('netcnnaud.'):
	idx = key.split('.')[1]
	param = '.'.join(key.split('.')[2:])
	new_key = f'audio_encoder.conv_layers.{idx}.{param}'
	if new_key in fcn_dict and pretrained_dict[key].shape == fcn_dict[new_key].shape:
	fcn_dict[new_key] = pretrained_dict[key]
	loaded_count += 1

	# Map video conv layers
	elif key.startswith('netcnnlip.'):
	idx = key.split('.')[1]
	param = '.'.join(key.split('.')[2:])
	new_key = f'video_encoder.conv_layers.{idx}.{param}'
	if new_key in fcn_dict and pretrained_dict[key].shape == fcn_dict[new_key].shape:
	fcn_dict[new_key] = pretrained_dict[key]
	loaded_count += 1

	self.fcn_model.load_state_dict(fcn_dict, strict=False)

	if verbose:
	print(f"✓ Loaded {loaded_count} pretrained conv parameters")

	if freeze_conv:
	for name, param in self.fcn_model.named_parameters():
	if 'conv_layers' in name:
	param.requires_grad = False
	if verbose:
	print("✓ Froze pretrained conv layers")

	except Exception as e:
	if verbose:
	print(f"⚠ Could not load pretrained weights: {e}")

	def unfreeze_all_layers(self, verbose=True):
	"""Unfreeze all layers for fine-tuning."""
	for param in self.fcn_model.parameters():
	param.requires_grad = True
	if verbose:
	print("✓ Unfrozen all layers for fine-tuning")

	def forward(self, audio_mfcc, video_frames):
	"""Forward pass through FCN model."""
	return self.fcn_model(audio_mfcc, video_frames)

	def process_window(self, audio_window, video_window):
	"""Process single window."""
	with torch.no_grad():
	sync_probs, _, _ = self.fcn_model(audio_window, video_window)
	offsets, confidences = self.fcn_model.compute_offset(sync_probs)
	return offsets[0].mean().item(), confidences[0].mean().item()

	def process_stream(self, audio_stream, video_stream, return_trace=False):
	"""Process full stream with sliding windows."""
	self.reset_buffers()

	video_frames = video_stream.shape[2]
	audio_frames = audio_stream.shape[3] // 4
	min_frames = min(video_frames, audio_frames)
	num_windows = max(1, (min_frames - self.window_size) // self.stride + 1)

	trace = {'offsets': [], 'confidences': [], 'timestamps': []}

	for win_idx in range(num_windows):
	start = win_idx * self.stride
	end = min(start + self.window_size, min_frames)

	video_win = video_stream[:, :, start:end, :, :]
	audio_win = audio_stream[:, :, :, start4:end4]

	offset, confidence = self.process_window(audio_win, video_win)

	self.offset_buffer.append(offset)
	self.confidence_buffer.append(confidence)

	if return_trace:
	trace['offsets'].append(offset)
	trace['confidences'].append(confidence)
	trace['timestamps'].append(start)

	if len(self.offset_buffer) > self.buffer_size:
	self.offset_buffer.pop(0)
	self.confidence_buffer.pop(0)

	self.frame_count = end

	final_offset, final_conf = self.get_smoothed_prediction()

	return (final_offset, final_conf, trace) if return_trace else (final_offset, final_conf)

	def get_smoothed_prediction(self, method='confidence_weighted'):
	"""Compute smoothed offset from buffer."""
	if not self.offset_buffer:
	return 0.0, 0.0

	offsets = torch.tensor(self.offset_buffer)
	confs = torch.tensor(self.confidence_buffer)

	if method == 'confidence_weighted':
	weights = confs / (confs.sum() + 1e-8)
	offset = (offsets * weights).sum().item()
	elif method == 'median':
	offset = torch.median(offsets).item()
	else:
	offset = torch.mean(offsets).item()

	return offset, torch.mean(confs).item()

	def extract_audio_mfcc(self, video_path, temp_dir='temp'):
	"""Extract audio and compute MFCC."""
	os.makedirs(temp_dir, exist_ok=True)
	audio_path = os.path.join(temp_dir, 'temp_audio.wav')

	cmd = ['ffmpeg', '-y', '-i', video_path, '-ac', '1', '-ar', '16000',
	'-vn', '-acodec', 'pcm_s16le', audio_path]
	subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)

	sample_rate, audio = wavfile.read(audio_path)
	mfcc = python_speech_features.mfcc(audio, sample_rate).T
	mfcc_tensor = torch.FloatTensor(mfcc).unsqueeze(0).unsqueeze(0)

	if os.path.exists(audio_path):
	os.remove(audio_path)

	return mfcc_tensor

	def extract_video_frames(self, video_path, target_size=(112, 112)):
	"""Extract video frames as tensor."""
	cap = cv2.VideoCapture(video_path)
	frames = []

	while True:
	ret, frame = cap.read()
	if not ret:
	break
	frame = cv2.resize(frame, target_size)
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frames.append(frame.astype(np.float32) / 255.0)

	cap.release()

	if not frames:
	raise ValueError(f"No frames extracted from {video_path}")

	frames_array = np.stack(frames, axis=0)
	video_tensor = torch.FloatTensor(frames_array).permute(3, 0, 1, 2).unsqueeze(0)

	return video_tensor

	def process_video_file(self, video_path, return_trace=False, temp_dir='temp',
	target_size=(112, 112), verbose=True):
	"""
	Process raw video file (MP4, AVI, MOV, etc.).

	Args:
	video_path: Path to video file
	return_trace: Return per-window predictions
	temp_dir: Temporary directory
	target_size: Video frame size
	verbose: Print progress

	Returns:
	offset: Detected offset (frames)
	confidence: Detection confidence
	trace: (optional) Per-window data

	Example:
	>>> model = StreamSyncFCN(pretrained_syncnet_path='data/syncnet_v2.model')
	>>> offset, conf = model.process_video_file('video.mp4')
	"""
	if verbose:
	print(f"Processing: {video_path}")

	mfcc = self.extract_audio_mfcc(video_path, temp_dir)
	video = self.extract_video_frames(video_path, target_size)

	if verbose:
	print(f" Audio: {mfcc.shape}, Video: {video.shape}")

	result = self.process_stream(mfcc, video, return_trace)

	if verbose:
	offset, conf = result[:2]
	print(f" Offset: {offset:.2f} frames, Confidence: {conf:.3f}")

	return result

	def detect_offset_correlation(self, video_path, calibration_offset=3, calibration_scale=-0.5,
	calibration_baseline=-15, temp_dir='temp', verbose=True):
	"""
	Detect AV offset using correlation-based method with calibration.

	This method uses the trained audio-video encoders to compute temporal
	correlation and find the best matching offset. A linear calibration
	is applied to correct for systematic bias in the model.

	Calibration formula: calibrated = calibration_offset + calibration_scale * (raw - calibration_baseline)
	Default values determined empirically from test videos.

	Args:
	video_path: Path to video file
	calibration_offset: Baseline expected offset (default: 3)
	calibration_scale: Scale factor for raw offset (default: -0.5)
	calibration_baseline: Baseline raw offset (default: -15)
	temp_dir: Temporary directory for audio extraction
	verbose: Print progress information

	Returns:
	offset: Calibrated offset in frames (positive = audio ahead)
	confidence: Detection confidence (correlation strength)
	raw_offset: Uncalibrated raw offset from correlation

	Example:
	>>> model = StreamSyncFCN(pretrained_syncnet_path='data/syncnet_v2.model')
	>>> offset, conf, raw = model.detect_offset_correlation('video.mp4')
	>>> print(f"Detected offset: {offset} frames")
	"""
	import python_speech_features
	from scipy.io import wavfile

	if verbose:
	print(f"Processing: {video_path}")

	# Extract audio MFCC
	os.makedirs(temp_dir, exist_ok=True)
	audio_path = os.path.join(temp_dir, 'temp_audio.wav')

	cmd = ['ffmpeg', '-y', '-i', video_path, '-ac', '1', '-ar', '16000',
	'-vn', '-acodec', 'pcm_s16le', audio_path]
	subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)

	sample_rate, audio = wavfile.read(audio_path)
	mfcc = python_speech_features.mfcc(audio, sample_rate, numcep=13)
	audio_tensor = torch.FloatTensor(mfcc.T).unsqueeze(0).unsqueeze(0)

	if os.path.exists(audio_path):
	os.remove(audio_path)

	# Extract video frames
	cap = cv2.VideoCapture(video_path)
	frames = []
	while True:
	ret, frame = cap.read()
	if not ret:
	break
	frame = cv2.resize(frame, (112, 112))
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frames.append(frame.astype(np.float32) / 255.0)
	cap.release()

	if not frames:
	raise ValueError(f"No frames extracted from {video_path}")

	video_tensor = torch.FloatTensor(np.stack(frames)).permute(3, 0, 1, 2).unsqueeze(0)

	if verbose:
	print(f" Audio MFCC: {audio_tensor.shape}, Video: {video_tensor.shape}")

	# Compute correlation-based offset
	with torch.no_grad():
	# Get features from encoders
	audio_feat = self.fcn_model.audio_encoder(audio_tensor)
	video_feat = self.fcn_model.video_encoder(video_tensor)

	# Align temporal dimensions
	min_t = min(audio_feat.shape[2], video_feat.shape[2])
	audio_feat = audio_feat[:, :, :min_t]
	video_feat = video_feat[:, :, :min_t]

	# Compute correlation map
	correlation = self.fcn_model.correlation(video_feat, audio_feat)

	# Average over time dimension
	corr_avg = correlation.mean(dim=2).squeeze(0)

	# Find best offset (argmax of correlation)
	best_idx = corr_avg.argmax().item()
	raw_offset = best_idx - self.max_offset

	# Compute confidence as peak prominence
	corr_np = corr_avg.numpy()
	peak_val = corr_np[best_idx]
	median_val = np.median(corr_np)
	confidence = peak_val - median_val

	# Apply linear calibration: calibrated = offset + scale * (raw - baseline)
	calibrated_offset = int(round(calibration_offset + calibration_scale * (raw_offset - calibration_baseline)))

	if verbose:
	print(f" Raw offset: {raw_offset}, Calibrated: {calibrated_offset}")
	print(f" Confidence: {confidence:.4f}")

	return calibrated_offset, confidence, raw_offset

	def process_hls_stream(self, hls_url, segment_duration=10, return_trace=False,
	temp_dir='temp_hls', verbose=True):
	"""
	Process HLS stream (.m3u8 playlist).

	Args:
	hls_url: URL to .m3u8 playlist
	segment_duration: Seconds to capture
	return_trace: Return per-window predictions
	temp_dir: Temporary directory
	verbose: Print progress

	Returns:
	offset: Detected offset
	confidence: Detection confidence
	trace: (optional) Per-window data

	Example:
	>>> model = StreamSyncFCN(pretrained_syncnet_path='data/syncnet_v2.model')
	>>> offset, conf = model.process_hls_stream('http://example.com/stream.m3u8')
	"""
	if verbose:
	print(f"Processing HLS: {hls_url}")

	os.makedirs(temp_dir, exist_ok=True)
	temp_video = os.path.join(temp_dir, 'hls_segment.mp4')

	try:
	cmd = ['ffmpeg', '-y', '-i', hls_url, '-t', str(segment_duration),
	'-c', 'copy', temp_video]
	subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
	check=True, timeout=segment_duration + 30)

	result = self.process_video_file(temp_video, return_trace, temp_dir, verbose=verbose)

	return result

	except Exception as e:
	raise RuntimeError(f"HLS processing failed: {e}")
	finally:
	if os.path.exists(temp_video):
	os.remove(temp_video)


	# Utility functions
	def save_model(model, filename):
	"""Save model to file."""
	with open(filename, "wb") as f:
	torch.save(model.state_dict(), f)
	print(f"{filename} saved.")


	def load_model(model, filename):
	"""Load model from file."""
	state_dict = torch.load(filename, map_location='cpu')
	model.load_state_dict(state_dict)
	print(f"{filename} loaded.")
	return model


	if __name__ == "__main__":
	# Test the models
	print("Testing FCN_AudioEncoder...")
	audio_encoder = FCN_AudioEncoder(output_channels=512)
	audio_input = torch.randn(2, 1, 13, 100) # [B, 1, MFCC_dim, Time]
	audio_out = audio_encoder(audio_input)
	print(f"Audio input: {audio_input.shape} → Audio output: {audio_out.shape}")

	print("\nTesting FCN_VideoEncoder...")
	video_encoder = FCN_VideoEncoder(output_channels=512)
	video_input = torch.randn(2, 3, 25, 112, 112) # [B, 3, T, H, W]
	video_out = video_encoder(video_input)
	print(f"Video input: {video_input.shape} → Video output: {video_out.shape}")

	print("\nTesting SyncNetFCN...")
	model = SyncNetFCN(embedding_dim=512, max_offset=15)
	sync_probs, audio_feat, video_feat = model(audio_input, video_input)
	print(f"Sync probs: {sync_probs.shape}")
	print(f"Audio features: {audio_feat.shape}")
	print(f"Video features: {video_feat.shape}")

	offsets, confidences = model.compute_offset(sync_probs)
	print(f"Offsets: {offsets.shape}")
	print(f"Confidences: {confidences.shape}")

	print("\nTesting SyncNetFCN_WithAttention...")
	model_attn = SyncNetFCN_WithAttention(embedding_dim=512, max_offset=15)
	sync_probs, audio_feat, video_feat = model_attn(audio_input, video_input)
	print(f"Sync probs (with attention): {sync_probs.shape}")

	# Count parameters
	total_params = sum(p.numel() for p in model.parameters())
	total_params_attn = sum(p.numel() for p in model_attn.parameters())
	print(f"\nTotal parameters (FCN): {total_params:,}")
	print(f"Total parameters (FCN+Attention): {total_params_attn:,}")