#!/usr/bin/python #-*- coding: utf-8 -*- """ Fully Convolutional SyncNet (FCN-SyncNet) Key improvements: 1. Fully convolutional architecture (no FC layers) 2. Temporal feature maps instead of single embeddings 3. Correlation-based audio-video fusion 4. Dense sync probability predictions over time 5. Multi-scale feature extraction 6. Attention mechanisms Author: Enhanced version based on original SyncNet Date: 2025-11-22 """ import torch import torch.nn as nn import torch.nn.functional as F import math import numpy as np import cv2 import os import subprocess from scipy.io import wavfile import python_speech_features from collections import OrderedDict class TemporalCorrelation(nn.Module): """ Compute correlation between audio and video features across time. Inspired by FlowNet correlation layer. """ def __init__(self, max_displacement=10): super(TemporalCorrelation, self).__init__() self.max_displacement = max_displacement def forward(self, feat1, feat2): """ Args: feat1: [B, C, T] - visual features feat2: [B, C, T] - audio features Returns: correlation: [B, 2*max_displacement+1, T] - correlation map """ B, C, T = feat1.shape max_disp = self.max_displacement # Normalize features feat1 = F.normalize(feat1, dim=1) feat2 = F.normalize(feat2, dim=1) # Pad feat2 for shifting feat2_padded = F.pad(feat2, (max_disp, max_disp), mode='replicate') corr_list = [] for offset in range(-max_disp, max_disp + 1): # Shift audio features shifted_feat2 = feat2_padded[:, :, offset+max_disp:offset+max_disp+T] # Compute correlation (cosine similarity) corr = (feat1 * shifted_feat2).sum(dim=1, keepdim=True) # [B, 1, T] corr_list.append(corr) # Stack all correlations correlation = torch.cat(corr_list, dim=1) # [B, 2*max_disp+1, T] return correlation class ChannelAttention(nn.Module): """Squeeze-and-Excitation style channel attention.""" def __init__(self, channels, reduction=16): super(ChannelAttention, self).__init__() self.avg_pool = nn.AdaptiveAvgPool1d(1) self.fc = nn.Sequential( nn.Linear(channels, channels // reduction, bias=False), nn.ReLU(inplace=True), nn.Linear(channels // reduction, channels, bias=False), nn.Sigmoid() ) def forward(self, x): b, c, t = x.size() y = self.avg_pool(x).view(b, c) y = self.fc(y).view(b, c, 1) return x * y.expand_as(x) class TemporalAttention(nn.Module): """Self-attention over temporal dimension.""" def __init__(self, channels): super(TemporalAttention, self).__init__() self.query_conv = nn.Conv1d(channels, channels // 8, 1) self.key_conv = nn.Conv1d(channels, channels // 8, 1) self.value_conv = nn.Conv1d(channels, channels, 1) self.gamma = nn.Parameter(torch.zeros(1)) def forward(self, x): """ Args: x: [B, C, T] """ B, C, T = x.size() # Generate query, key, value query = self.query_conv(x).permute(0, 2, 1) # [B, T, C'] key = self.key_conv(x) # [B, C', T] value = self.value_conv(x) # [B, C, T] # Attention weights attention = torch.bmm(query, key) # [B, T, T] attention = F.softmax(attention, dim=-1) # Apply attention out = torch.bmm(value, attention.permute(0, 2, 1)) # [B, C, T] out = self.gamma * out + x return out class FCN_AudioEncoder(nn.Module): """ Fully convolutional audio encoder. Input: MFCC or Mel spectrogram [B, 1, F, T] Output: Feature map [B, C, T'] """ def __init__(self, output_channels=512): super(FCN_AudioEncoder, self).__init__() # Convolutional layers (preserve temporal dimension) self.conv_layers = nn.Sequential( # Layer 1 nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)), nn.BatchNorm2d(64), nn.ReLU(inplace=True), # Layer 2 nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)), nn.BatchNorm2d(192), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)), # Reduce frequency, keep time # Layer 3 nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)), nn.BatchNorm2d(384), nn.ReLU(inplace=True), # Layer 4 nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)), nn.BatchNorm2d(256), nn.ReLU(inplace=True), # Layer 5 nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)), nn.BatchNorm2d(256), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)), # Layer 6 - Reduce frequency dimension to 1 nn.Conv2d(256, 512, kernel_size=(5,1), stride=(5,1), padding=(0,0)), nn.BatchNorm2d(512), nn.ReLU(inplace=True), ) # 1×1 conv to adjust channels (replaces FC layer) self.channel_conv = nn.Sequential( nn.Conv1d(512, 512, kernel_size=1), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, output_channels, kernel_size=1), nn.BatchNorm1d(output_channels), ) # Channel attention self.channel_attn = ChannelAttention(output_channels) def forward(self, x): """ Args: x: [B, 1, F, T] - MFCC features Returns: features: [B, C, T'] - temporal feature map """ # Convolutional encoding x = self.conv_layers(x) # [B, 512, F', T'] # Collapse frequency dimension B, C, F, T = x.size() x = x.view(B, C * F, T) # Flatten frequency into channels # Reduce to output_channels x = self.channel_conv(x) # [B, output_channels, T'] # Apply attention x = self.channel_attn(x) return x class FCN_VideoEncoder(nn.Module): """ Fully convolutional video encoder. Input: Video clip [B, 3, T, H, W] Output: Feature map [B, C, T'] """ def __init__(self, output_channels=512): super(FCN_VideoEncoder, self).__init__() # 3D Convolutional layers self.conv_layers = nn.Sequential( # Layer 1 nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=(2,3,3)), nn.BatchNorm3d(96), nn.ReLU(inplace=True), nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)), # Layer 2 nn.Conv3d(96, 256, kernel_size=(3,5,5), stride=(1,2,2), padding=(1,2,2)), nn.BatchNorm3d(256), nn.ReLU(inplace=True), nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)), # Layer 3 nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)), nn.BatchNorm3d(256), nn.ReLU(inplace=True), # Layer 4 nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)), nn.BatchNorm3d(256), nn.ReLU(inplace=True), # Layer 5 nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)), nn.BatchNorm3d(256), nn.ReLU(inplace=True), nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)), # Layer 6 - Reduce spatial dimension nn.Conv3d(256, 512, kernel_size=(3,3,3), stride=(1,1,1), padding=(1,1,1)), nn.BatchNorm3d(512), nn.ReLU(inplace=True), # Adaptive pooling to 1x1 spatial nn.AdaptiveAvgPool3d((None, 1, 1)) # Keep temporal, pool spatial to 1x1 ) # 1×1 conv to adjust channels (replaces FC layer) self.channel_conv = nn.Sequential( nn.Conv1d(512, 512, kernel_size=1), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, output_channels, kernel_size=1), nn.BatchNorm1d(output_channels), ) # Channel attention self.channel_attn = ChannelAttention(output_channels) def forward(self, x): """ Args: x: [B, 3, T, H, W] - video frames Returns: features: [B, C, T'] - temporal feature map """ # Convolutional encoding x = self.conv_layers(x) # [B, 512, T', 1, 1] # Remove spatial dimensions B, C, T, H, W = x.size() x = x.view(B, C, T) # [B, 512, T'] # Reduce to output_channels x = self.channel_conv(x) # [B, output_channels, T'] # Apply attention x = self.channel_attn(x) return x class SyncNetFCN(nn.Module): """ Fully Convolutional SyncNet with temporal outputs (REGRESSION VERSION). Architecture: 1. Audio encoder: MFCC → temporal features 2. Video encoder: frames → temporal features 3. Correlation layer: compute audio-video similarity over time 4. Offset regressor: predict continuous offset value for each frame Changes from classification version: - Output: [B, 1, T] continuous offset values (not probability distribution) - Default max_offset: 125 frames (±5 seconds at 25fps) for streaming - Loss: L1/MSE instead of CrossEntropy """ def __init__(self, embedding_dim=512, max_offset=125): super(SyncNetFCN, self).__init__() self.embedding_dim = embedding_dim self.max_offset = max_offset # Encoders self.audio_encoder = FCN_AudioEncoder(output_channels=embedding_dim) self.video_encoder = FCN_VideoEncoder(output_channels=embedding_dim) # Temporal correlation self.correlation = TemporalCorrelation(max_displacement=max_offset) # Offset regressor (processes correlation map) - REGRESSION OUTPUT self.offset_regressor = nn.Sequential( nn.Conv1d(2*max_offset+1, 128, kernel_size=3, padding=1), nn.BatchNorm1d(128), nn.ReLU(inplace=True), nn.Conv1d(128, 64, kernel_size=3, padding=1), nn.BatchNorm1d(64), nn.ReLU(inplace=True), nn.Conv1d(64, 1, kernel_size=1), # Output: single continuous offset value ) # Optional: Temporal smoothing with dilated convolutions self.temporal_smoother = nn.Sequential( nn.Conv1d(1, 32, kernel_size=3, dilation=2, padding=2), nn.BatchNorm1d(32), nn.ReLU(inplace=True), nn.Conv1d(32, 1, kernel_size=1), ) def forward_audio(self, audio_mfcc): """Extract audio features.""" return self.audio_encoder(audio_mfcc) def forward_video(self, video_frames): """Extract video features.""" return self.video_encoder(video_frames) def forward(self, audio_mfcc, video_frames): """ Forward pass with audio-video offset regression. Args: audio_mfcc: [B, 1, F, T] - MFCC features video_frames: [B, 3, T', H, W] - video frames Returns: predicted_offsets: [B, 1, T''] - predicted offset in frames for each timestep audio_features: [B, C, T_a] - audio embeddings video_features: [B, C, T_v] - video embeddings """ # Extract features if audio_mfcc.dim() == 3: audio_mfcc = audio_mfcc.unsqueeze(1) # [B, 1, F, T] audio_features = self.audio_encoder(audio_mfcc) # [B, C, T_a] video_features = self.video_encoder(video_frames) # [B, C, T_v] # Align temporal dimensions (if needed) min_time = min(audio_features.size(2), video_features.size(2)) audio_features = audio_features[:, :, :min_time] video_features = video_features[:, :, :min_time] # Compute correlation correlation = self.correlation(video_features, audio_features) # [B, 2*K+1, T] # Predict offset (regression) offset_logits = self.offset_regressor(correlation) # [B, 1, T] predicted_offsets = self.temporal_smoother(offset_logits) # Temporal smoothing # Clamp to valid range predicted_offsets = torch.clamp(predicted_offsets, -self.max_offset, self.max_offset) return predicted_offsets, audio_features, video_features def compute_offset(self, predicted_offsets): """ Extract offset and confidence from regression predictions. Args: predicted_offsets: [B, 1, T] - predicted offsets Returns: offsets: [B, T] - predicted offset for each frame confidences: [B, T] - confidence scores (inverse of variance) """ # Remove channel dimension offsets = predicted_offsets.squeeze(1) # [B, T] # Confidence = inverse of temporal variance (stable predictions = high confidence) temporal_variance = torch.var(offsets, dim=1, keepdim=True) + 1e-6 # [B, 1] confidences = 1.0 / temporal_variance # [B, 1] confidences = confidences.expand_as(offsets) # [B, T] # Normalize confidence to [0, 1] confidences = torch.sigmoid(confidences - 5.0) # Shift to reasonable range return offsets, confidences class SyncNetFCN_WithAttention(SyncNetFCN): """ Enhanced version with cross-modal attention. Audio and video features attend to each other before correlation. """ def __init__(self, embedding_dim=512, max_offset=15): super(SyncNetFCN_WithAttention, self).__init__(embedding_dim, max_offset) # Cross-modal attention self.audio_to_video_attn = nn.MultiheadAttention( embed_dim=embedding_dim, num_heads=8, batch_first=False ) self.video_to_audio_attn = nn.MultiheadAttention( embed_dim=embedding_dim, num_heads=8, batch_first=False ) # Self-attention for temporal modeling self.audio_self_attn = TemporalAttention(embedding_dim) self.video_self_attn = TemporalAttention(embedding_dim) def forward(self, audio_mfcc, video_frames): """ Forward pass with attention mechanisms. """ # Extract features if audio_mfcc.dim() == 3: audio_mfcc = audio_mfcc.unsqueeze(1) # [B, 1, F, T] audio_features = self.audio_encoder(audio_mfcc) # [B, C, T_a] video_features = self.video_encoder(video_frames) # [B, C, T_v] # Self-attention audio_features = self.audio_self_attn(audio_features) video_features = self.video_self_attn(video_features) # Align temporal dimensions min_time = min(audio_features.size(2), video_features.size(2)) audio_features = audio_features[:, :, :min_time] video_features = video_features[:, :, :min_time] # Cross-modal attention # Reshape for attention: [T, B, C] audio_t = audio_features.permute(2, 0, 1) video_t = video_features.permute(2, 0, 1) # Audio attends to video audio_attended, _ = self.audio_to_video_attn( query=audio_t, key=video_t, value=video_t ) audio_features = audio_features + audio_attended.permute(1, 2, 0) # Video attends to audio video_attended, _ = self.video_to_audio_attn( query=video_t, key=audio_t, value=audio_t ) video_features = video_features + video_attended.permute(1, 2, 0) # Compute correlation correlation = self.correlation(video_features, audio_features) # Predict offset (regression) offset_logits = self.offset_regressor(correlation) predicted_offsets = self.temporal_smoother(offset_logits) # Clamp to valid range predicted_offsets = torch.clamp(predicted_offsets, -self.max_offset, self.max_offset) return predicted_offsets, audio_features, video_features class StreamSyncFCN(nn.Module): """ StreamSync-style FCN with built-in preprocessing and transfer learning. Features: 1. Sliding window processing for streams 2. HLS stream support (.m3u8) 3. Raw video file processing (MP4, AVI, etc.) 4. Automatic transfer learning from Sync NetModel.py 5. Temporal buffering and smoothing """ def __init__(self, embedding_dim=512, max_offset=15, window_size=25, stride=5, buffer_size=100, use_attention=False, pretrained_syncnet_path=None, auto_load_pretrained=True): """ Args: embedding_dim: Feature dimension max_offset: Maximum temporal offset (frames) window_size: Frames per processing window stride: Window stride buffer_size: Temporal buffer size use_attention: Use attention model pretrained_syncnet_path: Path to original SyncNet weights auto_load_pretrained: Auto-load pretrained weights if path provided """ super(StreamSyncFCN, self).__init__() self.window_size = window_size self.stride = stride self.buffer_size = buffer_size self.max_offset = max_offset # Initialize FCN model if use_attention: self.fcn_model = SyncNetFCN_WithAttention(embedding_dim, max_offset) else: self.fcn_model = SyncNetFCN(embedding_dim, max_offset) # Auto-load pretrained weights if auto_load_pretrained and pretrained_syncnet_path: self.load_pretrained_syncnet(pretrained_syncnet_path) self.reset_buffers() def reset_buffers(self): """Reset temporal buffers.""" self.offset_buffer = [] self.confidence_buffer = [] self.frame_count = 0 def load_pretrained_syncnet(self, syncnet_model_path, freeze_conv=True, verbose=True): """ Load conv layers from original SyncNet (SyncNetModel.py). Maps: netcnnaud.* → audio_encoder.conv_layers.* netcnnlip.* → video_encoder.conv_layers.* """ if verbose: print(f"Loading pretrained SyncNet from: {syncnet_model_path}") try: pretrained = torch.load(syncnet_model_path, map_location='cpu') if isinstance(pretrained, dict): pretrained_dict = pretrained.get('model_state_dict', pretrained.get('state_dict', pretrained)) else: pretrained_dict = pretrained.state_dict() fcn_dict = self.fcn_model.state_dict() loaded_count = 0 # Map audio conv layers for key in list(pretrained_dict.keys()): if key.startswith('netcnnaud.'): idx = key.split('.')[1] param = '.'.join(key.split('.')[2:]) new_key = f'audio_encoder.conv_layers.{idx}.{param}' if new_key in fcn_dict and pretrained_dict[key].shape == fcn_dict[new_key].shape: fcn_dict[new_key] = pretrained_dict[key] loaded_count += 1 # Map video conv layers elif key.startswith('netcnnlip.'): idx = key.split('.')[1] param = '.'.join(key.split('.')[2:]) new_key = f'video_encoder.conv_layers.{idx}.{param}' if new_key in fcn_dict and pretrained_dict[key].shape == fcn_dict[new_key].shape: fcn_dict[new_key] = pretrained_dict[key] loaded_count += 1 self.fcn_model.load_state_dict(fcn_dict, strict=False) if verbose: print(f"✓ Loaded {loaded_count} pretrained conv parameters") if freeze_conv: for name, param in self.fcn_model.named_parameters(): if 'conv_layers' in name: param.requires_grad = False if verbose: print("✓ Froze pretrained conv layers") except Exception as e: if verbose: print(f"⚠ Could not load pretrained weights: {e}") def unfreeze_all_layers(self, verbose=True): """Unfreeze all layers for fine-tuning.""" for param in self.fcn_model.parameters(): param.requires_grad = True if verbose: print("✓ Unfrozen all layers for fine-tuning") def forward(self, audio_mfcc, video_frames): """Forward pass through FCN model.""" return self.fcn_model(audio_mfcc, video_frames) def process_window(self, audio_window, video_window): """Process single window.""" with torch.no_grad(): sync_probs, _, _ = self.fcn_model(audio_window, video_window) offsets, confidences = self.fcn_model.compute_offset(sync_probs) return offsets[0].mean().item(), confidences[0].mean().item() def process_stream(self, audio_stream, video_stream, return_trace=False): """Process full stream with sliding windows.""" self.reset_buffers() video_frames = video_stream.shape[2] audio_frames = audio_stream.shape[3] // 4 min_frames = min(video_frames, audio_frames) num_windows = max(1, (min_frames - self.window_size) // self.stride + 1) trace = {'offsets': [], 'confidences': [], 'timestamps': []} for win_idx in range(num_windows): start = win_idx * self.stride end = min(start + self.window_size, min_frames) video_win = video_stream[:, :, start:end, :, :] audio_win = audio_stream[:, :, :, start*4:end*4] offset, confidence = self.process_window(audio_win, video_win) self.offset_buffer.append(offset) self.confidence_buffer.append(confidence) if return_trace: trace['offsets'].append(offset) trace['confidences'].append(confidence) trace['timestamps'].append(start) if len(self.offset_buffer) > self.buffer_size: self.offset_buffer.pop(0) self.confidence_buffer.pop(0) self.frame_count = end final_offset, final_conf = self.get_smoothed_prediction() return (final_offset, final_conf, trace) if return_trace else (final_offset, final_conf) def get_smoothed_prediction(self, method='confidence_weighted'): """Compute smoothed offset from buffer.""" if not self.offset_buffer: return 0.0, 0.0 offsets = torch.tensor(self.offset_buffer) confs = torch.tensor(self.confidence_buffer) if method == 'confidence_weighted': weights = confs / (confs.sum() + 1e-8) offset = (offsets * weights).sum().item() elif method == 'median': offset = torch.median(offsets).item() else: offset = torch.mean(offsets).item() return offset, torch.mean(confs).item() def extract_audio_mfcc(self, video_path, temp_dir='temp'): """Extract audio and compute MFCC.""" os.makedirs(temp_dir, exist_ok=True) audio_path = os.path.join(temp_dir, 'temp_audio.wav') cmd = ['ffmpeg', '-y', '-i', video_path, '-ac', '1', '-ar', '16000', '-vn', '-acodec', 'pcm_s16le', audio_path] subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) sample_rate, audio = wavfile.read(audio_path) mfcc = python_speech_features.mfcc(audio, sample_rate).T mfcc_tensor = torch.FloatTensor(mfcc).unsqueeze(0).unsqueeze(0) if os.path.exists(audio_path): os.remove(audio_path) return mfcc_tensor def extract_video_frames(self, video_path, target_size=(112, 112)): """Extract video frames as tensor.""" cap = cv2.VideoCapture(video_path) frames = [] while True: ret, frame = cap.read() if not ret: break frame = cv2.resize(frame, target_size) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frames.append(frame.astype(np.float32) / 255.0) cap.release() if not frames: raise ValueError(f"No frames extracted from {video_path}") frames_array = np.stack(frames, axis=0) video_tensor = torch.FloatTensor(frames_array).permute(3, 0, 1, 2).unsqueeze(0) return video_tensor def process_video_file(self, video_path, return_trace=False, temp_dir='temp', target_size=(112, 112), verbose=True): """ Process raw video file (MP4, AVI, MOV, etc.). Args: video_path: Path to video file return_trace: Return per-window predictions temp_dir: Temporary directory target_size: Video frame size verbose: Print progress Returns: offset: Detected offset (frames) confidence: Detection confidence trace: (optional) Per-window data Example: >>> model = StreamSyncFCN(pretrained_syncnet_path='data/syncnet_v2.model') >>> offset, conf = model.process_video_file('video.mp4') """ if verbose: print(f"Processing: {video_path}") mfcc = self.extract_audio_mfcc(video_path, temp_dir) video = self.extract_video_frames(video_path, target_size) if verbose: print(f" Audio: {mfcc.shape}, Video: {video.shape}") result = self.process_stream(mfcc, video, return_trace) if verbose: offset, conf = result[:2] print(f" Offset: {offset:.2f} frames, Confidence: {conf:.3f}") return result def detect_offset_correlation(self, video_path, calibration_offset=3, calibration_scale=-0.5, calibration_baseline=-15, temp_dir='temp', verbose=True): """ Detect AV offset using correlation-based method with calibration. This method uses the trained audio-video encoders to compute temporal correlation and find the best matching offset. A linear calibration is applied to correct for systematic bias in the model. Calibration formula: calibrated = calibration_offset + calibration_scale * (raw - calibration_baseline) Default values determined empirically from test videos. Args: video_path: Path to video file calibration_offset: Baseline expected offset (default: 3) calibration_scale: Scale factor for raw offset (default: -0.5) calibration_baseline: Baseline raw offset (default: -15) temp_dir: Temporary directory for audio extraction verbose: Print progress information Returns: offset: Calibrated offset in frames (positive = audio ahead) confidence: Detection confidence (correlation strength) raw_offset: Uncalibrated raw offset from correlation Example: >>> model = StreamSyncFCN(pretrained_syncnet_path='data/syncnet_v2.model') >>> offset, conf, raw = model.detect_offset_correlation('video.mp4') >>> print(f"Detected offset: {offset} frames") """ import python_speech_features from scipy.io import wavfile if verbose: print(f"Processing: {video_path}") # Extract audio MFCC os.makedirs(temp_dir, exist_ok=True) audio_path = os.path.join(temp_dir, 'temp_audio.wav') cmd = ['ffmpeg', '-y', '-i', video_path, '-ac', '1', '-ar', '16000', '-vn', '-acodec', 'pcm_s16le', audio_path] subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) sample_rate, audio = wavfile.read(audio_path) mfcc = python_speech_features.mfcc(audio, sample_rate, numcep=13) audio_tensor = torch.FloatTensor(mfcc.T).unsqueeze(0).unsqueeze(0) if os.path.exists(audio_path): os.remove(audio_path) # Extract video frames cap = cv2.VideoCapture(video_path) frames = [] while True: ret, frame = cap.read() if not ret: break frame = cv2.resize(frame, (112, 112)) frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frames.append(frame.astype(np.float32) / 255.0) cap.release() if not frames: raise ValueError(f"No frames extracted from {video_path}") video_tensor = torch.FloatTensor(np.stack(frames)).permute(3, 0, 1, 2).unsqueeze(0) if verbose: print(f" Audio MFCC: {audio_tensor.shape}, Video: {video_tensor.shape}") # Compute correlation-based offset with torch.no_grad(): # Get features from encoders audio_feat = self.fcn_model.audio_encoder(audio_tensor) video_feat = self.fcn_model.video_encoder(video_tensor) # Align temporal dimensions min_t = min(audio_feat.shape[2], video_feat.shape[2]) audio_feat = audio_feat[:, :, :min_t] video_feat = video_feat[:, :, :min_t] # Compute correlation map correlation = self.fcn_model.correlation(video_feat, audio_feat) # Average over time dimension corr_avg = correlation.mean(dim=2).squeeze(0) # Find best offset (argmax of correlation) best_idx = corr_avg.argmax().item() raw_offset = best_idx - self.max_offset # Compute confidence as peak prominence corr_np = corr_avg.numpy() peak_val = corr_np[best_idx] median_val = np.median(corr_np) confidence = peak_val - median_val # Apply linear calibration: calibrated = offset + scale * (raw - baseline) calibrated_offset = int(round(calibration_offset + calibration_scale * (raw_offset - calibration_baseline))) if verbose: print(f" Raw offset: {raw_offset}, Calibrated: {calibrated_offset}") print(f" Confidence: {confidence:.4f}") return calibrated_offset, confidence, raw_offset def process_hls_stream(self, hls_url, segment_duration=10, return_trace=False, temp_dir='temp_hls', verbose=True): """ Process HLS stream (.m3u8 playlist). Args: hls_url: URL to .m3u8 playlist segment_duration: Seconds to capture return_trace: Return per-window predictions temp_dir: Temporary directory verbose: Print progress Returns: offset: Detected offset confidence: Detection confidence trace: (optional) Per-window data Example: >>> model = StreamSyncFCN(pretrained_syncnet_path='data/syncnet_v2.model') >>> offset, conf = model.process_hls_stream('http://example.com/stream.m3u8') """ if verbose: print(f"Processing HLS: {hls_url}") os.makedirs(temp_dir, exist_ok=True) temp_video = os.path.join(temp_dir, 'hls_segment.mp4') try: cmd = ['ffmpeg', '-y', '-i', hls_url, '-t', str(segment_duration), '-c', 'copy', temp_video] subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True, timeout=segment_duration + 30) result = self.process_video_file(temp_video, return_trace, temp_dir, verbose=verbose) return result except Exception as e: raise RuntimeError(f"HLS processing failed: {e}") finally: if os.path.exists(temp_video): os.remove(temp_video) # Utility functions def save_model(model, filename): """Save model to file.""" with open(filename, "wb") as f: torch.save(model.state_dict(), f) print(f"{filename} saved.") def load_model(model, filename): """Load model from file.""" state_dict = torch.load(filename, map_location='cpu') model.load_state_dict(state_dict) print(f"{filename} loaded.") return model if __name__ == "__main__": # Test the models print("Testing FCN_AudioEncoder...") audio_encoder = FCN_AudioEncoder(output_channels=512) audio_input = torch.randn(2, 1, 13, 100) # [B, 1, MFCC_dim, Time] audio_out = audio_encoder(audio_input) print(f"Audio input: {audio_input.shape} → Audio output: {audio_out.shape}") print("\nTesting FCN_VideoEncoder...") video_encoder = FCN_VideoEncoder(output_channels=512) video_input = torch.randn(2, 3, 25, 112, 112) # [B, 3, T, H, W] video_out = video_encoder(video_input) print(f"Video input: {video_input.shape} → Video output: {video_out.shape}") print("\nTesting SyncNetFCN...") model = SyncNetFCN(embedding_dim=512, max_offset=15) sync_probs, audio_feat, video_feat = model(audio_input, video_input) print(f"Sync probs: {sync_probs.shape}") print(f"Audio features: {audio_feat.shape}") print(f"Video features: {video_feat.shape}") offsets, confidences = model.compute_offset(sync_probs) print(f"Offsets: {offsets.shape}") print(f"Confidences: {confidences.shape}") print("\nTesting SyncNetFCN_WithAttention...") model_attn = SyncNetFCN_WithAttention(embedding_dim=512, max_offset=15) sync_probs, audio_feat, video_feat = model_attn(audio_input, video_input) print(f"Sync probs (with attention): {sync_probs.shape}") # Count parameters total_params = sum(p.numel() for p in model.parameters()) total_params_attn = sum(p.numel() for p in model_attn.parameters()) print(f"\nTotal parameters (FCN): {total_params:,}") print(f"Total parameters (FCN+Attention): {total_params_attn:,}")