Spaces:
Sleeping
Sleeping
| #!/usr/bin/python | |
| #-*- coding: utf-8 -*- | |
| """ | |
| Fully Convolutional SyncNet (FCN-SyncNet) | |
| Key improvements: | |
| 1. Fully convolutional architecture (no FC layers) | |
| 2. Temporal feature maps instead of single embeddings | |
| 3. Correlation-based audio-video fusion | |
| 4. Dense sync probability predictions over time | |
| 5. Multi-scale feature extraction | |
| 6. Attention mechanisms | |
| Author: Enhanced version based on original SyncNet | |
| Date: 2025-11-22 | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import math | |
| import numpy as np | |
| import cv2 | |
| import os | |
| import subprocess | |
| from scipy.io import wavfile | |
| import python_speech_features | |
| from collections import OrderedDict | |
| class TemporalCorrelation(nn.Module): | |
| """ | |
| Compute correlation between audio and video features across time. | |
| Inspired by FlowNet correlation layer. | |
| """ | |
| def __init__(self, max_displacement=10): | |
| super(TemporalCorrelation, self).__init__() | |
| self.max_displacement = max_displacement | |
| def forward(self, feat1, feat2): | |
| """ | |
| Args: | |
| feat1: [B, C, T] - visual features | |
| feat2: [B, C, T] - audio features | |
| Returns: | |
| correlation: [B, 2*max_displacement+1, T] - correlation map | |
| """ | |
| B, C, T = feat1.shape | |
| max_disp = self.max_displacement | |
| # Normalize features | |
| feat1 = F.normalize(feat1, dim=1) | |
| feat2 = F.normalize(feat2, dim=1) | |
| # Pad feat2 for shifting | |
| feat2_padded = F.pad(feat2, (max_disp, max_disp), mode='replicate') | |
| corr_list = [] | |
| for offset in range(-max_disp, max_disp + 1): | |
| # Shift audio features | |
| shifted_feat2 = feat2_padded[:, :, offset+max_disp:offset+max_disp+T] | |
| # Compute correlation (cosine similarity) | |
| corr = (feat1 * shifted_feat2).sum(dim=1, keepdim=True) # [B, 1, T] | |
| corr_list.append(corr) | |
| # Stack all correlations | |
| correlation = torch.cat(corr_list, dim=1) # [B, 2*max_disp+1, T] | |
| return correlation | |
| class ChannelAttention(nn.Module): | |
| """Squeeze-and-Excitation style channel attention.""" | |
| def __init__(self, channels, reduction=16): | |
| super(ChannelAttention, self).__init__() | |
| self.avg_pool = nn.AdaptiveAvgPool1d(1) | |
| self.fc = nn.Sequential( | |
| nn.Linear(channels, channels // reduction, bias=False), | |
| nn.ReLU(inplace=True), | |
| nn.Linear(channels // reduction, channels, bias=False), | |
| nn.Sigmoid() | |
| ) | |
| def forward(self, x): | |
| b, c, t = x.size() | |
| y = self.avg_pool(x).view(b, c) | |
| y = self.fc(y).view(b, c, 1) | |
| return x * y.expand_as(x) | |
| class TemporalAttention(nn.Module): | |
| """Self-attention over temporal dimension.""" | |
| def __init__(self, channels): | |
| super(TemporalAttention, self).__init__() | |
| self.query_conv = nn.Conv1d(channels, channels // 8, 1) | |
| self.key_conv = nn.Conv1d(channels, channels // 8, 1) | |
| self.value_conv = nn.Conv1d(channels, channels, 1) | |
| self.gamma = nn.Parameter(torch.zeros(1)) | |
| def forward(self, x): | |
| """ | |
| Args: | |
| x: [B, C, T] | |
| """ | |
| B, C, T = x.size() | |
| # Generate query, key, value | |
| query = self.query_conv(x).permute(0, 2, 1) # [B, T, C'] | |
| key = self.key_conv(x) # [B, C', T] | |
| value = self.value_conv(x) # [B, C, T] | |
| # Attention weights | |
| attention = torch.bmm(query, key) # [B, T, T] | |
| attention = F.softmax(attention, dim=-1) | |
| # Apply attention | |
| out = torch.bmm(value, attention.permute(0, 2, 1)) # [B, C, T] | |
| out = self.gamma * out + x | |
| return out | |
| class FCN_AudioEncoder(nn.Module): | |
| """ | |
| Fully convolutional audio encoder. | |
| Input: MFCC or Mel spectrogram [B, 1, F, T] | |
| Output: Feature map [B, C, T'] | |
| """ | |
| def __init__(self, output_channels=512): | |
| super(FCN_AudioEncoder, self).__init__() | |
| # Convolutional layers (preserve temporal dimension) | |
| self.conv_layers = nn.Sequential( | |
| # Layer 1 | |
| nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)), | |
| nn.BatchNorm2d(64), | |
| nn.ReLU(inplace=True), | |
| # Layer 2 | |
| nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)), | |
| nn.BatchNorm2d(192), | |
| nn.ReLU(inplace=True), | |
| nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)), # Reduce frequency, keep time | |
| # Layer 3 | |
| nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)), | |
| nn.BatchNorm2d(384), | |
| nn.ReLU(inplace=True), | |
| # Layer 4 | |
| nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)), | |
| nn.BatchNorm2d(256), | |
| nn.ReLU(inplace=True), | |
| # Layer 5 | |
| nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)), | |
| nn.BatchNorm2d(256), | |
| nn.ReLU(inplace=True), | |
| nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)), | |
| # Layer 6 - Reduce frequency dimension to 1 | |
| nn.Conv2d(256, 512, kernel_size=(5,1), stride=(5,1), padding=(0,0)), | |
| nn.BatchNorm2d(512), | |
| nn.ReLU(inplace=True), | |
| ) | |
| # 1×1 conv to adjust channels (replaces FC layer) | |
| self.channel_conv = nn.Sequential( | |
| nn.Conv1d(512, 512, kernel_size=1), | |
| nn.BatchNorm1d(512), | |
| nn.ReLU(inplace=True), | |
| nn.Conv1d(512, output_channels, kernel_size=1), | |
| nn.BatchNorm1d(output_channels), | |
| ) | |
| # Channel attention | |
| self.channel_attn = ChannelAttention(output_channels) | |
| def forward(self, x): | |
| """ | |
| Args: | |
| x: [B, 1, F, T] - MFCC features | |
| Returns: | |
| features: [B, C, T'] - temporal feature map | |
| """ | |
| # Convolutional encoding | |
| x = self.conv_layers(x) # [B, 512, F', T'] | |
| # Collapse frequency dimension | |
| B, C, F, T = x.size() | |
| x = x.view(B, C * F, T) # Flatten frequency into channels | |
| # Reduce to output_channels | |
| x = self.channel_conv(x) # [B, output_channels, T'] | |
| # Apply attention | |
| x = self.channel_attn(x) | |
| return x | |
| class FCN_VideoEncoder(nn.Module): | |
| """ | |
| Fully convolutional video encoder. | |
| Input: Video clip [B, 3, T, H, W] | |
| Output: Feature map [B, C, T'] | |
| """ | |
| def __init__(self, output_channels=512): | |
| super(FCN_VideoEncoder, self).__init__() | |
| # 3D Convolutional layers | |
| self.conv_layers = nn.Sequential( | |
| # Layer 1 | |
| nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=(2,3,3)), | |
| nn.BatchNorm3d(96), | |
| nn.ReLU(inplace=True), | |
| nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)), | |
| # Layer 2 | |
| nn.Conv3d(96, 256, kernel_size=(3,5,5), stride=(1,2,2), padding=(1,2,2)), | |
| nn.BatchNorm3d(256), | |
| nn.ReLU(inplace=True), | |
| nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)), | |
| # Layer 3 | |
| nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)), | |
| nn.BatchNorm3d(256), | |
| nn.ReLU(inplace=True), | |
| # Layer 4 | |
| nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)), | |
| nn.BatchNorm3d(256), | |
| nn.ReLU(inplace=True), | |
| # Layer 5 | |
| nn.Conv3d(256, 256, kernel_size=(3,3,3), padding=(1,1,1)), | |
| nn.BatchNorm3d(256), | |
| nn.ReLU(inplace=True), | |
| nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)), | |
| # Layer 6 - Reduce spatial dimension | |
| nn.Conv3d(256, 512, kernel_size=(3,3,3), stride=(1,1,1), padding=(1,1,1)), | |
| nn.BatchNorm3d(512), | |
| nn.ReLU(inplace=True), | |
| # Adaptive pooling to 1x1 spatial | |
| nn.AdaptiveAvgPool3d((None, 1, 1)) # Keep temporal, pool spatial to 1x1 | |
| ) | |
| # 1×1 conv to adjust channels (replaces FC layer) | |
| self.channel_conv = nn.Sequential( | |
| nn.Conv1d(512, 512, kernel_size=1), | |
| nn.BatchNorm1d(512), | |
| nn.ReLU(inplace=True), | |
| nn.Conv1d(512, output_channels, kernel_size=1), | |
| nn.BatchNorm1d(output_channels), | |
| ) | |
| # Channel attention | |
| self.channel_attn = ChannelAttention(output_channels) | |
| def forward(self, x): | |
| """ | |
| Args: | |
| x: [B, 3, T, H, W] - video frames | |
| Returns: | |
| features: [B, C, T'] - temporal feature map | |
| """ | |
| # Convolutional encoding | |
| x = self.conv_layers(x) # [B, 512, T', 1, 1] | |
| # Remove spatial dimensions | |
| B, C, T, H, W = x.size() | |
| x = x.view(B, C, T) # [B, 512, T'] | |
| # Reduce to output_channels | |
| x = self.channel_conv(x) # [B, output_channels, T'] | |
| # Apply attention | |
| x = self.channel_attn(x) | |
| return x | |
| class SyncNetFCN(nn.Module): | |
| """ | |
| Fully Convolutional SyncNet with temporal outputs (REGRESSION VERSION). | |
| Architecture: | |
| 1. Audio encoder: MFCC → temporal features | |
| 2. Video encoder: frames → temporal features | |
| 3. Correlation layer: compute audio-video similarity over time | |
| 4. Offset regressor: predict continuous offset value for each frame | |
| Changes from classification version: | |
| - Output: [B, 1, T] continuous offset values (not probability distribution) | |
| - Default max_offset: 125 frames (±5 seconds at 25fps) for streaming | |
| - Loss: L1/MSE instead of CrossEntropy | |
| """ | |
| def __init__(self, embedding_dim=512, max_offset=125): | |
| super(SyncNetFCN, self).__init__() | |
| self.embedding_dim = embedding_dim | |
| self.max_offset = max_offset | |
| # Encoders | |
| self.audio_encoder = FCN_AudioEncoder(output_channels=embedding_dim) | |
| self.video_encoder = FCN_VideoEncoder(output_channels=embedding_dim) | |
| # Temporal correlation | |
| self.correlation = TemporalCorrelation(max_displacement=max_offset) | |
| # Offset regressor (processes correlation map) - REGRESSION OUTPUT | |
| self.offset_regressor = nn.Sequential( | |
| nn.Conv1d(2*max_offset+1, 128, kernel_size=3, padding=1), | |
| nn.BatchNorm1d(128), | |
| nn.ReLU(inplace=True), | |
| nn.Conv1d(128, 64, kernel_size=3, padding=1), | |
| nn.BatchNorm1d(64), | |
| nn.ReLU(inplace=True), | |
| nn.Conv1d(64, 1, kernel_size=1), # Output: single continuous offset value | |
| ) | |
| # Optional: Temporal smoothing with dilated convolutions | |
| self.temporal_smoother = nn.Sequential( | |
| nn.Conv1d(1, 32, kernel_size=3, dilation=2, padding=2), | |
| nn.BatchNorm1d(32), | |
| nn.ReLU(inplace=True), | |
| nn.Conv1d(32, 1, kernel_size=1), | |
| ) | |
| def forward_audio(self, audio_mfcc): | |
| """Extract audio features.""" | |
| return self.audio_encoder(audio_mfcc) | |
| def forward_video(self, video_frames): | |
| """Extract video features.""" | |
| return self.video_encoder(video_frames) | |
| def forward(self, audio_mfcc, video_frames): | |
| """ | |
| Forward pass with audio-video offset regression. | |
| Args: | |
| audio_mfcc: [B, 1, F, T] - MFCC features | |
| video_frames: [B, 3, T', H, W] - video frames | |
| Returns: | |
| predicted_offsets: [B, 1, T''] - predicted offset in frames for each timestep | |
| audio_features: [B, C, T_a] - audio embeddings | |
| video_features: [B, C, T_v] - video embeddings | |
| """ | |
| # Extract features | |
| if audio_mfcc.dim() == 3: | |
| audio_mfcc = audio_mfcc.unsqueeze(1) # [B, 1, F, T] | |
| audio_features = self.audio_encoder(audio_mfcc) # [B, C, T_a] | |
| video_features = self.video_encoder(video_frames) # [B, C, T_v] | |
| # Align temporal dimensions (if needed) | |
| min_time = min(audio_features.size(2), video_features.size(2)) | |
| audio_features = audio_features[:, :, :min_time] | |
| video_features = video_features[:, :, :min_time] | |
| # Compute correlation | |
| correlation = self.correlation(video_features, audio_features) # [B, 2*K+1, T] | |
| # Predict offset (regression) | |
| offset_logits = self.offset_regressor(correlation) # [B, 1, T] | |
| predicted_offsets = self.temporal_smoother(offset_logits) # Temporal smoothing | |
| # Clamp to valid range | |
| predicted_offsets = torch.clamp(predicted_offsets, -self.max_offset, self.max_offset) | |
| return predicted_offsets, audio_features, video_features | |
| def compute_offset(self, predicted_offsets): | |
| """ | |
| Extract offset and confidence from regression predictions. | |
| Args: | |
| predicted_offsets: [B, 1, T] - predicted offsets | |
| Returns: | |
| offsets: [B, T] - predicted offset for each frame | |
| confidences: [B, T] - confidence scores (inverse of variance) | |
| """ | |
| # Remove channel dimension | |
| offsets = predicted_offsets.squeeze(1) # [B, T] | |
| # Confidence = inverse of temporal variance (stable predictions = high confidence) | |
| temporal_variance = torch.var(offsets, dim=1, keepdim=True) + 1e-6 # [B, 1] | |
| confidences = 1.0 / temporal_variance # [B, 1] | |
| confidences = confidences.expand_as(offsets) # [B, T] | |
| # Normalize confidence to [0, 1] | |
| confidences = torch.sigmoid(confidences - 5.0) # Shift to reasonable range | |
| return offsets, confidences | |
| class SyncNetFCN_WithAttention(SyncNetFCN): | |
| """ | |
| Enhanced version with cross-modal attention. | |
| Audio and video features attend to each other before correlation. | |
| """ | |
| def __init__(self, embedding_dim=512, max_offset=15): | |
| super(SyncNetFCN_WithAttention, self).__init__(embedding_dim, max_offset) | |
| # Cross-modal attention | |
| self.audio_to_video_attn = nn.MultiheadAttention( | |
| embed_dim=embedding_dim, | |
| num_heads=8, | |
| batch_first=False | |
| ) | |
| self.video_to_audio_attn = nn.MultiheadAttention( | |
| embed_dim=embedding_dim, | |
| num_heads=8, | |
| batch_first=False | |
| ) | |
| # Self-attention for temporal modeling | |
| self.audio_self_attn = TemporalAttention(embedding_dim) | |
| self.video_self_attn = TemporalAttention(embedding_dim) | |
| def forward(self, audio_mfcc, video_frames): | |
| """ | |
| Forward pass with attention mechanisms. | |
| """ | |
| # Extract features | |
| if audio_mfcc.dim() == 3: | |
| audio_mfcc = audio_mfcc.unsqueeze(1) # [B, 1, F, T] | |
| audio_features = self.audio_encoder(audio_mfcc) # [B, C, T_a] | |
| video_features = self.video_encoder(video_frames) # [B, C, T_v] | |
| # Self-attention | |
| audio_features = self.audio_self_attn(audio_features) | |
| video_features = self.video_self_attn(video_features) | |
| # Align temporal dimensions | |
| min_time = min(audio_features.size(2), video_features.size(2)) | |
| audio_features = audio_features[:, :, :min_time] | |
| video_features = video_features[:, :, :min_time] | |
| # Cross-modal attention | |
| # Reshape for attention: [T, B, C] | |
| audio_t = audio_features.permute(2, 0, 1) | |
| video_t = video_features.permute(2, 0, 1) | |
| # Audio attends to video | |
| audio_attended, _ = self.audio_to_video_attn( | |
| query=audio_t, key=video_t, value=video_t | |
| ) | |
| audio_features = audio_features + audio_attended.permute(1, 2, 0) | |
| # Video attends to audio | |
| video_attended, _ = self.video_to_audio_attn( | |
| query=video_t, key=audio_t, value=audio_t | |
| ) | |
| video_features = video_features + video_attended.permute(1, 2, 0) | |
| # Compute correlation | |
| correlation = self.correlation(video_features, audio_features) | |
| # Predict offset (regression) | |
| offset_logits = self.offset_regressor(correlation) | |
| predicted_offsets = self.temporal_smoother(offset_logits) | |
| # Clamp to valid range | |
| predicted_offsets = torch.clamp(predicted_offsets, -self.max_offset, self.max_offset) | |
| return predicted_offsets, audio_features, video_features | |
| class StreamSyncFCN(nn.Module): | |
| """ | |
| StreamSync-style FCN with built-in preprocessing and transfer learning. | |
| Features: | |
| 1. Sliding window processing for streams | |
| 2. HLS stream support (.m3u8) | |
| 3. Raw video file processing (MP4, AVI, etc.) | |
| 4. Automatic transfer learning from Sync NetModel.py | |
| 5. Temporal buffering and smoothing | |
| """ | |
| def __init__(self, embedding_dim=512, max_offset=15, | |
| window_size=25, stride=5, buffer_size=100, | |
| use_attention=False, pretrained_syncnet_path=None, | |
| auto_load_pretrained=True): | |
| """ | |
| Args: | |
| embedding_dim: Feature dimension | |
| max_offset: Maximum temporal offset (frames) | |
| window_size: Frames per processing window | |
| stride: Window stride | |
| buffer_size: Temporal buffer size | |
| use_attention: Use attention model | |
| pretrained_syncnet_path: Path to original SyncNet weights | |
| auto_load_pretrained: Auto-load pretrained weights if path provided | |
| """ | |
| super(StreamSyncFCN, self).__init__() | |
| self.window_size = window_size | |
| self.stride = stride | |
| self.buffer_size = buffer_size | |
| self.max_offset = max_offset | |
| # Initialize FCN model | |
| if use_attention: | |
| self.fcn_model = SyncNetFCN_WithAttention(embedding_dim, max_offset) | |
| else: | |
| self.fcn_model = SyncNetFCN(embedding_dim, max_offset) | |
| # Auto-load pretrained weights | |
| if auto_load_pretrained and pretrained_syncnet_path: | |
| self.load_pretrained_syncnet(pretrained_syncnet_path) | |
| self.reset_buffers() | |
| def reset_buffers(self): | |
| """Reset temporal buffers.""" | |
| self.offset_buffer = [] | |
| self.confidence_buffer = [] | |
| self.frame_count = 0 | |
| def load_pretrained_syncnet(self, syncnet_model_path, freeze_conv=True, verbose=True): | |
| """ | |
| Load conv layers from original SyncNet (SyncNetModel.py). | |
| Maps: netcnnaud.* → audio_encoder.conv_layers.* | |
| netcnnlip.* → video_encoder.conv_layers.* | |
| """ | |
| if verbose: | |
| print(f"Loading pretrained SyncNet from: {syncnet_model_path}") | |
| try: | |
| pretrained = torch.load(syncnet_model_path, map_location='cpu') | |
| if isinstance(pretrained, dict): | |
| pretrained_dict = pretrained.get('model_state_dict', pretrained.get('state_dict', pretrained)) | |
| else: | |
| pretrained_dict = pretrained.state_dict() | |
| fcn_dict = self.fcn_model.state_dict() | |
| loaded_count = 0 | |
| # Map audio conv layers | |
| for key in list(pretrained_dict.keys()): | |
| if key.startswith('netcnnaud.'): | |
| idx = key.split('.')[1] | |
| param = '.'.join(key.split('.')[2:]) | |
| new_key = f'audio_encoder.conv_layers.{idx}.{param}' | |
| if new_key in fcn_dict and pretrained_dict[key].shape == fcn_dict[new_key].shape: | |
| fcn_dict[new_key] = pretrained_dict[key] | |
| loaded_count += 1 | |
| # Map video conv layers | |
| elif key.startswith('netcnnlip.'): | |
| idx = key.split('.')[1] | |
| param = '.'.join(key.split('.')[2:]) | |
| new_key = f'video_encoder.conv_layers.{idx}.{param}' | |
| if new_key in fcn_dict and pretrained_dict[key].shape == fcn_dict[new_key].shape: | |
| fcn_dict[new_key] = pretrained_dict[key] | |
| loaded_count += 1 | |
| self.fcn_model.load_state_dict(fcn_dict, strict=False) | |
| if verbose: | |
| print(f"✓ Loaded {loaded_count} pretrained conv parameters") | |
| if freeze_conv: | |
| for name, param in self.fcn_model.named_parameters(): | |
| if 'conv_layers' in name: | |
| param.requires_grad = False | |
| if verbose: | |
| print("✓ Froze pretrained conv layers") | |
| except Exception as e: | |
| if verbose: | |
| print(f"⚠ Could not load pretrained weights: {e}") | |
| def unfreeze_all_layers(self, verbose=True): | |
| """Unfreeze all layers for fine-tuning.""" | |
| for param in self.fcn_model.parameters(): | |
| param.requires_grad = True | |
| if verbose: | |
| print("✓ Unfrozen all layers for fine-tuning") | |
| def forward(self, audio_mfcc, video_frames): | |
| """Forward pass through FCN model.""" | |
| return self.fcn_model(audio_mfcc, video_frames) | |
| def process_window(self, audio_window, video_window): | |
| """Process single window.""" | |
| with torch.no_grad(): | |
| sync_probs, _, _ = self.fcn_model(audio_window, video_window) | |
| offsets, confidences = self.fcn_model.compute_offset(sync_probs) | |
| return offsets[0].mean().item(), confidences[0].mean().item() | |
| def process_stream(self, audio_stream, video_stream, return_trace=False): | |
| """Process full stream with sliding windows.""" | |
| self.reset_buffers() | |
| video_frames = video_stream.shape[2] | |
| audio_frames = audio_stream.shape[3] // 4 | |
| min_frames = min(video_frames, audio_frames) | |
| num_windows = max(1, (min_frames - self.window_size) // self.stride + 1) | |
| trace = {'offsets': [], 'confidences': [], 'timestamps': []} | |
| for win_idx in range(num_windows): | |
| start = win_idx * self.stride | |
| end = min(start + self.window_size, min_frames) | |
| video_win = video_stream[:, :, start:end, :, :] | |
| audio_win = audio_stream[:, :, :, start*4:end*4] | |
| offset, confidence = self.process_window(audio_win, video_win) | |
| self.offset_buffer.append(offset) | |
| self.confidence_buffer.append(confidence) | |
| if return_trace: | |
| trace['offsets'].append(offset) | |
| trace['confidences'].append(confidence) | |
| trace['timestamps'].append(start) | |
| if len(self.offset_buffer) > self.buffer_size: | |
| self.offset_buffer.pop(0) | |
| self.confidence_buffer.pop(0) | |
| self.frame_count = end | |
| final_offset, final_conf = self.get_smoothed_prediction() | |
| return (final_offset, final_conf, trace) if return_trace else (final_offset, final_conf) | |
| def get_smoothed_prediction(self, method='confidence_weighted'): | |
| """Compute smoothed offset from buffer.""" | |
| if not self.offset_buffer: | |
| return 0.0, 0.0 | |
| offsets = torch.tensor(self.offset_buffer) | |
| confs = torch.tensor(self.confidence_buffer) | |
| if method == 'confidence_weighted': | |
| weights = confs / (confs.sum() + 1e-8) | |
| offset = (offsets * weights).sum().item() | |
| elif method == 'median': | |
| offset = torch.median(offsets).item() | |
| else: | |
| offset = torch.mean(offsets).item() | |
| return offset, torch.mean(confs).item() | |
| def extract_audio_mfcc(self, video_path, temp_dir='temp'): | |
| """Extract audio and compute MFCC.""" | |
| os.makedirs(temp_dir, exist_ok=True) | |
| audio_path = os.path.join(temp_dir, 'temp_audio.wav') | |
| cmd = ['ffmpeg', '-y', '-i', video_path, '-ac', '1', '-ar', '16000', | |
| '-vn', '-acodec', 'pcm_s16le', audio_path] | |
| subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) | |
| sample_rate, audio = wavfile.read(audio_path) | |
| mfcc = python_speech_features.mfcc(audio, sample_rate).T | |
| mfcc_tensor = torch.FloatTensor(mfcc).unsqueeze(0).unsqueeze(0) | |
| if os.path.exists(audio_path): | |
| os.remove(audio_path) | |
| return mfcc_tensor | |
| def extract_video_frames(self, video_path, target_size=(112, 112)): | |
| """Extract video frames as tensor.""" | |
| cap = cv2.VideoCapture(video_path) | |
| frames = [] | |
| while True: | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| frame = cv2.resize(frame, target_size) | |
| frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| frames.append(frame.astype(np.float32) / 255.0) | |
| cap.release() | |
| if not frames: | |
| raise ValueError(f"No frames extracted from {video_path}") | |
| frames_array = np.stack(frames, axis=0) | |
| video_tensor = torch.FloatTensor(frames_array).permute(3, 0, 1, 2).unsqueeze(0) | |
| return video_tensor | |
| def process_video_file(self, video_path, return_trace=False, temp_dir='temp', | |
| target_size=(112, 112), verbose=True): | |
| """ | |
| Process raw video file (MP4, AVI, MOV, etc.). | |
| Args: | |
| video_path: Path to video file | |
| return_trace: Return per-window predictions | |
| temp_dir: Temporary directory | |
| target_size: Video frame size | |
| verbose: Print progress | |
| Returns: | |
| offset: Detected offset (frames) | |
| confidence: Detection confidence | |
| trace: (optional) Per-window data | |
| Example: | |
| >>> model = StreamSyncFCN(pretrained_syncnet_path='data/syncnet_v2.model') | |
| >>> offset, conf = model.process_video_file('video.mp4') | |
| """ | |
| if verbose: | |
| print(f"Processing: {video_path}") | |
| mfcc = self.extract_audio_mfcc(video_path, temp_dir) | |
| video = self.extract_video_frames(video_path, target_size) | |
| if verbose: | |
| print(f" Audio: {mfcc.shape}, Video: {video.shape}") | |
| result = self.process_stream(mfcc, video, return_trace) | |
| if verbose: | |
| offset, conf = result[:2] | |
| print(f" Offset: {offset:.2f} frames, Confidence: {conf:.3f}") | |
| return result | |
| def detect_offset_correlation(self, video_path, calibration_offset=3, calibration_scale=-0.5, | |
| calibration_baseline=-15, temp_dir='temp', verbose=True): | |
| """ | |
| Detect AV offset using correlation-based method with calibration. | |
| This method uses the trained audio-video encoders to compute temporal | |
| correlation and find the best matching offset. A linear calibration | |
| is applied to correct for systematic bias in the model. | |
| Calibration formula: calibrated = calibration_offset + calibration_scale * (raw - calibration_baseline) | |
| Default values determined empirically from test videos. | |
| Args: | |
| video_path: Path to video file | |
| calibration_offset: Baseline expected offset (default: 3) | |
| calibration_scale: Scale factor for raw offset (default: -0.5) | |
| calibration_baseline: Baseline raw offset (default: -15) | |
| temp_dir: Temporary directory for audio extraction | |
| verbose: Print progress information | |
| Returns: | |
| offset: Calibrated offset in frames (positive = audio ahead) | |
| confidence: Detection confidence (correlation strength) | |
| raw_offset: Uncalibrated raw offset from correlation | |
| Example: | |
| >>> model = StreamSyncFCN(pretrained_syncnet_path='data/syncnet_v2.model') | |
| >>> offset, conf, raw = model.detect_offset_correlation('video.mp4') | |
| >>> print(f"Detected offset: {offset} frames") | |
| """ | |
| import python_speech_features | |
| from scipy.io import wavfile | |
| if verbose: | |
| print(f"Processing: {video_path}") | |
| # Extract audio MFCC | |
| os.makedirs(temp_dir, exist_ok=True) | |
| audio_path = os.path.join(temp_dir, 'temp_audio.wav') | |
| cmd = ['ffmpeg', '-y', '-i', video_path, '-ac', '1', '-ar', '16000', | |
| '-vn', '-acodec', 'pcm_s16le', audio_path] | |
| subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) | |
| sample_rate, audio = wavfile.read(audio_path) | |
| mfcc = python_speech_features.mfcc(audio, sample_rate, numcep=13) | |
| audio_tensor = torch.FloatTensor(mfcc.T).unsqueeze(0).unsqueeze(0) | |
| if os.path.exists(audio_path): | |
| os.remove(audio_path) | |
| # Extract video frames | |
| cap = cv2.VideoCapture(video_path) | |
| frames = [] | |
| while True: | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| frame = cv2.resize(frame, (112, 112)) | |
| frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| frames.append(frame.astype(np.float32) / 255.0) | |
| cap.release() | |
| if not frames: | |
| raise ValueError(f"No frames extracted from {video_path}") | |
| video_tensor = torch.FloatTensor(np.stack(frames)).permute(3, 0, 1, 2).unsqueeze(0) | |
| if verbose: | |
| print(f" Audio MFCC: {audio_tensor.shape}, Video: {video_tensor.shape}") | |
| # Compute correlation-based offset | |
| with torch.no_grad(): | |
| # Get features from encoders | |
| audio_feat = self.fcn_model.audio_encoder(audio_tensor) | |
| video_feat = self.fcn_model.video_encoder(video_tensor) | |
| # Align temporal dimensions | |
| min_t = min(audio_feat.shape[2], video_feat.shape[2]) | |
| audio_feat = audio_feat[:, :, :min_t] | |
| video_feat = video_feat[:, :, :min_t] | |
| # Compute correlation map | |
| correlation = self.fcn_model.correlation(video_feat, audio_feat) | |
| # Average over time dimension | |
| corr_avg = correlation.mean(dim=2).squeeze(0) | |
| # Find best offset (argmax of correlation) | |
| best_idx = corr_avg.argmax().item() | |
| raw_offset = best_idx - self.max_offset | |
| # Compute confidence as peak prominence | |
| corr_np = corr_avg.numpy() | |
| peak_val = corr_np[best_idx] | |
| median_val = np.median(corr_np) | |
| confidence = peak_val - median_val | |
| # Apply linear calibration: calibrated = offset + scale * (raw - baseline) | |
| calibrated_offset = int(round(calibration_offset + calibration_scale * (raw_offset - calibration_baseline))) | |
| if verbose: | |
| print(f" Raw offset: {raw_offset}, Calibrated: {calibrated_offset}") | |
| print(f" Confidence: {confidence:.4f}") | |
| return calibrated_offset, confidence, raw_offset | |
| def process_hls_stream(self, hls_url, segment_duration=10, return_trace=False, | |
| temp_dir='temp_hls', verbose=True): | |
| """ | |
| Process HLS stream (.m3u8 playlist). | |
| Args: | |
| hls_url: URL to .m3u8 playlist | |
| segment_duration: Seconds to capture | |
| return_trace: Return per-window predictions | |
| temp_dir: Temporary directory | |
| verbose: Print progress | |
| Returns: | |
| offset: Detected offset | |
| confidence: Detection confidence | |
| trace: (optional) Per-window data | |
| Example: | |
| >>> model = StreamSyncFCN(pretrained_syncnet_path='data/syncnet_v2.model') | |
| >>> offset, conf = model.process_hls_stream('http://example.com/stream.m3u8') | |
| """ | |
| if verbose: | |
| print(f"Processing HLS: {hls_url}") | |
| os.makedirs(temp_dir, exist_ok=True) | |
| temp_video = os.path.join(temp_dir, 'hls_segment.mp4') | |
| try: | |
| cmd = ['ffmpeg', '-y', '-i', hls_url, '-t', str(segment_duration), | |
| '-c', 'copy', temp_video] | |
| subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, | |
| check=True, timeout=segment_duration + 30) | |
| result = self.process_video_file(temp_video, return_trace, temp_dir, verbose=verbose) | |
| return result | |
| except Exception as e: | |
| raise RuntimeError(f"HLS processing failed: {e}") | |
| finally: | |
| if os.path.exists(temp_video): | |
| os.remove(temp_video) | |
| # Utility functions | |
| def save_model(model, filename): | |
| """Save model to file.""" | |
| with open(filename, "wb") as f: | |
| torch.save(model.state_dict(), f) | |
| print(f"{filename} saved.") | |
| def load_model(model, filename): | |
| """Load model from file.""" | |
| state_dict = torch.load(filename, map_location='cpu') | |
| model.load_state_dict(state_dict) | |
| print(f"{filename} loaded.") | |
| return model | |
| if __name__ == "__main__": | |
| # Test the models | |
| print("Testing FCN_AudioEncoder...") | |
| audio_encoder = FCN_AudioEncoder(output_channels=512) | |
| audio_input = torch.randn(2, 1, 13, 100) # [B, 1, MFCC_dim, Time] | |
| audio_out = audio_encoder(audio_input) | |
| print(f"Audio input: {audio_input.shape} → Audio output: {audio_out.shape}") | |
| print("\nTesting FCN_VideoEncoder...") | |
| video_encoder = FCN_VideoEncoder(output_channels=512) | |
| video_input = torch.randn(2, 3, 25, 112, 112) # [B, 3, T, H, W] | |
| video_out = video_encoder(video_input) | |
| print(f"Video input: {video_input.shape} → Video output: {video_out.shape}") | |
| print("\nTesting SyncNetFCN...") | |
| model = SyncNetFCN(embedding_dim=512, max_offset=15) | |
| sync_probs, audio_feat, video_feat = model(audio_input, video_input) | |
| print(f"Sync probs: {sync_probs.shape}") | |
| print(f"Audio features: {audio_feat.shape}") | |
| print(f"Video features: {video_feat.shape}") | |
| offsets, confidences = model.compute_offset(sync_probs) | |
| print(f"Offsets: {offsets.shape}") | |
| print(f"Confidences: {confidences.shape}") | |
| print("\nTesting SyncNetFCN_WithAttention...") | |
| model_attn = SyncNetFCN_WithAttention(embedding_dim=512, max_offset=15) | |
| sync_probs, audio_feat, video_feat = model_attn(audio_input, video_input) | |
| print(f"Sync probs (with attention): {sync_probs.shape}") | |
| # Count parameters | |
| total_params = sum(p.numel() for p in model.parameters()) | |
| total_params_attn = sum(p.numel() for p in model_attn.parameters()) | |
| print(f"\nTotal parameters (FCN): {total_params:,}") | |
| print(f"Total parameters (FCN+Attention): {total_params_attn:,}") | |