Syncnet_FCN / detect_sync.py
Shubham
Deploy clean version
579f772
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
FCN-SyncNet CLI Tool - Audio-Video Sync Detection
Detects audio-video synchronization offset in video files using
a Fully Convolutional Neural Network with transfer learning.
Usage:
python detect_sync.py video.mp4
python detect_sync.py video.mp4 --verbose
python detect_sync.py video.mp4 --output results.json
Author: R-V-Abhishek
"""
import argparse
import json
import os
import sys
import time
import torch
def load_model(checkpoint_path='checkpoints/syncnet_fcn_epoch2.pth', max_offset=15):
"""Load the FCN-SyncNet model with trained weights."""
from SyncNetModel_FCN import StreamSyncFCN
model = StreamSyncFCN(
max_offset=max_offset,
pretrained_syncnet_path=None,
auto_load_pretrained=False
)
if os.path.exists(checkpoint_path):
checkpoint = torch.load(checkpoint_path, map_location='cpu')
# Load only encoder weights
encoder_state = {k: v for k, v in checkpoint['model_state_dict'].items()
if 'audio_encoder' in k or 'video_encoder' in k}
model.load_state_dict(encoder_state, strict=False)
epoch = checkpoint.get('epoch', 'unknown')
print(f"✓ Loaded model from {checkpoint_path} (epoch {epoch})")
else:
# Fall back to pretrained SyncNet
print(f"! Checkpoint not found: {checkpoint_path}")
print(" Loading pretrained SyncNet weights...")
model = StreamSyncFCN(
max_offset=max_offset,
pretrained_syncnet_path='data/syncnet_v2.model',
auto_load_pretrained=True
)
model.eval()
return model
def detect_offset(model, video_path, verbose=False):
"""
Detect AV offset in a video file.
Returns:
dict with offset, confidence, raw_offset, and processing time
"""
start_time = time.time()
offset, confidence, raw_offset = model.detect_offset_correlation(
video_path,
calibration_offset=3,
calibration_scale=-0.5,
calibration_baseline=-15,
verbose=verbose
)
processing_time = time.time() - start_time
return {
'video': video_path,
'offset_frames': int(offset),
'offset_seconds': round(offset / 25.0, 3), # Assuming 25 fps
'confidence': round(float(confidence), 6),
'raw_offset': int(raw_offset),
'processing_time': round(processing_time, 2)
}
def print_result(result, verbose=False):
"""Print detection result in a nice format."""
print()
print("=" * 50)
print(" FCN-SyncNet Detection Result")
print("=" * 50)
print(f" Video: {os.path.basename(result['video'])}")
print(f" Offset: {result['offset_frames']:+d} frames ({result['offset_seconds']:+.3f}s)")
print(f" Confidence: {result['confidence']:.6f}")
print(f" Time: {result['processing_time']:.2f}s")
print("=" * 50)
# Interpretation
offset = result['offset_frames']
if abs(offset) <= 1:
print(" ✓ Audio and video are IN SYNC")
elif offset > 0:
print(f" ! Audio is {abs(offset)} frames BEHIND video")
print(f" (delay audio by {abs(result['offset_seconds']):.3f}s to fix)")
else:
print(f" ! Audio is {abs(offset)} frames AHEAD of video")
print(f" (advance audio by {abs(result['offset_seconds']):.3f}s to fix)")
print()
def main():
parser = argparse.ArgumentParser(
description='FCN-SyncNet: Detect audio-video sync offset',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python detect_sync.py video.mp4
python detect_sync.py video.mp4 --verbose
python detect_sync.py video.mp4 --output result.json
python detect_sync.py video.mp4 --model checkpoints/custom.pth
Output:
Positive offset = audio behind video (delay audio to fix)
Negative offset = audio ahead of video (advance audio to fix)
"""
)
parser.add_argument('video', help='Path to video file (MP4, AVI, MOV, etc.)')
parser.add_argument('--model', '-m', default='checkpoints/syncnet_fcn_epoch2.pth',
help='Path to model checkpoint (default: checkpoints/syncnet_fcn_epoch2.pth)')
parser.add_argument('--output', '-o', help='Save result to JSON file')
parser.add_argument('--verbose', '-v', action='store_true',
help='Show detailed processing info')
parser.add_argument('--json', '-j', action='store_true',
help='Output only JSON (for scripting)')
args = parser.parse_args()
# Validate input
if not os.path.exists(args.video):
print(f"Error: Video file not found: {args.video}")
sys.exit(1)
# Load model
if not args.json:
print()
print("FCN-SyncNet Audio-Video Sync Detector")
print("-" * 40)
try:
model = load_model(args.model)
except Exception as e:
print(f"Error loading model: {e}")
sys.exit(1)
# Detect offset
try:
result = detect_offset(model, args.video, verbose=args.verbose)
except Exception as e:
print(f"Error processing video: {e}")
sys.exit(1)
# Output result
if args.json:
print(json.dumps(result, indent=2))
else:
print_result(result, verbose=args.verbose)
# Save to file if requested
if args.output:
with open(args.output, 'w') as f:
json.dump(result, indent=2, fp=f)
if not args.json:
print(f"Result saved to: {args.output}")
return result['offset_frames']
if __name__ == '__main__':
sys.exit(main())