Spaces:

Abhi2504
/

Syncnet_FCN

Sleeping

Syncnet_FCN / detect_sync.py

Shubham

Deploy clean version

579f772 11 days ago

5.74 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	FCN-SyncNet CLI Tool - Audio-Video Sync Detection

	Detects audio-video synchronization offset in video files using
	a Fully Convolutional Neural Network with transfer learning.

	Usage:
	python detect_sync.py video.mp4
	python detect_sync.py video.mp4 --verbose
	python detect_sync.py video.mp4 --output results.json

	Author: R-V-Abhishek
	"""

	import argparse
	import json
	import os
	import sys
	import time

	import torch


	def load_model(checkpoint_path='checkpoints/syncnet_fcn_epoch2.pth', max_offset=15):
	"""Load the FCN-SyncNet model with trained weights."""
	from SyncNetModel_FCN import StreamSyncFCN

	model = StreamSyncFCN(
	max_offset=max_offset,
	pretrained_syncnet_path=None,
	auto_load_pretrained=False
	)

	if os.path.exists(checkpoint_path):
	checkpoint = torch.load(checkpoint_path, map_location='cpu')
	# Load only encoder weights
	encoder_state = {k: v for k, v in checkpoint['model_state_dict'].items()
	if 'audio_encoder' in k or 'video_encoder' in k}
	model.load_state_dict(encoder_state, strict=False)
	epoch = checkpoint.get('epoch', 'unknown')
	print(f"✓ Loaded model from {checkpoint_path} (epoch {epoch})")
	else:
	# Fall back to pretrained SyncNet
	print(f"! Checkpoint not found: {checkpoint_path}")
	print(" Loading pretrained SyncNet weights...")
	model = StreamSyncFCN(
	max_offset=max_offset,
	pretrained_syncnet_path='data/syncnet_v2.model',
	auto_load_pretrained=True
	)

	model.eval()
	return model


	def detect_offset(model, video_path, verbose=False):
	"""
	Detect AV offset in a video file.

	Returns:
	dict with offset, confidence, raw_offset, and processing time
	"""
	start_time = time.time()

	offset, confidence, raw_offset = model.detect_offset_correlation(
	video_path,
	calibration_offset=3,
	calibration_scale=-0.5,
	calibration_baseline=-15,
	verbose=verbose
	)

	processing_time = time.time() - start_time

	return {
	'video': video_path,
	'offset_frames': int(offset),
	'offset_seconds': round(offset / 25.0, 3), # Assuming 25 fps
	'confidence': round(float(confidence), 6),
	'raw_offset': int(raw_offset),
	'processing_time': round(processing_time, 2)
	}


	def print_result(result, verbose=False):
	"""Print detection result in a nice format."""
	print()
	print("=" * 50)
	print(" FCN-SyncNet Detection Result")
	print("=" * 50)
	print(f" Video: {os.path.basename(result['video'])}")
	print(f" Offset: {result['offset_frames']:+d} frames ({result['offset_seconds']:+.3f}s)")
	print(f" Confidence: {result['confidence']:.6f}")
	print(f" Time: {result['processing_time']:.2f}s")
	print("=" * 50)

	# Interpretation
	offset = result['offset_frames']
	if abs(offset) <= 1:
	print(" ✓ Audio and video are IN SYNC")
	elif offset > 0:
	print(f" ! Audio is {abs(offset)} frames BEHIND video")
	print(f" (delay audio by {abs(result['offset_seconds']):.3f}s to fix)")
	else:
	print(f" ! Audio is {abs(offset)} frames AHEAD of video")
	print(f" (advance audio by {abs(result['offset_seconds']):.3f}s to fix)")
	print()


	def main():
	parser = argparse.ArgumentParser(
	description='FCN-SyncNet: Detect audio-video sync offset',
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	python detect_sync.py video.mp4
	python detect_sync.py video.mp4 --verbose
	python detect_sync.py video.mp4 --output result.json
	python detect_sync.py video.mp4 --model checkpoints/custom.pth

	Output:
	Positive offset = audio behind video (delay audio to fix)
	Negative offset = audio ahead of video (advance audio to fix)
	"""
	)

	parser.add_argument('video', help='Path to video file (MP4, AVI, MOV, etc.)')
	parser.add_argument('--model', '-m', default='checkpoints/syncnet_fcn_epoch2.pth',
	help='Path to model checkpoint (default: checkpoints/syncnet_fcn_epoch2.pth)')
	parser.add_argument('--output', '-o', help='Save result to JSON file')
	parser.add_argument('--verbose', '-v', action='store_true',
	help='Show detailed processing info')
	parser.add_argument('--json', '-j', action='store_true',
	help='Output only JSON (for scripting)')

	args = parser.parse_args()

	# Validate input
	if not os.path.exists(args.video):
	print(f"Error: Video file not found: {args.video}")
	sys.exit(1)

	# Load model
	if not args.json:
	print()
	print("FCN-SyncNet Audio-Video Sync Detector")
	print("-" * 40)

	try:
	model = load_model(args.model)
	except Exception as e:
	print(f"Error loading model: {e}")
	sys.exit(1)

	# Detect offset
	try:
	result = detect_offset(model, args.video, verbose=args.verbose)
	except Exception as e:
	print(f"Error processing video: {e}")
	sys.exit(1)

	# Output result
	if args.json:
	print(json.dumps(result, indent=2))
	else:
	print_result(result, verbose=args.verbose)

	# Save to file if requested
	if args.output:
	with open(args.output, 'w') as f:
	json.dump(result, indent=2, fp=f)
	if not args.json:
	print(f"Result saved to: {args.output}")

	return result['offset_frames']


	if __name__ == '__main__':
	sys.exit(main())