Syncnet_FCN / run_pipeline.py
Shubham
Deploy clean version
579f772
#!/usr/bin/python
import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2
import numpy as np
import torch
from shutil import rmtree
import scenedetect
from scenedetect.video_manager import VideoManager
from scenedetect.scene_manager import SceneManager
from scenedetect.frame_timecode import FrameTimecode
from scenedetect.stats_manager import StatsManager
from scenedetect.detectors import ContentDetector
from scipy.interpolate import interp1d
from scipy.io import wavfile
from scipy import signal
from detectors import S3FD
# ========== ========== ========== ==========
# # PARSE ARGS
# ========== ========== ========== ==========
parser = argparse.ArgumentParser(description = "FaceTracker");
parser.add_argument('--data_dir', type=str, default='data/work', help='Output direcotry');
parser.add_argument('--videofile', type=str, default='', help='Input video file');
parser.add_argument('--reference', type=str, default='', help='Video reference');
parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection');
parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box');
parser.add_argument('--min_track', type=int, default=100, help='Minimum facetrack duration');
parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate');
parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped');
parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels');
opt = parser.parse_args();
setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes'))
# ========== ========== ========== ==========
# # IOU FUNCTION
# ========== ========== ========== ==========
def bb_intersection_over_union(boxA, boxB):
xA = max(boxA[0], boxB[0])
yA = max(boxA[1], boxB[1])
xB = min(boxA[2], boxB[2])
yB = min(boxA[3], boxB[3])
interArea = max(0, xB - xA) * max(0, yB - yA)
boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
iou = interArea / float(boxAArea + boxBArea - interArea)
return iou
# ========== ========== ========== ==========
# # FACE TRACKING
# ========== ========== ========== ==========
def track_shot(opt,scenefaces):
iouThres = 0.5 # Minimum IOU between consecutive face detections
tracks = []
while True:
track = []
for framefaces in scenefaces:
for face in framefaces:
if track == []:
track.append(face)
framefaces.remove(face)
elif face['frame'] - track[-1]['frame'] <= opt.num_failed_det:
iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox'])
if iou > iouThres:
track.append(face)
framefaces.remove(face)
continue
else:
break
if track == []:
break
elif len(track) > opt.min_track:
framenum = np.array([ f['frame'] for f in track ])
bboxes = np.array([np.array(f['bbox']) for f in track])
frame_i = np.arange(framenum[0],framenum[-1]+1)
bboxes_i = []
for ij in range(0,4):
interpfn = interp1d(framenum, bboxes[:,ij])
bboxes_i.append(interpfn(frame_i))
bboxes_i = np.stack(bboxes_i, axis=1)
if max(np.mean(bboxes_i[:,2]-bboxes_i[:,0]), np.mean(bboxes_i[:,3]-bboxes_i[:,1])) > opt.min_face_size:
tracks.append({'frame':frame_i,'bbox':bboxes_i})
return tracks
# ========== ========== ========== ==========
# # VIDEO CROP AND SAVE
# ========== ========== ========== ==========
def crop_video(opt,track,cropfile):
flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
flist.sort()
fourcc = cv2.VideoWriter_fourcc(*'XVID')
vOut = cv2.VideoWriter(cropfile+'t.avi', fourcc, opt.frame_rate, (224,224))
dets = {'x':[], 'y':[], 's':[]}
for det in track['bbox']:
dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2)
dets['y'].append((det[1]+det[3])/2) # crop center x
dets['x'].append((det[0]+det[2])/2) # crop center y
# Smooth detections
dets['s'] = signal.medfilt(dets['s'],kernel_size=13)
dets['x'] = signal.medfilt(dets['x'],kernel_size=13)
dets['y'] = signal.medfilt(dets['y'],kernel_size=13)
for fidx, frame in enumerate(track['frame']):
cs = opt.crop_scale
bs = dets['s'][fidx] # Detection box size
bsi = int(bs*(1+2*cs)) # Pad videos by this amount
image = cv2.imread(flist[frame])
frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110))
my = dets['y'][fidx]+bsi # BBox center Y
mx = dets['x'][fidx]+bsi # BBox center X
face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))]
vOut.write(cv2.resize(face,(224,224)))
audiotmp = os.path.join(opt.tmp_dir,opt.reference,'audio.wav')
audiostart = (track['frame'][0])/opt.frame_rate
audioend = (track['frame'][-1]+1)/opt.frame_rate
vOut.release()
# ========== CROP AUDIO FILE ==========
command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp))
output = subprocess.call(command, shell=True, stdout=None)
if output != 0:
pdb.set_trace()
sample_rate, audio = wavfile.read(audiotmp)
# ========== COMBINE AUDIO AND VIDEO FILES ==========
command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile))
output = subprocess.call(command, shell=True, stdout=None)
if output != 0:
pdb.set_trace()
print('Written %s'%cropfile)
os.remove(cropfile+'t.avi')
print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s'])))
return {'track':track, 'proc_track':dets}
# ========== ========== ========== ==========
# # FACE DETECTION
# ========== ========== ========== ==========
def inference_video(opt):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
DET = S3FD(device=device)
flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
flist.sort()
dets = []
for fidx, fname in enumerate(flist):
start_time = time.time()
image = cv2.imread(fname)
image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale])
dets.append([]);
for bbox in bboxes:
dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]})
elapsed_time = time.time() - start_time
print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time)))
savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl')
with open(savepath, 'wb') as fil:
pickle.dump(dets, fil)
return dets
# ========== ========== ========== ==========
# # SCENE DETECTION
# ========== ========== ========== ==========
def scene_detect(opt):
video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')])
stats_manager = StatsManager()
scene_manager = SceneManager(stats_manager)
# Add ContentDetector algorithm (constructor takes detector options like threshold).
scene_manager.add_detector(ContentDetector())
base_timecode = video_manager.get_base_timecode()
video_manager.set_downscale_factor()
video_manager.start()
try:
scene_manager.detect_scenes(frame_source=video_manager)
scene_list = scene_manager.get_scene_list(base_timecode)
except TypeError as e:
# Handle OpenCV/scenedetect compatibility issue
print(f'Scene detection failed ({e}), treating entire video as single scene')
scene_list = []
savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl')
if scene_list == []:
scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())]
with open(savepath, 'wb') as fil:
pickle.dump(scene_list, fil)
print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list)))
return scene_list
# ========== ========== ========== ==========
# # EXECUTE DEMO
# ========== ========== ========== ==========
# ========== DELETE EXISTING DIRECTORIES ==========
if os.path.exists(os.path.join(opt.work_dir,opt.reference)):
rmtree(os.path.join(opt.work_dir,opt.reference))
if os.path.exists(os.path.join(opt.crop_dir,opt.reference)):
rmtree(os.path.join(opt.crop_dir,opt.reference))
if os.path.exists(os.path.join(opt.avi_dir,opt.reference)):
rmtree(os.path.join(opt.avi_dir,opt.reference))
if os.path.exists(os.path.join(opt.frames_dir,opt.reference)):
rmtree(os.path.join(opt.frames_dir,opt.reference))
if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
rmtree(os.path.join(opt.tmp_dir,opt.reference))
# ========== MAKE NEW DIRECTORIES ==========
os.makedirs(os.path.join(opt.work_dir,opt.reference))
os.makedirs(os.path.join(opt.crop_dir,opt.reference))
os.makedirs(os.path.join(opt.avi_dir,opt.reference))
os.makedirs(os.path.join(opt.frames_dir,opt.reference))
os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
# ========== CONVERT VIDEO AND EXTRACT FRAMES ==========
command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi')))
output = subprocess.call(command, shell=True, stdout=None)
command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg')))
output = subprocess.call(command, shell=True, stdout=None)
command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav')))
output = subprocess.call(command, shell=True, stdout=None)
# ========== FACE DETECTION ==========
faces = inference_video(opt)
# ========== SCENE DETECTION ==========
scene = scene_detect(opt)
# ========== FACE TRACKING ==========
alltracks = []
vidtracks = []
for shot in scene:
if shot[1].frame_num - shot[0].frame_num >= opt.min_track :
alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num]))
# ========== FACE TRACK CROP ==========
for ii, track in enumerate(alltracks):
vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii)))
# ========== SAVE RESULTS ==========
savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl')
with open(savepath, 'wb') as fil:
pickle.dump(vidtracks, fil)
rmtree(os.path.join(opt.tmp_dir,opt.reference))