import os import argparse import subprocess import torch import numpy as np from tqdm import tqdm from omegaconf import OmegaConf from typing import Tuple, List, Union import decord import json import cv2 from musetalk.utils.face_detection import FaceAlignment,LandmarksType from mmpose.apis import inference_topdown, init_model from mmpose.structures import merge_data_samples import sys def fast_check_ffmpeg(): try: subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True) return True except: return False ffmpeg_path = "./ffmpeg-4.4-amd64-static/" if not fast_check_ffmpeg(): print("Adding ffmpeg to PATH") # Choose path separator based on operating system path_separator = ';' if sys.platform == 'win32' else ':' os.environ["PATH"] = f"{args.ffmpeg_path}{path_separator}{os.environ['PATH']}" if not fast_check_ffmpeg(): print("Warning: Unable to find ffmpeg, please ensure ffmpeg is properly installed") class AnalyzeFace: def __init__(self, device: Union[str, torch.device], config_file: str, checkpoint_file: str): """ Initialize the AnalyzeFace class with the given device, config file, and checkpoint file. Parameters: device (Union[str, torch.device]): The device to run the models on ('cuda' or 'cpu'). config_file (str): Path to the mmpose model configuration file. checkpoint_file (str): Path to the mmpose model checkpoint file. """ self.device = device self.dwpose = init_model(config_file, checkpoint_file, device=self.device) self.facedet = FaceAlignment(LandmarksType._2D, flip_input=False, device=self.device) def __call__(self, im: np.ndarray) -> Tuple[List[np.ndarray], np.ndarray]: """ Detect faces and keypoints in the given image. Parameters: im (np.ndarray): The input image. maxface (bool): Whether to detect the maximum face. Default is True. Returns: Tuple[List[np.ndarray], np.ndarray]: A tuple containing the bounding boxes and keypoints. """ try: # Ensure the input image has the correct shape if im.ndim == 3: im = np.expand_dims(im, axis=0) elif im.ndim != 4 or im.shape[0] != 1: raise ValueError("Input image must have shape (1, H, W, C)") bbox = self.facedet.get_detections_for_batch(np.asarray(im)) results = inference_topdown(self.dwpose, np.asarray(im)[0]) results = merge_data_samples(results) keypoints = results.pred_instances.keypoints face_land_mark= keypoints[0][23:91] face_land_mark = face_land_mark.astype(np.int32) return face_land_mark, bbox except Exception as e: print(f"Error during face analysis: {e}") return np.array([]),[] def convert_video(org_path: str, dst_path: str, vid_list: List[str]) -> None: """ Convert video files to a specified format and save them to the destination path. Parameters: org_path (str): The directory containing the original video files. dst_path (str): The directory where the converted video files will be saved. vid_list (List[str]): A list of video file names to process. Returns: None """ for idx, vid in enumerate(vid_list): if vid.endswith('.mp4'): org_vid_path = os.path.join(org_path, vid) dst_vid_path = os.path.join(dst_path, vid) if org_vid_path != dst_vid_path: cmd = [ "ffmpeg", "-hide_banner", "-y", "-i", org_vid_path, "-r", "25", "-crf", "15", "-c:v", "libx264", "-pix_fmt", "yuv420p", dst_vid_path ] subprocess.run(cmd, check=True) if idx % 1000 == 0: print(f"### {idx} videos converted ###") def segment_video(org_path: str, dst_path: str, vid_list: List[str], segment_duration: int = 30) -> None: """ Segment video files into smaller clips of specified duration. Parameters: org_path (str): The directory containing the original video files. dst_path (str): The directory where the segmented video files will be saved. vid_list (List[str]): A list of video file names to process. segment_duration (int): The duration of each segment in seconds. Default is 30 seconds. Returns: None """ for idx, vid in enumerate(vid_list): if vid.endswith('.mp4'): input_file = os.path.join(org_path, vid) original_filename = os.path.basename(input_file) command = [ 'ffmpeg', '-i', input_file, '-c', 'copy', '-map', '0', '-segment_time', str(segment_duration), '-f', 'segment', '-reset_timestamps', '1', os.path.join(dst_path, f'clip%03d_{original_filename}') ] subprocess.run(command, check=True) def extract_audio(org_path: str, dst_path: str, vid_list: List[str]) -> None: """ Extract audio from video files and save as WAV format. Parameters: org_path (str): The directory containing the original video files. dst_path (str): The directory where the extracted audio files will be saved. vid_list (List[str]): A list of video file names to process. Returns: None """ for idx, vid in enumerate(vid_list): if vid.endswith('.mp4'): video_path = os.path.join(org_path, vid) audio_output_path = os.path.join(dst_path, os.path.splitext(vid)[0] + ".wav") try: command = [ 'ffmpeg', '-hide_banner', '-y', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-f', 'wav', '-ar', '16000', '-ac', '1', audio_output_path, ] subprocess.run(command, check=True) print(f"Audio saved to: {audio_output_path}") except subprocess.CalledProcessError as e: print(f"Error extracting audio from {vid}: {e}") def split_data(video_files: List[str], val_list_hdtf: List[str]) -> (List[str], List[str]): """ Split video files into training and validation sets based on val_list_hdtf. Parameters: video_files (List[str]): A list of video file names. val_list_hdtf (List[str]): A list of validation file identifiers. Returns: (List[str], List[str]): A tuple containing the training and validation file lists. """ val_files = [f for f in video_files if any(val_id in f for val_id in val_list_hdtf)] train_files = [f for f in video_files if f not in val_files] return train_files, val_files def save_list_to_file(file_path: str, data_list: List[str]) -> None: """ Save a list of strings to a file, each string on a new line. Parameters: file_path (str): The path to the file where the list will be saved. data_list (List[str]): The list of strings to save. Returns: None """ with open(file_path, 'w') as file: for item in data_list: file.write(f"{item}\n") def generate_train_list(cfg): train_file_path = cfg.video_clip_file_list_train val_file_path = cfg.video_clip_file_list_val val_list_hdtf = cfg.val_list_hdtf meta_list = os.listdir(cfg.meta_root) sorted_meta_list = sorted(meta_list) train_files, val_files = split_data(meta_list, val_list_hdtf) save_list_to_file(train_file_path, train_files) save_list_to_file(val_file_path, val_files) print(val_list_hdtf) def analyze_video(org_path: str, dst_path: str, vid_list: List[str]) -> None: """ Convert video files to a specified format and save them to the destination path. Parameters: org_path (str): The directory containing the original video files. dst_path (str): The directory where the meta json will be saved. vid_list (List[str]): A list of video file names to process. Returns: None """ device = "cuda" if torch.cuda.is_available() else "cpu" config_file = './musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py' checkpoint_file = './models/dwpose/dw-ll_ucoco_384.pth' analyze_face = AnalyzeFace(device, config_file, checkpoint_file) for vid in tqdm(vid_list, desc="Processing videos"): #vid = "clip005_WDA_BernieSanders_000.mp4" #print(vid) if vid.endswith('.mp4'): vid_path = os.path.join(org_path, vid) wav_path = vid_path.replace(".mp4",".wav") vid_meta = os.path.join(dst_path, os.path.splitext(vid)[0] + ".json") if os.path.exists(vid_meta): continue print('process video {}'.format(vid)) total_bbox_list = [] total_pts_list = [] isvalid = True # process try: cap = decord.VideoReader(vid_path, fault_tol=1) except Exception as e: print(e) continue total_frames = len(cap) for frame_idx in range(total_frames): frame = cap[frame_idx] if frame_idx==0: video_height,video_width,_ = frame.shape frame_bgr = cv2.cvtColor(frame.asnumpy(), cv2.COLOR_BGR2RGB) pts_list, bbox_list = analyze_face(frame_bgr) if len(bbox_list)>0 and None not in bbox_list: bbox = bbox_list[0] else: isvalid = False bbox = [] print(f"set isvalid to False as broken img in {frame_idx} of {vid}") break #print(pts_list) if len(pts_list)>0 and pts_list is not None: pts = pts_list.tolist() else: isvalid = False pts = [] break if frame_idx==0: x1,y1,x2,y2 = bbox face_height, face_width = y2-y1,x2-x1 total_pts_list.append(pts) total_bbox_list.append(bbox) meta_data = { "mp4_path": vid_path, "wav_path": wav_path, "video_size": [video_height, video_width], "face_size": [face_height, face_width], "frames": total_frames, "face_list": total_bbox_list, "landmark_list": total_pts_list, "isvalid":isvalid, } with open(vid_meta, 'w') as f: json.dump(meta_data, f, indent=4) def main(cfg): # Ensure all necessary directories exist os.makedirs(cfg.video_root_25fps, exist_ok=True) os.makedirs(cfg.video_audio_clip_root, exist_ok=True) os.makedirs(cfg.meta_root, exist_ok=True) os.makedirs(os.path.dirname(cfg.video_file_list), exist_ok=True) os.makedirs(os.path.dirname(cfg.video_clip_file_list_train), exist_ok=True) os.makedirs(os.path.dirname(cfg.video_clip_file_list_val), exist_ok=True) vid_list = os.listdir(cfg.video_root_raw) sorted_vid_list = sorted(vid_list) # Save video file list with open(cfg.video_file_list, 'w') as file: for vid in sorted_vid_list: file.write(vid + '\n') # 1. Convert videos to 25 FPS convert_video(cfg.video_root_raw, cfg.video_root_25fps, sorted_vid_list) # 2. Segment videos into 30-second clips segment_video(cfg.video_root_25fps, cfg.video_audio_clip_root, vid_list, segment_duration=cfg.clip_len_second) # 3. Extract audio clip_vid_list = os.listdir(cfg.video_audio_clip_root) extract_audio(cfg.video_audio_clip_root, cfg.video_audio_clip_root, clip_vid_list) # 4. Generate video metadata analyze_video(cfg.video_audio_clip_root, cfg.meta_root, clip_vid_list) # 5. Generate training and validation set lists generate_train_list(cfg) print("done") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, default="./configs/training/preprocess.yaml") args = parser.parse_args() config = OmegaConf.load(args.config) main(config)