|
|
import os |
|
|
import cv2 |
|
|
import numpy as np |
|
|
import torch |
|
|
import matplotlib as mpl |
|
|
|
|
|
from .video_utils import ( |
|
|
read_video_frames, |
|
|
resize_frames_to_long_side, |
|
|
save_to_video, |
|
|
add_overlay_text |
|
|
) |
|
|
from typing import Optional, List, Tuple |
|
|
from libs.models.mano_wrapper import MANO |
|
|
from .render_utils import Renderer |
|
|
|
|
|
class Config: |
|
|
""" |
|
|
Configuration class for file paths, parameters, and visual settings. |
|
|
Paths are initialized with default values but can be overridden by arguments. |
|
|
""" |
|
|
def __init__(self, args=None): |
|
|
|
|
|
self.VIDEO_ROOT = getattr(args, 'video_root', 'data/examples/videos') |
|
|
self.LABEL_ROOT = getattr(args, 'label_root', 'data/examples/annotations') |
|
|
self.SAVE_PATH = getattr(args, 'save_path', 'data/examples/visualize') |
|
|
self.MANO_MODEL_PATH = getattr(args, 'mano_model_path', './weights/mano') |
|
|
|
|
|
|
|
|
self.RENDER_SIZE_LONG_SIDE = 480 |
|
|
self.FPS = 15 |
|
|
|
|
|
|
|
|
self.LEFT_CMAP = "inferno" |
|
|
self.RIGHT_CMAP = "inferno" |
|
|
|
|
|
|
|
|
self.LEFT_COLOR = np.array([0.6594, 0.6259, 0.7451]) |
|
|
self.RIGHT_COLOR = np.array([0.4078, 0.4980, 0.7451]) |
|
|
|
|
|
|
|
|
class HandVisualizer: |
|
|
""" |
|
|
Main class for loading data, configuring the renderer, and visualizing |
|
|
the hand episode, including mesh and trajectory. |
|
|
""" |
|
|
def __init__(self, config: Config, render_gradual_traj: bool = False): |
|
|
self.config = config |
|
|
self.render_gradual_traj = render_gradual_traj |
|
|
self.all_modes = ['cam', 'first'] |
|
|
if self.render_gradual_traj: |
|
|
self.all_modes = ['cam', 'full', 'first'] |
|
|
|
|
|
|
|
|
self.mano = MANO(model_path=self.config.MANO_MODEL_PATH).cuda() |
|
|
faces_right = torch.from_numpy(self.mano.faces).float().cuda() |
|
|
|
|
|
self.faces_left = faces_right[:, [0, 2, 1]] |
|
|
self.faces_right = faces_right |
|
|
|
|
|
def _render_hand_trajectory(self, video_frames, hand_traj_wordspace, hand_mask, extrinsics, renderer: Renderer, mode: str): |
|
|
""" |
|
|
Renders hand mesh for one frame or hand trajectory across multiple frames, |
|
|
depending on the mode ('cam', 'first', 'full'). |
|
|
""" |
|
|
verts_left_worldspace, verts_right_worldspace = hand_traj_wordspace |
|
|
left_hand_mask, right_hand_mask = hand_mask |
|
|
R_w2c, t_w2c = extrinsics |
|
|
|
|
|
num_total_frames = len(video_frames) |
|
|
all_save_frames = [] |
|
|
|
|
|
|
|
|
if mode == 'cam': |
|
|
|
|
|
num_loop_frames = num_total_frames |
|
|
|
|
|
left_colors = self.config.LEFT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
|
|
right_colors = self.config.RIGHT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
|
|
elif mode == 'first': |
|
|
|
|
|
num_loop_frames = 1 |
|
|
left_colors = self.config.LEFT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
|
|
right_colors = self.config.RIGHT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
|
|
elif mode == 'full': |
|
|
|
|
|
num_loop_frames = num_total_frames |
|
|
|
|
|
left_colors, right_colors = generate_hand_colors(num_total_frames, self.config.LEFT_CMAP, self.config.RIGHT_CMAP) |
|
|
else: |
|
|
raise ValueError(f'Unknown rendering mode: {mode}') |
|
|
|
|
|
for current_frame_idx in range(num_loop_frames): |
|
|
|
|
|
if not mode == 'first': |
|
|
print(f'Processing frame {current_frame_idx + 1}/{num_loop_frames}', end='\r') |
|
|
|
|
|
curr_img_overlay = video_frames[current_frame_idx].copy().astype(np.float32) / 255.0 |
|
|
|
|
|
|
|
|
R_w2c_cur = R_w2c[current_frame_idx] |
|
|
t_w2c_cur = t_w2c[current_frame_idx] |
|
|
|
|
|
|
|
|
verts_left_camspace = ( |
|
|
R_w2c_cur @ verts_left_worldspace.transpose(0, 2, 1) + t_w2c_cur |
|
|
).transpose(0, 2, 1) |
|
|
verts_right_camspace = ( |
|
|
R_w2c_cur @ verts_right_worldspace.transpose(0, 2, 1) + t_w2c_cur |
|
|
).transpose(0, 2, 1) |
|
|
|
|
|
|
|
|
if mode == 'cam': |
|
|
|
|
|
start_traj_idx = current_frame_idx |
|
|
end_traj_idx = current_frame_idx + 1 |
|
|
transparency = [1.0] |
|
|
elif mode == 'first': |
|
|
|
|
|
start_traj_idx = 0 |
|
|
end_traj_idx = num_total_frames |
|
|
transparency = [1.0] * (end_traj_idx - start_traj_idx) |
|
|
|
|
|
if current_frame_idx > 0: continue |
|
|
elif mode == 'full': |
|
|
|
|
|
start_traj_idx = current_frame_idx |
|
|
end_traj_idx = num_total_frames |
|
|
|
|
|
transparency = np.linspace(0.4, 0.7, end_traj_idx - start_traj_idx) |
|
|
else: |
|
|
raise ValueError(f'Unknown rendering mode: {mode}') |
|
|
|
|
|
|
|
|
for traj_idx, kk in enumerate(range(start_traj_idx, end_traj_idx)): |
|
|
|
|
|
if mode == 'first': |
|
|
print(f'Processing frame {traj_idx + 1}/{num_total_frames}', end='\r') |
|
|
curr_img_overlay = video_frames[current_frame_idx].copy().astype(np.float32)/255 |
|
|
|
|
|
|
|
|
left_mask_k = left_hand_mask[kk] |
|
|
right_mask_k = right_hand_mask[kk] |
|
|
transp_k = transparency[traj_idx] if len(transparency) > traj_idx else 1.0 |
|
|
|
|
|
left_verts_list, left_color_list, left_face_list = ([], [], []) |
|
|
right_verts_list, right_color_list, right_face_list = ([], [], []) |
|
|
|
|
|
if left_mask_k != 0: |
|
|
left_verts_list = [torch.from_numpy(verts_left_camspace[kk]).float().cuda()] |
|
|
|
|
|
left_color_list = [torch.from_numpy(left_colors[kk]).float().unsqueeze(0).repeat(778, 1).cuda()] |
|
|
left_face_list = [self.faces_left] |
|
|
|
|
|
if right_mask_k != 0: |
|
|
right_verts_list = [torch.from_numpy(verts_right_camspace[kk]).float().cuda()] |
|
|
right_color_list = [torch.from_numpy(right_colors[kk]).float().unsqueeze(0).repeat(778, 1).cuda()] |
|
|
right_face_list = [self.faces_right] |
|
|
|
|
|
verts_list = left_verts_list + right_verts_list |
|
|
faces_list = left_face_list + right_face_list |
|
|
colors_list = left_color_list + right_color_list |
|
|
|
|
|
if verts_list: |
|
|
|
|
|
rend, mask = renderer.render(verts_list, faces_list, colors_list) |
|
|
rend = rend[..., ::-1] |
|
|
|
|
|
color_mesh = rend.astype(np.float32) / 255.0 |
|
|
valid_mask = mask[..., None].astype(np.float32) |
|
|
|
|
|
|
|
|
|
|
|
curr_img_overlay = ( |
|
|
curr_img_overlay[:, :, :3] * (1 - valid_mask) + |
|
|
color_mesh[:, :, :3] * valid_mask * transp_k + |
|
|
curr_img_overlay[:, :, :3] * valid_mask * (1 - transp_k) |
|
|
) |
|
|
if mode == 'first': |
|
|
|
|
|
final_frame = (curr_img_overlay * 255).astype(np.uint8) |
|
|
final_frame = cv2.cvtColor(final_frame, cv2.COLOR_BGR2RGB) |
|
|
all_save_frames.append(final_frame) |
|
|
|
|
|
if mode == 'cam' or mode == 'full': |
|
|
|
|
|
final_frame = (curr_img_overlay * 255).astype(np.uint8) |
|
|
final_frame = cv2.cvtColor(final_frame, cv2.COLOR_BGR2RGB) |
|
|
all_save_frames.append(final_frame) |
|
|
|
|
|
print(f'Finished rendering with mode: {mode}') |
|
|
return all_save_frames |
|
|
|
|
|
def process_episode(self, episode_name: str): |
|
|
"""Loads data and orchestrates the visualization process for a single episode.""" |
|
|
print(f'\nProcessing episode: {episode_name}') |
|
|
|
|
|
|
|
|
dataset_name = episode_name.split('_')[0] |
|
|
ep_name = episode_name.split('_')[-2] + '_' + episode_name.split('_')[-1] |
|
|
video_name = episode_name.replace(f'{dataset_name}_', '').replace(f'_{ep_name}', '') |
|
|
video_path = os.path.join(self.config.VIDEO_ROOT, f'{video_name}.mp4') |
|
|
label_path = os.path.join(self.config.LABEL_ROOT, episode_name + '.npy') |
|
|
|
|
|
if not os.path.exists(label_path): |
|
|
print(f'Episode file {label_path} does not exist, skipping...') |
|
|
return |
|
|
|
|
|
|
|
|
cap = cv2.VideoCapture(video_path) |
|
|
episode_info = np.load(label_path, allow_pickle=True).item() |
|
|
|
|
|
start_frame, end_frame = get_frame_interval(episode_info) |
|
|
R_w2c, t_w2c, normalized_intrinsics = get_camera_info(episode_info) |
|
|
caption_left, caption_right, hand_type = get_caption_info(episode_info) |
|
|
(verts_left_worldspace, left_hand_mask), (verts_right_worldspace, right_hand_mask) = \ |
|
|
get_hand_labels(episode_info, self.mano) |
|
|
|
|
|
|
|
|
video_frames = read_video_frames(cap, start_frame=start_frame, end_frame=end_frame, interval=1) |
|
|
resize_video_frames = resize_frames_to_long_side(video_frames, self.config.RENDER_SIZE_LONG_SIDE) |
|
|
H, W, _ = resize_video_frames[0].shape |
|
|
|
|
|
|
|
|
|
|
|
intrinsics_denorm = normalized_intrinsics.copy() |
|
|
intrinsics_denorm[0] *= W |
|
|
intrinsics_denorm[1] *= H |
|
|
fx_exo = intrinsics_denorm[0, 0] |
|
|
fy_exo = intrinsics_denorm[1, 1] |
|
|
|
|
|
renderer = Renderer(W, H, (fx_exo, fy_exo), 'cuda') |
|
|
|
|
|
|
|
|
all_rendered_frames = [] |
|
|
hand_traj_wordspace = (verts_left_worldspace, verts_right_worldspace) |
|
|
hand_mask = (left_hand_mask, right_hand_mask) |
|
|
extrinsics = (R_w2c, t_w2c) |
|
|
|
|
|
for mode in self.all_modes: |
|
|
save_frames = self._render_hand_trajectory( |
|
|
resize_video_frames, |
|
|
hand_traj_wordspace, |
|
|
hand_mask, |
|
|
extrinsics, |
|
|
renderer, |
|
|
mode=mode |
|
|
) |
|
|
all_rendered_frames.append(save_frames) |
|
|
|
|
|
|
|
|
final_save_frames = [] |
|
|
num_frames = len(all_rendered_frames[0]) |
|
|
|
|
|
|
|
|
caption_primary = caption_right if hand_type == 'right' else caption_left |
|
|
caption_opposite = caption_left if hand_type == 'right' else caption_right |
|
|
opposite_intervals = [interval for _, interval in caption_opposite] |
|
|
|
|
|
for frame_idx in range(num_frames): |
|
|
|
|
|
curr_img_overlay = np.concatenate( |
|
|
[all_rendered_frames[mode_idx][frame_idx] for mode_idx in range(len(self.all_modes))], |
|
|
axis=1 |
|
|
) |
|
|
|
|
|
|
|
|
overlay_text_primary = caption_primary[0][0] |
|
|
|
|
|
|
|
|
opposite_idx = find_caption_index(frame_idx, opposite_intervals) |
|
|
overlay_text_opposite = caption_opposite[opposite_idx][0] if opposite_idx is not None else 'None.' |
|
|
|
|
|
|
|
|
overlay_text_full = generate_overlay_text( |
|
|
overlay_text_primary, |
|
|
overlay_text_opposite, |
|
|
hand_type |
|
|
) |
|
|
add_overlay_text(curr_img_overlay, overlay_text_full) |
|
|
|
|
|
final_save_frames.append(curr_img_overlay) |
|
|
|
|
|
|
|
|
os.makedirs(self.config.SAVE_PATH, exist_ok=True) |
|
|
save_to_video(final_save_frames, f'{self.config.SAVE_PATH}/{episode_name}.mp4', fps=self.config.FPS) |
|
|
print(f'\nSuccessfully saved episode to {self.config.SAVE_PATH}/{episode_name}.mp4') |
|
|
|
|
|
def find_caption_index(frame_index: int, intervals: list[tuple[int, int]]) -> Optional[int]: |
|
|
"""Finds the interval index for a given frame index.""" |
|
|
for idx, (start, end) in enumerate(intervals): |
|
|
if start <= frame_index <= end: |
|
|
return idx |
|
|
return None |
|
|
|
|
|
def generate_hand_colors(T: int, left_cmap: str, right_cmap: str) -> tuple[np.ndarray, np.ndarray]: |
|
|
""" |
|
|
Generates RGB color sequences for left and right hands over T frames. |
|
|
Returns colors in shape (T, 3), normalized 0-1, based on the specified colormaps. |
|
|
""" |
|
|
t_norm = np.linspace(0, 0.95, T) |
|
|
left_colors = mpl.colormaps.get_cmap(left_cmap)(t_norm)[:, :3] |
|
|
right_colors = mpl.colormaps.get_cmap(right_cmap)(t_norm)[:, :3] |
|
|
return left_colors, right_colors |
|
|
|
|
|
def get_frame_interval(episode_info: dict) -> tuple[int, int]: |
|
|
"""Extracts start (inclusive) and end (exclusive) frame indices from episode info.""" |
|
|
video_decode_frames = episode_info['video_decode_frame'] |
|
|
start_frame = video_decode_frames[0] |
|
|
end_frame = video_decode_frames[-1] + 1 |
|
|
return start_frame, end_frame |
|
|
|
|
|
def normalize_camera_intrinsics(intrinsics: np.ndarray) -> np.ndarray: |
|
|
""" |
|
|
Normalizes intrinsics based on the assumption that the principal point |
|
|
is at the image center (image size is 2*cx, 2*cy). |
|
|
""" |
|
|
|
|
|
normalized_intrinsics = intrinsics.copy() |
|
|
normalized_intrinsics[0] /= normalized_intrinsics[0, 2] * 2 |
|
|
normalized_intrinsics[1] /= normalized_intrinsics[1, 2] * 2 |
|
|
return normalized_intrinsics |
|
|
|
|
|
def get_camera_info(episode_info: dict) -> tuple[np.ndarray, np.ndarray, np.ndarray]: |
|
|
""" |
|
|
Extracts and normalizes camera intrinsics and extrinsics (world-to-cam). |
|
|
""" |
|
|
extrinsics = episode_info['extrinsics'] |
|
|
R_w2c = extrinsics[:, :3, :3].copy() |
|
|
t_w2c = extrinsics[:, :3, 3:].copy() |
|
|
|
|
|
intrinsics = episode_info['intrinsics'].copy() |
|
|
normalized_intrinsics = normalize_camera_intrinsics(intrinsics) |
|
|
|
|
|
return R_w2c, t_w2c, normalized_intrinsics |
|
|
|
|
|
def get_caption_info(episode_info: dict) -> tuple[list, list, str]: |
|
|
""" |
|
|
Extracts and formats caption information for left and right hands. |
|
|
Adds a large interval if captions are empty to cover all frames. |
|
|
""" |
|
|
hand_type = episode_info['anno_type'] |
|
|
|
|
|
caption_right = episode_info['text'].get('right', []) |
|
|
caption_left = episode_info['text'].get('left', []) |
|
|
|
|
|
|
|
|
if not caption_right: |
|
|
caption_right = [['None.', (0, 10000)]] |
|
|
if not caption_left: |
|
|
caption_left = [['None.', (0, 10000)]] |
|
|
|
|
|
return caption_left, caption_right, hand_type |
|
|
|
|
|
def get_hand_labels(episode_info: dict, mano: MANO): |
|
|
""" |
|
|
Processes hand labels (pose, shape, translation, orientation) through the MANO model |
|
|
to obtain hand vertices in world space. |
|
|
""" |
|
|
left_labels = episode_info['left'] |
|
|
right_labels = episode_info['right'] |
|
|
|
|
|
|
|
|
left_hand_mask = left_labels['kept_frames'] |
|
|
verts_left, _ = process_single_hand_labels(left_labels, left_hand_mask, mano, is_left=True) |
|
|
|
|
|
|
|
|
right_hand_mask = right_labels['kept_frames'] |
|
|
verts_right, _ = process_single_hand_labels(right_labels, right_hand_mask, mano) |
|
|
|
|
|
return (verts_left, left_hand_mask), (verts_right, right_hand_mask) |
|
|
|
|
|
def process_single_hand_labels(hand_labels: dict, hand_mask: np.ndarray, mano: MANO, is_left: bool = False): |
|
|
""" |
|
|
Helper function to compute MANO vertices for a single hand (left or right). |
|
|
""" |
|
|
T = len(hand_mask) |
|
|
|
|
|
wrist_worldspace = hand_labels['transl_worldspace'].reshape(-1, 1, 3) |
|
|
wrist_orientation = hand_labels['global_orient_worldspace'] |
|
|
beta = hand_labels['beta'] |
|
|
pose = hand_labels['hand_pose'] |
|
|
|
|
|
|
|
|
identity = np.eye(3, dtype=pose.dtype) |
|
|
identity_block = np.broadcast_to(identity, (pose.shape[1], 3, 3)) |
|
|
mask_indices = (hand_mask == 0) |
|
|
if np.any(mask_indices): |
|
|
pose[mask_indices] = identity_block |
|
|
|
|
|
|
|
|
beta_torch = torch.from_numpy(beta).float().cuda().unsqueeze(0).repeat(T, 1) |
|
|
pose_torch = torch.from_numpy(pose).float().cuda() |
|
|
|
|
|
|
|
|
global_rot_placeholder = torch.eye(3).float().unsqueeze(0).unsqueeze(0).cuda().repeat(T, 1, 1, 1) |
|
|
|
|
|
mano_out = mano(betas=beta_torch, hand_pose=pose_torch, global_orient=global_rot_placeholder) |
|
|
|
|
|
verts = mano_out.vertices.cpu().numpy() |
|
|
joints = mano_out.joints.cpu().numpy() |
|
|
|
|
|
|
|
|
|
|
|
if is_left: |
|
|
verts[:, :, 0] *= -1 |
|
|
joints[:, :, 0] *= -1 |
|
|
|
|
|
|
|
|
|
|
|
verts_worldspace = ( |
|
|
wrist_orientation @ |
|
|
(verts - joints[:, 0][:, None]).transpose(0, 2, 1) |
|
|
).transpose(0, 2, 1) + wrist_worldspace |
|
|
|
|
|
return verts_worldspace, joints[:, 0] |
|
|
|
|
|
def generate_overlay_text(overlay_text: str, overlay_text_opposite: str, hand_type: str) -> str: |
|
|
"""Formats the caption string based on the primary hand type.""" |
|
|
if hand_type == 'right': |
|
|
return f'Left: {overlay_text_opposite} | Right: {overlay_text}' |
|
|
else: |
|
|
return f'Left: {overlay_text} | Right: {overlay_text_opposite}' |