Spaces:

HikariDawn
/

FrameINO

Running on Zero

App Files Files Community

HikariDawn commited on 29 days ago

Commit

5000b0a

1 Parent(s): 1577493

feat: initial push

Browse files

Files changed (23) hide show

.gitignore +33 -0
app.py +893 -0
architecture/attention_processor.py +0 -0
architecture/autoencoder_kl_wan.py +1419 -0
architecture/cogvideox_transformer_3d.py +563 -0
architecture/embeddings.py +0 -0
architecture/noise_sampler.py +54 -0
architecture/transformer_wan.py +552 -0
config/accelerate_config_4GPU.json +18 -0
config/train_cogvideox_motion.yaml +88 -0
config/train_cogvideox_motion_FrameINO.yaml +96 -0
config/train_wan_motion.yaml +104 -0
config/train_wan_motion_FrameINO.yaml +110 -0
data_loader/sampler.py +110 -0
data_loader/video_dataset_motion.py +407 -0
data_loader/video_dataset_motion_FrameINO.py +578 -0
data_loader/video_dataset_motion_FrameINO_old.py +538 -0
pipelines/pipeline_cogvideox_i2v_motion.py +931 -0
pipelines/pipeline_cogvideox_i2v_motion_FrameINO.py +960 -0
pipelines/pipeline_wan_i2v_motion.py +861 -0
pipelines/pipeline_wan_i2v_motion_FrameINO.py +937 -0
requirements.txt +24 -0
utils/optical_flow_utils.py +219 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,33 @@

+*.csv
+*.mp4
+*.png
+*.jpg
+*.err
+*.txt
+*.log
+*.pyc
+*.pth
+*.DS_Store*
+*.o
+*.so
+*.egg*
+*.json
+*.zip
+*.jpeg
+*.pkl
+*.gif
+*.pem
+*.npy
+*.sh
+pretrained/*
+checkpoints/*
+preprocess/sam2_code
+!preprocess/oneformer_code/oneformer/data/bpe_simple_vocab_16e6.txt
+!config/*.json
+!requirements.txt
+!requirements/*
+!__assets__/*
+!__assets__/page/*

app.py ADDED Viewed

	@@ -0,0 +1,893 @@

+import os, sys, shutil
+import csv
+import numpy as np
+import ffmpeg
+import cv2
+import collections
+import json
+import math
+import time
+import imageio
+import random
+import ast
+import gradio as gr
+from omegaconf import OmegaConf
+from PIL import Image
+from segment_anything import SamPredictor, sam_model_registry
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from diffusers import AutoencoderKLCogVideoX
+from transformers import T5EncoderModel
+from diffusers.utils import export_to_video, load_image
+# Import files from the local fodler
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from pipelines.pipeline_cogvideox_i2v_motion_FrameINO import CogVideoXImageToVideoPipeline
+from architecture.cogvideox_transformer_3d import CogVideoXTransformer3DModel
+from data_loader.video_dataset_motion import VideoDataset_Motion
+from architecture.transformer_wan import WanTransformer3DModel
+from pipelines.pipeline_wan_i2v_motion_FrameINO import WanImageToVideoPipeline
+from architecture.autoencoder_kl_wan import AutoencoderKLWan
+MARKDOWN = \
+    """
+    <div align='center'>
+        <h1> Frame In-N-Out </h1> \
+            <h2 style='font-weight: 450; font-size: 1rem; margin-bottom: 1rem;'>\
+                <a href='https://kiteretsu77.github.io/BoyangWang/'>Boyang Wang</a>,  <a href='https://xuweiyichen.github.io/'>Xuweiyi Chen</a>,   <a href='http://mgadelha.me/'>Matheus Gadelha</a>,  <a href='https://sites.google.com/site/zezhoucheng/'>Zezhou Cheng</a>\
+            </h2> \
+        <div style="display: flex; flex-wrap: wrap; justify-content: center; gap: 2rem; margin-bottom: 1rem;">
+            <!-- 第一行按钮 -->
+            <a href="https://arxiv.org/abs/2505.21491" target="_blank"
+            style="display: inline-flex; align-items: center; padding: 0.5rem 1rem; background-color: #f0f0f0; /* 浅灰色背景 */ color: #333; /* 深色文字 */ text-decoration: none; border-radius: 9999px; font-weight: 500; transition: background-color 0.3s;">
+                <span style="margin-right: 0.5rem;">📄</span> <!-- 使用文档图标 -->
+                <span>Paper</span>
+            </a>
+            <a href="https://github.com/UVA-Computer-Vision-Lab/FrameINO" target="_blank"
+            style="display: inline-flex; align-items: center; padding: 0.5rem 1rem; background-color: #f0f0f0; color: #333; text-decoration: none; border-radius: 9999px; font-weight: 500; transition: background-color 0.3s;">
+                <span style="margin-right: 0.5rem;">💻</span> <!-- 使用电脑图标 -->
+                <span>GitHub</span>
+            </a>
+            <a href="https://uva-computer-vision-lab.github.io/Frame-In-N-Out" target="_blank"
+            style="display: inline-flex; align-items: center; padding: 0.5rem 1rem; background-color: #f0f0f0; color: #333; text-decoration: none; border-radius: 9999px; font-weight: 500; transition: background-color 0.3s;">
+                <span style="margin-right: 0.5rem;">🤖</span>
+                <span>Project Page</span>
+            </a>
+            <a href="https://huggingface.co/collections/uva-cv-lab/frame-in-n-out" target="_blank"
+            style="display: inline-flex; align-items: center; padding: 0.5rem 1rem; background-color: #f0f0f0; color: #333; text-decoration: none; border-radius: 9999px; font-weight: 500; transition: background-color 0.3s;">
+                <span style="margin-right: 0.5rem;">🤗</span>
+                <span>HF Model and Data</span>
+            </a>
+        </div>
+    </div>
+    Frame In-N-Out expands the first frame condition to a broader canvas region by setting top left and bottom right expansion amount,
+    and users could provide motion trajectory to existing objects or provide breaking new identity to enter the scene with motion trajectory, or both. <br>
+    The model we used here is <b>Wan2.2-5B</b> trained on our Frame In-N-Out control mechanism.
+    <br>
+    <b>Easiest way:</b>
+        Choose one example and then simply click <b>Generate</b>.
+    <br>
+    <br>
+    ❗️❗️❗️Instruction Steps:<br>
+    1️⃣ Upload your first frame image. Set the size you want to resize to for <b>Resized Height for Input Image</b> and <b>Resized Width for Input Image</b>.  <br>
+    2️⃣ Set your <b>canvas top left</b> and <b>bottom right expansion</b>. The combined height and width should be the multiplier of 32. <br>
+        PLEASE ENSURE that <b>Canvas HEIGHT = 704</b> and <b>Canvas WIDTH = 1280</b> for the best performance (current training resolution). <br>
+    3️⃣ Click <b>Build the Canvas</b>.  <br>
+    4️⃣ Provide the trajectory of the main object in the canvas by clicking on the <b>Expanded Canvas</b>. <br>
+    5️⃣ Provide the ID reference image and its trajectory (optional). Also, write a detailed <b>text prompt</b>. <br>
+    Click the <b>Generate</b> button to start the Video Generation. <br>
+    If **Frame In-N-Out** is helpful, please help star the [GitHub Repo](https://github.com/UVA-Computer-Vision-Lab/FrameINO?tab=readme-ov-file). Thanks!
+    """
+# Color
+all_color_codes = [(255, 0, 0), (255, 255, 0), (0, 255, 0), (0, 255, 255),
+                    (255, 0, 255), (0, 0, 255), (128, 128, 128), (64, 224, 208),
+                    (233, 150, 122)]
+for _ in range(100):        # Should not be over 100 colors
+    all_color_codes.append((random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)))
+# Data Transforms
+train_transforms = transforms.Compose(
+                                        [
+                                            transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0),
+                                        ]
+                                    )
+######################################################## CogVideoX #################################################################
+# Path Setting
+model_code_name = "CogVideox"
+base_model_id = "zai-org/CogVideoX-5b-I2V"
+transformer_ckpt_path = "uva-cv-lab/FrameINO_CogVideoX_Stage2_MotionINO_v1.0"
+# Load Model
+transformer = CogVideoXTransformer3DModel.from_pretrained(transformer_ckpt_path, torch_dtype=torch.float16)
+text_encoder = T5EncoderModel.from_pretrained(base_model_id, subfolder="text_encoder", torch_dtype=torch.float16)
+vae = AutoencoderKLCogVideoX.from_pretrained(base_model_id, subfolder="vae", torch_dtype=torch.float16)
+# Create pipeline and run inference
+pipe = CogVideoXImageToVideoPipeline.from_pretrained(
+            base_model_id,
+            text_encoder = text_encoder,
+            transformer = transformer,
+            vae = vae,
+            torch_dtype = torch.float16,
+        )
+pipe.enable_model_cpu_offload()
+#####################################################################################################################################
+######################################################## Wan2.2 5B #################################################################
+# Path Setting
+model_code_name = "Wan"
+base_model_id = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
+transformer_ckpt_path = "uva-cv-lab/FrameINO_Wan2.2_5B_Stage2_MotionINO_v1.5"
+# Load model
+print("Loading the model!")
+transformer = WanTransformer3DModel.from_pretrained(transformer_ckpt_path, torch_dtype=torch.float16)
+vae = AutoencoderKLWan.from_pretrained(base_model_id, subfolder="vae", torch_dtype=torch.float32)
+# Create the pipeline
+print("Loading the pipeline!")
+pipe = WanImageToVideoPipeline.from_pretrained(base_model_id, transformer=transformer, vae=vae, torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+pipe.enable_model_cpu_offload()
+#####################################################################################################################################
+########################################################## Other Auxiliary Func #################################################################
+# # Init SAM model
+model_type = "vit_h"        #vit-h has the most number of paramter
+sam_pretrained_path = "pretrained/sam_vit_h_4b8939.pth"
+if not os.path.exists(sam_pretrained_path):
+    os.system("wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth -P pretrained/")
+sam = sam_model_registry[model_type](checkpoint = sam_pretrained_path).to(device="cuda")
+sam_predictor = SamPredictor(sam)     # There is a lot of setting here
+#####################################################################################################################################
+# Examples Sample
+def get_example():
+    case = [
+                [
+                    '__assets__/horse.jpg',
+                    480,
+                    736,
+                    128,
+                    224,
+                    96,
+                    320,
+                    '__assets__/sheep.png',
+                    "A brown horse with a black mane walks to the right on a wooden path in a green forest, and then a white sheep enters from the left and walks toward it. Natural daylight, realistic texture, smooth motion, cinematic focus, 4K detail.",
+                    [[[[299, 241], [390, 236], [461, 245], [521, 249], [565, 240], [612, 246], [666, 245]], [[449, 224], [488, 212], [512, 206], [531, 209], [552, 202], [581, 204], [609, 210], [657, 206], [703, 202], [716, 211]]], [[[24, 305], [104, 300], [167, 299], [219, 303], [270, 296], [295, 304]]]],
+                ],
+                [
+                    '__assets__/cup.jpg',
+                    448,
+                    736,
+                    256,
+                    64,
+                    0,
+                    480,
+                    '__assets__/hand2.png',
+                    "A human hand reaches into the frame, gently grabbing the black metal cup with a golden character design on the front, lifting it off the table and taking it away.",
+                    [[[[565, 324], [473, 337], [386, 345], [346, 340], [339, 324], [352, 212], [328, 114], [328, 18], [348, 0]]]],
+                ],
+                [
+                    '__assets__/grass.jpg',
+                    512,
+                    800,
+                    64,
+                    64,
+                    160,
+                    416,
+                    '__assets__/dog.png',
+                    "A fluffy, adorable puppy joyfully sprints onto the bright green grass, its fur bouncing with each step as sunlight highlights its soft coat. The scene takes place in a peaceful park filled with tall trees casting gentle shadows across the lawn. After dashing forward with enthusiasm, the puppy slows to a happy trot, continuing farther ahead into the deeper area of the park, disappearing toward the more shaded grass beneath the trees.",
+                    [[[[600, 412], [512, 394], [408, 358], [333, 336], [270, 313], [259, 260], [236, 222], [231, 180]], [[592, 392], [295, 305], [256, 217], [243, 163]]]],
+                ],
+                [
+                    '__assets__/man_scene.jpg',
+                    576,
+                    1024,
+                    64,
+                    32,
+                    64,
+                    224,
+                    None,
+                    "A single hiker, equipped with a backpack, walks toward the right side of a rugged mountainside trail. The bright sunlight highlights the pale rocky terrain around him, while massive stone cliffs loom in the background. Sparse patches of grass and scattered boulders sit along the path, emphasizing the isolation and vastness of the mountain environment as he steadily continues his journey.",
+                    [[[[342, 247], [415, 247], [478, 262], [518, 271], [570, 275], [613, 283], [646, 308], [690, 307], [705, 325]], [[349, 227], [461, 232], [536, 254], [595, 252], [638, 269], [691, 289], [715, 291]], [[341, 283], [415, 291], [500, 316], [590, 317], [632, 354], [675, 362], [711, 372]]]],
+                ]
+            ]
+    return case
+def on_example_click(
+                        input_image, resized_height, resized_width,
+                        top_left_height, top_left_width, bottom_right_height, bottom_right_width,
+                        identity_image, text_prompt, traj_lists,
+                    ):
+    # Convert
+    traj_lists = ast.literal_eval(traj_lists)
+    # Note: No need for the rest like resized_width and resized_height, because these will be replaced in function
+    # Sequentially build the canvas (We don't accept the empty traj_lists & traj_instance_idx returned by build_canvas)
+    visual_canvas, initial_visual_canvas, inference_canvas, _, _ = build_canvas(input_image, resized_height, resized_width, top_left_height, top_left_width, bottom_right_height, bottom_right_width)
+    # Sequentially load the Trajs of all instances on the canvas
+    visual_canvas, traj_instance_idx = fn_vis_all_instance_traj(visual_canvas, traj_lists)
+    return visual_canvas, initial_visual_canvas, inference_canvas, traj_instance_idx
+def build_canvas(input_image_path, resized_height, resized_width, top_left_height, top_left_width, bottom_right_height, bottom_right_width):
+    # Init
+    canvas_color = (250, 249, 246)      # This color is like white color used in painting paper
+    # Convert the string to integer
+    if not resized_height.isdigit():
+        raise gr.Error("resized_height must be integer input!")
+    resized_height = int(resized_height)
+    if not resized_width.isdigit():
+        raise gr.Error("resized_width must be integer input!")
+    resized_width = int(resized_width)
+    if not top_left_height.isdigit():
+        raise gr.Error("top_left_height must be integer input!")
+    top_left_height = int(top_left_height)
+    if not top_left_width.isdigit():
+        raise gr.Error("top_left_width must be integer input!")
+    top_left_width = int(top_left_width)
+    if not bottom_right_height.isdigit():
+        raise gr.Error("bottom_right_height must be integer input!")
+    bottom_right_height = int(bottom_right_height)
+    if not bottom_right_width.isdigit():
+        raise gr.Error("bottom_right_width must be integer input!")
+    bottom_right_width = int(bottom_right_width)
+    # Read the original image and preprare the placeholder
+    first_frame_img = np.uint8(np.asarray(Image.open(input_image_path)))          # NOTE: this is BGR form, be careful for the later cropping process for ID Reference
+    # print("first_frame_img shape is ", first_frame_img.shape)
+    # Resize to a uniform resolution
+    first_frame_img = cv2.resize(first_frame_img, (resized_width, resized_height), interpolation = cv2.INTER_AREA)
+    # Expand to Outside Region to form the Canvas
+    expand_height = resized_height + top_left_height + bottom_right_height
+    expand_width = resized_width + top_left_width + bottom_right_width
+    inference_canvas = np.uint8(np.zeros((expand_height, expand_width, 3)))       # Whole Black Canvas, same as other inference
+    visual_canvas = np.full((expand_height, expand_width, 3), canvas_color, dtype=np.uint8)
+    # Sanity Check
+    if expand_height % 32 != 0:
+        raise gr.Error("The Height of resized_height + top_left_height + bottom_right_height must be divisible by 32!")
+    if expand_width % 32 != 0:
+        raise gr.Error("The Width of resized_width + top_left_width + bottom_right_width must be divisible by 32!")
+    # Draw the Region Box Region (Original Resolution)
+    bottom_len = inference_canvas.shape[0] - bottom_right_height
+    right_len = inference_canvas.shape[1] - bottom_right_width
+    inference_canvas[top_left_height:bottom_len, top_left_width:right_len, :] = first_frame_img
+    visual_canvas[top_left_height:bottom_len, top_left_width:right_len, :] = first_frame_img
+    # Resize to the uniform height and width
+    visual_canvas = cv2.resize(visual_canvas, (uniform_width, uniform_height), interpolation = cv2.INTER_AREA)
+    # Return the visual_canvas (for visualizaiton) and canvas map
+    # Corresponds to: visual_canvas, initial_visual_canvas, inference_canvas, traj_instance_idx, traj_lists
+    return visual_canvas, visual_canvas.copy(), inference_canvas, 0, [ [ [] ] ]     # The last two is initialized with the trajectory instance idx and trajectory list
+def process_points(traj_list, num_frames=49):
+    if len(traj_list) < 2:     # First point
+        return [traj_list[0]] * num_frames
+    elif len(traj_list) >= num_frames:
+        raise gr.Info("The number of trajectory points is more than 49 limits, we will do cropping!")
+        skip = len(traj_list) // num_frames
+        return traj_list[::skip][: num_frames - 1] + traj_list[-1:]
+    else:
+        insert_num = num_frames - len(traj_list)
+        insert_num_dict = {}
+        interval = len(traj_list) - 1
+        n = insert_num // interval
+        m = insert_num % interval
+        for i in range(interval):
+            insert_num_dict[i] = n
+        for i in range(m):
+            insert_num_dict[i] += 1
+        res = []
+        for i in range(interval):
+            insert_points = []
+            x0, y0 = traj_list[i]
+            x1, y1 = traj_list[i + 1]
+            delta_x = x1 - x0
+            delta_y = y1 - y0
+            for j in range(insert_num_dict[i]):
+                x = x0 + (j + 1) / (insert_num_dict[i] + 1) * delta_x
+                y = y0 + (j + 1) / (insert_num_dict[i] + 1) * delta_y
+                insert_points.append([int(x), int(y)])
+            res += traj_list[i : i + 1] + insert_points
+        res += traj_list[-1:]
+        # return
+        return res
+def fn_vis_realtime_traj(visual_canvas, traj_list, traj_instance_idx):  # Visualize the traj on canvas
+    # Process Points
+    points = process_points(traj_list)
+    # Draw straight line to connect
+    for i in range(len(points) - 1):
+        p = points[i]
+        p1 = points[i + 1]
+        cv2.line(visual_canvas, p, p1, all_color_codes[traj_instance_idx], 5)
+    return visual_canvas
+def fn_vis_all_instance_traj(visual_canvas, traj_lists):  # Visualize all traj from all instances on canvas
+    for traj_instance_idx, traj_list_instance in enumerate(traj_lists):
+        for traj_list_line in traj_list_instance:
+            visual_canvas = fn_vis_realtime_traj(visual_canvas, traj_list_line, traj_instance_idx)
+    return visual_canvas, traj_instance_idx     # Also return the instance idx
+def add_traj_point(
+                    visual_canvas,
+                    traj_lists,
+                    traj_instance_idx,
+                    evt: gr.SelectData,
+                ):  # Add new Traj and then visualize
+    # Convert
+    traj_lists = ast.literal_eval(traj_lists)
+    # Mark New Trajectory Key Point
+    hotizontal, vertical = evt.index
+    # traj_lists data structure is: (Num of Instnace, Num of Trajecotries, Num of Points, [X, Y])
+    traj_lists[-1][-1].append( [int(hotizontal), int(vertical)] )
+    # Draw new trajectory on the Canvas image
+    visual_canvas = fn_vis_realtime_traj(visual_canvas, traj_lists[-1][-1], traj_instance_idx)
+    # Return New Traj Marked Canvas image
+    return visual_canvas, traj_lists
+def clear_traj_points(initial_visual_canvas):
+    return initial_visual_canvas.copy(), 0, [ [ [] ] ]         # 1sr One is the initial state canvas; 2nd one is the traj instance idx; 3rd one is the traj list (with the same data structure)
+def traj_point_update(traj_lists):
+    # Convert
+    traj_lists = ast.literal_eval(traj_lists)
+    # Append on the last trajecotry line
+    traj_lists[-1].append([])
+    return traj_lists
+def traj_instance_update(traj_instance_idx, traj_lists):
+    # Convert
+    traj_lists = ast.literal_eval(traj_lists)
+    # Update one index
+    if traj_instance_idx >= len(all_color_codes):
+        raise gr.Error("The trajectory instance number is over the limit!")
+    # Add one for the traj instance
+    traj_instance_idx = traj_instance_idx + 1
+    # Append a new empty list to the traj lists
+    traj_lists.append([[]])
+    # Reutn
+    return traj_instance_idx, traj_lists
+def sample_traj_by_length(points, num_samples):
+    # Sample points evenly from traj based on the euclidean distance
+    pts = np.array(points, dtype=float)  # shape (M, 2)
+    # 1) 每段长度
+    seg = pts[1:] - pts[:-1]
+    seg_len = np.sqrt((seg**2).sum(axis=1))  # shape (M-1,)
+    # 2) 累积长度
+    cum = np.cumsum(seg_len)
+    total_length = cum[-1]
+    # 3) 目标等距长度位置
+    target = np.linspace(0, total_length, num_samples)
+    res = []
+    for t in target:
+        # 4) 找到它落在哪一段
+        idx = np.searchsorted(cum, t)
+        if idx == 0:
+            prev = 0.
+        else:
+            prev = cum[idx-1]
+        # 5) 在该段内插值
+        ratio = (t - prev) / seg_len[idx]
+        p = pts[idx] * ratio + pts[idx+1] * (1-ratio)  # careful: direction reversed?
+        # Actually want: start*(1-ratio) + end*ratio
+        p = pts[idx] * (1 - ratio) + pts[idx+1] * ratio
+        res.append(p)
+    return np.array(res)
+def inference(inference_canvas, visual_canvas, text_prompt, traj_lists, main_reference_img,
+                resized_height, resized_width, top_left_height, top_left_width, bottom_right_height, bottom_right_width):
+    # TODO: enhance the text prompt by Qwen3-VL-32B?
+    # Convert
+    resized_height = int(resized_height)
+    resized_width  = int(resized_width)
+    top_left_height = int(top_left_height)
+    top_left_width  = int(top_left_width)
+    bottom_right_height = int(bottom_right_height)
+    bottom_right_width  = int(bottom_right_width)
+    traj_lists = ast.literal_eval(traj_lists)
+    # Init Some Fixed Setting
+    if model_code_name == "Wan":
+        config_path = "config/train_wan_motion_FrameINO.yaml"
+        dot_radius = 7
+        num_frames = 81
+    elif model_code_name == "CogVideoX":
+        config_path = "config/train_cogvideox_i2v_motion_FrameINO.yaml"
+        dot_radius = 6
+        num_frames = 49
+    config = OmegaConf.load(config_path)
+    # Prepare tmp folders
+    print()
+    store_folder_path = "tmp_app_example_" + str(int(time.time()))
+    if os.path.exists(store_folder_path):
+        shutil.rmtree(store_folder_path)
+    os.makedirs(store_folder_path)
+    # Write the visual canvas
+    visual_canvas_store_path = os.path.join(store_folder_path, "visual_canvas.png")
+    cv2.imwrite( visual_canvas_store_path, cv2.cvtColor(visual_canvas, cv2.COLOR_BGR2RGB) )
+    # Resize the map
+    canvas_width = resized_width + top_left_width + bottom_right_width
+    canvas_height = resized_height + top_left_height + bottom_right_height
+    # inference_canvas = cv2.resize(visual_canvas, (canvas_width, canvas_height), interpolation = cv2.INTER_AREA)
+    print("Canvas Shape is", str(canvas_height) + "x" + str(canvas_width) )
+    # TODO: 还要去enhance这个text prompt要跟QWen的保持一致的complexity的感觉。。。
+    # Save the text prompt
+    print("Text Prompt is", text_prompt)
+    with open(os.path.join(store_folder_path, 'text_prompt.txt'), 'w') as file:
+        file.write(text_prompt)
+    ################################################## Motion Trajectory Condition #####################################################
+    # #Prepare the points in the linear way
+    full_pred_tracks = [[] for _ in range(num_frames)]
+    ID_tensor = None
+    # Iterate all tracking information for all objects
+    print("traj_lists is", traj_lists)
+    for instance_idx, traj_list_per_object in enumerate(traj_lists):
+        # Iterate all trajectory lines in one instance
+        for traj_idx, single_trajectory in enumerate(traj_list_per_object):
+            # Sanity Check
+            if len(single_trajectory) < 2:
+                raise gr.Error("One of the trajectory provided is too short!")
+            # Sampled the point based on the Euclidean distance
+            sampled_points = sample_traj_by_length(single_trajectory, num_frames)
+            # Iterate all points
+            temporal_idx = 0
+            for (raw_point_x, raw_point_y) in sampled_points:
+                # Scale the point coordinate to the Infernece Size (Realistic Canvas size)
+                point_x, point_y = int(raw_point_x * canvas_width / uniform_width), int(raw_point_y * canvas_height / uniform_height)       # Clicking on the board is with respect to the Uniform Preset Height and Width
+                if traj_idx == 0:       # Needs to init the list in list
+                    full_pred_tracks[temporal_idx].append( [] )
+                full_pred_tracks[temporal_idx][-1].append( (point_x, point_y) )        # [-1] and [instance_idx] should have the same effect
+                temporal_idx += 1
+    # Create the traj tensor
+    traj_tensor, traj_imgs_np, _, img_with_traj = VideoDataset_Motion.prepare_traj_tensor(
+                                                                                            full_pred_tracks, canvas_height, canvas_width,
+                                                                                            [], dot_radius, canvas_width, canvas_height,
+                                                                                            idx=0, first_frame_img = inference_canvas
+                                                                                        )
+    # Store Trajectory
+    imageio.mimsave(os.path.join(store_folder_path, "traj_video.mp4"),  traj_imgs_np, fps=8)
+    ######################################################################################################################################################
+    ########################################## Prepare the Identity Reference Condition #####################################################
+    # ID reference preparation
+    if main_reference_img is not None:
+        print("We have an ID reference being used!")
+        # Fetch
+        ref_h, ref_w, _ = main_reference_img.shape
+        # Using breakpoint to extract the points
+        sam_predictor.set_image(np.uint8(main_reference_img))
+        # Define the sample point
+        sam_points = [(ref_w//2, ref_h//2)] # We don't need that many points to express       [:len(traj_points)//2]
+        # Reverse traj_points
+        positive_point_cords = np.array(sam_points)
+        positive_point_labels = np.ones(len(positive_point_cords))
+        # Predict the mask based on the point and bounding box designed
+        masks, scores, logits = sam_predictor.predict(
+                                                        point_coords = positive_point_cords,
+                                                        point_labels = positive_point_labels,
+                                                        multimask_output = False,
+                                                    )
+        mask = masks[0]
+        main_reference_img[mask == False] = 0   # Merge the mask the first first frame
+        # Resize to the same resolution as the first frame
+        scale_h = canvas_height / max(ref_h, ref_w)
+        scale_w = canvas_width / max(ref_h, ref_w)
+        new_h, new_w = int(ref_h * scale_h), int(ref_w * scale_w)
+        main_reference_img = cv2.resize(main_reference_img, (new_w, new_h), interpolation = cv2.INTER_AREA)
+        # Calculate padding amounts on all direction
+        pad_height1 = (canvas_height - main_reference_img.shape[0]) // 2
+        pad_height2 = canvas_height - main_reference_img.shape[0] - pad_height1
+        pad_width1 = (canvas_width - main_reference_img.shape[1]) // 2
+        pad_width2 = canvas_width - main_reference_img.shape[1] - pad_width1
+        # Apply padding to same resolution as the training farmes
+        main_reference_img = np.pad(
+                                        main_reference_img,
+                                        ((pad_height1, pad_height2), (pad_width1, pad_width2), (0, 0)),
+                                        mode = 'constant',
+                                        constant_values = 0
+                                    )
+        cv2.imwrite(os.path.join(store_folder_path, "ID.png"), cv2.cvtColor(main_reference_img, cv2.COLOR_BGR2RGB))
+    elif main_reference_img is None:
+        # Whole Black Color placeholder
+        main_reference_img = np.uint8(np.zeros((canvas_height, canvas_width, 3)))
+    # Convert to tensor
+    ID_tensor = torch.tensor(main_reference_img)
+    ID_tensor = train_transforms(ID_tensor).permute(2, 0, 1).contiguous()
+    if model_code_name == "Wan":        # Needs to be the shape  (B, C, F, H, W)
+        ID_tensor = ID_tensor.unsqueeze(0).unsqueeze(2)
+    ###############################################################################################################################################
+    ############################################# Call the Inference Pipeline ##########################################################
+    image = Image.fromarray(inference_canvas)
+    if model_code_name == "Wan":
+        video = pipe(
+                        image = image,
+                        prompt = text_prompt, negative_prompt = "",     # Empty string as negative text prompt
+                        traj_tensor = traj_tensor,      # Should be shape (F, C, H, W)
+                        ID_tensor = ID_tensor,          # Should be shape (B, C, F, H, W)
+                        height = canvas_height, width = canvas_width, num_frames = num_frames,
+                        num_inference_steps = 50,       # 38 is also ok
+                        guidance_scale = 5.0,
+                    ).frames[0]
+    elif model_code_name == "CogVideoX":
+        video = pipe(
+                        image = image,
+                        prompt = text_prompt,
+                        traj_tensor = traj_tensor,
+                        ID_tensor = ID_tensor,
+                        height = canvas_height, width = canvas_width, num_frames = len(traj_tensor),
+                        guidance_scale = 6, use_dynamic_cfg = False,
+                        num_inference_steps = 50,
+                        add_ID_reference_augment_noise = True,
+                    ).frames[0]
+    # Store the reuslt
+    export_to_video(video, os.path.join(store_folder_path, "generated_video_padded.mp4"), fps=8)
+    # Save frames
+    print("Writing as Frames")
+    video_file_path = os.path.join(store_folder_path, "generated_video.mp4")
+    writer = imageio.get_writer(video_file_path, fps = 8)
+    for frame_idx, frame in enumerate(video):
+        # Extract Unpadded version
+        # frame = np.uint8(frame)
+        if model_code_name == "CogVideoX":
+            frame = np.asarray(frame)        # PIL to RGB
+        bottom_right_y = frame.shape[0] - bottom_right_height
+        bottom_right_x = frame.shape[1] - bottom_right_width
+        cropped_region_frame = np.uint8(frame[top_left_height: bottom_right_y, top_left_width : bottom_right_x] * 255)
+        writer.append_data(cropped_region_frame)
+    writer.close()
+    #####################################################################################################################################
+    return gr.update(value = video_file_path, width = uniform_width, height = uniform_height)
+if __name__ == '__main__':
+    # Global Setting
+    uniform_height = 480        # Visual Canvas as 480x720 is decent
+    uniform_width = 720
+    # Draw the Website
+    block = gr.Blocks().queue(max_size=10)
+    with block:
+        with gr.Row():
+            gr.Markdown(MARKDOWN)
+        with gr.Row(elem_classes=["container"]):
+            with gr.Column(scale=2):
+                # Input image
+                input_image = gr.Image(type="filepath", label="Input Image 🖼️ ")
+                # uploaded_files = gr.Gallery(label="Your images", visible=False, columns=5, rows=1, height=200)
+            with gr.Column(scale=2):
+                # Input image
+                resized_height = gr.Textbox(label="Resized Height for Input Image")
+                resized_width = gr.Textbox(label="Resized Width for Input Image")
+                # gr.Number(value=unit_height, label="Fixed", interactive=False)
+                # gr.Number(value=unit_height * 1.77777, label="Fixed", interactive=False)
+                # Input the expansion factor
+                top_left_height = gr.Textbox(label="Top-Left Expand Height")
+                top_left_width = gr.Textbox(label="Top-Left Expand Width")
+                bottom_right_height = gr.Textbox(label="Bottom-Right Expand Height")
+                bottom_right_width = gr.Textbox(label="Bottom-Right Expand Width")
+                # Button
+                build_canvas_btn = gr.Button(value="Build the Canvas")
+        with gr.Row():
+            with gr.Column(scale=3):
+                with gr.Row(scale=3):
+                    visual_canvas = gr.Image(height = uniform_height, width = uniform_width, type="numpy", label='Expanded Canvas 🖼️ ')
+                    # inference_canvas = gr.Image(height = uniform_height, width = uniform_width, type="numpy")
+                    # inference_canvas = None
+                with gr.Row(scale=1):
+                    # TODO: 还差clear traj的选择
+                    add_point = gr.Button(value = "Add New Traj Line (Same Obj)", visible = True)     # Add new trajectory for the same instance
+                    add_traj = gr.Button(value = "Add New Instance (New Obj, including new ID)", visible = True)
+                    clear_traj_button = gr.Button("Clear All Traj", visible=True)
+            with gr.Column(scale=2):
+                with gr.Row(scale=2):
+                    identity_image = gr.Image(type="numpy", label="Identity Reference (SAM on center point only) 🖼️ ")
+                with gr.Row(scale=2):
+                    text_prompt = gr.Textbox(label="Text Prompt", lines=3)
+        with gr.Row():
+            # Button
+            generation_btn = gr.Button(value="Generate")
+        with gr.Row():
+            generated_video = gr.Video(value = None, label="Generated Video", show_label = True, height = uniform_height, width = uniform_width)
+        ################################################################## Click + Select + Any Effect Area ###########################################################################
+        # Init some states that will be supporting purposes
+        traj_lists = gr.Textbox(label="Trajectory", visible = False)    # gr.State(None)       # Data Structure is: (Number of Instance, Number of Trajectories, Points)       Init as [ [ [] ] ]
+        inference_canvas = gr.State(None)
+        traj_instance_idx = gr.State(0)
+        initial_visual_canvas = gr.State(None)      # gr.Image(height = uniform_height, width = uniform_width, type="numpy", label='Canvas Expanded Image (Initial State)')       # This is the initila visual, used to load back in clearing
+        # Canvas Click
+        build_canvas_btn.click(
+                                build_canvas,
+                                inputs = [input_image, resized_height, resized_width, top_left_height, top_left_width, bottom_right_height, bottom_right_width],
+                                outputs = [visual_canvas, initial_visual_canvas, inference_canvas, traj_instance_idx, traj_lists]       # inference_canvas is used for inference; visual_canvas is for gradio visualization
+                            )
+        # Draw Trajectory for each click on the canvas
+        visual_canvas.select(
+                                fn = add_traj_point,
+                                inputs = [visual_canvas, traj_lists, traj_instance_idx],
+                                outputs = [visual_canvas, traj_lists]
+                            )
+        # Add new Trajectory
+        add_point.click(
+                            fn = traj_point_update,
+                            inputs = [traj_lists],
+                            outputs = [traj_lists],
+                        )
+        add_traj.click(
+                        fn = traj_instance_update,
+                        inputs = [traj_instance_idx, traj_lists],
+                        outputs = [traj_instance_idx, traj_lists],
+                    )
+        # Clean all the traj points
+        clear_traj_button.click(
+                                    clear_traj_points,
+                                    [initial_visual_canvas],
+                                    [visual_canvas, traj_instance_idx, traj_lists],
+                                )
+        # Inference Generation
+        generation_btn.click(
+                                inference,
+                                inputs = [inference_canvas, visual_canvas, text_prompt, traj_lists, identity_image, resized_height, resized_width, top_left_height, top_left_width, bottom_right_height, bottom_right_width],
+                                outputs = [generated_video],
+                            )
+        # Load Examples
+        with gr.Row(elem_classes=["container"]):
+            gr.Examples(
+                            examples = get_example(),
+                            inputs = [input_image, resized_height, resized_width, top_left_height, top_left_width, bottom_right_height, bottom_right_width, identity_image, text_prompt, traj_lists],
+                            run_on_click = True,
+                            fn = on_example_click,
+                            outputs = [visual_canvas, initial_visual_canvas, inference_canvas, traj_instance_idx],
+                        )
+    block.launch(share=True)

architecture/attention_processor.py ADDED Viewed

The diff for this file is too large to render. See raw diff

architecture/autoencoder_kl_wan.py ADDED Viewed

	@@ -0,0 +1,1419 @@

+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin
+from diffusers.utils import logging
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.activations import get_activation
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+CACHE_T = 2
+class AvgDown3D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+        pad = (0, 0, 0, 0, pad_t, 0)
+        x = F.pad(x, pad)
+        B, C, T, H, W = x.shape
+        x = x.view(
+            B,
+            C,
+            T // self.factor_t,
+            self.factor_t,
+            H // self.factor_s,
+            self.factor_s,
+            W // self.factor_s,
+            self.factor_s,
+        )
+        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+        x = x.view(
+            B,
+            C * self.factor,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.view(
+            B,
+            self.out_channels,
+            self.group_size,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.mean(dim=2)
+        return x
+class DupUp3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        x = x.repeat_interleave(self.repeats, dim=1)
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            self.factor_t,
+            self.factor_s,
+            self.factor_s,
+            x.size(2),
+            x.size(3),
+            x.size(4),
+        )
+        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            x.size(2) * self.factor_t,
+            x.size(4) * self.factor_s,
+            x.size(6) * self.factor_s,
+        )
+        if first_chunk:
+            x = x[:, :, self.factor_t - 1 :, :, :]
+        return x
+class WanCausalConv3d(nn.Conv3d):
+    r"""
+    A custom 3D causal convolution layer with feature caching support.
+    This layer extends the standard Conv3D layer by ensuring causality in the time dimension and handling feature
+    caching for efficient inference.
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to all three sides of the input. Default: 0
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+    ) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        # Set up causal padding
+        self._padding = (self.padding[2], self.padding[2], self.padding[1], self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class WanRMS_norm(nn.Module):
+    r"""
+    A custom RMS normalization layer.
+    Args:
+        dim (int): The number of dimensions to normalize over.
+        channel_first (bool, optional): Whether the input tensor has channels as the first dimension.
+            Default is True.
+        images (bool, optional): Whether the input represents image data. Default is True.
+        bias (bool, optional): Whether to include a learnable bias term. Default is False.
+    """
+    def __init__(self, dim: int, channel_first: bool = True, images: bool = True, bias: bool = False) -> None:
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    def forward(self, x):
+        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+class WanUpsample(nn.Upsample):
+    r"""
+    Perform upsampling while ensuring the output tensor has the same data type as the input.
+    Args:
+        x (torch.Tensor): Input tensor to be upsampled.
+    Returns:
+        torch.Tensor: Upsampled tensor with the same data type as the input.
+    """
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class WanResample(nn.Module):
+    r"""
+    A custom resampling module for 2D and 3D data.
+    Args:
+        dim (int): The number of input/output channels.
+        mode (str): The resampling mode. Must be one of:
+            - 'none': No resampling (identity operation).
+            - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
+            - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
+            - 'downsample2d': 2D downsampling with zero-padding and convolution.
+            - 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
+    """
+    def __init__(self, dim: int, mode: str, upsample_out_dim: int = None) -> None:
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # default to dim //2
+        if upsample_out_dim is None:
+            upsample_out_dim = dim // 2
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, upsample_out_dim, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, upsample_out_dim, 3, padding=1),
+            )
+            self.time_conv = WanCausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = WanCausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                        )
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        x = self.resample(x)
+        x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+class WanResidualBlock(nn.Module):
+    r"""
+    A custom residual block module.
+    Args:
+        in_dim (int): Number of input channels.
+        out_dim (int): Number of output channels.
+        dropout (float, optional): Dropout rate for the dropout layer. Default is 0.0.
+        non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dropout: float = 0.0,
+        non_linearity: str = "silu",
+    ) -> None:
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.nonlinearity = get_activation(non_linearity)
+        # layers
+        self.norm1 = WanRMS_norm(in_dim, images=False)
+        self.conv1 = WanCausalConv3d(in_dim, out_dim, 3, padding=1)
+        self.norm2 = WanRMS_norm(out_dim, images=False)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = WanCausalConv3d(out_dim, out_dim, 3, padding=1)
+        self.conv_shortcut = WanCausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # Apply shortcut connection
+        h = self.conv_shortcut(x)
+        # First normalization and activation
+        x = self.norm1(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        # Second normalization and activation
+        x = self.norm2(x)
+        x = self.nonlinearity(x)
+        # Dropout
+        x = self.dropout(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv2(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv2(x)
+        # Add residual connection
+        return x + h
+class WanAttentionBlock(nn.Module):
+    r"""
+    Causal self-attention with a single head.
+    Args:
+        dim (int): The number of channels in the input tensor.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = WanRMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+    def forward(self, x):
+        identity = x
+        batch_size, channels, time, height, width = x.size()
+        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * time, channels, height, width)
+        x = self.norm(x)
+        # compute query, key, value
+        qkv = self.to_qkv(x)
+        qkv = qkv.reshape(batch_size * time, 1, channels * 3, -1)
+        qkv = qkv.permute(0, 1, 3, 2).contiguous()
+        q, k, v = qkv.chunk(3, dim=-1)
+        # apply attention
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = x.squeeze(1).permute(0, 2, 1).reshape(batch_size * time, channels, height, width)
+        # output projection
+        x = self.proj(x)
+        # Reshape back: [(b*t), c, h, w] -> [b, c, t, h, w]
+        x = x.view(batch_size, time, channels, height, width)
+        x = x.permute(0, 2, 1, 3, 4)
+        return x + identity
+class WanMidBlock(nn.Module):
+    """
+    Middle block for WanVAE encoder and decoder.
+    Args:
+        dim (int): Number of input/output channels.
+        dropout (float): Dropout rate.
+        non_linearity (str): Type of non-linearity to use.
+    """
+    def __init__(self, dim: int, dropout: float = 0.0, non_linearity: str = "silu", num_layers: int = 1):
+        super().__init__()
+        self.dim = dim
+        # Create the components
+        resnets = [WanResidualBlock(dim, dim, dropout, non_linearity)]
+        attentions = []
+        for _ in range(num_layers):
+            attentions.append(WanAttentionBlock(dim))
+            resnets.append(WanResidualBlock(dim, dim, dropout, non_linearity))
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # First residual block
+        x = self.resnets[0](x, feat_cache, feat_idx)
+        # Process through attention and residual blocks
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                x = attn(x)
+            x = resnet(x, feat_cache, feat_idx)
+        return x
+class WanResidualDownBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout, num_res_blocks, temperal_downsample=False, down_flag=False):
+        super().__init__()
+        # Shortcut path with downsample
+        self.avg_shortcut = AvgDown3D(
+            in_dim,
+            out_dim,
+            factor_t=2 if temperal_downsample else 1,
+            factor_s=2 if down_flag else 1,
+        )
+        # Main path with residual blocks and downsample
+        resnets = []
+        for _ in range(num_res_blocks):
+            resnets.append(WanResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+        # Add the final downsample block
+        if down_flag:
+            mode = "downsample3d" if temperal_downsample else "downsample2d"
+            self.downsampler = WanResample(out_dim, mode=mode)
+        else:
+            self.downsampler = None
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for resnet in self.resnets:
+            x = resnet(x, feat_cache, feat_idx)
+        if self.downsampler is not None:
+            x = self.downsampler(x, feat_cache, feat_idx)
+        return x + self.avg_shortcut(x_copy)
+class WanEncoder3d(nn.Module):
+    r"""
+    A 3D encoder module.
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_downsample (list of bool): Whether to downsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+        non_linearity: str = "silu",
+        is_residual: bool = False,  # wan 2.2 vae use a residual downblock
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.nonlinearity = get_activation(non_linearity)
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv_in = WanCausalConv3d(in_channels, dims[0], 3, padding=1)
+        # downsample blocks
+        self.down_blocks = nn.ModuleList([])
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if is_residual:
+                self.down_blocks.append(
+                    WanResidualDownBlock(
+                        in_dim,
+                        out_dim,
+                        dropout,
+                        num_res_blocks,
+                        temperal_downsample=temperal_downsample[i] if i != len(dim_mult) - 1 else False,
+                        down_flag=i != len(dim_mult) - 1,
+                    )
+                )
+            else:
+                for _ in range(num_res_blocks):
+                    self.down_blocks.append(WanResidualBlock(in_dim, out_dim, dropout))
+                    if scale in attn_scales:
+                        self.down_blocks.append(WanAttentionBlock(out_dim))
+                    in_dim = out_dim
+                # downsample block
+                if i != len(dim_mult) - 1:
+                    mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
+                    self.down_blocks.append(WanResample(out_dim, mode=mode))
+                    scale /= 2.0
+        # middle blocks
+        self.mid_block = WanMidBlock(out_dim, dropout, non_linearity, num_layers=1)
+        # output blocks
+        self.norm_out = WanRMS_norm(out_dim, images=False)
+        self.conv_out = WanCausalConv3d(out_dim, z_dim, 3, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+        ## downsamples
+        for layer in self.down_blocks:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+class WanResidualUpBlock(nn.Module):
+    """
+    A block that handles upsampling for the WanVAE decoder.
+    Args:
+        in_dim (int): Input dimension
+        out_dim (int): Output dimension
+        num_res_blocks (int): Number of residual blocks
+        dropout (float): Dropout rate
+        temperal_upsample (bool): Whether to upsample on temporal dimension
+        up_flag (bool): Whether to upsample or not
+        non_linearity (str): Type of non-linearity to use
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        num_res_blocks: int,
+        dropout: float = 0.0,
+        temperal_upsample: bool = False,
+        up_flag: bool = False,
+        non_linearity: str = "silu",
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        if up_flag:
+            self.avg_shortcut = DupUp3D(
+                in_dim,
+                out_dim,
+                factor_t=2 if temperal_upsample else 1,
+                factor_s=2,
+            )
+        else:
+            self.avg_shortcut = None
+        # create residual blocks
+        resnets = []
+        current_dim = in_dim
+        for _ in range(num_res_blocks + 1):
+            resnets.append(WanResidualBlock(current_dim, out_dim, dropout, non_linearity))
+            current_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+        # Add upsampling layer if needed
+        if up_flag:
+            upsample_mode = "upsample3d" if temperal_upsample else "upsample2d"
+            self.upsampler = WanResample(out_dim, mode=upsample_mode, upsample_out_dim=out_dim)
+        else:
+            self.upsampler = None
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        """
+        Forward pass through the upsampling block.
+        Args:
+            x (torch.Tensor): Input tensor
+            feat_cache (list, optional): Feature cache for causal convolutions
+            feat_idx (list, optional): Feature index for cache management
+        Returns:
+            torch.Tensor: Output tensor
+        """
+        x_copy = x.clone()
+        for resnet in self.resnets:
+            if feat_cache is not None:
+                x = resnet(x, feat_cache, feat_idx)
+            else:
+                x = resnet(x)
+        if self.upsampler is not None:
+            if feat_cache is not None:
+                x = self.upsampler(x, feat_cache, feat_idx)
+            else:
+                x = self.upsampler(x)
+        if self.avg_shortcut is not None:
+            x = x + self.avg_shortcut(x_copy, first_chunk=first_chunk)
+        return x
+class WanUpBlock(nn.Module):
+    """
+    A block that handles upsampling for the WanVAE decoder.
+    Args:
+        in_dim (int): Input dimension
+        out_dim (int): Output dimension
+        num_res_blocks (int): Number of residual blocks
+        dropout (float): Dropout rate
+        upsample_mode (str, optional): Mode for upsampling ('upsample2d' or 'upsample3d')
+        non_linearity (str): Type of non-linearity to use
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        num_res_blocks: int,
+        dropout: float = 0.0,
+        upsample_mode: Optional[str] = None,
+        non_linearity: str = "silu",
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # Create layers list
+        resnets = []
+        # Add residual blocks and attention if needed
+        current_dim = in_dim
+        for _ in range(num_res_blocks + 1):
+            resnets.append(WanResidualBlock(current_dim, out_dim, dropout, non_linearity))
+            current_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+        # Add upsampling layer if needed
+        self.upsamplers = None
+        if upsample_mode is not None:
+            self.upsamplers = nn.ModuleList([WanResample(out_dim, mode=upsample_mode)])
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=None):
+        """
+        Forward pass through the upsampling block.
+        Args:
+            x (torch.Tensor): Input tensor
+            feat_cache (list, optional): Feature cache for causal convolutions
+            feat_idx (list, optional): Feature index for cache management
+        Returns:
+            torch.Tensor: Output tensor
+        """
+        for resnet in self.resnets:
+            if feat_cache is not None:
+                x = resnet(x, feat_cache, feat_idx)
+            else:
+                x = resnet(x)
+        if self.upsamplers is not None:
+            if feat_cache is not None:
+                x = self.upsamplers[0](x, feat_cache, feat_idx)
+            else:
+                x = self.upsamplers[0](x)
+        return x
+class WanDecoder3d(nn.Module):
+    r"""
+    A 3D decoder module.
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_upsample (list of bool): Whether to upsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+    """
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+        non_linearity: str = "silu",
+        out_channels: int = 3,
+        is_residual: bool = False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        self.nonlinearity = get_activation(non_linearity)
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        # init block
+        self.conv_in = WanCausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.mid_block = WanMidBlock(dims[0], dropout, non_linearity, num_layers=1)
+        # upsample blocks
+        self.up_blocks = nn.ModuleList([])
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i > 0 and not is_residual:
+                # wan vae 2.1
+                in_dim = in_dim // 2
+            # determine if we need upsampling
+            up_flag = i != len(dim_mult) - 1
+            # determine upsampling mode, if not upsampling, set to None
+            upsample_mode = None
+            if up_flag and temperal_upsample[i]:
+                upsample_mode = "upsample3d"
+            elif up_flag:
+                upsample_mode = "upsample2d"
+            # Create and add the upsampling block
+            if is_residual:
+                up_block = WanResidualUpBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    num_res_blocks=num_res_blocks,
+                    dropout=dropout,
+                    temperal_upsample=temperal_upsample[i] if up_flag else False,
+                    up_flag=up_flag,
+                    non_linearity=non_linearity,
+                )
+            else:
+                up_block = WanUpBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    num_res_blocks=num_res_blocks,
+                    dropout=dropout,
+                    upsample_mode=upsample_mode,
+                    non_linearity=non_linearity,
+                )
+            self.up_blocks.append(up_block)
+        # output blocks
+        self.norm_out = WanRMS_norm(out_dim, images=False)
+        self.conv_out = WanCausalConv3d(out_dim, out_channels, 3, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+        ## upsamples
+        for up_block in self.up_blocks:
+            x = up_block(x, feat_cache, feat_idx, first_chunk=first_chunk)
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+def patchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() != 5:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    # x shape: [batch_size, channels, frames, height, width]
+    batch_size, channels, frames, height, width = x.shape
+    # Ensure height and width are divisible by patch_size
+    if height % patch_size != 0 or width % patch_size != 0:
+        raise ValueError(f"Height ({height}) and width ({width}) must be divisible by patch_size ({patch_size})")
+    # Reshape to [batch_size, channels, frames, height//patch_size, patch_size, width//patch_size, patch_size]
+    x = x.view(batch_size, channels, frames, height // patch_size, patch_size, width // patch_size, patch_size)
+    # Rearrange to [batch_size, channels * patch_size * patch_size, frames, height//patch_size, width//patch_size]
+    x = x.permute(0, 1, 6, 4, 2, 3, 5).contiguous()
+    x = x.view(batch_size, channels * patch_size * patch_size, frames, height // patch_size, width // patch_size)
+    return x
+def unpatchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() != 5:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    # x shape: [batch_size, (channels * patch_size * patch_size), frame, height, width]
+    batch_size, c_patches, frames, height, width = x.shape
+    channels = c_patches // (patch_size * patch_size)
+    # Reshape to [b, c, patch_size, patch_size, f, h, w]
+    x = x.view(batch_size, channels, patch_size, patch_size, frames, height, width)
+    # Rearrange to [b, c, f, h * patch_size, w * patch_size]
+    x = x.permute(0, 1, 4, 5, 3, 6, 2).contiguous()
+    x = x.view(batch_size, channels, frames, height * patch_size, width * patch_size)
+    return x
+class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    r"""
+    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.
+    Introduced in [Wan 2.1].
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+    _supports_gradient_checkpointing = False
+    @register_to_config
+    def __init__(
+        self,
+        base_dim: int = 96,
+        decoder_base_dim: Optional[int] = None,
+        z_dim: int = 16,
+        dim_mult: Tuple[int] = [1, 2, 4, 4],
+        num_res_blocks: int = 2,
+        attn_scales: List[float] = [],
+        temperal_downsample: List[bool] = [False, True, True],
+        dropout: float = 0.0,
+        latents_mean: List[float] = [
+            -0.7571,
+            -0.7089,
+            -0.9113,
+            0.1075,
+            -0.1745,
+            0.9653,
+            -0.1517,
+            1.5508,
+            0.4134,
+            -0.0715,
+            0.5517,
+            -0.3632,
+            -0.1922,
+            -0.9497,
+            0.2503,
+            -0.2921,
+        ],
+        latents_std: List[float] = [
+            2.8184,
+            1.4541,
+            2.3275,
+            2.6558,
+            1.2196,
+            1.7708,
+            2.6052,
+            2.0743,
+            3.2687,
+            2.1526,
+            2.8652,
+            1.5579,
+            1.6382,
+            1.1253,
+            2.8251,
+            1.9160,
+        ],
+        is_residual: bool = False,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        patch_size: Optional[int] = None,
+        scale_factor_temporal: Optional[int] = 4,
+        scale_factor_spatial: Optional[int] = 8,
+    ) -> None:
+        super().__init__()
+        self.z_dim = z_dim
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        if decoder_base_dim is None:
+            decoder_base_dim = base_dim
+        self.encoder = WanEncoder3d(
+            in_channels=in_channels,
+            dim=base_dim,
+            z_dim=z_dim * 2,
+            dim_mult=dim_mult,
+            num_res_blocks=num_res_blocks,
+            attn_scales=attn_scales,
+            temperal_downsample=temperal_downsample,
+            dropout=dropout,
+            is_residual=is_residual,
+        )
+        self.quant_conv = WanCausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.post_quant_conv = WanCausalConv3d(z_dim, z_dim, 1)
+        self.decoder = WanDecoder3d(
+            dim=decoder_base_dim,
+            z_dim=z_dim,
+            dim_mult=dim_mult,
+            num_res_blocks=num_res_blocks,
+            attn_scales=attn_scales,
+            temperal_upsample=self.temperal_upsample,
+            dropout=dropout,
+            out_channels=out_channels,
+            is_residual=is_residual,
+        )
+        self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)
+        # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
+        # to perform decoding of a single video latent at a time.
+        self.use_slicing = False
+        # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
+        # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
+        # intermediate tiles together, the memory requirement can be lowered.
+        self.use_tiling = False
+        # The minimal tile height and width for spatial tiling to be used
+        self.tile_sample_min_height = 256
+        self.tile_sample_min_width = 256
+        # The minimal distance between two spatial tiles
+        self.tile_sample_stride_height = 192
+        self.tile_sample_stride_width = 192
+        # Precompute and cache conv counts for encoder and decoder for clear_cache speedup
+        self._cached_conv_counts = {
+            "decoder": sum(isinstance(m, WanCausalConv3d) for m in self.decoder.modules())
+            if self.decoder is not None
+            else 0,
+            "encoder": sum(isinstance(m, WanCausalConv3d) for m in self.encoder.modules())
+            if self.encoder is not None
+            else 0,
+        }
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_sample_stride_height: Optional[float] = None,
+        tile_sample_stride_width: Optional[float] = None,
+    ) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        Args:
+            tile_sample_min_height (`int`, *optional*):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_sample_stride_height (`int`, *optional*):
+                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
+                no tiling artifacts produced across the height dimension.
+            tile_sample_stride_width (`int`, *optional*):
+                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
+                artifacts produced across the width dimension.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
+        self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    def clear_cache(self):
+        # Use cached conv counts for decoder and encoder to avoid re-iterating modules each call
+        self._conv_num = self._cached_conv_counts["decoder"]
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = self._cached_conv_counts["encoder"]
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+    def _encode(self, x: torch.Tensor):
+        _, _, num_frame, height, width = x.shape
+        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
+            return self.tiled_encode(x)
+        self.clear_cache()
+        if self.config.patch_size is not None:
+            x = patchify(x, patch_size=self.config.patch_size)
+        iter_ = 1 + (num_frame - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        enc = self.quant_conv(out)
+        self.clear_cache()
+        return enc
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        r"""
+        Encode a batch of images into latents.
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+        posterior = DiagonalGaussianDistribution(h)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def _decode(self, z: torch.Tensor, return_dict: bool = True):
+        _, _, num_frame, height, width = z.shape
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
+            return self.tiled_decode(z, return_dict=return_dict)
+        self.clear_cache()
+        x = self.post_quant_conv(z)
+        for i in range(num_frame):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx, first_chunk=True
+                )
+            else:
+                out_ = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)
+        if self.config.patch_size is not None:
+            out = unpatchify(out, patch_size=self.config.patch_size)
+        out = torch.clamp(out, min=-1.0, max=1.0)
+        self.clear_cache()
+        if not return_dict:
+            return (out,)
+        return DecoderOutput(sample=out)
+    @apply_forward_hook
+    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images.
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+    def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+        _, _, num_frames, height, width = x.shape
+        latent_height = height // self.spatial_compression_ratio
+        latent_width = width // self.spatial_compression_ratio
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
+        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+        blend_height = tile_latent_min_height - tile_latent_stride_height
+        blend_width = tile_latent_min_width - tile_latent_stride_width
+        # Split x into overlapping tiles and encode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, self.tile_sample_stride_height):
+            row = []
+            for j in range(0, width, self.tile_sample_stride_width):
+                self.clear_cache()
+                time = []
+                frame_range = 1 + (num_frames - 1) // 4
+                for k in range(frame_range):
+                    self._enc_conv_idx = [0]
+                    if k == 0:
+                        tile = x[:, :, :1, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
+                    else:
+                        tile = x[
+                            :,
+                            :,
+                            1 + 4 * (k - 1) : 1 + 4 * k,
+                            i : i + self.tile_sample_min_height,
+                            j : j + self.tile_sample_min_width,
+                        ]
+                    tile = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+                    tile = self.quant_conv(tile)
+                    time.append(tile)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
+        return enc
+    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        _, _, num_frames, height, width = z.shape
+        sample_height = height * self.spatial_compression_ratio
+        sample_width = width * self.spatial_compression_ratio
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
+        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+        blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
+        blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, tile_latent_stride_height):
+            row = []
+            for j in range(0, width, tile_latent_stride_width):
+                self.clear_cache()
+                time = []
+                for k in range(num_frames):
+                    self._conv_idx = [0]
+                    tile = z[:, :, k : k + 1, i : i + tile_latent_min_height, j : j + tile_latent_min_width]
+                    tile = self.post_quant_conv(tile)
+                    decoded = self.decoder(tile, feat_cache=self._feat_map, feat_idx=self._conv_idx)
+                    time.append(decoded)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        """
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, return_dict=return_dict)
+        return dec

architecture/cogvideox_transformer_3d.py ADDED Viewed

	@@ -0,0 +1,563 @@

+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+import os, sys, shutil
+import torch
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import PeftAdapterMixin
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNorm, CogVideoXLayerNormZero
+# Import files from the local fodler
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from architecture.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
+from architecture.attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class CogVideoXBlock(nn.Module):
+    r"""
+    Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        time_embed_dim (`int`):
+            The number of channels in timestep embedding.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to be used in feed-forward.
+        attention_bias (`bool`, defaults to `False`):
+            Whether or not to use bias in attention projection layers.
+        qk_norm (`bool`, defaults to `True`):
+            Whether or not to use normalization after query and key projections in Attention.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, defaults to `1e-5`):
+            Epsilon value for normalization layers.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*, defaults to `None`):
+            Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
+        ff_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Feed-forward layer.
+        attention_out_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Attention output projection layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        time_embed_dim: int,
+        dropout: float = 0.0,
+        activation_fn: str = "gelu-approximate",
+        attention_bias: bool = False,
+        qk_norm: bool = True,
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        final_dropout: bool = True,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        # 1. Self Attention
+        self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.attn1 = Attention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm="layer_norm" if qk_norm else None,
+            eps=1e-6,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+            processor=CogVideoXAttnProcessor2_0(),
+        )
+        # 2. Feed Forward
+        self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+        attention_kwargs = attention_kwargs or {}
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
+            hidden_states, encoder_hidden_states, temb
+        )
+        # attention
+        attn_hidden_states, attn_encoder_hidden_states = self.attn1(
+                                                                        hidden_states = norm_hidden_states,
+                                                                        encoder_hidden_states = norm_encoder_hidden_states,
+                                                                        image_rotary_emb = image_rotary_emb,
+                                                                        **attention_kwargs,
+                                                                    )
+        hidden_states = hidden_states + gate_msa * attn_hidden_states
+        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
+                                                                                            hidden_states, encoder_hidden_states, temb
+                                                                                        )
+        # feed-forward
+        norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+        encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
+        return hidden_states, encoder_hidden_states
+class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    """
+    A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
+    Parameters:
+        num_attention_heads (`int`, defaults to `30`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `64`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `16`):
+            The number of channels in the output.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        time_embed_dim (`int`, defaults to `512`):
+            Output dimension of timestep embeddings.
+        ofs_embed_dim (`int`, defaults to `512`):
+            Output dimension of "ofs" embeddings used in CogVideoX-5b-I2B in version 1.5
+        text_embed_dim (`int`, defaults to `4096`):
+            Input dimension of text embeddings from the text encoder.
+        num_layers (`int`, defaults to `30`):
+            The number of layers of Transformer blocks to use.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        attention_bias (`bool`, defaults to `True`):
+            Whether to use bias in the attention projection layers.
+        sample_width (`int`, defaults to `90`):
+            The width of the input latents.
+        sample_height (`int`, defaults to `60`):
+            The height of the input latents.
+        sample_frames (`int`, defaults to `49`):
+            The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
+            instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
+            but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
+            K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
+        patch_size (`int`, defaults to `2`):
+            The size of the patches to use in the patch embedding layer.
+        temporal_compression_ratio (`int`, defaults to `4`):
+            The compression ratio across the temporal dimension. See documentation for `sample_frames`.
+        max_text_seq_length (`int`, defaults to `226`):
+            The maximum sequence length of the input text embeddings.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to use in feed-forward.
+        timestep_activation_fn (`str`, defaults to `"silu"`):
+            Activation function to use when generating the timestep embeddings.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use elementwise affine in normalization layers.
+        norm_eps (`float`, defaults to `1e-5`):
+            The epsilon value to use in normalization layers.
+        spatial_interpolation_scale (`float`, defaults to `1.875`):
+            Scaling factor to apply in 3D positional embeddings across spatial dimensions.
+        temporal_interpolation_scale (`float`, defaults to `1.0`):
+            Scaling factor to apply in 3D positional embeddings across temporal dimensions.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["CogVideoXBlock", "CogVideoXPatchEmbed"]
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 30,
+        attention_head_dim: int = 64,
+        in_channels: int = 16,
+        out_channels: Optional[int] = 16,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        time_embed_dim: int = 512,
+        ofs_embed_dim: Optional[int] = None,
+        text_embed_dim: int = 4096,
+        num_layers: int = 30,
+        dropout: float = 0.0,
+        attention_bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        patch_size: int = 2,
+        patch_size_t: Optional[int] = None,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        activation_fn: str = "gelu-approximate",
+        timestep_activation_fn: str = "silu",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_rotary_positional_embeddings: bool = False,
+        use_learned_positional_embeddings: bool = False,
+        patch_bias: bool = True,
+        extra_encoder_cond_channels: int = -1,
+        use_FrameIn: bool = False,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        # breakpoint()
+        # if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
+        #     raise ValueError(
+        #         "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
+        #         "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
+        #         "issue at https://github.com/huggingface/diffusers/issues."
+        #     )
+        # 1. Patch embedding
+        self.patch_embed = CogVideoXPatchEmbed(
+                                patch_size = patch_size,
+                                patch_size_t = patch_size_t,
+                                in_channels = in_channels,
+                                embed_dim = inner_dim,
+                                text_embed_dim = text_embed_dim,
+                                bias = patch_bias,
+                                sample_width = sample_width,
+                                sample_height = sample_height,
+                                sample_frames = sample_frames,
+                                temporal_compression_ratio = temporal_compression_ratio,
+                                max_text_seq_length = max_text_seq_length,
+                                spatial_interpolation_scale = spatial_interpolation_scale,
+                                temporal_interpolation_scale = temporal_interpolation_scale,
+                                use_positional_embeddings = not use_rotary_positional_embeddings,       # HACK: use_positional_embeddings is the revert of use_rotary_positional_embeddings
+                                use_learned_positional_embeddings = use_learned_positional_embeddings,
+                                extra_encoder_cond_channels = extra_encoder_cond_channels,
+                                use_FrameIn = use_FrameIn,
+                            )
+        self.embedding_dropout = nn.Dropout(dropout)
+        # 2. Time embeddings and ofs embedding(Only CogVideoX1.5-5B I2V have)
+        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
+        self.ofs_proj = None
+        self.ofs_embedding = None
+        if ofs_embed_dim:
+            self.ofs_proj = Timesteps(ofs_embed_dim, flip_sin_to_cos, freq_shift)
+            self.ofs_embedding = TimestepEmbedding(
+                ofs_embed_dim, ofs_embed_dim, timestep_activation_fn
+            )  # same as time embeddings, for ofs
+        # 3. Define spatio-temporal transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CogVideoXBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    time_embed_dim=time_embed_dim,
+                    dropout=dropout,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
+        # 4. Output blocks
+        self.norm_out = AdaLayerNorm(
+            embedding_dim=time_embed_dim,
+            output_dim=2 * inner_dim,
+            norm_elementwise_affine=norm_elementwise_affine,
+            norm_eps=norm_eps,
+            chunk_dim=1,
+        )
+        if patch_size_t is None:
+            # For CogVideox 1.0
+            output_dim = patch_size * patch_size * out_channels
+        else:
+            # For CogVideoX 1.5
+            output_dim = patch_size * patch_size * patch_size_t * out_channels
+        self.proj_out = nn.Linear(inner_dim, output_dim)
+        self.gradient_checkpointing = False
+    # def _set_gradient_checkpointing(self, module, value=False):
+    #     self.gradient_checkpointing = value
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        timestep_cond: Optional[torch.Tensor] = None,
+        ofs: Optional[Union[int, float, torch.LongTensor]] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ):
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        if self.ofs_embedding is not None:
+            ofs_emb = self.ofs_proj(ofs)
+            ofs_emb = ofs_emb.to(dtype = hidden_states.dtype)
+            ofs_emb = self.ofs_embedding(ofs_emb)
+            emb = emb + ofs_emb
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)      # Only use patch embedding at the very beginning
+        hidden_states = self.embedding_dropout(hidden_states)
+        # HACK: patch_embed embedding is split after Adding with Positional Embedding
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]      # Merged encoder hidden states is split again
+        hidden_states = hidden_states[:, text_seq_length:]
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    attention_kwargs,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states = hidden_states,
+                    encoder_hidden_states = encoder_hidden_states,
+                    temb = emb,
+                    image_rotary_emb = image_rotary_emb,
+                    attention_kwargs = attention_kwargs,
+                )
+        if not self.config.use_rotary_positional_embeddings:
+            # CogVideoX-2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # CogVideoX-5B
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+        # 4. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+        # 5. Unpatchify
+        p = self.config.patch_size
+        p_t = self.config.patch_size_t
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

architecture/embeddings.py ADDED Viewed

The diff for this file is too large to render. See raw diff

architecture/noise_sampler.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Modified from https://github.com/THUDM/CogVideo/blob/3710a612d8760f5cdb1741befeebb65b9e0f2fe0/sat/sgm/modules/diffusionmodules/sigma_sampling.py
+"""
+import torch
+class DiscreteSampling:
+    def __init__(self, num_idx, uniform_sampling=False):
+        self.num_idx = num_idx
+        self.uniform_sampling = uniform_sampling
+        self.is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized()
+        # print("self.is_distributed status is ", self.is_distributed)
+        if self.is_distributed and self.uniform_sampling:
+            world_size = torch.distributed.get_world_size()
+            self.rank = torch.distributed.get_rank()
+            i = 1
+            while True:
+                if world_size % i != 0 or num_idx % (world_size // i) != 0:
+                    i += 1
+                else:
+                    self.group_num = world_size // i
+                    break
+            assert self.group_num > 0
+            assert world_size % self.group_num == 0
+            # the number of rank in one group
+            self.group_width = world_size // self.group_num
+            self.sigma_interval = self.num_idx // self.group_num
+            print('rank=%d world_size=%d group_num=%d group_width=%d sigma_interval=%s' % (
+                  self.rank, world_size, self.group_num,
+                  self.group_width, self.sigma_interval))
+    def __call__(self, n_samples, generator=None, device=None):
+        if self.is_distributed and self.uniform_sampling:
+            group_index = self.rank // self.group_width
+            idx = torch.randint(
+                                    group_index * self.sigma_interval,
+                                    (group_index + 1) * self.sigma_interval,
+                                    (n_samples,),
+                                    generator=generator, device=device,
+                                )
+            # print('proc[%d] idx=%s' % (self.rank, idx))
+            # print("Uniform sample range is ", group_index * self.sigma_interval, (group_index + 1) * self.sigma_interval)
+        else:
+            idx = torch.randint(
+                                    0, self.num_idx, (n_samples,),
+                                    generator=generator, device=device,
+                                )
+        return idx

architecture/transformer_wan.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import Attention
+from diffusers.models.cache_utils import CacheMixin
+from diffusers.models.embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import FP32LayerNorm
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class WanAttnProcessor2_0:
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("WanAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        encoder_hidden_states_img = None
+        if attn.add_k_proj is not None:
+            # 512 is the context length of the text encoder, hardcoded for now
+            image_context_length = encoder_hidden_states.shape[1] - 512
+            encoder_hidden_states_img = encoder_hidden_states[:, :image_context_length]
+            encoder_hidden_states = encoder_hidden_states[:, image_context_length:]
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        if rotary_emb is not None:
+            def apply_rotary_emb(
+                hidden_states: torch.Tensor,
+                freqs_cos: torch.Tensor,
+                freqs_sin: torch.Tensor,
+            ):
+                x = hidden_states.view(*hidden_states.shape[:-1], -1, 2)
+                x1, x2 = x[..., 0], x[..., 1]
+                cos = freqs_cos[..., 0::2]
+                sin = freqs_sin[..., 1::2]
+                out = torch.empty_like(hidden_states)
+                out[..., 0::2] = x1 * cos - x2 * sin
+                out[..., 1::2] = x1 * sin + x2 * cos
+                return out.type_as(hidden_states)
+            query = apply_rotary_emb(query, *rotary_emb)
+            key = apply_rotary_emb(key, *rotary_emb)
+        # I2V task
+        hidden_states_img = None
+        if encoder_hidden_states_img is not None:
+            key_img = attn.add_k_proj(encoder_hidden_states_img)
+            key_img = attn.norm_added_k(key_img)
+            value_img = attn.add_v_proj(encoder_hidden_states_img)
+            key_img = key_img.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            value_img = value_img.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            hidden_states_img = F.scaled_dot_product_attention(
+                query, key_img, value_img, attn_mask=None, dropout_p=0.0, is_causal=False
+            )
+            hidden_states_img = hidden_states_img.transpose(1, 2).flatten(2, 3)
+            hidden_states_img = hidden_states_img.type_as(query)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        if hidden_states_img is not None:
+            hidden_states = hidden_states + hidden_states_img
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class WanImageEmbedding(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int, pos_embed_seq_len=None):
+        super().__init__()
+        self.norm1 = FP32LayerNorm(in_features)
+        self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu")
+        self.norm2 = FP32LayerNorm(out_features)
+        if pos_embed_seq_len is not None:
+            self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_seq_len, in_features))
+        else:
+            self.pos_embed = None
+    def forward(self, encoder_hidden_states_image: torch.Tensor) -> torch.Tensor:
+        if self.pos_embed is not None:
+            batch_size, seq_len, embed_dim = encoder_hidden_states_image.shape
+            encoder_hidden_states_image = encoder_hidden_states_image.view(-1, 2 * seq_len, embed_dim)
+            encoder_hidden_states_image = encoder_hidden_states_image + self.pos_embed
+        hidden_states = self.norm1(encoder_hidden_states_image)
+        hidden_states = self.ff(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+class WanTimeTextImageEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        time_freq_dim: int,
+        time_proj_dim: int,
+        text_embed_dim: int,
+        image_embed_dim: Optional[int] = None,
+        pos_embed_seq_len: Optional[int] = None,
+    ):
+        super().__init__()
+        self.timesteps_proj = Timesteps(num_channels=time_freq_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.time_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
+        self.act_fn = nn.SiLU()
+        self.time_proj = nn.Linear(dim, time_proj_dim)
+        self.text_embedder = PixArtAlphaTextProjection(text_embed_dim, dim, act_fn="gelu_tanh")
+        self.image_embedder = None
+        if image_embed_dim is not None:
+            self.image_embedder = WanImageEmbedding(image_embed_dim, dim, pos_embed_seq_len=pos_embed_seq_len)
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        timestep_seq_len: Optional[int] = None,
+    ):
+        timestep = self.timesteps_proj(timestep)
+        if timestep_seq_len is not None:
+            timestep = timestep.unflatten(0, (1, timestep_seq_len))
+        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
+        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
+            timestep = timestep.to(time_embedder_dtype)
+        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
+        timestep_proj = self.time_proj(self.act_fn(temb))
+        encoder_hidden_states = self.text_embedder(encoder_hidden_states)
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states_image = self.image_embedder(encoder_hidden_states_image)
+        return temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image
+class WanRotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        attention_head_dim: int,
+        patch_size: Tuple[int, int, int],
+        max_seq_len: int,
+        theta: float = 10000.0,
+    ):
+        super().__init__()
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+        h_dim = w_dim = 2 * (attention_head_dim // 6)
+        t_dim = attention_head_dim - h_dim - w_dim
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        freqs_cos = []
+        freqs_sin = []
+        for dim in [t_dim, h_dim, w_dim]:
+            freq_cos, freq_sin = get_1d_rotary_pos_embed(
+                dim,
+                max_seq_len,
+                theta,
+                use_real=True,
+                repeat_interleave_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            freqs_cos.append(freq_cos)
+            freqs_sin.append(freq_sin)
+        self.register_buffer("freqs_cos", torch.cat(freqs_cos, dim=1), persistent=False)
+        self.register_buffer("freqs_sin", torch.cat(freqs_sin, dim=1), persistent=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.patch_size
+        ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
+        split_sizes = [
+            self.attention_head_dim - 2 * (self.attention_head_dim // 3),
+            self.attention_head_dim // 3,
+            self.attention_head_dim // 3,
+        ]
+        freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
+        freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
+        freqs_cos_f = freqs_cos[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_h = freqs_cos[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_w = freqs_cos[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_f = freqs_sin[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_h = freqs_sin[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_w = freqs_sin[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos = torch.cat([freqs_cos_f, freqs_cos_h, freqs_cos_w], dim=-1).reshape(1, 1, ppf * pph * ppw, -1)
+        freqs_sin = torch.cat([freqs_sin_f, freqs_sin_h, freqs_sin_w], dim=-1).reshape(1, 1, ppf * pph * ppw, -1)
+        return freqs_cos, freqs_sin
+@maybe_allow_in_graph
+class WanTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        qk_norm: str = "rms_norm_across_heads",
+        cross_attn_norm: bool = False,
+        eps: float = 1e-6,
+        added_kv_proj_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        # 1. Self-attention
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_heads,
+            kv_heads=num_heads,
+            dim_head=dim // num_heads,
+            qk_norm=qk_norm,
+            eps=eps,
+            bias=True,
+            cross_attention_dim=None,
+            out_bias=True,
+            processor=WanAttnProcessor2_0(),
+        )
+        # 2. Cross-attention
+        self.attn2 = Attention(
+            query_dim=dim,
+            heads=num_heads,
+            kv_heads=num_heads,
+            dim_head=dim // num_heads,
+            qk_norm=qk_norm,
+            eps=eps,
+            bias=True,
+            cross_attention_dim=None,
+            out_bias=True,
+            added_kv_proj_dim=added_kv_proj_dim,
+            added_proj_bias=True,
+            processor=WanAttnProcessor2_0(),
+        )
+        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        # 3. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        rotary_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        if temb.ndim == 4:
+            # temb: batch_size, seq_len, 6, inner_dim (wan2.2 ti2v)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table.unsqueeze(0) + temb.float()
+            ).chunk(6, dim=2)
+            # batch_size, seq_len, 1, inner_dim
+            shift_msa = shift_msa.squeeze(2)
+            scale_msa = scale_msa.squeeze(2)
+            gate_msa = gate_msa.squeeze(2)
+            c_shift_msa = c_shift_msa.squeeze(2)
+            c_scale_msa = c_scale_msa.squeeze(2)
+            c_gate_msa = c_gate_msa.squeeze(2)
+        else:
+            # temb: batch_size, 6, inner_dim (wan2.1/wan2.2 14B)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table + temb.float()
+            ).chunk(6, dim=1)
+        # 1. Self-attention
+        norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
+        attn_output = self.attn1(hidden_states=norm_hidden_states, rotary_emb=rotary_emb)
+        hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states)
+        # 2. Cross-attention
+        norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states)
+        attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+        hidden_states = hidden_states + attn_output
+        # 3. Feed-forward
+        norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
+            hidden_states
+        )
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states)
+        return hidden_states
+class WanTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
+    r"""
+    A Transformer model for video-like data used in the Wan model.
+    Args:
+        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
+            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
+        num_attention_heads (`int`, defaults to `40`):
+            Fixed length for text embeddings.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output.
+        text_dim (`int`, defaults to `512`):
+            Input dimension for text embeddings.
+        freq_dim (`int`, defaults to `256`):
+            Dimension for sinusoidal time embeddings.
+        ffn_dim (`int`, defaults to `13824`):
+            Intermediate dimension in feed-forward network.
+        num_layers (`int`, defaults to `40`):
+            The number of layers of transformer blocks to use.
+        window_size (`Tuple[int]`, defaults to `(-1, -1)`):
+            Window size for local attention (-1 indicates global attention).
+        cross_attn_norm (`bool`, defaults to `True`):
+            Enable cross-attention normalization.
+        qk_norm (`bool`, defaults to `True`):
+            Enable query/key normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        add_img_emb (`bool`, defaults to `False`):
+            Whether to use img_emb.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+    """
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _no_split_modules = ["WanTransformerBlock"]
+    _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
+    _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
+    _repeated_blocks = ["WanTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: Tuple[int] = (1, 2, 2),
+        num_attention_heads: int = 40,
+        attention_head_dim: int = 128,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        text_dim: int = 4096,
+        freq_dim: int = 256,
+        ffn_dim: int = 13824,
+        num_layers: int = 40,
+        cross_attn_norm: bool = True,
+        qk_norm: Optional[str] = "rms_norm_across_heads",
+        eps: float = 1e-6,
+        image_dim: Optional[int] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        rope_max_seq_len: int = 1024,
+        pos_embed_seq_len: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+        # 1. Patch & position embedding
+        self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+        self.patch_embedding = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+        # 2. Condition embeddings
+        # image_embedding_dim=1280 for I2V model
+        self.condition_embedder = WanTimeTextImageEmbedding(
+            dim=inner_dim,
+            time_freq_dim=freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=text_dim,
+            image_embed_dim=image_dim,
+            pos_embed_seq_len=pos_embed_seq_len,
+        )
+        # 3. Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                WanTransformerBlock(
+                    inner_dim, ffn_dim, num_attention_heads, qk_norm, cross_attn_norm, eps, added_kv_proj_dim
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4. Output norm & projection
+        self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False)
+        self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size))
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 2, inner_dim) / inner_dim**0.5)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.config.patch_size
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p_h
+        post_patch_width = width // p_w
+        rotary_emb = self.rope(hidden_states)
+        hidden_states = self.patch_embedding(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        # timestep shape: batch_size, or batch_size, seq_len (wan 2.2 ti2v)
+        if timestep.ndim == 2:
+            ts_seq_len = timestep.shape[1]
+            timestep = timestep.flatten()  # batch_size * seq_len
+        else:
+            ts_seq_len = None
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=ts_seq_len
+        )
+        if ts_seq_len is not None:
+            # batch_size, seq_len, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(2, (6, -1))
+        else:
+            # batch_size, 6, inner_dim
+            timestep_proj = timestep_proj.unflatten(1, (6, -1))
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+        # 4. Transformer blocks
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for block in self.blocks:
+                hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
+                )
+        else:
+            for block in self.blocks:
+                hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)
+        # 5. Output norm, projection & unpatchify
+        if temb.ndim == 3:
+            # batch_size, seq_len, inner_dim (wan 2.2 ti2v)
+            shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
+            shift = shift.squeeze(2)
+            scale = scale.squeeze(2)
+        else:
+            # batch_size, inner_dim
+            shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
+        # Move the shift and scale tensors to the same device as hidden_states.
+        # When using multi-GPU inference via accelerate these will be on the
+        # first device rather than the last device, which hidden_states ends up
+        # on.
+        shift = shift.to(hidden_states.device)
+        scale = scale.to(hidden_states.device)
+        hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1
+        )
+        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

config/accelerate_config_4GPU.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "compute_environment": "LOCAL_MACHINE",
+    "debug": false,
+    "distributed_type": "MULTI_GPU",
+    "downcast_bf16": "no",
+    "gpu_ids": "all",
+    "machine_rank": 0,
+    "main_training_function": "main",
+    "mixed_precision": "bf16",
+    "num_machines": 1,
+    "num_processes": 4,
+    "rdzv_backend": "static",
+    "same_network": true,
+    "tpu_env": [],
+    "tpu_use_cluster": false,
+    "tpu_use_sudo": false,
+    "use_cpu": false
+}

config/train_cogvideox_motion.yaml ADDED Viewed

	@@ -0,0 +1,88 @@

+experiment_name: CogVideoX_5B_Motion_480P   # Store Folder Name
+# Model Setting
+base_model_path: zai-org/CogVideoX-5b-I2V
+pretrained_transformer_path:    # No need to set; if you set, this will load transformer model with non-default Wan transformer
+enable_slicing: True
+enable_tiling: True
+use_learned_positional_embeddings: True
+use_rotary_positional_embeddings: True
+# Dataset Setting
+download_folder_path: FrameINO_data/   # Set the downloaded folder path, all the other csv will be read automatically
+train_csv_relative_path:        dataset_csv_files/train_sample_short_dataset        # No need to change, Fixed
+train_video_relative_path:      video_dataset/train_sample_dataset                  # No need to change, Fixed
+validation_csv_relative_path:   dataset_csv_files/val_sample_short_dataset          # No need to change, Fixed
+validation_video_relative_path: video_dataset/val_sample_dataset                    # No need to change, Fixed
+dataloader_num_workers: 4         # This should be per GPU   In Debug, we set to 1
+# height_range: [480, 480]         # Height Range; By slightly modify the dataloader code and use this setting, we can use variable resolution training
+target_height: 480
+target_width: 720
+sample_accelerate_factor: 2       # Imitate 12FPS we have set before.
+train_frame_num_range: [49, 49]   # Number of frames for the trianing, required to be 4N+1
+# min_train_frame_num: 49           # If it is less than this number, the dataloader will raise Exception and skip to the next one valid!
+# Motion Setting
+dot_radius: 6               # This is set with respect to 384 height pixel, will be adjust based on the height change
+point_keep_ratio: 0.4       # The ratio of points left; Likelyhood by random.choices for each tracking point, so it can be quite versatile; 0.33 is also recommended
+faster_motion_prob: 0.0     # Whether we support faster (~8FPS), 0.0 - 0.1 is also recomended (0.0 by default).
+# Denoise + Text Setting
+noised_image_dropout: 0.05      # No First Frame Setting, becomes T2V
+empty_text_prompt: False        # FOR TI2V, we needs to use text prompt
+text_mask_ratio: 0.05           # Follow InstructPix2Pix
+max_text_seq_length: 226
+# Training Setting
+resume_from_checkpoint: False     # latest / False; latest will automatically fetch the newest checkpoint
+max_train_steps: 1002             # Based on the needs; This is just a demo dataset, so training low is not needed
+train_batch_size: 1               # batch size per GPU
+gradient_accumulation_steps: 2    # Equivalent to multi batch size; Total GPU
+checkpointing_steps: 2000         # Check point frequeuncy, don't recommend to be too frequent
+checkpoints_total_limit: 8        # Transformer are too large, this size is too big (~32 GB per checkpoint)
+mixed_precision: bf16             # CogvideoX official code usaully use bf16
+gradient_checkpointing: True      # This will save the memory but slower; Even if I have 80GB memory, this is still needed to open; else, OOM
+seed:                             # If we set seed here, the reading of the data in each resume will be the same as the first time, which cannot train full dataset in resume mode
+output_folder: checkpoints/
+logging_name: logging
+nccl_timeout: 1800
+# Validation Setting
+validation_step: 2000         # Don't set too frequent, which will be very resource consuming
+first_iter_validation: True   # Whether we do the first iter validation
+num_inference_steps: 50
+# Learning Rate and Optimizer
+optimizer: adamw              # Choose between ["adam", "adamw", "prodigy"]
+learning_rate: 2e-5           # 1e-4 might be too big
+scale_lr: False
+lr_scheduler: constant_with_warmup        # Most cases should be constant
+adam_beta1: 0.9
+adam_beta2: 0.95              # In the past, this used to be 0.999; smaller than usual
+adam_beta3: 0.98
+lr_power: 1.0
+lr_num_cycles: 1.0
+max_grad_norm: 1.0
+prodigy_beta3:                # Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2
+adam_weight_decay: 1e-04
+adam_epsilon: 1e-08
+lr_warmup_steps: 400
+# Other Setting
+report_to: tensorboard
+allow_tf32: True
+revision:
+variant:
+cache_dir:
+tracker_name:

config/train_cogvideox_motion_FrameINO.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+experiment_name: CogVideoX_5B_Motion_FINO_480P
+# Model Setting
+base_model_path: zai-org/CogVideoX-5b-I2V
+pretrained_transformer_path:  uva-cv-lab/FrameINO_CogVideoX_Stage1_Motion_v1.0    # Use the stage1 weight here; if you use your trained weight, it should go to the transformer folder (TODO: needs to check this)
+enable_slicing: True
+enable_tiling: True
+use_learned_positional_embeddings: True
+use_rotary_positional_embeddings: True
+# Dataset Setting
+download_folder_path: FrameINO_data/   # Set the downloaded folder path, all the other csv will be read automatically
+train_csv_relative_path:        dataset_csv_files/train_sample_short_dataset        # No need to change, Fixed
+train_video_relative_path:      video_dataset/train_sample_dataset                  # No need to change, Fixed
+train_ID_relative_path:         video_dataset/train_ID_FrameIn                      # No need to change, Fixed
+validation_csv_relative_path:   dataset_csv_files/val_sample_short_dataset          # No need to change, Fixed
+validation_video_relative_path: video_dataset/val_sample_dataset                    # No need to change, Fixed
+validation_ID_relative_path:    video_dataset/val_ID_FrameIn                        # No need to change, Fixed
+dataloader_num_workers: 4   # This should be per GPU
+# height_range: [480, 704]        # Height Range; By slightly modify the dataloader code and use this setting, we can use variable resolution training
+target_height: 480
+target_width: 720
+sample_accelerate_factor: 2       # Imitate 12FPS we have set before.
+train_frame_num_range: [49, 49]   # Number of frames for the trianing, required to be 4N+1
+min_train_frame_num: 49           # If it is less than this number, the dataloader will raise Exception and skip to the next one valid!   We recommand CogVideoX to use exactly 49 frames.
+# Motion Setting
+dot_radius: 6                     # This is set with respect to 384 height pixel, will be adjust based on the height change
+point_keep_ratio_regular: 0.33    # Less points than motion control; The Ratio of points left for points inside the region box; For Non-main Object Motion
+faster_motion_prob: 0.0           # Whether we support faster (~8FPS), 0.0 - 0.1 is also recomended (0.0 by default).
+# Frame In and Out Setting
+drop_FrameIn_prob: 0.15           # This is the cases where we only has FrameOut occur, FrameIn will be whole whilte place holder (Recommend: 0.15)
+point_keep_ratio_ID: 0.33         # The Ratio of points left for new ID introduced
+# Denoise + Text Setting
+noised_image_dropout: 0.05        # No First Frame Setting, becomes T2V
+empty_text_prompt: False          # FOR TI2V, we needs to use text prompt
+text_mask_ratio: 0.05             # Follow InstructPix2Pix
+max_text_seq_length: 226
+# Training Setting
+resume_from_checkpoint: False     # latest / False; latest will automatically fetch the newest checkpoint
+max_train_steps: 1002             # Based on the needs; This is just a demo dataset, so training low is not needed
+train_batch_size: 1               # batch size per GPU
+gradient_accumulation_steps: 2    # This should be set to 1 usually.
+checkpointing_steps: 2000         # Check point frequeuncy, don't recommend to be too frequent
+checkpoints_total_limit: 8        # Transformer are too large, this size is too big (~32 GB per checkpoint)
+mixed_precision: bf16             # CogvideoX official code usaully use bf16
+gradient_checkpointing: True      # This will save the memory but slower; Even if I have 80GB memory, this is still needed to open; else, OOM
+seed:                             # 如果这里set seed了；你每次resume都跟resume前的data 读取顺序完全一致；如果连一个epoch都没train，那就每次同样数据循环
+output_folder: checkpoints/
+logging_name: logging
+nccl_timeout: 1800
+# Validation Setting
+validation_step: 2000         # Don't set too frequent, which will be very resource consuming
+first_iter_validation: True   # Whether we do the first iter validation
+num_inference_steps: 50
+# Learning Rate and Optimizer
+optimizer: adamw              # Choose between ["adam", "adamw", "prodigy"]
+learning_rate: 2e-5           # 1e-4 might be too big
+scale_lr: False
+lr_scheduler: constant_with_warmup        # Most cases should be constant
+adam_beta1: 0.9
+adam_beta2: 0.95              # In the past, this used to be 0.999; smaller than usual
+adam_beta3: 0.98
+lr_power: 1.0
+lr_num_cycles: 1.0
+max_grad_norm: 1.0
+prodigy_beta3:                # Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2
+# use_8bit_adam: False          # This saves a lot of GPU memory, but slightly slower
+adam_weight_decay: 1e-04
+adam_epsilon: 1e-08
+lr_warmup_steps: 400
+# Other Setting
+report_to: tensorboard
+allow_tf32: True
+revision:
+variant:
+cache_dir:
+tracker_name:

config/train_wan_motion.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+experiment_name: Wan_5B_Motion_704P
+# Model Setting
+base_model_path: Wan-AI/Wan2.2-TI2V-5B-Diffusers
+pretrained_transformer_path:    # No need to set; if you set, this will load transformer model with non-default Wan transformer
+enable_slicing: True
+enable_tiling: True
+# Dataset Setting
+download_folder_path: FrameINO_data/   # Set the downloaded folder path, all the other csv will be read automatically
+train_csv_relative_path:        dataset_csv_files/train_sample_short_dataset        # No need to change, Fixed
+train_video_relative_path:      video_dataset/train_sample_dataset                  # No need to change, Fixed
+validation_csv_relative_path:   dataset_csv_files/val_sample_short_dataset          # No need to change, Fixed
+validation_video_relative_path: video_dataset/val_sample_dataset                    # No need to change, Fixed
+dataloader_num_workers: 4         # This should be per GPU;   In Debug, we set to 1
+# height_range: [480, 704]        # Height Range; By slightly modify the dataloader code and use this setting, we can use variable resolution training
+target_height: 704
+target_width: 1280
+sample_accelerate_factor: 2       # Imitate 12FPS we have set before.
+train_frame_num_range: [81, 81]   # Number of frames for the trianing, required to be 4N+1; If the total number of files is less than the min range, just use the minimum available; Now, set to 81 Frames
+# min_train_frame_num: 49           # If it is less than this number, the dataloader will raise Exception and skip to the next one valid!
+# Motion Setting
+dot_radius: 7               # Due to the VAE of Wan, this is slightly larger than CogVideoX; this is set with respect to 384 height pixel, will be adjust based on the height change
+point_keep_ratio: 0.4       # The ratio of points left; Likelyhood by random.choices for each tracking point, so it can be quite versatile; 0.33 is also recommended
+faster_motion_prob: 0.0     # Whether we support faster (~8FPS), 0.0 - 0.1 is also recomended (0.0 by default).
+# Denoise (For Flow Matchin-based)
+noised_image_dropout: 0.0       # No First Frame Setting, becomes T2V; not used for Wan
+train_sampling_steps: 1000
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000     # 1000 is the default value
+  shift: 5.0
+  use_dynamic_shifting: false   # false is the default value
+  base_shift: 0.5               # 0.5 is the default value
+  max_shift: 1.15               # 1.15 is the default value
+  base_image_seq_len: 256       # 256 is the default value
+  max_image_seq_len: 4096       # 4096 is the default value
+# Text Setting
+text_mask_ratio: 0.0            # Follow InstructPix2Pix
+empty_text_prompt: False        # FOR TI2V, we start using text prompt
+max_text_seq_length: 512        # For the Wan
+# Training Setting
+resume_from_checkpoint: False     # latest / False; latest will automatically fetch the newest checkpoint
+max_train_steps: 1002             # Based on the needs; This is just a demo dataset, so training low is not needed
+train_batch_size: 1               # batch size per GPU
+gradient_accumulation_steps: 2    # Equivalent to multi batch size; Total GPU
+checkpointing_steps: 2000         # Check point frequeuncy, don't recommend to be too frequent
+checkpoints_total_limit: 8        # Transformer are too large, this size is too big (~32 GB per checkpoint)
+mixed_precision: bf16             # CogvideoX official code usaully use bf16
+gradient_checkpointing: True      # This will save the memory but slower; Even if I have 80GB memory, this is still needed to open; else, OOM
+seed:                             # If we set seed here, the reading of the data in each resume will be the same as the first time, which cannot train full dataset in resume mode
+output_folder: checkpoints/
+logging_name: logging
+nccl_timeout: 1800
+# Validation Setting
+validation_step: 2000         # Don't set too frequent, which will be very resource consuming
+first_iter_validation: True   # Whether we do the first iter validation
+num_inference_steps: 38
+# Learning Rate and Optimizer
+optimizer: adamw              # Choose between ["adam", "adamw", "prodigy"]
+learning_rate: 3e-5           # 1e-4 might be too big
+scale_lr: False
+lr_scheduler: constant_with_warmup        # Most cases should be constant
+adam_beta1: 0.9                           # This Setting is different from CogVideoX, we follow VideoFun
+adam_beta2: 0.999
+# adam_beta3: 0.98
+lr_power: 1.0
+lr_num_cycles: 1.0
+initial_grad_norm_ratio: 5
+abnormal_norm_clip_start: 1000    # Follow VideoFun
+max_grad_norm: 0.05               # Follow VideoFun
+prodigy_beta3:                    # Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2
+# use_8bit_adam: False              # This saves a lot of GPU memory, but slightly slower; Recommend to open
+adam_weight_decay: 1e-4
+adam_epsilon: 1e-10
+lr_warmup_steps: 100
+# Other Setting
+report_to: tensorboard
+allow_tf32: True
+revision:
+variant:
+cache_dir:
+tracker_name:

config/train_wan_motion_FrameINO.yaml ADDED Viewed

	@@ -0,0 +1,110 @@

+experiment_name: Wan_5B_Motion_FINO_704P
+# Model Setting
+base_model_path: Wan-AI/Wan2.2-TI2V-5B-Diffusers
+pretrained_transformer_path: uva-cv-lab/FrameINO_Wan2.2_5B_Stage1_Motion_v1.5     # Use the one trained with the motion
+enable_slicing: True
+enable_tiling: True
+# Dataset Setting
+download_folder_path: FrameINO_data/   # Set the downloaded folder path, all the other csv will be read automatically
+train_csv_relative_path:        dataset_csv_files/train_sample_short_dataset        # No need to change, Fixed
+train_video_relative_path:      video_dataset/train_sample_dataset                  # No need to change, Fixed
+train_ID_relative_path:         video_dataset/train_ID_FrameIn                      # No need to change, Fixed
+validation_csv_relative_path:   dataset_csv_files/val_sample_short_dataset          # No need to change, Fixed
+validation_video_relative_path: video_dataset/val_sample_dataset                    # No need to change, Fixed
+validation_ID_relative_path:    video_dataset/val_ID_FrameIn                        # No need to change, Fixed
+dataloader_num_workers: 4         # This should be per GPU   In Debug, we set to 1
+# height_range: [480, 704]        # Height Range; By slightly modify the dataloader code and use this setting, we can use variable resolution training
+target_height: 704                # Recommend 704 x 1280 for the Wan2.2
+target_width: 1280
+sample_accelerate_factor: 2       # Imitate 12FPS we have set before.
+train_frame_num_range: [81, 81]   # Number of frames for the trianing, required to be 4N+1
+min_train_frame_num: 49           # If it is less than this number, the dataloader will raise Exception and skip to the next one valid!
+# Motion Setting
+dot_radius: 7                     # Due to VAE of Wan, this is slightly larger than CogVideoX; this is set with respect to 384 height pixel, will be adjust based on the height change
+point_keep_ratio_regular: 0.33    # Less points than motion control; The Ratio of points left for points inside the region box; For Non-main Object Motion
+faster_motion_prob: 0.0     # Whether we support faster (~8FPS), 0.0 - 0.1 is also recomended (0.0 by default).
+# Frame In and Out Setting
+drop_FrameIn_prob: 0.15          # This is the cases where we only has FrameOut occur; ID tokens will be filled with whole whilte place holder (Recommend value: 0.15)
+point_keep_ratio_ID: 0.33        # The Ratio of points left for new ID introduced; For Main ID Object Motion
+# Denoise
+noised_image_dropout: 0.0       # No First Frame Setting, becomes T2V; not used for Wan
+train_sampling_steps: 1000
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000     # 1000 is the default value
+  shift: 5.0
+  use_dynamic_shifting: false   # false is the default value
+  base_shift: 0.5               # 0.5 is the default value
+  max_shift: 1.15               # 1.15 is the default value
+  base_image_seq_len: 256       # 256 is the default value
+  max_image_seq_len: 4096       # 4096 is the default value
+# Text Setting
+text_mask_ratio: 0.0            # Follow InstructPix2Pix, Currently, we set to 0; At most 0.05 is recommeneded
+empty_text_prompt: False        # FOR TI2V, we needs to use text prompt
+max_text_seq_length: 512        # For the Wan
+# Training setting
+resume_from_checkpoint: False     # latest / False; latest will automatically fetch the newest checkpoint
+max_train_steps: 1002             # Based on the needs; This is just a demo dataset, so training low is not needed
+train_batch_size: 1               # batch size per GPU
+gradient_accumulation_steps: 2    # This should be set to 1 usually.
+checkpointing_steps: 2000         # Check point frequeuncy, don't recommend to be too frequent
+checkpoints_total_limit: 8        # Transformer are too large, this size is too big (~32 GB per checkpoint)
+mixed_precision: bf16             # CogvideoX official code usaully use bf16
+gradient_checkpointing: True      # This will save the memory but slower; Even if I have 80GB memory, this is still needed to open; else, OOM
+seed:                             # 如果这里set seed了；你每次resume都跟resume前的data 读取顺序完全一致；如果连一个epoch都没train，那就每次同样数据循环
+output_folder: checkpoints/
+logging_name: logging
+nccl_timeout: 1800
+# Validation Setting
+validation_step: 2000         # Don't set too frequent, which will be very resource consuming
+first_iter_validation: True   # Whether we do the first iter validation
+num_inference_steps: 38
+# Learning Rate and Optimizer
+optimizer: adamw              # Choose between ["adam", "adamw", "prodigy"]
+learning_rate: 3e-5           # 1e-4 might be too big
+scale_lr: False
+lr_scheduler: constant_with_warmup        # Most cases should be constant
+adam_beta1: 0.9                           # This Setting is different from CogVideoX, we follow VideoFun
+adam_beta2: 0.999
+# adam_beta3: 0.98
+lr_power: 1.0
+lr_num_cycles: 1.0
+initial_grad_norm_ratio: 5
+abnormal_norm_clip_start: 1000    # Follow VideoFun
+max_grad_norm: 0.05               # Follow VideoFun
+prodigy_beta3:                    # Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2
+# use_8bit_adam: False               # This saves a lot of GPU memory, but slightly slower
+adam_weight_decay: 1e-4
+adam_epsilon: 1e-10
+lr_warmup_steps: 100
+# Other Setting
+report_to: tensorboard
+allow_tf32: True
+revision:
+variant:
+cache_dir:
+tracker_name:

data_loader/sampler.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Last modified: 2024-04-18
+#
+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import torch
+from torch.utils.data import (
+    BatchSampler,
+    RandomSampler,
+    SequentialSampler,
+)
+class MixedBatchSampler(BatchSampler):
+    """Sample one batch from a selected dataset with given probability.
+    Compatible with datasets at different resolution
+    """
+    def __init__(
+        self, src_dataset_ls, batch_size, drop_last, shuffle, prob=None, generator=None
+    ):
+        self.base_sampler = None
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.generator = generator
+        self.src_dataset_ls = src_dataset_ls
+        self.n_dataset = len(self.src_dataset_ls)
+        # Dataset length
+        self.dataset_length = [len(ds) for ds in self.src_dataset_ls]
+        self.cum_dataset_length = [
+                                    sum(self.dataset_length[:i]) for i in range(self.n_dataset)
+                                ]  # cumulative dataset length
+        # BatchSamplers for each source dataset
+        if self.shuffle:
+            self.src_batch_samplers = [
+                BatchSampler(
+                                sampler=RandomSampler(
+                                                        ds, replacement=False, generator=self.generator
+                                                    ),
+                                batch_size=self.batch_size,
+                                drop_last=self.drop_last,
+                            )
+                for ds in self.src_dataset_ls
+            ]
+        else:
+            self.src_batch_samplers = [
+                BatchSampler(
+                    sampler=SequentialSampler(ds),
+                    batch_size=self.batch_size,
+                    drop_last=self.drop_last,
+                )
+                for ds in self.src_dataset_ls
+            ]
+        self.raw_batches = [
+            list(bs) for bs in self.src_batch_samplers
+        ]  # index in original dataset
+        self.n_batches = [len(b) for b in self.raw_batches]
+        self.n_total_batch = sum(self.n_batches)
+        # sampling probability
+        if prob is None:
+            # if not given, decide by dataset length
+            self.prob = torch.tensor(self.n_batches) / self.n_total_batch
+        else:
+            self.prob = torch.as_tensor(prob)
+    def __iter__(self):
+        """_summary_
+        Yields:
+            list(int): a batch of indics, corresponding to ConcatDataset of src_dataset_ls
+        """
+        for _ in range(self.n_total_batch):
+            idx_ds = torch.multinomial(
+                self.prob, 1, replacement=True, generator=self.generator
+            ).item()
+            # if batch list is empty, generate new list
+            if 0 == len(self.raw_batches[idx_ds]):
+                self.raw_batches[idx_ds] = list(self.src_batch_samplers[idx_ds])
+            # get a batch from list
+            batch_raw = self.raw_batches[idx_ds].pop()
+            # shift by cumulative dataset length
+            shift = self.cum_dataset_length[idx_ds]
+            batch = [n + shift for n in batch_raw]
+            yield batch
+    def __len__(self):
+        return self.n_total_batch

data_loader/video_dataset_motion.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import os, sys, shutil
+from typing import List, Optional, Tuple, Union
+from pathlib import Path
+import csv
+import random
+import math
+import numpy as np
+import ffmpeg
+import json
+import imageio
+import collections
+import cv2
+import pdb
+csv.field_size_limit(sys.maxsize)      # Default setting is 131072, 100x expand should be enough
+import torch
+from torch.utils.data import Dataset
+from torchvision import transforms
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from utils.optical_flow_utils import flow_to_image, filter_uv, bivariate_Gaussian
+# Init paramter and global shared setting
+# Blurring Kernel
+blur_kernel = bivariate_Gaussian(45, 3, 3, 0, grid = None, isotropic = True)
+# Color
+all_color_codes = [(255, 0, 0), (255, 255, 0), (0, 255, 0), (0, 255, 255),
+                    (255, 0, 255), (0, 0, 255), (128, 128, 128), (64, 224, 208),
+                    (233, 150, 122)]
+for _ in range(100):        # Should not be over 100 colors
+    all_color_codes.append((random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)))
+# Data Transforms
+train_transforms = transforms.Compose(
+                                        [
+                                            transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0),
+                                        ]
+                                    )
+class VideoDataset_Motion(Dataset):
+    def __init__(
+        self,
+        config,
+        download_folder_path,
+        csv_relative_path,
+        video_relative_path,
+        is_diy_test = False,
+    ) -> None:
+        super().__init__()
+        # Gen Size Settings
+        # self.height_range = config["height_range"]
+        # self.max_aspect_ratio = config["max_aspect_ratio"]
+        self.target_height = config["target_height"]
+        self.target_width = config["target_width"]
+        self.sample_accelerate_factor = config["sample_accelerate_factor"]
+        self.train_frame_num_range = config["train_frame_num_range"]
+        # Condition Settings (Text, Motion, etc.)
+        self.empty_text_prompt = config["empty_text_prompt"]
+        self.dot_radius = int(config["dot_radius"])
+        self.point_keep_ratio = config["point_keep_ratio"]          # Point selection mechanism
+        self.faster_motion_prob = config["faster_motion_prob"]
+        # Other Settings
+        self.download_folder_path = download_folder_path
+        self.is_diy_test = is_diy_test
+        self.config = config
+        self.video_folder_path = os.path.join(download_folder_path, video_relative_path)
+        csv_folder_path = os.path.join(download_folder_path, csv_relative_path)
+        # Sanity Check
+        assert(os.path.exists(csv_folder_path))
+        assert(self.point_keep_ratio <= 1.0)
+        # Read the CSV files
+        info_lists = []
+        for csv_file_name in os.listdir(csv_folder_path):       # Read all csv files
+            csv_file_path = os.path.join(csv_folder_path, csv_file_name)
+            with open(csv_file_path) as file_obj:
+                reader_obj = csv.reader(file_obj)
+                # Iterate over each row in the csv
+                for idx, row in enumerate(reader_obj):
+                    if idx == 0:
+                        elements = dict()
+                        for element_idx, key in enumerate(row):
+                            elements[key] = element_idx
+                        continue
+                    # Read the important information
+                    info_lists.append(row)
+        # Organize
+        self.info_lists = info_lists
+        self.element_idx_dict = elements
+        # Log
+        print("The number of videos for ", csv_folder_path, " is ", len(self.info_lists))
+        # print("The memory cost is ", sys.getsizeof(self.info_lists))
+    def __len__(self):
+        return len(self.info_lists)
+    @staticmethod
+    def prepare_traj_tensor(full_pred_tracks, original_height, original_width, selected_frames,
+                                dot_radius, target_width, target_height, idx = 0, first_frame_img = None):
+        # Prepare the color
+        target_color_codes = all_color_codes[:len(full_pred_tracks[0])]        # This means how many objects in total we have
+        # Prepare the traj image
+        traj_img_lists = []
+        # Set a new dot radius based on the resolution fluctuating
+        dot_radius_resize = int( dot_radius * original_height / 384 )     # This is set with respect to default 384 height, will be adjust based on the height change
+        # Prepare base draw image if there is
+        if first_frame_img is not None:
+            img_with_traj = first_frame_img.copy()
+        # Iterate all temporal sequence
+        merge_frames = []
+        for temporal_idx, points_per_frame in enumerate(full_pred_tracks): # Iterate all downsampled frames, should be 13
+            # Init the base img for the traj figures
+            base_img = np.zeros((original_height, original_width, 3)).astype(np.float32)      # Use the original image size
+            base_img.fill(255)      # Whole white frames
+            # Iterate all points in each object
+            for obj_idx, points_per_obj in enumerate(points_per_frame):
+                # Basic setting
+                color_code = target_color_codes[obj_idx]        # Color across frames should be consistent
+                # Process all points in this current object
+                for (horizontal, vertical) in points_per_obj:
+                    if horizontal < 0 or horizontal >= original_width or vertical < 0 or vertical >= original_height:
+                        continue    # If the point is already out of the range, Don't draw
+                    # Draw square around the target position
+                    vertical_start = min(original_height, max(0, vertical - dot_radius_resize))
+                    vertical_end = min(original_height, max(0, vertical + dot_radius_resize))       # Diameter, used to be 10, but want smaller if there are too many points now
+                    horizontal_start = min(original_width, max(0, horizontal - dot_radius_resize))
+                    horizontal_end =  min(original_width, max(0, horizontal + dot_radius_resize))
+                    # Paint
+                    base_img[vertical_start:vertical_end, horizontal_start:horizontal_end, :] = color_code
+                    # Draw the visual of traj if needed
+                    if first_frame_img is not None:
+                        img_with_traj[vertical_start:vertical_end, horizontal_start:horizontal_end, :] = color_code
+            # Resize frames  Don't use negative and don't resize in [0,1]
+            base_img = cv2.resize(base_img, (target_width, target_height), interpolation = cv2.INTER_CUBIC)
+            # Dilate (Default to be True)
+            base_img = cv2.filter2D(base_img, -1, blur_kernel).astype(np.uint8)
+            # Append selected_frames and the color together for visualization
+            if len(selected_frames) != 0:
+                merge_frame = selected_frames[temporal_idx].copy()
+                merge_frame[base_img < 250] = base_img[base_img < 250]
+                merge_frames.append(merge_frame)
+            # cv2.imwrite("Video"+str(idx) + "_traj" + str(temporal_idx).zfill(2) + ".png", cv2.cvtColor(merge_frame, cv2.COLOR_RGB2BGR))       # Comment Out Later
+            # Append to the temporal index
+            traj_img_lists.append(base_img)
+        # Convert to tensor
+        traj_imgs_np = np.array(traj_img_lists)
+        traj_tensor = torch.tensor(traj_imgs_np)
+        # Transform
+        traj_tensor = traj_tensor.float()
+        traj_tensor = torch.stack([train_transforms(traj_frame) for traj_frame in traj_tensor], dim=0)
+        traj_tensor = traj_tensor.permute(0, 3, 1, 2).contiguous()  # [F, C, H, W]
+        # Write to video (Comment Out Later)
+        # imageio.mimsave("merge_cond" + str(idx) + ".mp4",  merge_frames, fps=12)
+        # Return
+        merge_frames = np.array(merge_frames)
+        if first_frame_img is not None:
+            return traj_tensor, traj_imgs_np, merge_frames, img_with_traj
+        else:
+            return traj_tensor, traj_imgs_np, merge_frames        # Need to return traj_imgs_np for other purpose
+    def __getitem__(self, idx):
+        while True: # Iterate until there is a valid video read
+            # try:
+            # Fetch the information
+            info = self.info_lists[idx]
+            video_path = os.path.join(self.video_folder_path, info[self.element_idx_dict["video_path"]])
+            original_height = int(info[self.element_idx_dict["height"]])
+            original_width = int(info[self.element_idx_dict["width"]])
+            # num_frames = int(info[self.element_idx_dict["num_frames"]])       # Deprecated, this is about the whole frame duration, not just one
+            valid_duration = json.loads(info[self.element_idx_dict["valid_duration"]])
+            All_Frame_Panoptic_Segmentation = json.loads(info[self.element_idx_dict["Panoptic_Segmentation"]])
+            text_prompt_all = json.loads(info[self.element_idx_dict["Structured_Text_Prompt"]])
+            Track_Traj_all = json.loads(info[self.element_idx_dict["Track_Traj"]])           # NOTE: Same as above, only consider the first panoptic segmented frame
+            Obj_Info_all = json.loads(info[self.element_idx_dict["Obj_Info"]])
+            # Sanity check
+            if not os.path.exists(video_path):
+                raise Exception("This video path", video_path, "doesn't exists!")
+            ########################################## Mangage Resolution and selected Clip Setting ##########################################
+            # Option1: Variable Resolution Gen
+            # # Check the resolution size
+            # aspect_ratio = min(self.max_aspect_ratio, original_width / original_height)
+            # target_height_raw = min(original_height, random.randint(*self.height_range))
+            # target_width_raw = min(original_width, int(target_height_raw * aspect_ratio))
+            # # Must be the multiplier of 32
+            # target_height = (target_height_raw // 32) * 32
+            # target_width = (target_width_raw // 32) * 32
+            # print("New Height and Width are ", target_height, target_width)
+            # Option2: Fixed Resolution Gen (Assume that the provided is 32x valid)
+            target_width = self.target_width
+            target_height = self.target_height
+            # Only choose the first clip
+            Obj_Info = Obj_Info_all[0]      # For the Motion Training, we have enough dataset, so we just choose the first panoptic section
+            Track_Traj = Track_Traj_all[0]
+            text_prompt = text_prompt_all[0]
+            resolution = str(target_width) + "x" + str(target_height)       # Used for ffmpeg load
+            frame_start_idx = Obj_Info[0][1]       # NOTE: If there is multiple objects Obj_Info[X][1] should be the same
+            ##############################################################################################################################
+            ############################################## Read the video by ffmpeg #################################################
+            # Read the video by ffmpeg in the needed decode fps and resolution
+            video_stream, err = ffmpeg.input(
+                                                video_path
+                                            ).output(
+                                                "pipe:", format = "rawvideo", pix_fmt = "rgb24", s = resolution, vsync = 'passthrough',
+                                            ).run(
+                                                capture_stdout = True, capture_stderr = True    # If there is bug, command capture_stderr
+                                            )    # The resize is already included
+            video_np_full = np.frombuffer(video_stream, np.uint8).reshape(-1, target_height, target_width, 3)
+            # Fetch the valid duration
+            video_np = video_np_full[valid_duration[0] : valid_duration[1]]
+            valid_num_frames = len(video_np)      # Update the number of frames
+            # Decide the accelerate factor
+            train_frame_num_raw = random.randint(*self.train_frame_num_range)
+            if frame_start_idx + 3 * train_frame_num_raw < valid_num_frames and random.random() < self.faster_motion_prob:      # Should be (1) have enough frames and (2) in 10% probability
+                sample_accelerate_factor = self.sample_accelerate_factor + 1       # Hard Code
+            else:
+                sample_accelerate_factor = self.sample_accelerate_factor
+            # Check the number of frames needed this time
+            frame_end_idx = min(valid_num_frames, frame_start_idx + sample_accelerate_factor * train_frame_num_raw)
+            frame_end_idx = frame_start_idx + 4 * math.floor(( (frame_end_idx-frame_start_idx) - 1) / 4) + 1       # Rounded to the closest 4N + 1 size
+            # Select Frames and Convert to Tensor
+            selected_frames = video_np[ frame_start_idx : frame_end_idx : sample_accelerate_factor]       # NOTE: start from the first frame
+            video_tensor = torch.tensor(selected_frames)   # Convert to tensor
+            first_frame_np = selected_frames[0]    # Needs to return for Validation
+            train_frame_num = len(video_tensor)      # Read the actual number of frames from the video (Must be 4N+1)
+            # Data transforms and shape organize
+            video_tensor = video_tensor.float()
+            video_tensor = torch.stack([train_transforms(frame) for frame in video_tensor], dim=0)
+            video_tensor = video_tensor.permute(0, 3, 1, 2).contiguous()  # [F, C, H, W]
+            #############################################################################################################################
+            ######################################### Define the text prompt #######################################################
+            # NOTE: text prompt is fetched above; here, we just decide if we you empty string
+            if self.empty_text_prompt or random.random() < self.config["text_mask_ratio"]:
+                text_prompt = ""
+            # print("Text Prompt for Video", idx, " is ", text_prompt)
+            ########################################################################################################################
+            ###################### Prepare the Tracking points for each object (each object has different color) #################################
+            # Iterate all the segmentation info
+            full_pred_tracks = [[] for _ in range(train_frame_num)]   # The dim should be: (temporal, object, points, xy) The fps should be fixed to 12 fps, which is the same as training decode fps
+            for track_obj_idx in range(len(Obj_Info)):
+                # Read the basic info
+                text_name, frame_idx_raw = Obj_Info[track_obj_idx]      # This is expected to be all the same in the video
+                # Sanity Check: make sure that the number of frames is consistent
+                if track_obj_idx > 0:
+                    if frame_idx_raw != previous_frame_idx_raw:
+                        raise Exception("The panoptic_frame_idx cannot pass the sanity check")
+                # Prepare the tracjectory
+                pred_tracks_full = Track_Traj[track_obj_idx]
+                pred_tracks = pred_tracks_full[ frame_start_idx : frame_end_idx : sample_accelerate_factor]
+                if len(pred_tracks) != train_frame_num:
+                    raise Exception("The length of tracking images does not match the video GT.")
+                # Randomly select the points based on the prob given, here, the number of points is different for each objeects
+                kept_point_status = random.choices([True, False], weights = [self.point_keep_ratio, 1 - self.point_keep_ratio], k = len(pred_tracks[0]))
+                if len(kept_point_status) != len(pred_tracks[-1]):
+                    raise Exception("The number of points filterred is not match with the dataset")
+                # Iterate and add all temporally
+                for temporal_idx, pred_track in enumerate(pred_tracks):
+                    # Iterate all point one by one
+                    left_points = []
+                    for point_idx in range(len(pred_track)):
+                        if kept_point_status[point_idx]:
+                            left_points.append(pred_track[point_idx])
+                    # Append the left points to the list
+                    full_pred_tracks[temporal_idx].append(left_points)    # pred_tracks will be 49 frames, and each one represent all tracking points for single objects; only one object here
+                # Other update
+                previous_frame_idx_raw = frame_idx_raw
+            # Draw the dilated traj points
+            traj_tensor, traj_imgs_np, merge_frames = self.prepare_traj_tensor(full_pred_tracks, original_height, original_width, selected_frames,
+                                                                                self.dot_radius, target_width, target_height, idx)
+            # Sanity Check to make sure that the traj tensor and ground truth has the same number of frames
+            if len(traj_tensor) != len(video_tensor):        # If this two cannot match, the torch.cat on latents will fail
+                raise Exception("Traj length and Video length does not matched!")
+            #########################################################################################################################################
+            # except Exception as e:        # Note: You can uncomment this part to jump failure cases in mass training.
+            #     print("The exception is ", e)
+            #     old_idx = idx
+            #     idx = (idx + 1) % len(self.info_lists)
+            #     print("We cannot process the video", old_idx, " and we choose a new idx of ", idx)
+            #     continue     # For any error occurs, we run it again with new idx proposed (a random int less than current value)
+            # If everything is ok, we should break at the end
+            break
+        # Return the information
+        return {
+                    "video_tensor": video_tensor,
+                    "traj_tensor": traj_tensor,
+                    "text_prompt": text_prompt,
+                    # The rest are auxiliary data for the validation/testing purposes
+                    "video_gt_np": selected_frames,
+                    "first_frame_np": first_frame_np,
+                    "traj_imgs_np": traj_imgs_np,
+                    "merge_frames": merge_frames,
+                    "gt_video_path": video_path,
+                }

data_loader/video_dataset_motion_FrameINO.py ADDED Viewed

	@@ -0,0 +1,578 @@

+import os, sys, shutil
+from typing import List, Optional, Tuple, Union
+from pathlib import Path
+import csv
+import random
+import numpy as np
+import ffmpeg
+import json
+import imageio
+import collections
+import cv2
+import pdb
+import math
+import PIL.Image as Image
+csv.field_size_limit(sys.maxsize)       # Default setting is 131072, 100x expand should be enough
+import torch
+from torch.utils.data import Dataset
+from torchvision import transforms
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from utils.optical_flow_utils import flow_to_image, filter_uv, bivariate_Gaussian
+# Init paramter and global shared setting
+# Blurring Kernel
+blur_kernel = bivariate_Gaussian(45, 3, 3, 0, grid = None, isotropic = True)
+# Color
+all_color_codes = [(255, 0, 0), (255, 255, 0), (0, 255, 0), (0, 255, 255),
+                    (255, 0, 255), (0, 0, 255), (128, 128, 128), (64, 224, 208),
+                    (233, 150, 122)]
+for _ in range(100):        # Should not be over 100 colors
+    all_color_codes.append((random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)))
+# Data Transforms
+train_transforms = transforms.Compose(
+                                        [
+                                            transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0),
+                                        ]
+                                    )
+class VideoDataset_Motion_FrameINO(Dataset):
+    def __init__(
+        self,
+        config,
+        download_folder_path,
+        csv_relative_path,
+        video_relative_path,
+        ID_relative_path,
+        FrameOut_only = False,
+        one_point_one_obj = False,
+        strict_validation_match = False,
+    ) -> None:
+        super().__init__()
+        # Gen Size Settings
+        # self.height_range = config["height_range"]
+        # self.max_aspect_ratio = config["max_aspect_ratio"]
+        self.target_height = config["target_height"]
+        self.target_width = config["target_width"]
+        self.sample_accelerate_factor = config["sample_accelerate_factor"]
+        self.train_frame_num_range = config["train_frame_num_range"]
+        self.min_train_frame_num = config["min_train_frame_num"]
+        # Condition Settings (Text, Motion, etc.)
+        self.empty_text_prompt = config["empty_text_prompt"]
+        self.dot_radius = int(config["dot_radius"])
+        self.point_keep_ratio_ID = config["point_keep_ratio_ID"]
+        self.point_keep_ratio_regular = config["point_keep_ratio_regular"]
+        self.faster_motion_prob = config["faster_motion_prob"]
+        # Other Settings
+        self.FrameOut_only = FrameOut_only
+        self.one_point_one_obj = one_point_one_obj      # Currently, this only open when FrameOut_only = True
+        self.strict_validation_match = strict_validation_match
+        self.config = config
+        self.video_folder_path = os.path.join(download_folder_path, video_relative_path)
+        self.ID_folder_path = os.path.join(download_folder_path, ID_relative_path)
+        csv_folder_path = os.path.join(download_folder_path, csv_relative_path)
+        # Sanity Check
+        assert(os.path.exists(csv_folder_path))
+        assert(self.point_keep_ratio_ID <= 1.0)
+        assert(self.point_keep_ratio_regular <= 1.0)
+        # Read the CSV files
+        info_lists = []
+        for csv_file_name in os.listdir(csv_folder_path):       # Read all csv files
+            csv_file_path = os.path.join(csv_folder_path, csv_file_name)
+            with open(csv_file_path) as file_obj:
+                reader_obj = csv.reader(file_obj)
+                # Iterate over each row in the csv
+                for idx, row in enumerate(reader_obj):
+                    if idx == 0:
+                        elements = dict()
+                        for element_idx, key in enumerate(row):
+                            elements[key] = element_idx
+                        continue
+                    # Read the important information
+                    info_lists.append(row)
+        # Organize
+        self.info_lists = info_lists
+        self.element_idx_dict = elements
+        # Log
+        print("The number of videos for ", csv_folder_path, " is ", len(self.info_lists))
+        # print("The memory cost is ", sys.getsizeof(self.info_lists))
+    def __len__(self):
+        return len(self.info_lists)
+    @staticmethod
+    def prepare_traj_tensor(full_pred_tracks, original_height, original_width, selected_frames,
+                                dot_radius, target_width, target_height, region_box, idx = 0, first_frame_img = None):
+        # Prepare the color and other stuff
+        target_color_codes = all_color_codes[:len(full_pred_tracks[0])]        # This means how many objects in total we have
+        (top_left_x, top_left_y), (bottom_right_x, bottom_right_y) = region_box
+        # Prepare the traj image
+        traj_img_lists = []
+        # Set a new dot radius based on the resolution fluctuating
+        dot_radius_resize = int( dot_radius * original_height / 384 )     # This is set with respect to default 384 height, will be adjust based on the height change
+        # Prepare base draw image if there is
+        if first_frame_img is not None:
+            img_with_traj = first_frame_img.copy()
+        # Iterate all object instance
+        merge_frames = []
+        for temporal_idx, obj_points in enumerate(full_pred_tracks): # Iterate all downsampled frames, should be 13
+            # Init the base img for the traj figures
+            base_img = np.zeros((original_height, original_width, 3)).astype(np.float32)      # Use the original image size
+            base_img.fill(255)      # Whole white frames
+            # Iterate for the per object
+            for obj_idx, points in enumerate(obj_points):
+                # Basic setting
+                color_code = target_color_codes[obj_idx]        # Color across frames should be consistent
+                # Process all points in this current object
+                for (horizontal, vertical) in points:
+                    if horizontal < 0 or horizontal >= original_width or vertical < 0 or vertical >= original_height:
+                        continue    # If the point is already out of the range, Don't draw
+                    # Draw square around the target position
+                    vertical_start = min(original_height, max(0, vertical - dot_radius_resize))
+                    vertical_end = min(original_height, max(0, vertical + dot_radius_resize))       # Diameter, used to be 10, but want smaller if there are too many points now
+                    horizontal_start = min(original_width, max(0, horizontal - dot_radius_resize))
+                    horizontal_end =  min(original_width, max(0, horizontal + dot_radius_resize))
+                    # Paint
+                    base_img[vertical_start:vertical_end, horizontal_start:horizontal_end, :] = color_code
+                    # Draw the visual of traj if needed
+                    if first_frame_img is not None:
+                        img_with_traj[vertical_start:vertical_end, horizontal_start:horizontal_end, :] = color_code
+            # Resize frames  Don't use negative and don't resize in [0,1]
+            base_img = cv2.resize(base_img, (target_width, target_height), interpolation = cv2.INTER_CUBIC)
+            # Dilate (Default to be True)
+            base_img = cv2.filter2D(base_img, -1, blur_kernel).astype(np.uint8)
+            # Append selected_frames and the color together for visualization
+            merge_frame = selected_frames[temporal_idx].copy()
+            merge_frame = cv2.rectangle(merge_frame, (top_left_x, top_left_y), (bottom_right_x, bottom_right_y), (255, 0, 0), 5)          # Draw the Region Box Area
+            merge_frame[base_img < 250] = base_img[base_img < 250]
+            merge_frames.append(merge_frame)
+            # Append to the temporal index
+            traj_img_lists.append(base_img)
+        # Convert to tensor
+        traj_imgs_np = np.array(traj_img_lists)
+        traj_tensor = torch.tensor(traj_imgs_np)
+        # Transform
+        traj_tensor = traj_tensor.float()
+        traj_tensor = torch.stack([train_transforms(traj_frame) for traj_frame in traj_tensor], dim=0)
+        traj_tensor = traj_tensor.permute(0, 3, 1, 2).contiguous()  # [F, C, H, W]
+        # Write to video (For Debug Purpose)
+        # imageio.mimsave("merge_cond" + str(idx) + ".mp4",  merge_frames, fps=12)
+        # Return
+        merge_frames = np.array(merge_frames)
+        if first_frame_img is not None:
+            return traj_tensor, traj_imgs_np, merge_frames, img_with_traj
+        else:
+            return traj_tensor, traj_imgs_np, merge_frames        # Need to return traj_imgs_np for other purpose
+    def __getitem__(self, idx):
+        while True: # Iterate until there is a valid video read
+            # try:
+            # Fetch the information
+            info = self.info_lists[idx]
+            video_path = os.path.join(self.video_folder_path, info[self.element_idx_dict["video_path"]])
+            original_height = int(info[self.element_idx_dict["height"]])
+            original_width = int(info[self.element_idx_dict["width"]])
+            # num_frames = int(info[self.element_idx_dict["num_frames"]])       # Deprecated, this is about the whole frame duration, not just one
+            valid_duration = json.loads(info[self.element_idx_dict["valid_duration"]])
+            All_Frame_Panoptic_Segmentation = json.loads(info[self.element_idx_dict["Panoptic_Segmentation"]])
+            text_prompt_all = json.loads(info[self.element_idx_dict["Structured_Text_Prompt"]])
+            Track_Traj_all = json.loads(info[self.element_idx_dict["Track_Traj"]])
+            Obj_Info_all = json.loads(info[self.element_idx_dict["Obj_Info"]])
+            ID_info_all = json.loads(info[self.element_idx_dict["ID_info"]])        # New elements compared to motion data loader
+            # Sanity check
+            if not os.path.exists(video_path):
+                raise Exception("This video path", video_path, "doesn't exists!")
+            ########################################## Mangage Resolution and selected Clip Setting ##########################################
+            # Option1: Variable Resolution Gen
+            # # Check the resolution size
+            # aspect_ratio = min(self.max_aspect_ratio, original_width / original_height)
+            # target_height_raw = min(original_height, random.randint(*self.height_range))
+            # target_width_raw = min(original_width, int(target_height_raw * aspect_ratio))
+            # # Must be the multiplier of 32
+            # target_height = (target_height_raw // 32) * 32
+            # target_width = (target_width_raw // 32) * 32
+            # print("New Height and Width are ", target_height, target_width)
+            # Option2: Fixed Resolution Gen (Assume that the provided is 32x valid)
+            target_width = self.target_width
+            target_height = self.target_height
+            # NOTE: Here, we only choose the first Panoptic choice, to avoid multiple panoptic choices.
+            Obj_Info = Obj_Info_all[0]      # For panoptic Segmentation
+            Track_Traj = Track_Traj_all[0]
+            text_prompt = text_prompt_all[0]
+            ID_info = ID_info_all[0]       # For Frame In ID information, Just one Panoptic Frame
+            resolution = str(target_width) + "x" + str(target_height)
+            frame_start_idx = Obj_Info[0][1]       # NOTE: If there is multiple objects Obj_Info[X][1] should be the same
+            ##############################################################################################################################
+            #################################################### Fetch FrameIn ID information ###############################################################
+            # FrameIn drop
+            if self.FrameOut_only or random.random() < self.config["drop_FrameIn_prob"]:
+                drop_FrameIn = True
+            else:
+                drop_FrameIn = False
+            # Not all objects is ideal FrameIn, we need to select
+            if not self.strict_validation_match:
+                effective_ID_idxs = []
+                for ID_idx, ID_Info_obj in enumerate(ID_info):
+                    if ID_Info_obj != []:
+                        effective_ID_idxs.append(ID_idx)
+                main_target_ID_idx = random.choice(effective_ID_idxs)     # NOTE: I think we should only has one object to be processed for now
+            else:
+                main_target_ID_idx = 0     # Always choose the first one
+            # Fetch the FrameIn ID info
+            segmentation_info, useful_region_box = ID_info[main_target_ID_idx]       # There might be multiple objects ideal, but we just randomly choose one
+            if not self.FrameOut_only:
+                _, first_frame_reference_path, _ = segmentation_info     # bbox_info, first_frame_reference_path, store_img_path_lists
+                first_frame_reference_path = os.path.join(self.ID_folder_path, first_frame_reference_path)
+                if not os.path.exists(first_frame_reference_path):
+                    raise Exception("Cannot find ID path", first_frame_reference_path)
+            ##################################################################################################################################################
+            ################ Randomly choose one mask inside the multiple choice available (Resolution is respect to the origional resolution) #################
+            # Choose one region box
+            useful_region_box.sort(key=lambda x: x[0])      # Sort based on the BBox size
+            if not self.strict_validation_match:
+                mask_region = random.choice(useful_region_box[-5:])[1:]         # Choose among the largest 5 BBox available
+            else:
+                mask_region = useful_region_box[-1][1:]     # Choose the last one
+            # Fetch
+            (top_left_x_raw, top_left_y_raw), (bottom_right_x_raw, bottom_right_y_raw) = mask_region        # As Original Resolution
+            # Resize the mask based on the CURRENT Target resolution (现在的384x480的resolution了)
+            top_left_x = int(top_left_x_raw * target_width / original_width)
+            top_left_y = int(top_left_y_raw * target_height / original_height)
+            bottom_right_x = int(bottom_right_x_raw * target_width / original_width)
+            bottom_right_y = int(bottom_right_y_raw * target_height / original_height)
+            resized_mask_region_box = (top_left_x, top_left_y), (bottom_right_x, bottom_right_y)
+            ###################################################################################################################################################
+            ################################################ Read the video by ffmpeg #########################################################################
+            # Read the video by ffmpeg in the needed decode fps and resolution
+            video_stream, err = ffmpeg.input(
+                                                video_path
+                                            ).output(
+                                                "pipe:", format = "rawvideo", pix_fmt = "rgb24", s = resolution, vsync = 'passthrough',
+                                            ).run(
+                                                capture_stdout = True, capture_stderr = True    # If there is bug, command capture_stderr
+                                            )    # The resize is already included
+            video_np_full = np.frombuffer(video_stream, np.uint8).reshape(-1, target_height, target_width, 3)
+            # Fetch the valid duration
+            video_np = video_np_full[valid_duration[0] : valid_duration[1]]
+            valid_num_frames = len(video_np)      # Update the number of frames
+            # Decide the accelerate factor
+            train_frame_num_raw = random.randint(*self.train_frame_num_range)
+            if frame_start_idx + 3 * train_frame_num_raw < valid_num_frames and random.random() < self.faster_motion_prob:      # Should be (1) have enough frames and (2) in 10% probability
+                sample_accelerate_factor = self.sample_accelerate_factor + 1       # Hard Code
+            else:
+                sample_accelerate_factor = self.sample_accelerate_factor
+            # Check the number of frames needed this time
+            frame_end_idx = min(valid_num_frames, frame_start_idx + sample_accelerate_factor * train_frame_num_raw)
+            frame_end_idx = frame_start_idx + 4 * math.floor(( (frame_end_idx-frame_start_idx) - 1) / 4) + 1       # Rounded to the closest 4N + 1 size
+            # Select Frames based on the start and end idx; then, Convert to Tensor
+            selected_frames = video_np[ frame_start_idx : frame_end_idx : sample_accelerate_factor]       # NOTE: start from the first frame
+            if len(selected_frames) < self.min_train_frame_num:
+                print(len(selected_frames), len(video_np), frame_start_idx, frame_end_idx, sample_accelerate_factor)
+                raise Exception(f"selected_frames is less than {self.min_train_frame_num} frames preset! We jump to the next valid one!")      # 我这里让Number of Frames Exactly = 49
+            video_tensor = torch.tensor(selected_frames)   # Convert to tensor
+            train_frame_num = len(video_tensor)     # Read the actual number of frames from the video (Must be 4N+1)
+            # print("Number of frames is", train_frame_num)
+            # Data transforms and shape organize
+            video_tensor = video_tensor.float()
+            video_tensor = torch.stack([train_transforms(frame) for frame in video_tensor], dim=0)
+            video_tensor = video_tensor.permute(0, 3, 1, 2).contiguous()  # [F, C, H, W]
+            # Crop the tensor with all Non-interest region becomes blank(black-0 value); The region is target resolution in training with VAE step size adjustment
+            video_np_masked = np.zeros(selected_frames.shape, dtype = np.uint8)
+            video_np_masked[:, top_left_y:bottom_right_y, top_left_x:bottom_right_x, :] = selected_frames[:, top_left_y:bottom_right_y, top_left_x:bottom_right_x, :]
+            # Decide the first frame with the masked one instead of the full one.
+            first_frame_np = video_np_masked[0]    # Needs to return for Validation
+            # cv2.imwrite("first_frame"+str(idx)+".png", cv2.cvtColor(first_frame_np, cv2.COLOR_BGR2RGB))         # Comment Out Later
+            # Convert to Tensor and then Transforms
+            first_frame_tensor = torch.tensor(first_frame_np)
+            first_frame_tensor = train_transforms(first_frame_tensor).permute(2, 0, 1).contiguous()
+            #########################################################################################################################################
+            ############################################# Define the text prompt #######################################################
+            # NOTE: text prompt 上面已经extract好了，这里就是看到底要不要设置为empty的case
+            if self.empty_text_prompt or random.random() < self.config["text_mask_ratio"]:
+                text_prompt = ""
+            # print("Text Prompt for Video", idx, " is ", text_prompt)        # Comment Out Later
+            #############################################################################################################################
+            ########################### Prepare the Tracking points for each object (each object has different color) #################################
+            # Iterate all the Segmentation Info
+            full_pred_tracks = [[] for _ in range(train_frame_num)]   # The dim should be: (temporal, object, points, xy) The fps should be fixed to 12 fps, which is the same as training decode fps
+            for track_obj_idx in range(len(Obj_Info)):
+                # Read the basic info
+                text_name, frame_idx_raw = Obj_Info[track_obj_idx]      # This is expected to be all the same in the video
+                # Sanity Check: make sure that the number of frames is consistent
+                if track_obj_idx > 0:
+                    if frame_idx_raw != previous_frame_idx_raw:
+                        raise Exception("The panoptic_frame_idx cannot pass the sanity check")
+                # Prepare the tracjectory
+                pred_tracks_full = Track_Traj[track_obj_idx]
+                pred_tracks = pred_tracks_full[ frame_start_idx : frame_end_idx : sample_accelerate_factor]
+                if len(pred_tracks) != train_frame_num:
+                    raise Exception("The length of tracking images does not match the video GT.")
+                # Here is FrameINO special Setting on Kept Point Setting: For Non-main obj idx, we must ensure all points inside the region box; If it is main obj, the ID must be outside the region box
+                if track_obj_idx != main_target_ID_idx or self.FrameOut_only:        # Non-main obj (Usually, for Frame Out cases)
+                    # Randomly select the points based on the prob given, here, the number of points is different for each objeects
+                    kept_point_status = random.choices([True, False], weights = [self.point_keep_ratio_regular, 1 - self.point_keep_ratio_regular], k = len(pred_tracks[0]))
+                    # Check if point of the object is within the first frame; No need to check for following frames (allowed to have FrameOut effect)
+                    first_frame_points = pred_tracks[0]
+                    for point_idx in range(len(first_frame_points)):
+                        (horizontal, vertical) = first_frame_points[point_idx]
+                        if horizontal < top_left_x_raw or horizontal >= bottom_right_x_raw or vertical < top_left_y_raw or vertical >= bottom_right_y_raw:       # Whether Outside the BBox region
+                            kept_point_status[point_idx] = False
+                else:   # For main object
+                    # Randomly select the points based on the prob given, here, the number of points is different for each objeects
+                    if drop_FrameIn:
+                        # No motion provided on ID for Drop FrameIn cases
+                        kept_point_status = random.choices([False], k = len(pred_tracks[0]))
+                    else:   # Regular FrameIn case
+                        kept_point_status = random.choices([True, False], weights = [self.point_keep_ratio_ID, 1 - self.point_keep_ratio_ID], k = len(pred_tracks[0]))
+                # Sanity Check
+                if len(kept_point_status) != len(pred_tracks[-1]):
+                    raise Exception("The number of points filterred does not match with the dataset")
+                # Iterate and add all temporally
+                for temporal_idx, pred_track in enumerate(pred_tracks):     # The length = number of frames
+                    # Iterate all point one by one
+                    left_points = []
+                    for point_idx in range(len(pred_track)):
+                        # Select kept points
+                        if kept_point_status[point_idx]:
+                            left_points.append(pred_track[point_idx])
+                    # Append the left points to the list
+                    full_pred_tracks[temporal_idx].append(left_points)    # pred_tracks will be 49 frames, and each one represent all tracking points for single objects; only one object here
+                # Other update
+                previous_frame_idx_raw = frame_idx_raw
+            # Fetch One Point
+            if self.one_point_one_obj:
+                one_track_point = []
+                for full_pred_track_per_frame in full_pred_tracks:
+                    one_track_point.append( [[full_pred_track_per_frame[0][0]]])
+            #######################################################################################################################################
+            ############################### Process the Video Tensor (based on info fetched from traj) ############################################
+            if drop_FrameIn:
+                ID_img = np.uint8(np.zeros((target_height, target_width, 3)))     # Whole Black (0-value) pixel placeholder
+            else:
+                # Fetch the reference and resize
+                ID_img = np.asarray(Image.open(first_frame_reference_path))
+                # Resize to the same size as the video
+                ref_h, ref_w = ID_img.shape[:2]
+                scale_h = target_height / max(ref_h, ref_w)
+                scale_w = target_width / max(ref_h, ref_w)
+                new_h, new_w = int(ref_h * scale_h), int(ref_w * scale_w)
+                ID_img = cv2.resize(ID_img, (new_w, new_h), interpolation = cv2.INTER_AREA)
+                # Calculate padding amounts on all direction
+                pad_height1 = (target_height - ID_img.shape[0]) // 2
+                pad_height2 = target_height - ID_img.shape[0] - pad_height1
+                pad_width1 = (target_width - ID_img.shape[1]) // 2
+                pad_width2 = target_width - ID_img.shape[1] - pad_width1
+                # Apply padding to same resolution as the training farmes
+                ID_img = np.pad(
+                                    ID_img,
+                                    ((pad_height1, pad_height2), (pad_width1, pad_width2), (0, 0)),
+                                    mode = 'constant',
+                                    constant_values = 0
+                                )
+                # Visualize; Comment Out Later
+                # cv2.imwrite("ID_img_padded"+str(idx)+".png", cv2.cvtColor(ID_img, cv2.COLOR_BGR2RGB))
+            # Convert to tensor (Same as others)
+            ID_tensor = torch.tensor(ID_img)
+            ID_tensor = train_transforms(ID_tensor).permute(2, 0, 1).contiguous()
+            #######################################################################################################################################
+            ############################################## Draw the Traj Points and Transform to Tensor #############################################
+            # Draw the dilated points
+            if self.one_point_one_obj:
+                target_pred_tracks = one_track_point        # For this case, we only has one point per one object
+            else:
+                target_pred_tracks = full_pred_tracks
+            traj_tensor, traj_imgs_np, merge_frames = self.prepare_traj_tensor(target_pred_tracks, original_height, original_width, selected_frames,
+                                                                                self.dot_radius, target_width, target_height, resized_mask_region_box, idx)
+            # Sanity Check to make sure that the traj tensor and ground truth has the same number of frames
+            if len(traj_tensor) != len(video_tensor):        # If this two cannot match, the torch.cat on latents will fail
+                raise Exception("Traj length and Video length does not matched!")
+            #########################################################################################################################################
+            # Write some processed meta data
+            processed_meta_data = {
+                                        "full_pred_tracks": full_pred_tracks,
+                                        "original_width": original_width,
+                                        "original_height": original_height,
+                                        "mask_region": mask_region,
+                                        "resized_mask_region_box": resized_mask_region_box,
+                                    }
+            # except Exception as e:        # Note: You can uncomment this part to jump failure cases in mass training.
+            #     print("The exception is ", e)
+            #     old_idx = idx
+            #     idx = (idx + 1) % len(self.info_lists)
+            #     print("We cannot process the video", old_idx, " and we choose a new idx of ", idx)
+            #     continue     # For any error occurs, we run it again with new idx proposed (a random int less than current value)
+            # If everything is ok, we should break at the end
+            break
+        # Return the information
+        return {
+                    "video_tensor": video_tensor,
+                    "traj_tensor": traj_tensor,
+                    "first_frame_tensor": first_frame_tensor,
+                    "ID_tensor": ID_tensor,
+                    "text_prompt": text_prompt,
+                    # The rest are auxiliary data for the validation/testing purposes
+                    "video_gt_np": selected_frames,
+                    "first_frame_np": first_frame_np,
+                    "ID_np": ID_img,
+                    "processed_meta_data": processed_meta_data,
+                    "traj_imgs_np": traj_imgs_np,
+                    "merge_frames" : merge_frames,
+                    "gt_video_path": video_path,
+                }

data_loader/video_dataset_motion_FrameINO_old.py ADDED Viewed

	@@ -0,0 +1,538 @@

+import os, sys, shutil
+from typing import List, Optional, Tuple, Union
+from pathlib import Path
+import csv
+import random
+import numpy as np
+import ffmpeg
+import json
+import imageio
+import collections
+import cv2
+import pdb
+import math
+import PIL.Image as Image
+csv.field_size_limit(13107200)       # Default setting is 131072, 100x expand should be enough
+import torch
+from torch.utils.data import Dataset
+from torchvision import transforms
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from utils.optical_flow_utils import flow_to_image, filter_uv, bivariate_Gaussian
+# Init paramter and global shared setting
+# Blurring Kernel
+blur_kernel = bivariate_Gaussian(45, 3, 3, 0, grid = None, isotropic = True)
+# Color
+all_color_codes = [(255, 0, 0), (255, 255, 0), (0, 255, 0), (0, 255, 255),
+                    (255, 0, 255), (0, 0, 255), (128, 128, 128), (64, 224, 208),
+                    (233, 150, 122)]
+for _ in range(100):        # Should not be over 100 colors
+    all_color_codes.append((random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)))
+# Data Transforms
+train_transforms = transforms.Compose(
+                                        [
+                                            transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0),
+                                        ]
+                                    )
+class VideoDataset_Motion_FrameINO(Dataset):
+    def __init__(
+        self,
+        config,
+        csv_folder_path,
+        FrameOut_only = False,
+        one_point_one_obj = False,
+        strict_validation_match = False,
+    ) -> None:
+        super().__init__()
+        # Fetch the Fundamental Setting
+        self.dataset_folder_path = config["dataset_folder_path"]
+        if not FrameOut_only:   # Frame In mode
+            self.ID_folder_path = config["ID_folder_path"]
+        self.target_height = config["height"]
+        self.target_width = config["width"]
+        # self.ref_cond_size = config["ref_cond_size"]
+        self.preset_decode_fps = config["preset_decode_fps"]        # Set to be 16
+        self.train_frame_num = config["train_frame_num"]
+        self.empty_text_prompt = config["empty_text_prompt"]
+        self.start_skip = config["start_skip"]
+        self.end_skip = config["end_skip"]
+        self.dot_radius = int(config["dot_radius"])                 # Set to be 6
+        self.point_keep_ratio_ID = config["point_keep_ratio_ID"]
+        self.point_keep_ratio_regular = config["point_keep_ratio_regular"]
+        self.faster_motion_prob = config["faster_motion_prob"]
+        self.FrameOut_only = FrameOut_only
+        self.one_point_one_obj = one_point_one_obj      # Currently, this only open when FrameOut_only = True
+        self.strict_validation_match = strict_validation_match
+        self.config = config
+        # Sanity Check
+        assert(self.point_keep_ratio_ID <= 1.0)
+        assert(self.point_keep_ratio_regular <= 1.0)
+        # Read the CSV files
+        info_lists = []
+        for csv_file_name in os.listdir(csv_folder_path):       # Read all csv files
+            csv_file_path  = os.path.join(csv_folder_path, csv_file_name)
+            with open(csv_file_path) as file_obj:
+                reader_obj = csv.reader(file_obj)
+                # Iterate over each row in the csv
+                for idx, row in enumerate(reader_obj):
+                    if idx == 0:
+                        elements = dict()
+                        for element_idx, key in enumerate(row):
+                            elements[key] = element_idx
+                        continue
+                    # Read the important information
+                    info_lists.append(row)
+        # Organize
+        self.info_lists = info_lists
+        self.element_idx_dict = elements
+        # Log
+        print("The number of videos for ", csv_folder_path, " is ", len(self.info_lists))
+        # print("The memory cost is ", sys.getsizeof(self.info_lists))
+    def __len__(self):
+        return len(self.info_lists)
+    @staticmethod
+    def prepare_traj_tensor(full_pred_tracks, original_height, original_width, selected_frames,
+                                dot_radius, target_width, target_height, region_box, idx = 0, first_frame_img = None):
+        # Prepare the color and other stuff
+        target_color_codes = all_color_codes[:len(full_pred_tracks[0])]        # This means how many objects in total we have
+        (top_left_x, top_left_y), (bottom_right_x, bottom_right_y) = region_box
+        # Prepare the traj image
+        traj_img_lists = []
+        # Set a new dot radius based on the resolution fluctuating
+        dot_radius_resize = int( dot_radius * original_height / 384 )     # This is set with respect to default 384 height, will be adjust based on the height change
+        # Prepare base draw image if there is
+        if first_frame_img is not None:
+            img_with_traj = first_frame_img.copy()
+        # Iterate all object instance
+        merge_frames = []
+        for temporal_idx, obj_points in enumerate(full_pred_tracks): # Iterate all downsampled frames, should be 13
+            # Init the base img for the traj figures
+            base_img = np.zeros((original_height, original_width, 3)).astype(np.float32)      # Use the original image size
+            base_img.fill(255)      # Whole white frames
+            # Iterate for the per object
+            for obj_idx, points in enumerate(obj_points):
+                # Basic setting
+                color_code = target_color_codes[obj_idx]        # Color across frames should be consistent
+                # Process all points in this current object
+                for (horizontal, vertical) in points:
+                    if horizontal < 0 or horizontal >= original_width or vertical < 0 or vertical >= original_height:
+                        continue    # If the point is already out of the range, Don't draw
+                    # Draw square around the target position
+                    vertical_start = min(original_height, max(0, vertical - dot_radius_resize))
+                    vertical_end = min(original_height, max(0, vertical + dot_radius_resize))       # Diameter, used to be 10, but want smaller if there are too many points now
+                    horizontal_start = min(original_width, max(0, horizontal - dot_radius_resize))
+                    horizontal_end =  min(original_width, max(0, horizontal + dot_radius_resize))
+                    # Paint
+                    base_img[vertical_start:vertical_end, horizontal_start:horizontal_end, :] = color_code
+                    # Draw the visual of traj if needed
+                    if first_frame_img is not None:
+                        img_with_traj[vertical_start:vertical_end, horizontal_start:horizontal_end, :] = color_code
+            # Resize frames  Don't use negative and don't resize in [0,1]
+            base_img = cv2.resize(base_img, (target_width, target_height), interpolation = cv2.INTER_CUBIC)
+            # Dilate (Default to be True)
+            base_img = cv2.filter2D(base_img, -1, blur_kernel).astype(np.uint8)
+            # Append selected_frames and the color together for visualization
+            merge_frame = selected_frames[temporal_idx].copy()
+            merge_frame = cv2.rectangle(merge_frame, (top_left_x, top_left_y), (bottom_right_x, bottom_right_y), (255, 0, 0), 5)          # Draw the Region Box Area
+            merge_frame[base_img < 250] = base_img[base_img < 250]
+            merge_frames.append(merge_frame)
+            # Append to the temporal index
+            traj_img_lists.append(base_img)
+        # Convert to tensor
+        traj_imgs_np = np.array(traj_img_lists)
+        traj_tensor = torch.tensor(traj_imgs_np)
+        # Transform
+        traj_tensor = traj_tensor.float()
+        traj_tensor = torch.stack([train_transforms(traj_frame) for traj_frame in traj_tensor], dim=0)
+        traj_tensor = traj_tensor.permute(0, 3, 1, 2).contiguous()  # [F, C, H, W]
+        # Write to video (For Debug Purpose)
+        # imageio.mimsave("merge_cond" + str(idx) + ".mp4",  merge_frames, fps=12)
+        # Return
+        merge_frames = np.array(merge_frames)
+        if first_frame_img is not None:
+            return traj_tensor, traj_imgs_np, merge_frames, img_with_traj
+        else:
+            return traj_tensor, traj_imgs_np, merge_frames        # Need to return traj_imgs_np for other purpose
+    def __getitem__(self, idx):
+        while True: # Iterate until there is a valid video read
+            try:
+                # Fetch the information
+                info = self.info_lists[idx]
+                video_path = os.path.join(self.dataset_folder_path, info[self.element_idx_dict["video_path"]])
+                original_height = int(info[self.element_idx_dict["height"]])
+                original_width = int(info[self.element_idx_dict["width"]])
+                num_frames = int(info[self.element_idx_dict["num_frames"]])
+                fps = float(info[self.element_idx_dict["fps"]])
+                # Fetch all panoptic frames
+                FrameIN_info_all = json.loads(info[self.element_idx_dict["FrameIN_info"]])
+                Track_Traj_all = json.loads(info[self.element_idx_dict["Track_Traj"]])
+                text_prompt_all = json.loads(info[self.element_idx_dict["Improved_Text_Prompt"]])
+                ID_info_all = json.loads(info[self.element_idx_dict["ID_info"]])
+                # Randomly Choose one available
+                panoptic_idx = random.choice(range(len(FrameIN_info_all)))
+                FrameIN_info = FrameIN_info_all[panoptic_idx]
+                Track_Traj = Track_Traj_all[panoptic_idx]
+                text_prompt = text_prompt_all[panoptic_idx]
+                ID_info_panoptic = ID_info_all[panoptic_idx]
+                # Organize
+                resolution = str(self.target_width) + "x" + str(self.target_height)
+                fps_scale = self.preset_decode_fps / fps
+                downsample_num_frames = int(num_frames * fps_scale)
+                # FrameIn drop
+                if self.FrameOut_only or random.random() < self.config["drop_FrameIn_prob"]:
+                    drop_FrameIn = True
+                else:
+                    drop_FrameIn = False
+                # Sanity check
+                if not os.path.exists(video_path):
+                    raise Exception("This video path ", video_path, " doesn't exists!")
+                # Not all objects is ideal FrameIn, we need to select
+                if not self.strict_validation_match:
+                    effective_obj_idxs = []
+                    for obj_idx, obj_info in enumerate(ID_info_panoptic):
+                        if obj_info != []:
+                            effective_obj_idxs.append(obj_idx)
+                    main_target_obj_idx = random.choice(effective_obj_idxs)     # NOTE: I think we should only has one object to be processed for now
+                else:
+                    main_target_obj_idx = 0     # Always choose the first one
+                #################################################### Fetch FrameIn ID information ###############################################################
+                # Fetch the FrameIn ID info
+                segmentation_info, useful_region_box = ID_info_panoptic[main_target_obj_idx]       # There might be multiple objects ideal, but we just randomly choose one
+                if not self.FrameOut_only:
+                    _, first_frame_reference_path, _ = segmentation_info     # bbox_info, first_frame_reference_path, store_img_path_lists
+                    first_frame_reference_path = os.path.join(self.ID_folder_path, first_frame_reference_path)
+                ##################################################################################################################################################
+                ############ Randomly choose one mask inside the multiple choice available (Resolution is respect to the origional resolution) ############
+                useful_region_box.sort(key=lambda x: x[0])
+                # Choose one region box
+                if not self.strict_validation_match:
+                    mask_region = random.choice(useful_region_box[-5:])[1:]         # Choose in the largest 5 available
+                else:
+                    mask_region = useful_region_box[-1][1:]     # Choose the last one
+                # Fetch
+                (top_left_x_raw, top_left_y_raw), (bottom_right_x_raw, bottom_right_y_raw) = mask_region        # As Original Resolution
+                # Resize the mask based on the CURRENT Target resolution (现在的384x480的resolution了)
+                top_left_x = int(top_left_x_raw * self.target_width / original_width)
+                top_left_y = int(top_left_y_raw * self.target_height / original_height)
+                bottom_right_x = int(bottom_right_x_raw * self.target_width / original_width)
+                bottom_right_y = int(bottom_right_y_raw * self.target_height / original_height)
+                resized_mask_region_box = (top_left_x, top_left_y), (bottom_right_x, bottom_right_y)
+                ###########################################################################################################################################
+                ############################################## Read the video by ffmpeg #############################################################
+                # Read the video by ffmpeg in the needed decode fps and resolution
+                video_stream, err = ffmpeg.input(
+                                                    video_path
+                                                ).filter(
+                                                    'fps', fps = self.preset_decode_fps, round = 'up'
+                                                ).output(
+                                                    "pipe:", format = "rawvideo", pix_fmt = "rgb24", s = resolution
+                                                ).run(
+                                                    capture_stdout = True, capture_stderr = True
+                                                )      # The resize is already included
+                video_np_raw = np.frombuffer(video_stream, np.uint8).reshape(-1, self.target_height, self.target_width, 3)
+                # Sanity Check
+                if len(video_np_raw) - self.start_skip - self.end_skip < self.train_frame_num:
+                    raise Exception("The number of frames from the video is not enough")
+                # Crop the tensor with all Non-interest region becomes blank(black-0 value); The region is target resolution in training with VAE step size adjustment
+                video_np_masked = np.zeros(video_np_raw.shape, dtype = np.uint8)
+                video_np_masked[:, top_left_y:bottom_right_y, top_left_x:bottom_right_x, :] = video_np_raw[:, top_left_y:bottom_right_y, top_left_x:bottom_right_x, :]
+                #########################################################################################################################################
+                ######################################### Define the text prompt #######################################################
+                # Whether empty text prompt; Text Prompt already exists above
+                if self.empty_text_prompt or random.random() < self.config["text_mask_ratio"]:
+                    text_prompt = ""
+                ########################################################################################################################
+                ###################### Prepare the Tracking points for each object (each object has different color) #################################
+                # Make sure that the frame from the FrameIN_info has enough number of frames
+                _, original_start_frame_idx, fps_scale = FrameIN_info[main_target_obj_idx]      # This is expected to be all the same in the video
+                downsample_start_frame_idx = max(0, int(original_start_frame_idx * fps_scale))
+                # Check the max number of frames available (NOTE: Recommended to use Full Text Prompt Version)
+                max_step_num = (downsample_num_frames - downsample_start_frame_idx) // self.train_frame_num
+                if max_step_num == 0:
+                    print("This video is ", video_path)
+                    raise Exception("The video is too short!")
+                elif max_step_num >= 2 and random.random() < self.faster_motion_prob:
+                    iter_gap = 2       # Maximum Setting now is 2x; else, the VAE might not works well
+                else:
+                    iter_gap = 1
+                # Iterate all the Segmentation Info
+                full_pred_tracks = [[] for _ in range(self.train_frame_num)]   # The dim should be: (temporal, object, points, xy) The fps should be fixed to 12 fps, which is the same as training decode fps
+                # Iterate all objects but not the main objects
+                for obj_idx in range(len(ID_info_panoptic)):
+                    # Prepare the tracjectory
+                    pred_tracks = Track_Traj[obj_idx]
+                    pred_tracks = pred_tracks[downsample_start_frame_idx : downsample_start_frame_idx + iter_gap * self.train_frame_num : iter_gap]
+                    if len(pred_tracks) != self.train_frame_num:
+                        raise Exception("The len of pre_track does not match")
+                    # For Non-main obj idx, we must ensure all points inside the region box; If it is main obj, the ID must be outside the region box
+                    if obj_idx != main_target_obj_idx or self.FrameOut_only:
+                        # Randomly select the points based on the prob given, here, the number of points is different for each objeects
+                        kept_point_status = random.choices([True, False], weights = [self.point_keep_ratio_regular, 1 - self.point_keep_ratio_regular], k = len(pred_tracks[0]))
+                        # Check witht the first frame, No need to check for following frames (allowed to have FrameOut effect)
+                        first_frame_points = pred_tracks[0]
+                        for point_idx in range(len(first_frame_points)):
+                            (horizontal, vertical) = first_frame_points[point_idx]
+                            if horizontal < top_left_x_raw or horizontal >= bottom_right_x_raw or vertical < top_left_y_raw or vertical >= bottom_right_y_raw:
+                                kept_point_status[point_idx] = False
+                    else:   # For main object
+                        # Randomly select the points based on the prob given, here, the number of points is different for each objeects
+                        if drop_FrameIn:
+                            # No motion provided on ID for Drop FrameIn cases
+                            kept_point_status = random.choices([False], k = len(pred_tracks[0]))
+                        else:   # Regular FrameIn case
+                            kept_point_status = random.choices([True, False], weights = [self.point_keep_ratio_ID, 1 - self.point_keep_ratio_ID], k = len(pred_tracks[0]))
+                    # Sanity Check
+                    if len(kept_point_status) != len(pred_tracks[-1]):
+                        raise Exception("The number of points filterred is not match with the dataset")
+                    # Iterate and add all temporally
+                    for temporal_idx, pred_track in enumerate(pred_tracks):
+                        # Iterate all point one by one
+                        left_points = []
+                        for point_idx in range(len(pred_track)):
+                            # Select kept points
+                            if kept_point_status[point_idx]:
+                                left_points.append(pred_track[point_idx])
+                        # Append the left points to the list
+                        full_pred_tracks[temporal_idx].append(left_points)    # pred_tracks will be 49 frames, and each one represent all tracking points for single objects; only one object here
+                # Fetch One Point
+                if self.one_point_one_obj:
+                    one_track_point = []
+                    for full_pred_track_per_frame in full_pred_tracks:
+                        one_track_point.append( [[full_pred_track_per_frame[0][0]]])
+                #######################################################################################################################################
+                ############################### Process the Video Tensor (based on info fetched from traj) ############################################
+                # Select Frames based on the panoptic range (No Mask here)
+                selected_frames = video_np_raw[downsample_start_frame_idx : downsample_start_frame_idx + iter_gap * self.train_frame_num : iter_gap]
+                # Prepare the Video Tensor; NOTE: in this branch, video tensor is full image without mask
+                video_tensor = torch.tensor(selected_frames)   # Convert to tensor
+                if len(video_tensor) != self.train_frame_num:
+                    raise Exception("The len of train frames does not match")
+                # Training transforms for the Video and condition
+                video_tensor = video_tensor.float()
+                video_tensor = torch.stack([train_transforms(frame) for frame in video_tensor], dim=0)
+                video_tensor = video_tensor.permute(0, 3, 1, 2).contiguous()  # [F, C, H, W]
+                if drop_FrameIn:
+                    main_reference_img = np.uint8(np.zeros((self.target_height, self.target_width, 3)))     # Whole Black (0-value) pixel placeholder
+                else:
+                    # Fetch the reference and resize
+                    main_reference_img = np.asarray(Image.open(first_frame_reference_path))
+                    # Resize to the same size as the video
+                    ref_h, ref_w = main_reference_img.shape[:2]
+                    scale_h = self.target_height / max(ref_h, ref_w)
+                    scale_w = self.target_width / max(ref_h, ref_w)
+                    new_h, new_w = int(ref_h * scale_h), int(ref_w * scale_w)
+                    main_reference_img = cv2.resize(main_reference_img, (new_w, new_h), interpolation = cv2.INTER_AREA)
+                    # Calculate padding amounts on all direction
+                    pad_height1 = (self.target_height - main_reference_img.shape[0]) // 2
+                    pad_height2 = self.target_height - main_reference_img.shape[0] - pad_height1
+                    pad_width1 = (self.target_width - main_reference_img.shape[1]) // 2
+                    pad_width2 = self.target_width - main_reference_img.shape[1] - pad_width1
+                    # Apply padding to same resolution as the training farmes
+                    main_reference_img = np.pad(
+                                                    main_reference_img,
+                                                    ((pad_height1, pad_height2), (pad_width1, pad_width2), (0, 0)),
+                                                    mode = 'constant',
+                                                    constant_values = 0
+                                                )
+                    # cv2.imwrite("main_reference_img_padded"+str(idx)+".png", cv2.cvtColor(main_reference_img, cv2.COLOR_BGR2RGB))
+                # Convert to tensor
+                main_reference_tensor = torch.tensor(main_reference_img)
+                main_reference_tensor = train_transforms(main_reference_tensor).permute(2, 0, 1).contiguous()
+                # Fetch the first frame and then do ID merge for this branch of training
+                first_frame_np = video_np_masked[downsample_start_frame_idx]         # Needs to return for Validation
+                # cv2.imwrite("first_frame"+str(idx)+".png", cv2.cvtColor(first_frame_np, cv2.COLOR_BGR2RGB))
+                # Convert to Tensor and then Transforms
+                first_frame_tensor = torch.tensor(first_frame_np)
+                first_frame_tensor = train_transforms(first_frame_tensor).permute(2, 0, 1).contiguous()
+                #######################################################################################################################################
+                ############################################## Draw the Traj Points and Transform to Tensor #############################################
+                # Draw the dilated points
+                if self.one_point_one_obj:
+                    target_pred_tracks = one_track_point        # For this case, we only has one point per one object
+                else:
+                    target_pred_tracks = full_pred_tracks
+                traj_tensor, traj_imgs_np, merge_frames = self.prepare_traj_tensor(target_pred_tracks, original_height, original_width, selected_frames,
+                                                                                    self.dot_radius, self.target_width, self.target_height, resized_mask_region_box, idx)
+                #########################################################################################################################################
+                # Write some processed meta data
+                processed_meta_data = {
+                                            "full_pred_tracks": full_pred_tracks,
+                                            "original_width": original_width,
+                                            "original_height": original_height,
+                                            "mask_region": mask_region,
+                                            "resized_mask_region_box": resized_mask_region_box,
+                                        }
+            except Exception as e:
+                print("The exception is ", e)
+                old_idx = idx
+                idx = random.randint(0, len(self.info_lists))
+                print("We cannot process the video", old_idx, " and we choose a new idx of ", idx)
+                continue     # For any error occurs, we run it again with new idx proposed (a random int less than current value)
+            # If everything is ok, we should break at the end
+            break
+        # Return the information
+        return {
+                    "video_tensor": video_tensor,
+                    "traj_tensor": traj_tensor,
+                    "first_frame_tensor": first_frame_tensor,
+                    "main_reference_tensor": main_reference_tensor,
+                    "text_prompt": text_prompt,
+                    # The rest are auxiliary data for the validation/testing purposes
+                    "video_gt_np": selected_frames,
+                    "first_frame_np": first_frame_np,
+                    "main_reference_np": main_reference_img,
+                    "processed_meta_data": processed_meta_data,
+                    "traj_imgs_np": traj_imgs_np,
+                    "merge_frames" : merge_frames,
+                    "gt_video_path": video_path,
+                }

pipelines/pipeline_cogvideox_i2v_motion.py ADDED Viewed

	@@ -0,0 +1,931 @@

+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os, sys, shutil
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import PIL
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.loaders import CogVideoXLoraLoaderMixin
+from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+# from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.cogvideo.pipeline_output import CogVideoXPipelineOutput
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from architecture.embeddings import get_3d_rotary_pos_embed
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import CogVideoXImageToVideoPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+        >>> pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
+        ... )
+        >>> video = pipe(image, prompt, use_dynamic_cfg=True)
+        >>> export_to_video(video.frames[0], "output.mp4", fps=8)
+        ```
+"""
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):       # NOTE: 这里应该是把aspect ratio align到target的程度 (类似于之前看的Reference Resize方法)
+        resize_height = th
+        resize_width = int(round(th / h * w))       # NOTE: 这个一个branch，这里会有多余位点
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))       # NOTE: 这个取了中间值
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class CogVideoXImageToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
+    r"""
+    Pipeline for image-to-video generation using CogVideoX.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. CogVideoX uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CogVideoXTransformer3DModel`]):
+            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if getattr(self, "vae", None) else 4
+        )
+        self.vae_scaling_factor_image = self.vae.config.scaling_factor if getattr(self, "vae", None) else 0.7
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        return prompt_embeds, negative_prompt_embeds
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        num_frames: int = 13,
+        height: int = 60,
+        width: int = 90,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        # For CogVideoX1.5, the latent should add 1 for padding (Not use)
+        if self.transformer.config.patch_size_t is not None:
+            shape = shape[:1] + (shape[1] + shape[1] % self.transformer.config.patch_size_t,) + shape[2:]
+        image = image.unsqueeze(2)  # [B, C, F, H, W]
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
+            ]
+        else:
+            image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
+        image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
+        if not self.vae.config.invert_scale_latents:
+            image_latents = self.vae_scaling_factor_image * image_latents
+        else:
+            # This is awkward but required because the CogVideoX team forgot to multiply the
+            # scaling factor during training :)
+            image_latents = 1 / self.vae_scaling_factor_image * image_latents
+        padding_shape = (
+            batch_size,
+            num_frames - 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        latent_padding = torch.zeros(padding_shape, device=device, dtype=dtype)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+        # Select the first frame along the second dimension
+        if self.transformer.config.patch_size_t is not None:
+            first_frame = image_latents[:, : image_latents.size(1) % self.transformer.config.patch_size_t, ...]
+            image_latents = torch.cat([first_frame, image_latents], dim=1)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents, image_latents
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae_scaling_factor_image * latents
+        frames = self.vae.decode(latents).sample
+        return frames
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = timesteps[t_start * self.scheduler.order :]
+        return timesteps, num_inference_steps - t_start
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        image,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        latents=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        p = self.transformer.config.patch_size
+        p_t = self.transformer.config.patch_size_t
+        base_size_width = self.transformer.config.sample_width // p
+        base_size_height = self.transformer.config.sample_height // p
+        # RoPE extrapolation factor in NTK
+        # token_factor_ratio = (grid_height * grid_width) / (base_size_width * base_size_height)
+        # if token_factor_ratio > 1.0:
+        #     ntk_factor = token_factor_ratio
+        # else:
+        #     ntk_factor = 1.0
+        if p_t is None:     # HACK: Go this Branch
+            # CogVideoX 1.0
+            grid_crops_coords = get_resize_crop_region_for_grid(
+                (grid_height, grid_width), base_size_width, base_size_height
+            )
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=grid_crops_coords,     # ((0, 0), (30, 45))
+                grid_size=(grid_height, grid_width),    # (30, 45)
+                # ntk_factor = ntk_factor,                # For the extrapolation
+                temporal_size=num_frames,
+                device=device,
+            )
+        else:
+            # CogVideoX 1.5
+            base_num_frames = (num_frames + p_t - 1) // p_t
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=None,
+                grid_size=(grid_height, grid_width),
+                temporal_size=base_num_frames,
+                grid_type="slice",
+                max_size=(base_size_height, base_size_width),
+                device=device,
+            )
+        return freqs_cos, freqs_sin
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        traj_tensor = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The height in pixels of the generated image. This is set to 480 by default for the best results.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The width in pixels of the generated image. This is set to 720 by default for the best results.
+            num_frames (`int`, defaults to `48`):
+                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
+                num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
+                needs to be satisfied is that of divisibility mentioned above.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+        Examples:
+        Returns:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = num_frames or self.transformer.config.sample_frames
+        num_videos_per_prompt = 1
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            image=image,
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+        # 5. Prepare latents
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if patch_size_t is not None and latent_frames % patch_size_t != 0:
+            additional_frames = patch_size_t - latent_frames % patch_size_t
+            num_frames += additional_frames * self.vae_scale_factor_temporal
+        image = self.video_processor.preprocess(image, height=height, width=width).to(
+            device, dtype=prompt_embeds.dtype
+        )
+        latent_channels = 16 # self.transformer.config.in_channels // 2
+        latents, image_latents = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5.5. Traj Preprocess
+        traj_tensor = traj_tensor.to(device, dtype = self.vae.dtype)[None]     #.unsqueeze(0)
+        traj_tensor = traj_tensor.permute(0, 2, 1, 3, 4)
+        traj_latents = self.vae.encode(traj_tensor).latent_dist
+        # Scale, Permute, and other conversion
+        traj_latents = traj_latents.sample() * self.vae.config.scaling_factor
+        traj_latents = traj_latents.permute(0, 2, 1, 3, 4)
+        traj_latents = traj_latents.to(memory_format = torch.contiguous_format).float().to(dtype = prompt_embeds.dtype)    # [B, F, C, H, W]
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+        # 8. Create ofs embeds if required
+        ofs_emb = None if self.transformer.config.ofs_embed_dim is None else latents.new_full((1,), fill_value=2.0)
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_traj = torch.cat([traj_latents] * 2) if do_classifier_free_guidance else traj_latents
+                latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
+                latent_model_input = torch.cat([latent_model_input, latent_image_input, latent_traj], dim=2)     # The thrid dim grow from 16 to 32
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    ofs=ofs_emb,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if not output_type == "latent":
+            # Discard any padding frames that were added for CogVideoX 1.5
+            latents = latents[:, additional_frames:]
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return CogVideoXPipelineOutput(frames=video)

pipelines/pipeline_cogvideox_i2v_motion_FrameINO.py ADDED Viewed

	@@ -0,0 +1,960 @@

+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import PIL
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.loaders import CogVideoXLoraLoaderMixin
+from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.cogvideo.pipeline_output import CogVideoXPipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import CogVideoXImageToVideoPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+        >>> pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
+        ... )
+        >>> video = pipe(image, prompt, use_dynamic_cfg=True)
+        >>> export_to_video(video.frames[0], "output.mp4", fps=8)
+        ```
+"""
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class CogVideoXImageToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
+    r"""
+    Pipeline for image-to-video generation using CogVideoX.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. CogVideoX uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CogVideoXTransformer3DModel`]):
+            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if getattr(self, "vae", None) else 4
+        )
+        self.vae_scaling_factor_image = self.vae.config.scaling_factor if getattr(self, "vae", None) else 0.7
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        return prompt_embeds, negative_prompt_embeds
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        num_frames: int = 13,
+        height: int = 60,
+        width: int = 90,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        # For CogVideoX1.5, the latent should add 1 for padding (Not use)
+        if self.transformer.config.patch_size_t is not None:
+            shape = shape[:1] + (shape[1] + shape[1] % self.transformer.config.patch_size_t,) + shape[2:]
+        image = image.unsqueeze(2)  # [B, C, F, H, W]
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
+            ]
+        else:
+            image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
+        image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
+        if not self.vae.config.invert_scale_latents:
+            image_latents = self.vae_scaling_factor_image * image_latents
+        else:
+            # This is awkward but required because the CogVideoX team forgot to multiply the
+            # scaling factor during training :)
+            image_latents = 1 / self.vae_scaling_factor_image * image_latents
+        padding_shape = (
+            batch_size,
+            num_frames - 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        latent_padding = torch.zeros(padding_shape, device=device, dtype=dtype)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+        # Select the first frame along the second dimension
+        if self.transformer.config.patch_size_t is not None:
+            first_frame = image_latents[:, : image_latents.size(1) % self.transformer.config.patch_size_t, ...]
+            image_latents = torch.cat([first_frame, image_latents], dim=1)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents, image_latents
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae_scaling_factor_image * latents
+        frames = self.vae.decode(latents).sample
+        return frames
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = timesteps[t_start * self.scheduler.order :]
+        return timesteps, num_inference_steps - t_start
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        image,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        latents=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        p = self.transformer.config.patch_size
+        p_t = self.transformer.config.patch_size_t
+        base_size_width = self.transformer.config.sample_width // p
+        base_size_height = self.transformer.config.sample_height // p
+        if p_t is None:     # HACK: Go this Branch
+            # CogVideoX 1.0
+            grid_crops_coords = get_resize_crop_region_for_grid(
+                (grid_height, grid_width), base_size_width, base_size_height
+            )
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=grid_crops_coords,     # ((0, 0), (30, 45))
+                grid_size=(grid_height, grid_width),    # (30, 45)
+                temporal_size=num_frames,
+                device=device,
+            )
+        else:
+            # CogVideoX 1.5
+            base_num_frames = (num_frames + p_t - 1) // p_t
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=None,
+                grid_size=(grid_height, grid_width),
+                temporal_size=base_num_frames,
+                grid_type="slice",
+                max_size=(base_size_height, base_size_width),
+                device=device,
+            )
+        return freqs_cos, freqs_sin
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        traj_tensor = None,
+        ID_tensor = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        add_ID_reference_augment_noise: bool = True,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The height in pixels of the generated image. This is set to 480 by default for the best results.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The width in pixels of the generated image. This is set to 720 by default for the best results.
+            num_frames (`int`, defaults to `48`):
+                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
+                num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
+                needs to be satisfied is that of divisibility mentioned above.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+        Examples:
+        Returns:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = num_frames or self.transformer.config.sample_frames
+        num_videos_per_prompt = 1
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            image=image,
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+        # 5. Prepare latents
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if patch_size_t is not None and num_latent_frames % patch_size_t != 0:
+            additional_frames = patch_size_t - num_latent_frames % patch_size_t
+            num_frames += additional_frames * self.vae_scale_factor_temporal
+        image = self.video_processor.preprocess(image, height=height, width=width).to(
+            device, dtype=prompt_embeds.dtype
+        )
+        latent_channels = 16 # self.transformer.config.in_channels // 2
+        latents, image_latents = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5.1. Traj Preprocess
+        traj_tensor = traj_tensor.to(device, dtype = self.vae.dtype)[None]     #.unsqueeze(0)
+        traj_tensor = traj_tensor.permute(0, 2, 1, 3, 4)
+        traj_latents = self.vae.encode(traj_tensor).latent_dist
+        # Scale, Permute, and other conversion
+        traj_latents = traj_latents.sample() * self.vae.config.scaling_factor
+        traj_latents = traj_latents.permute(0, 2, 1, 3, 4)
+        traj_latents = traj_latents.to(memory_format = torch.contiguous_format).float().to(dtype = prompt_embeds.dtype)    # [B, F, C, H, W]
+        # 5.2. ID Reference Preprocess
+        if ID_tensor is not None:
+            from train_code.train_cogvideox_motion_FrameINO import img_tensor_to_vae_latent     # Put it here to avoid circular import
+            # TODO: test中要不要加Augment Noise再验证一下
+            ID_latent = img_tensor_to_vae_latent(ID_tensor.unsqueeze(0), self.vae, traj_latents.device, add_augment_noise = add_ID_reference_augment_noise)
+            ID_latent = ID_latent.unsqueeze(1).to(dtype = prompt_embeds.dtype)
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+        # Copy the 14th frame with the first frame PE information
+        freqs_cos, freqs_sin = image_rotary_emb
+        first_frame_token_num = freqs_cos.shape[0] // num_latent_frames
+        freqs_cos = torch.cat([freqs_cos, freqs_cos[:first_frame_token_num]], dim=0)      # Hard Code
+        freqs_sin = torch.cat([freqs_sin, freqs_sin[:first_frame_token_num]], dim=0)
+        image_rotary_emb = (freqs_cos, freqs_sin)
+        # 8. Create ofs embeds if required
+        ofs_emb = None if self.transformer.config.ofs_embed_dim is None else latents.new_full((1,), fill_value=2.0)
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # Noisy latents prepare
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # First Frame latents prepare
+                latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
+                # Traj latents prepare
+                latent_traj = torch.cat([traj_latents] * 2) if do_classifier_free_guidance else traj_latents
+                # ID Refence prepare
+                if ID_tensor is not None:
+                    # CFG Double Batch Size
+                    latent_ID = torch.cat([ID_latent] * 2) if do_classifier_free_guidance else ID_latent
+                    # Frame-Wise Token Increase
+                    latent_model_input = torch.cat([latent_model_input, latent_ID], dim = 1)
+                    # Increase the frame dimension of the Traj latents and the first frame latent
+                    latent_ID_padding = latent_model_input.new_zeros(latent_ID.shape)           # Zero latent values
+                    latent_image_input = torch.cat([latent_image_input, latent_ID_padding], dim=1)
+                    latent_traj = torch.cat([latent_traj, latent_ID_padding], dim=1)
+                # Dimension-Wise Concatenation
+                latent_model_input = torch.cat([latent_model_input, latent_image_input, latent_traj], dim=2)     # The thrid dim grow from 16 to 32
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                # predict noise model_output
+                noise_pred = self.transformer(
+                                                hidden_states=latent_model_input,
+                                                encoder_hidden_states=prompt_embeds,
+                                                timestep=timestep,
+                                                ofs=ofs_emb,
+                                                image_rotary_emb=image_rotary_emb,
+                                                attention_kwargs=attention_kwargs,
+                                                return_dict=False,
+                                            )[0]
+                noise_pred = noise_pred.float()
+                #  Discard the Extra ID tokens in the Noise Prediction
+                if ID_tensor is not None:
+                    noise_pred = noise_pred[:, :num_latent_frames, :, :, :]
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if not output_type == "latent":
+            # Discard any padding frames that were added for CogVideoX 1.5
+            latents = latents[:, additional_frames:]
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return CogVideoXPipelineOutput(frames=video)

pipelines/pipeline_wan_i2v_motion.py ADDED Viewed

	@@ -0,0 +1,861 @@

+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import html
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import os, sys, shutil
+import PIL
+import regex as re
+import torch
+from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, UMT5EncoderModel
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.loaders import WanLoraLoaderMixin
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.wan.pipeline_output import WanPipelineOutput
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from architecture.transformer_wan import WanTransformer3DModel
+from architecture.autoencoder_kl_wan import AutoencoderKLWan
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+if is_ftfy_available():
+    import ftfy
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> import numpy as np
+        >>> from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+        >>> from transformers import CLIPVisionModel
+        >>> # Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+        >>> model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
+        >>> image_encoder = CLIPVisionModel.from_pretrained(
+        ...     model_id, subfolder="image_encoder", torch_dtype=torch.float32
+        ... )
+        >>> vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+        >>> pipe = WanImageToVideoPipeline.from_pretrained(
+        ...     model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
+        ... )
+        >>> max_area = 480 * 832
+        >>> aspect_ratio = image.height / image.width
+        >>> mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+        >>> height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+        >>> width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+        >>> image = image.resize((width, height))
+        >>> prompt = (
+        ...     "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
+        ...     "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+        ... )
+        >>> negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+        >>> output = pipe(
+        ...     image=image,
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     height=height,
+        ...     width=width,
+        ...     num_frames=81,
+        ...     guidance_scale=5.0,
+        ... ).frames[0]
+        >>> export_to_video(output, "output.mp4", fps=16)
+        ```
+"""
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
+    r"""
+    Pipeline for image-to-video generation using Wan.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        tokenizer ([`T5Tokenizer`]):
+            Tokenizer from [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5Tokenizer),
+            specifically the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        text_encoder ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        image_encoder ([`CLIPVisionModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModel), specifically
+            the
+            [clip-vit-huge-patch14](https://github.com/mlfoundations/open_clip/blob/main/docs/PRETRAINED.md#vit-h14-xlm-roberta-large)
+            variant.
+        transformer ([`WanTransformer3DModel`]):
+            Conditional Transformer to denoise the input latents.
+        scheduler ([`UniPCMultistepScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKLWan`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        transformer_2 ([`WanTransformer3DModel`], *optional*):
+            Conditional Transformer to denoise the input latents during the low-noise stage. In two-stage denoising,
+            `transformer` handles high-noise stages and `transformer_2` handles low-noise stages. If not provided, only
+            `transformer` is used.
+        boundary_ratio (`float`, *optional*, defaults to `None`):
+            Ratio of total timesteps to use as the boundary for switching between transformers in two-stage denoising.
+            The actual boundary timestep is calculated as `boundary_ratio * num_train_timesteps`. When provided,
+            `transformer` handles timesteps >= boundary_timestep and `transformer_2` handles timesteps <
+            boundary_timestep. If `None`, only `transformer` is used for the entire denoising process.
+    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->transformer->transformer_2->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    _optional_components = ["transformer", "transformer_2", "image_encoder", "image_processor"]
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: UMT5EncoderModel,
+        vae: AutoencoderKLWan,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        image_processor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModel = None,
+        transformer: WanTransformer3DModel = None,
+        transformer_2: WanTransformer3DModel = None,
+        boundary_ratio: Optional[float] = None,
+        expand_timesteps: bool = False,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_processor=image_processor,
+            transformer_2=transformer_2,
+        )
+        self.register_to_config(boundary_ratio=boundary_ratio, expand_timesteps=expand_timesteps)
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        self.image_processor = image_processor
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(u) for u in prompt]
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+        )
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        image = self.image_processor(images=image, return_tensors="pt").to(device)
+        image_embeds = self.image_encoder(**image, output_hidden_states=True)
+        return image_embeds.hidden_states[-2]
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        return prompt_embeds, negative_prompt_embeds
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        image,
+        height,
+        width,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        guidance_scale_2=None,
+    ):
+        if image is not None and image_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `image`: {image} and `image_embeds`: {image_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        if image is None and image_embeds is None:
+            raise ValueError(
+                "Provide either `image` or `prompt_embeds`. Cannot leave both `image` and `image_embeds` undefined."
+            )
+        if image is not None and not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is {type(image)}")
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+        if self.config.boundary_ratio is None and guidance_scale_2 is not None:
+            raise ValueError("`guidance_scale_2` is only supported when the pipeline's `boundary_ratio` is not None.")
+        if self.config.boundary_ratio is not None and image_embeds is not None:
+            raise ValueError("Cannot forward `image_embeds` when the pipeline's `boundary_ratio` is not configured.")
+    def prepare_latents(
+        self,
+        image: PipelineImageInput,
+        traj_tensor,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        last_image: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        latent_height = height // self.vae_scale_factor_spatial
+        latent_width = width // self.vae_scale_factor_spatial
+        shape = (batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+        image = image.unsqueeze(2)  # [batch_size, channels, 1, height, width]
+        if self.config.expand_timesteps:
+            video_condition = image
+        elif last_image is None:
+            video_condition = torch.cat(
+                [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
+            )
+        else:
+            last_image = last_image.unsqueeze(2)
+            video_condition = torch.cat(
+                [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 2, height, width), last_image],
+                dim=2,
+            )
+        video_condition = video_condition.to(device=device, dtype=self.vae.dtype)
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean)
+            .view(1, self.vae.config.z_dim, 1, 1, 1)
+            .to(latents.device, latents.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+            latents.device, latents.dtype
+        )
+        if isinstance(generator, list):
+            latent_condition = [
+                retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax") for _ in generator
+            ]
+            latent_condition = torch.cat(latent_condition)
+        else:
+            latent_condition = retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax")
+            latent_condition = latent_condition.repeat(batch_size, 1, 1, 1, 1)
+        latent_condition = latent_condition.to(dtype)
+        latent_condition = (latent_condition - latents_mean) * latents_std
+        # Prepare the traj latent
+        traj_tensor = traj_tensor.to(device, dtype=self.vae.dtype)     #.unsqueeze(0)
+        traj_tensor = traj_tensor.unsqueeze(0)
+        traj_tensor = traj_tensor.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
+        # VAE encode
+        traj_latents = retrieve_latents(self.vae.encode(traj_tensor), sample_mode="argmax")
+        # Extract Mean and Variance
+        traj_latents = (traj_latents - latents_mean) * latents_std
+        # Final Convert
+        traj_latents = traj_latents.to(memory_format = torch.contiguous_format).float()
+        if self.config.expand_timesteps:
+            first_frame_mask = torch.ones(
+                                            1, 1, num_latent_frames, latent_height, latent_width, dtype=dtype, device=device
+                                        )
+            first_frame_mask[:, :, 0] = 0
+            return latents, latent_condition, traj_latents, first_frame_mask
+        mask_lat_size = torch.ones(batch_size, 1, num_frames, latent_height, latent_width)
+        if last_image is None:
+            mask_lat_size[:, :, list(range(1, num_frames))] = 0
+        else:
+            mask_lat_size[:, :, list(range(1, num_frames - 1))] = 0
+        first_frame_mask = mask_lat_size[:, :, 0:1]
+        first_frame_mask = torch.repeat_interleave(first_frame_mask, dim=2, repeats=self.vae_scale_factor_temporal)
+        mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
+        mask_lat_size = mask_lat_size.view(batch_size, -1, self.vae_scale_factor_temporal, latent_height, latent_width)
+        mask_lat_size = mask_lat_size.transpose(1, 2)
+        mask_lat_size = mask_lat_size.to(latent_condition.device)
+        return latents, torch.concat([mask_lat_size, latent_condition], dim=1)
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        traj_tensor = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        guidance_scale_2: Optional[float] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        last_image: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, defaults to `480`):
+                The height of the generated video.
+            width (`int`, defaults to `832`):
+                The width of the generated video.
+            num_frames (`int`, defaults to `81`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to `5.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            guidance_scale_2 (`float`, *optional*, defaults to `None`):
+                Guidance scale for the low-noise stage transformer (`transformer_2`). If `None` and the pipeline's
+                `boundary_ratio` is not None, uses the same value as `guidance_scale`. Only used when `transformer_2`
+                and the pipeline's `boundary_ratio` are not None.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `negative_prompt` input argument.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided,
+                image embeddings are generated from the `image` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
+                truncated. If the prompt is shorter, it will be padded to this length.
+        Examples:
+        Returns:
+            [`~WanPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`WanPipelineOutput`] is returned, otherwise a `tuple` is returned where
+                the first element is a list with the generated images and the second element is a list of `bool`s
+                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            image,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            image_embeds,
+            callback_on_step_end_tensor_inputs,
+            guidance_scale_2,
+        )
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+        if self.config.boundary_ratio is not None and guidance_scale_2 is None:
+            guidance_scale_2 = guidance_scale
+        self._guidance_scale = guidance_scale
+        self._guidance_scale_2 = guidance_scale_2
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        device = self._execution_device
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        # Encode image embedding
+        transformer_dtype = self.transformer.dtype if self.transformer is not None else self.transformer_2.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+        # only wan 2.1 i2v transformer accepts image_embeds
+        if self.transformer is not None and self.transformer.config.image_dim is not None:
+            if image_embeds is None:
+                if last_image is None:
+                    image_embeds = self.encode_image(image, device)
+                else:
+                    image_embeds = self.encode_image([image, last_image], device)
+            image_embeds = image_embeds.repeat(batch_size, 1, 1)
+            image_embeds = image_embeds.to(transformer_dtype)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.vae.config.z_dim
+        image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
+        if last_image is not None:
+            last_image = self.video_processor.preprocess(last_image, height=height, width=width).to(
+                device, dtype=torch.float32
+            )
+        latents_outputs = self.prepare_latents(
+                                                image,
+                                                traj_tensor,
+                                                batch_size * num_videos_per_prompt,
+                                                num_channels_latents,
+                                                height,
+                                                width,
+                                                num_frames,
+                                                torch.float32,
+                                                device,
+                                                generator,
+                                                latents,
+                                                last_image,
+                                            )
+        if self.config.expand_timesteps:
+            # wan 2.2 5b i2v use firt_frame_mask to mask timesteps
+            latents, condition, traj_latents, first_frame_mask = latents_outputs
+        else:
+            latents, condition = latents_outputs
+        # 6. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        if self.config.boundary_ratio is not None:
+            boundary_timestep = self.config.boundary_ratio * self.scheduler.config.num_train_timesteps
+        else:
+            boundary_timestep = None
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                if boundary_timestep is None or t >= boundary_timestep:
+                    # wan2.1 or high-noise stage in wan2.2
+                    current_model = self.transformer
+                    current_guidance_scale = guidance_scale
+                else:
+                    # low-noise stage in wan2.2
+                    current_model = self.transformer_2
+                    current_guidance_scale = guidance_scale_2
+                if self.config.expand_timesteps:
+                    latent_model_input = (1 - first_frame_mask) * condition + first_frame_mask * latents
+                    latent_model_input = latent_model_input.to(transformer_dtype)
+                    # seq_len: num_latent_frames * (latent_height // patch_size) * (latent_width // patch_size)
+                    temp_ts = (first_frame_mask[0][0][:, ::2, ::2] * t).flatten()
+                    # batch_size, seq_len
+                    timestep = temp_ts.unsqueeze(0).expand(latents.shape[0], -1)
+                else:
+                    latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
+                    timestep = t.expand(latents.shape[0])
+                # Concat the traj latents in channel dimension
+                latent_model_input = torch.cat([latent_model_input, traj_latents], dim=1).to(transformer_dtype)
+                # Predict the noise according to the timestep
+                with current_model.cache_context("cond"):
+                    noise_pred = current_model(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=prompt_embeds,
+                        encoder_hidden_states_image=image_embeds,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                if self.do_classifier_free_guidance:
+                    with current_model.cache_context("uncond"):
+                        noise_uncond = current_model(
+                            hidden_states=latent_model_input,
+                            timestep=timestep,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            encoder_hidden_states_image=image_embeds,
+                            attention_kwargs=attention_kwargs,
+                            return_dict=False,
+                        )[0]
+                        noise_pred = noise_uncond + current_guidance_scale * (noise_pred - noise_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if self.config.expand_timesteps:
+            latents = (1 - first_frame_mask) * condition + first_frame_mask * latents
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return WanPipelineOutput(frames=video)

pipelines/pipeline_wan_i2v_motion_FrameINO.py ADDED Viewed

	@@ -0,0 +1,937 @@

+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import html
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import os, sys, shutil
+import PIL
+import regex as re
+import torch
+from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, UMT5EncoderModel
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.loaders import WanLoraLoaderMixin
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.wan.pipeline_output import WanPipelineOutput
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from architecture.transformer_wan import WanTransformer3DModel
+from architecture.autoencoder_kl_wan import AutoencoderKLWan
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+if is_ftfy_available():
+    import ftfy
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> import numpy as np
+        >>> from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+        >>> from transformers import CLIPVisionModel
+        >>> # Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
+        >>> model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
+        >>> image_encoder = CLIPVisionModel.from_pretrained(
+        ...     model_id, subfolder="image_encoder", torch_dtype=torch.float32
+        ... )
+        >>> vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+        >>> pipe = WanImageToVideoPipeline.from_pretrained(
+        ...     model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
+        ... )
+        >>> max_area = 480 * 832
+        >>> aspect_ratio = image.height / image.width
+        >>> mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+        >>> height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+        >>> width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+        >>> image = image.resize((width, height))
+        >>> prompt = (
+        ...     "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
+        ...     "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
+        ... )
+        >>> negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+        >>> output = pipe(
+        ...     image=image,
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     height=height,
+        ...     width=width,
+        ...     num_frames=81,
+        ...     guidance_scale=5.0,
+        ... ).frames[0]
+        >>> export_to_video(output, "output.mp4", fps=16)
+        ```
+"""
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
+    r"""
+    Pipeline for image-to-video generation using Wan.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        tokenizer ([`T5Tokenizer`]):
+            Tokenizer from [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5Tokenizer),
+            specifically the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        text_encoder ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
+        image_encoder ([`CLIPVisionModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModel), specifically
+            the
+            [clip-vit-huge-patch14](https://github.com/mlfoundations/open_clip/blob/main/docs/PRETRAINED.md#vit-h14-xlm-roberta-large)
+            variant.
+        transformer ([`WanTransformer3DModel`]):
+            Conditional Transformer to denoise the input latents.
+        scheduler ([`UniPCMultistepScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKLWan`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        transformer_2 ([`WanTransformer3DModel`], *optional*):
+            Conditional Transformer to denoise the input latents during the low-noise stage. In two-stage denoising,
+            `transformer` handles high-noise stages and `transformer_2` handles low-noise stages. If not provided, only
+            `transformer` is used.
+        boundary_ratio (`float`, *optional*, defaults to `None`):
+            Ratio of total timesteps to use as the boundary for switching between transformers in two-stage denoising.
+            The actual boundary timestep is calculated as `boundary_ratio * num_train_timesteps`. When provided,
+            `transformer` handles timesteps >= boundary_timestep and `transformer_2` handles timesteps <
+            boundary_timestep. If `None`, only `transformer` is used for the entire denoising process.
+    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->transformer->transformer_2->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    _optional_components = ["transformer", "transformer_2", "image_encoder", "image_processor"]
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: UMT5EncoderModel,
+        vae: AutoencoderKLWan,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        image_processor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModel = None,
+        transformer: WanTransformer3DModel = None,
+        transformer_2: WanTransformer3DModel = None,
+        boundary_ratio: Optional[float] = None,
+        expand_timesteps: bool = False,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_processor=image_processor,
+            transformer_2=transformer_2,
+        )
+        self.register_to_config(boundary_ratio=boundary_ratio, expand_timesteps=expand_timesteps)
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        self.image_processor = image_processor
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(u) for u in prompt]
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+        )
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        image = self.image_processor(images=image, return_tensors="pt").to(device)
+        image_embeds = self.image_encoder(**image, output_hidden_states=True)
+        return image_embeds.hidden_states[-2]
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        return prompt_embeds, negative_prompt_embeds
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        image,
+        height,
+        width,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        guidance_scale_2=None,
+    ):
+        if image is not None and image_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `image`: {image} and `image_embeds`: {image_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        if image is None and image_embeds is None:
+            raise ValueError(
+                "Provide either `image` or `prompt_embeds`. Cannot leave both `image` and `image_embeds` undefined."
+            )
+        if image is not None and not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is {type(image)}")
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+        if self.config.boundary_ratio is None and guidance_scale_2 is not None:
+            raise ValueError("`guidance_scale_2` is only supported when the pipeline's `boundary_ratio` is not None.")
+        if self.config.boundary_ratio is not None and image_embeds is not None:
+            raise ValueError("Cannot forward `image_embeds` when the pipeline's `boundary_ratio` is not configured.")
+    def prepare_latents(
+        self,
+        image: PipelineImageInput,
+        traj_tensor,
+        ID_tensor,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        last_image: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        latent_height = height // self.vae_scale_factor_spatial
+        latent_width = width // self.vae_scale_factor_spatial
+        shape = (batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+        image = image.unsqueeze(2)  # [batch_size, channels, 1, height, width]
+        if self.config.expand_timesteps:
+            video_condition = image
+        elif last_image is None:
+            video_condition = torch.cat(
+                [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
+            )
+        else:
+            last_image = last_image.unsqueeze(2)
+            video_condition = torch.cat(
+                [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 2, height, width), last_image],
+                dim=2,
+            )
+        video_condition = video_condition.to(device=device, dtype=self.vae.dtype)
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean)
+            .view(1, self.vae.config.z_dim, 1, 1, 1)
+            .to(latents.device, latents.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+            latents.device, latents.dtype
+        )
+        if isinstance(generator, list):
+            latent_condition = [
+                retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax") for _ in generator
+            ]
+            latent_condition = torch.cat(latent_condition)
+        else:
+            latent_condition = retrieve_latents(self.vae.encode(video_condition), sample_mode="argmax")
+            latent_condition = latent_condition.repeat(batch_size, 1, 1, 1, 1)
+        latent_condition = latent_condition.to(dtype)
+        latent_condition = (latent_condition - latents_mean) * latents_std
+        # Prepare the traj latent
+        traj_tensor = traj_tensor.to(device, dtype=self.vae.dtype)     #.unsqueeze(0)
+        traj_tensor = traj_tensor.unsqueeze(0)
+        traj_tensor = traj_tensor.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
+        # VAE Encode
+        traj_latents = retrieve_latents(self.vae.encode(traj_tensor), sample_mode="argmax")
+        # Extract Mean and Variance
+        traj_latents = (traj_latents - latents_mean) * latents_std
+        # Final Convert
+        traj_latents = traj_latents.to(memory_format = torch.contiguous_format).float()
+        # Prepare the ID latents
+        if ID_tensor.shape[2] != 0:       # Must have at least one ID frame, could be empty sometime
+            # Tranform
+            ID_tensor = ID_tensor.to(device=device, dtype=self.vae.dtype)
+            # VAE encode for each frame One by One
+            ID_latents = []
+            for frame_idx in range(ID_tensor.shape[2]):
+                # Fetch
+                ID_tensor = ID_tensor[:, :, frame_idx].unsqueeze(2)
+                # Single Frame Encode, which will be single frame token
+                ID_latent = retrieve_latents(self.vae.encode(ID_tensor), sample_mode="argmax")
+                ID_latent = ID_latent.repeat(batch_size, 1, 1, 1, 1)
+                # Convert
+                ID_latent = ID_latent.to(dtype)
+                ID_latent = (ID_latent - latents_mean) * latents_std
+                # Append
+                ID_latents.append(ID_latent)
+            # Final Convert
+            ID_latent_condition = torch.cat(ID_latents, dim = 2)
+            # Add padding to the traj latents
+            ID_latent_padding = torch.zeros_like(ID_latent_condition)
+            traj_latents = torch.cat([traj_latents, ID_latent_padding], dim=2)
+            # Update the number of latents frames for the first frame mask
+            # num_latent_frames = num_latent_frames + len(ID_latents)
+        else:
+            # Return an empty one
+            ID_latent_condition = None
+        if self.config.expand_timesteps:        # For Wan2.2
+            first_frame_mask = torch.ones(
+                                            1, 1, num_latent_frames, latent_height, latent_width, dtype=dtype, device=device
+                                        )
+            first_frame_mask[:, :, 0] = 0
+            # Return all condition information needed
+            return latents, latent_condition, traj_latents, ID_latent_condition, first_frame_mask
+        # The rest if for Wan2.1
+        mask_lat_size = torch.ones(batch_size, 1, num_frames, latent_height, latent_width)
+        if last_image is None:
+            mask_lat_size[:, :, list(range(1, num_frames))] = 0
+        else:
+            mask_lat_size[:, :, list(range(1, num_frames - 1))] = 0
+        first_frame_mask = mask_lat_size[:, :, 0:1]
+        first_frame_mask = torch.repeat_interleave(first_frame_mask, dim=2, repeats=self.vae_scale_factor_temporal)
+        mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
+        mask_lat_size = mask_lat_size.view(batch_size, -1, self.vae_scale_factor_temporal, latent_height, latent_width)
+        mask_lat_size = mask_lat_size.transpose(1, 2)
+        mask_lat_size = mask_lat_size.to(latent_condition.device)
+        return latents, torch.concat([mask_lat_size, latent_condition], dim=1)
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        traj_tensor = None,
+        ID_tensor = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        guidance_scale_2: Optional[float] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        last_image: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, defaults to `480`):
+                The height of the generated video.
+            width (`int`, defaults to `832`):
+                The width of the generated video.
+            num_frames (`int`, defaults to `81`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to `5.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            guidance_scale_2 (`float`, *optional*, defaults to `None`):
+                Guidance scale for the low-noise stage transformer (`transformer_2`). If `None` and the pipeline's
+                `boundary_ratio` is not None, uses the same value as `guidance_scale`. Only used when `transformer_2`
+                and the pipeline's `boundary_ratio` are not None.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `negative_prompt` input argument.
+            image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided,
+                image embeddings are generated from the `image` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`WanPipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum sequence length of the text encoder. If the prompt is longer than this, it will be
+                truncated. If the prompt is shorter, it will be padded to this length.
+        Examples:
+        Returns:
+            [`~WanPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`WanPipelineOutput`] is returned, otherwise a `tuple` is returned where
+                the first element is a list with the generated images and the second element is a list of `bool`s
+                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            image,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            image_embeds,
+            callback_on_step_end_tensor_inputs,
+            guidance_scale_2,
+        )
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+        if self.config.boundary_ratio is not None and guidance_scale_2 is None:
+            guidance_scale_2 = guidance_scale
+        self._guidance_scale = guidance_scale
+        self._guidance_scale_2 = guidance_scale_2
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        device = self._execution_device
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        # Encode image embedding
+        transformer_dtype = self.transformer.dtype if self.transformer is not None else self.transformer_2.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+        # only wan 2.1 i2v transformer accepts image_embeds
+        if self.transformer is not None and self.transformer.config.image_dim is not None:
+            if image_embeds is None:
+                if last_image is None:
+                    image_embeds = self.encode_image(image, device)
+                else:
+                    image_embeds = self.encode_image([image, last_image], device)
+            image_embeds = image_embeds.repeat(batch_size, 1, 1)
+            image_embeds = image_embeds.to(transformer_dtype)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.vae.config.z_dim
+        image = self.video_processor.preprocess(image, height=height, width=width).to(device, dtype=torch.float32)
+        if last_image is not None:
+            last_image = self.video_processor.preprocess(last_image, height=height, width=width).to(
+                device, dtype=torch.float32
+            )
+        latents_outputs = self.prepare_latents(
+                                                image,
+                                                traj_tensor,
+                                                ID_tensor,
+                                                batch_size * num_videos_per_prompt,
+                                                num_channels_latents,
+                                                height,
+                                                width,
+                                                num_frames,
+                                                torch.float32,
+                                                device,
+                                                generator,
+                                                latents,
+                                                last_image,
+                                            )
+        if self.config.expand_timesteps:
+            # wan 2.2 5b i2v use firt_frame_mask to mask timesteps
+            latents, condition, traj_latents, ID_latent_condition, first_frame_mask = latents_outputs
+        else:
+            latents, condition = latents_outputs
+        # 5.5. For ID reference change, we need to add padding for the latents each time
+        _, channel_num, num_gen_frames, latent_height, latent_width = latents.shape
+        # 6. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        if self.config.boundary_ratio is not None:
+            boundary_timestep = self.config.boundary_ratio * self.scheduler.config.num_train_timesteps
+        else:
+            boundary_timestep = None
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                if boundary_timestep is None or t >= boundary_timestep:
+                    # wan2.1 or high-noise stage in wan2.2
+                    current_model = self.transformer
+                    current_guidance_scale = guidance_scale
+                else:
+                    # low-noise stage in wan2.2
+                    current_model = self.transformer_2
+                    current_guidance_scale = guidance_scale_2
+                if self.config.expand_timesteps:
+                    # Multiply with the mask, such that the first frame latent of the model input is the clean latent of the first frame condition (Here, for Frame INO, the first frame should be masked outpainting design)
+                    latent_model_input = (1 - first_frame_mask) * condition + first_frame_mask * latents    # NOTE: 现在first frame应该设定为带masked的first frame（有outpainting的样式的）
+                    latent_model_input = latent_model_input.to(transformer_dtype)
+                    # Add padding for the first_frame_mask here with the length of ID tokens
+                    if ID_latent_condition is not None:
+                        mask_padding = torch.ones(
+                                                    1, 1, ID_latent_condition.shape[2], latent_height, latent_width, dtype=transformer_dtype, device=device
+                                                )
+                        first_frame_mask_adjust = torch.cat([first_frame_mask, mask_padding], dim = 2)
+                    else:
+                        first_frame_mask_adjust = first_frame_mask
+                    # Reshape to   num_latent_frames * (latent_height // patch_size) * (latent_width // patch_size)
+                    temp_ts = (first_frame_mask_adjust[0][0][:, ::2, ::2] * t).flatten()
+                    timestep = temp_ts.unsqueeze(0).expand(latents.shape[0], -1)
+                else:
+                    latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
+                    timestep = t.expand(latents.shape[0])
+                # TODO: 我现在不是特别确定这里的timestep 跟training的align了吗？
+                # Frame-Wise concatenate ID tokens
+                if ID_latent_condition is not None:
+                    latent_model_input = torch.cat([latent_model_input, ID_latent_condition], dim = 2)
+                # Concat the trajectory latents in Channel dimension
+                latent_model_input = torch.cat([latent_model_input, traj_latents], dim = 1).to(transformer_dtype)
+                # Predict the noise according to the timestep
+                with current_model.cache_context("cond"):
+                    noise_pred = current_model(
+                                                hidden_states = latent_model_input,
+                                                timestep = timestep,
+                                                encoder_hidden_states = prompt_embeds,
+                                                encoder_hidden_states_image = image_embeds,
+                                                attention_kwargs = attention_kwargs,
+                                                return_dict = False,
+                                            )[0]
+                if self.do_classifier_free_guidance:
+                    with current_model.cache_context("uncond"):
+                        noise_uncond = current_model(
+                                                        hidden_states = latent_model_input,
+                                                        timestep = timestep,
+                                                        encoder_hidden_states = negative_prompt_embeds,
+                                                        encoder_hidden_states_image = image_embeds,
+                                                        attention_kwargs = attention_kwargs,
+                                                        return_dict = False,
+                                                    )[0]
+                        noise_pred = noise_uncond + current_guidance_scale * (noise_pred - noise_uncond)
+                # Discard the Extra ID tokens in the Noise Prediction
+                noise_pred = noise_pred[:, :, :num_gen_frames]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if self.config.expand_timesteps:
+            latents = (1 - first_frame_mask) * condition + first_frame_mask * latents
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return WanPipelineOutput(frames=video)

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+pandas
+tqdm
+opencv-python
+pyiqa
+numpy==1.26.0
+ffmpeg-python
+bitsandbytes
+pyarrow
+omegaconf
+peft>=0.15.0
+transformers>=4.56.2      # Install in the newest version
+git+https://github.com/huggingface/diffusers.git
+sentencepiece
+qwen-vl-utils[decord]==0.0.8
+scikit-learn
+matplotlib
+gradio
+imageio-ffmpeg
+bitsandbytes
+git+https://github.com/facebookresearch/segment-anything.git
+git+https://github.com/facebookresearch/sam2.git
+accelerate
+hf-transfer

utils/optical_flow_utils.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import numpy as np
+def make_colorwheel():
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+    Code follows the original C++ source code of Daniel Scharstein.
+    Code follows the the Matlab source code of Deqing Sun.
+    Returns:
+        np.ndarray: Color wheel
+    """
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros((ncols, 3))
+    col = 0
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY)
+    col = col+RY
+    # YG
+    colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG)
+    colorwheel[col:col+YG, 1] = 255
+    col = col+YG
+    # GC
+    colorwheel[col:col+GC, 1] = 255
+    colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC)
+    col = col+GC
+    # CB
+    colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB)
+    colorwheel[col:col+CB, 2] = 255
+    col = col+CB
+    # BM
+    colorwheel[col:col+BM, 2] = 255
+    colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM)
+    col = col+BM
+    # MR
+    colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR)
+    colorwheel[col:col+MR, 0] = 255
+    return colorwheel
+def flow_uv_to_colors(u, v, convert_to_bgr=False):
+    """
+    Applies the flow color wheel to (possibly clipped) flow components u and v.
+    According to the C++ source code of Daniel Scharstein
+    According to the Matlab source code of Deqing Sun
+    Args:
+        u (np.ndarray): Input horizontal flow of shape [H,W]
+        v (np.ndarray): Input vertical flow of shape [H,W]
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3] in range [0, 255]
+    """
+    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+    colorwheel = make_colorwheel()  # shape [55x3]
+    ncols = colorwheel.shape[0]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    a = np.arctan2(-v, -u)/np.pi
+    fk = (a+1) / 2*(ncols-1)
+    k0 = np.floor(fk).astype(np.int32)
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+    for i in range(colorwheel.shape[1]):
+        tmp = colorwheel[:,i]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1-f)*col0 + f*col1
+        idx = (rad <= 1)
+        col[idx]  = 1 - rad[idx] * (1-col[idx])
+        col[~idx] = col[~idx] * 0.75   # out of range
+        # Note the 2-i => BGR instead of RGB
+        ch_idx = 2-i if convert_to_bgr else i
+        flow_image[:,:,ch_idx] = np.floor(255 * col)
+    return flow_image
+def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
+    """
+    Expects a two dimensional flow image of shape.
+    Args:
+        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
+        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    assert flow_uv.ndim == 3, 'input flow must have three dimensions'
+    assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
+    if clip_flow is not None:
+        flow_uv = np.clip(flow_uv, 0, clip_flow)
+    u = flow_uv[:,:,0]
+    v = flow_uv[:,:,1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    epsilon = 1e-5
+    u = u / (rad_max + epsilon)
+    v = v / (rad_max + epsilon)
+    return flow_uv_to_colors(u, v, convert_to_bgr)
+def filter_uv(flow, threshold_factor = 0.1, sample_prob = 1.0):
+    '''
+    Args:
+        flow (numpy):               A 2-dim array that stores x and y change in optical flow
+        threshold_factor (float):   Prob of discarding outliers vector
+        sample_prob (float):        The selection rate of how much proportion of points we need to store
+    '''
+    u = flow[:,:,0]
+    v = flow[:,:,1]
+    # Filter out those less than the threshold
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    threshold = threshold_factor * rad_max
+    flow[:,:,0][rad < threshold] = 0
+    flow[:,:,1][rad < threshold] = 0
+    # Randomly sample based on sample_prob
+    zero_prob = 1 - sample_prob
+    random_array = np.random.randn(*flow.shape)
+    random_array[random_array < zero_prob] = 0
+    random_array[random_array >= zero_prob] = 1
+    flow = flow * random_array
+    return flow
+############################################# The following is for dilation method in optical flow ######################################
+def sigma_matrix2(sig_x, sig_y, theta):
+    """Calculate the rotated sigma matrix (two dimensional matrix).
+    Args:
+        sig_x (float):
+        sig_y (float):
+        theta (float): Radian measurement.
+    Returns:
+        ndarray: Rotated sigma matrix.
+    """
+    d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]])
+    u_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
+    return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))
+def mesh_grid(kernel_size):
+    """Generate the mesh grid, centering at zero.
+    Args:
+        kernel_size (int):
+    Returns:
+        xy (ndarray): with the shape (kernel_size, kernel_size, 2)
+        xx (ndarray): with the shape (kernel_size, kernel_size)
+        yy (ndarray): with the shape (kernel_size, kernel_size)
+    """
+    ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.)
+    xx, yy = np.meshgrid(ax, ax)
+    xy = np.hstack((xx.reshape((kernel_size * kernel_size, 1)), yy.reshape(kernel_size * kernel_size,
+                                                                           1))).reshape(kernel_size, kernel_size, 2)
+    return xy, xx, yy
+def pdf2(sigma_matrix, grid):
+    """Calculate PDF of the bivariate Gaussian distribution.
+    Args:
+        sigma_matrix (ndarray): with the shape (2, 2)
+        grid (ndarray): generated by :func:`mesh_grid`,
+            with the shape (K, K, 2), K is the kernel size.
+    Returns:
+        kernel (ndarrray): un-normalized kernel.
+    """
+    inverse_sigma = np.linalg.inv(sigma_matrix)
+    kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2))
+    return kernel
+def bivariate_Gaussian(kernel_size, sig_x, sig_y, theta, grid=None, isotropic=True):
+    """Generate a bivariate isotropic or anisotropic Gaussian kernel.
+    In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
+    Args:
+        kernel_size (int):
+        sig_x (float):
+        sig_y (float):
+        theta (float): Radian measurement.
+        grid (ndarray, optional): generated by :func:`mesh_grid`,
+            with the shape (K, K, 2), K is the kernel size. Default: None
+        isotropic (bool):
+    Returns:
+        kernel (ndarray): normalized kernel.
+    """
+    if grid is None:
+        grid, _, _ = mesh_grid(kernel_size)
+    if isotropic:
+        sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
+    else:
+        sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
+    kernel = pdf2(sigma_matrix, grid)
+    kernel = kernel / np.sum(kernel)
+    return kernel