File size: 5,389 Bytes
b5e673d
fd33994
 
 
 
 
 
b5e673d
fd33994
 
b5e673d
fd33994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5e673d
fd33994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5e673d
fd33994
 
b5e673d
 
fd33994
b5e673d
fd33994
b5e673d
 
fd33994
 
b5e673d
 
fd33994
b5e673d
fd33994
 
 
b5e673d
 
 
 
 
fd33994
 
 
b5e673d
fd33994
b5e673d
fd33994
b5e673d
fd33994
 
 
b5e673d
 
fd33994
b5e673d
 
fd33994
b5e673d
fd33994
b5e673d
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import torch
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
from diffusers.utils import export_to_video
from peft import PeftModel
from transformers import CLIPTextModel, CLIPTokenizer
from einops import rearrange
import os
import av
import numpy as np
import uuid
from huggingface_hub import hf_hub_download, snapshot_download
from gradio.components import Video, Textbox, Button, Markdown, Examples
import spaces # Required for the @spaces.GPU decorator

# --- 1. Define Paths and Constants ---
# These are defined globally so the decorated function can access them.
base_model_id = "runwayml/stable-diffusion-v1-5"
ditto_lora_repo = "QingyanBai/Ditto_models"
ditto_lora_filename = "models/lora/Editto-XL.safetensors"

# --- 2. The Core GPU Function ---
# This function contains ALL the logic that needs a GPU.
# It will be called by Gradio, and Hugging Face will attach a T4 GPU for its duration.
# `duration=120` gives the function up to 2 minutes to run before timing out.
@spaces.GPU(duration=120)
def process_video_on_gpu(input_video_path, prompt_text):
    if not input_video_path:
        raise gr.Error("Please upload an input video.")
    if not prompt_text:
        raise gr.Error("Please provide an editing instruction.")

    print("GPU function started. Loading models...")

    # --- Load all models inside the decorated function ---
    tokenizer = CLIPTokenizer.from_pretrained(base_model_id, subfolder="tokenizer")
    text_encoder = CLIPTextModel.from_pretrained(base_model_id, subfolder="text_encoder", torch_dtype=torch.float16).to("cuda")
    
    # This is a placeholder for the actual video model.
    # The original script uses a complex model not directly in diffusers.
    # We will simulate the logic by using a known good video model as a base.
    # NOTE: This part is a simplification. The original 'Wan2.1-VACE-14B' is not a standard
    # diffusers pipeline and requires its own custom code. This is the closest we can get
    # without a full rewrite of their inference logic.
    adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16).to("cuda")
    pipe = AnimateDiffPipeline.from_pretrained(base_model_id, motion_adapter=adapter, torch_dtype=torch.float16).to("cuda")
    pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear", timestep_spacing="trailing")

    print("Loading Ditto LoRA weights...")
    # Download and load the LoRA model
    lora_path = hf_hub_download(ditto_lora_repo, ditto_lora_filename)
    pipe.load_lora_weights(lora_path, adapter_name="ditto")
    pipe.set_adapters(["ditto"], [0.8]) # Set adapter with a weight

    print("Preprocessing video...")
    # --- Load and process the input video ---
    container = av.open(input_video_path)
    # Extract the first frame to use as the initial image
    first_frame = next(container.decode(video=0)).to_image().convert("RGB")
    
    print("Running inference...")
    # --- Run Inference ---
    # The Ditto model is a video-to-video model. The logic here is simplified to
    # image-to-video for compatibility with the diffusers library on ZeroGPU.
    # This is a necessary adaptation.
    output = pipe(
        prompt=prompt_text,
        image=first_frame, # Condition on the first frame
        num_frames=16,
        guidance_scale=7.5,
        num_inference_steps=25,
    )
    frames = output.frames[0]

    print("Inference complete. Saving video...")
    # --- Save the output video ---
    output_filename = f"{uuid.uuid4()}.mp4"
    output_video_path = os.path.join("/tmp", output_filename)
    export_to_video(frames, output_video_path, fps=10)

    print(f"Video saved to {output_video_path}")
    return output_video_path

# --- 3. Build the Gradio Interface ---
# This part of the code runs on the CPU.
with gr.Blocks(css="#col-container {max-width: 780px; margin: auto;}") as demo:
    with gr.Column(elem_id="col-container"):
        Markdown(
        """
        # Ditto / Editto: Instruction-Based Video Editing (ZeroGPU Version)
        This demo attempts to run the Ditto model on free ZeroGPU hardware.
        **Disclaimer:** The original model script is not directly compatible with ZeroGPU. This version uses a modified workflow based on the `diffusers` library to enable execution. The results may differ from the official implementation.
        """
        )

        with gr.Row():
            with gr.Column():
                input_video = Video(label="Input Video (first frame will be used)")
                instruction = Textbox(label="Editing Instruction", placeholder="e.g., a man snowboarding")
                submit_btn = Button("Edit Video", variant="primary")
            with gr.Column():
                output_video = Video(label="Edited Video", interactive=False)

        Examples(
            examples=[
                ["make it snowing"],
                ["a watercolor painting of a boat"],
                ["a cat wearing sunglasses"],
            ],
            inputs=[instruction],
            label="Example Instructions (you still need to upload a video)"
        )

    # When the button is clicked, it calls our special GPU function
    submit_btn.click(
        fn=process_video_on_gpu,
        inputs=[input_video, instruction],
        outputs=[output_video]
    )

demo.launch()