Spaces:

FresherDifference
/

Ditto

Paused

File size: 5,389 Bytes

import gradio as gr
import torch
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
from diffusers.utils import export_to_video
from peft import PeftModel
from transformers import CLIPTextModel, CLIPTokenizer
from einops import rearrange
import os
import av
import numpy as np
import uuid
from huggingface_hub import hf_hub_download, snapshot_download
from gradio.components import Video, Textbox, Button, Markdown, Examples
import spaces # Required for the @spaces.GPU decorator

# --- 1. Define Paths and Constants ---
# These are defined globally so the decorated function can access them.
base_model_id = "runwayml/stable-diffusion-v1-5"
ditto_lora_repo = "QingyanBai/Ditto_models"
ditto_lora_filename = "models/lora/Editto-XL.safetensors"

# --- 2. The Core GPU Function ---
# This function contains ALL the logic that needs a GPU.
# It will be called by Gradio, and Hugging Face will attach a T4 GPU for its duration.
# `duration=120` gives the function up to 2 minutes to run before timing out.
@spaces.GPU(duration=120)
def process_video_on_gpu(input_video_path, prompt_text):
    if not input_video_path:
        raise gr.Error("Please upload an input video.")
    if not prompt_text:
        raise gr.Error("Please provide an editing instruction.")

    print("GPU function started. Loading models...")

    # --- Load all models inside the decorated function ---
    tokenizer = CLIPTokenizer.from_pretrained(base_model_id, subfolder="tokenizer")
    text_encoder = CLIPTextModel.from_pretrained(base_model_id, subfolder="text_encoder", torch_dtype=torch.float16).to("cuda")
    
    # This is a placeholder for the actual video model.
    # The original script uses a complex model not directly in diffusers.
    # We will simulate the logic by using a known good video model as a base.
    # NOTE: This part is a simplification. The original 'Wan2.1-VACE-14B' is not a standard
    # diffusers pipeline and requires its own custom code. This is the closest we can get
    # without a full rewrite of their inference logic.
    adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16).to("cuda")
    pipe = AnimateDiffPipeline.from_pretrained(base_model_id, motion_adapter=adapter, torch_dtype=torch.float16).to("cuda")
    pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear", timestep_spacing="trailing")

    print("Loading Ditto LoRA weights...")
    # Download and load the LoRA model
    lora_path = hf_hub_download(ditto_lora_repo, ditto_lora_filename)
    pipe.load_lora_weights(lora_path, adapter_name="ditto")
    pipe.set_adapters(["ditto"], [0.8]) # Set adapter with a weight

    print("Preprocessing video...")
    # --- Load and process the input video ---
    container = av.open(input_video_path)
    # Extract the first frame to use as the initial image
    first_frame = next(container.decode(video=0)).to_image().convert("RGB")
    
    print("Running inference...")
    # --- Run Inference ---
    # The Ditto model is a video-to-video model. The logic here is simplified to
    # image-to-video for compatibility with the diffusers library on ZeroGPU.
    # This is a necessary adaptation.
    output = pipe(
        prompt=prompt_text,
        image=first_frame, # Condition on the first frame
        num_frames=16,
        guidance_scale=7.5,
        num_inference_steps=25,
    )
    frames = output.frames[0]

    print("Inference complete. Saving video...")
    # --- Save the output video ---
    output_filename = f"{uuid.uuid4()}.mp4"
    output_video_path = os.path.join("/tmp", output_filename)
    export_to_video(frames, output_video_path, fps=10)

    print(f"Video saved to {output_video_path}")
    return output_video_path

# --- 3. Build the Gradio Interface ---
# This part of the code runs on the CPU.
with gr.Blocks(css="#col-container {max-width: 780px; margin: auto;}") as demo:
    with gr.Column(elem_id="col-container"):
        Markdown(
        """
        # Ditto / Editto: Instruction-Based Video Editing (ZeroGPU Version)
        This demo attempts to run the Ditto model on free ZeroGPU hardware.
        **Disclaimer:** The original model script is not directly compatible with ZeroGPU. This version uses a modified workflow based on the `diffusers` library to enable execution. The results may differ from the official implementation.
        """
        )

        with gr.Row():
            with gr.Column():
                input_video = Video(label="Input Video (first frame will be used)")
                instruction = Textbox(label="Editing Instruction", placeholder="e.g., a man snowboarding")
                submit_btn = Button("Edit Video", variant="primary")
            with gr.Column():
                output_video = Video(label="Edited Video", interactive=False)

        Examples(
            examples=[
                ["make it snowing"],
                ["a watercolor painting of a boat"],
                ["a cat wearing sunglasses"],
            ],
            inputs=[instruction],
            label="Example Instructions (you still need to upload a video)"
        )

    # When the button is clicked, it calls our special GPU function
    submit_btn.click(
        fn=process_video_on_gpu,
        inputs=[input_video, instruction],
        outputs=[output_video]
    )

demo.launch()