Ditto / app.py
FresherDifference's picture
Update app.py
fd33994 verified
import gradio as gr
import torch
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
from diffusers.utils import export_to_video
from peft import PeftModel
from transformers import CLIPTextModel, CLIPTokenizer
from einops import rearrange
import os
import av
import numpy as np
import uuid
from huggingface_hub import hf_hub_download, snapshot_download
from gradio.components import Video, Textbox, Button, Markdown, Examples
import spaces # Required for the @spaces.GPU decorator
# --- 1. Define Paths and Constants ---
# These are defined globally so the decorated function can access them.
base_model_id = "runwayml/stable-diffusion-v1-5"
ditto_lora_repo = "QingyanBai/Ditto_models"
ditto_lora_filename = "models/lora/Editto-XL.safetensors"
# --- 2. The Core GPU Function ---
# This function contains ALL the logic that needs a GPU.
# It will be called by Gradio, and Hugging Face will attach a T4 GPU for its duration.
# `duration=120` gives the function up to 2 minutes to run before timing out.
@spaces.GPU(duration=120)
def process_video_on_gpu(input_video_path, prompt_text):
if not input_video_path:
raise gr.Error("Please upload an input video.")
if not prompt_text:
raise gr.Error("Please provide an editing instruction.")
print("GPU function started. Loading models...")
# --- Load all models inside the decorated function ---
tokenizer = CLIPTokenizer.from_pretrained(base_model_id, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(base_model_id, subfolder="text_encoder", torch_dtype=torch.float16).to("cuda")
# This is a placeholder for the actual video model.
# The original script uses a complex model not directly in diffusers.
# We will simulate the logic by using a known good video model as a base.
# NOTE: This part is a simplification. The original 'Wan2.1-VACE-14B' is not a standard
# diffusers pipeline and requires its own custom code. This is the closest we can get
# without a full rewrite of their inference logic.
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16).to("cuda")
pipe = AnimateDiffPipeline.from_pretrained(base_model_id, motion_adapter=adapter, torch_dtype=torch.float16).to("cuda")
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear", timestep_spacing="trailing")
print("Loading Ditto LoRA weights...")
# Download and load the LoRA model
lora_path = hf_hub_download(ditto_lora_repo, ditto_lora_filename)
pipe.load_lora_weights(lora_path, adapter_name="ditto")
pipe.set_adapters(["ditto"], [0.8]) # Set adapter with a weight
print("Preprocessing video...")
# --- Load and process the input video ---
container = av.open(input_video_path)
# Extract the first frame to use as the initial image
first_frame = next(container.decode(video=0)).to_image().convert("RGB")
print("Running inference...")
# --- Run Inference ---
# The Ditto model is a video-to-video model. The logic here is simplified to
# image-to-video for compatibility with the diffusers library on ZeroGPU.
# This is a necessary adaptation.
output = pipe(
prompt=prompt_text,
image=first_frame, # Condition on the first frame
num_frames=16,
guidance_scale=7.5,
num_inference_steps=25,
)
frames = output.frames[0]
print("Inference complete. Saving video...")
# --- Save the output video ---
output_filename = f"{uuid.uuid4()}.mp4"
output_video_path = os.path.join("/tmp", output_filename)
export_to_video(frames, output_video_path, fps=10)
print(f"Video saved to {output_video_path}")
return output_video_path
# --- 3. Build the Gradio Interface ---
# This part of the code runs on the CPU.
with gr.Blocks(css="#col-container {max-width: 780px; margin: auto;}") as demo:
with gr.Column(elem_id="col-container"):
Markdown(
"""
# Ditto / Editto: Instruction-Based Video Editing (ZeroGPU Version)
This demo attempts to run the Ditto model on free ZeroGPU hardware.
**Disclaimer:** The original model script is not directly compatible with ZeroGPU. This version uses a modified workflow based on the `diffusers` library to enable execution. The results may differ from the official implementation.
"""
)
with gr.Row():
with gr.Column():
input_video = Video(label="Input Video (first frame will be used)")
instruction = Textbox(label="Editing Instruction", placeholder="e.g., a man snowboarding")
submit_btn = Button("Edit Video", variant="primary")
with gr.Column():
output_video = Video(label="Edited Video", interactive=False)
Examples(
examples=[
["make it snowing"],
["a watercolor painting of a boat"],
["a cat wearing sunglasses"],
],
inputs=[instruction],
label="Example Instructions (you still need to upload a video)"
)
# When the button is clicked, it calls our special GPU function
submit_btn.click(
fn=process_video_on_gpu,
inputs=[input_video, instruction],
outputs=[output_video]
)
demo.launch()