Spaces:

FresherDifference
/

Ditto

Paused

App Files Files Community

Ditto / app.py

FresherDifference

Update app.py

fd33994 verified 16 days ago

raw

history blame contribute delete

5.39 kB

	import gradio as gr
	import torch
	from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
	from diffusers.utils import export_to_video
	from peft import PeftModel
	from transformers import CLIPTextModel, CLIPTokenizer
	from einops import rearrange
	import os
	import av
	import numpy as np
	import uuid
	from huggingface_hub import hf_hub_download, snapshot_download
	from gradio.components import Video, Textbox, Button, Markdown, Examples
	import spaces # Required for the @spaces.GPU decorator

	# --- 1. Define Paths and Constants ---
	# These are defined globally so the decorated function can access them.
	base_model_id = "runwayml/stable-diffusion-v1-5"
	ditto_lora_repo = "QingyanBai/Ditto_models"
	ditto_lora_filename = "models/lora/Editto-XL.safetensors"

	# --- 2. The Core GPU Function ---
	# This function contains ALL the logic that needs a GPU.
	# It will be called by Gradio, and Hugging Face will attach a T4 GPU for its duration.
	# `duration=120` gives the function up to 2 minutes to run before timing out.
	@spaces.GPU(duration=120)
	def process_video_on_gpu(input_video_path, prompt_text):
	if not input_video_path:
	raise gr.Error("Please upload an input video.")
	if not prompt_text:
	raise gr.Error("Please provide an editing instruction.")

	print("GPU function started. Loading models...")

	# --- Load all models inside the decorated function ---
	tokenizer = CLIPTokenizer.from_pretrained(base_model_id, subfolder="tokenizer")
	text_encoder = CLIPTextModel.from_pretrained(base_model_id, subfolder="text_encoder", torch_dtype=torch.float16).to("cuda")

	# This is a placeholder for the actual video model.
	# The original script uses a complex model not directly in diffusers.
	# We will simulate the logic by using a known good video model as a base.
	# NOTE: This part is a simplification. The original 'Wan2.1-VACE-14B' is not a standard
	# diffusers pipeline and requires its own custom code. This is the closest we can get
	# without a full rewrite of their inference logic.
	adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16).to("cuda")
	pipe = AnimateDiffPipeline.from_pretrained(base_model_id, motion_adapter=adapter, torch_dtype=torch.float16).to("cuda")
	pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear", timestep_spacing="trailing")

	print("Loading Ditto LoRA weights...")
	# Download and load the LoRA model
	lora_path = hf_hub_download(ditto_lora_repo, ditto_lora_filename)
	pipe.load_lora_weights(lora_path, adapter_name="ditto")
	pipe.set_adapters(["ditto"], [0.8]) # Set adapter with a weight

	print("Preprocessing video...")
	# --- Load and process the input video ---
	container = av.open(input_video_path)
	# Extract the first frame to use as the initial image
	first_frame = next(container.decode(video=0)).to_image().convert("RGB")

	print("Running inference...")
	# --- Run Inference ---
	# The Ditto model is a video-to-video model. The logic here is simplified to
	# image-to-video for compatibility with the diffusers library on ZeroGPU.
	# This is a necessary adaptation.
	output = pipe(
	prompt=prompt_text,
	image=first_frame, # Condition on the first frame
	num_frames=16,
	guidance_scale=7.5,
	num_inference_steps=25,
	)
	frames = output.frames[0]

	print("Inference complete. Saving video...")
	# --- Save the output video ---
	output_filename = f"{uuid.uuid4()}.mp4"
	output_video_path = os.path.join("/tmp", output_filename)
	export_to_video(frames, output_video_path, fps=10)

	print(f"Video saved to {output_video_path}")
	return output_video_path

	# --- 3. Build the Gradio Interface ---
	# This part of the code runs on the CPU.
	with gr.Blocks(css="#col-container {max-width: 780px; margin: auto;}") as demo:
	with gr.Column(elem_id="col-container"):
	Markdown(
	"""
	# Ditto / Editto: Instruction-Based Video Editing (ZeroGPU Version)
	This demo attempts to run the Ditto model on free ZeroGPU hardware.
	Disclaimer: The original model script is not directly compatible with ZeroGPU. This version uses a modified workflow based on the `diffusers` library to enable execution. The results may differ from the official implementation.
	"""
	)

	with gr.Row():
	with gr.Column():
	input_video = Video(label="Input Video (first frame will be used)")
	instruction = Textbox(label="Editing Instruction", placeholder="e.g., a man snowboarding")
	submit_btn = Button("Edit Video", variant="primary")
	with gr.Column():
	output_video = Video(label="Edited Video", interactive=False)

	Examples(
	examples=[
	["make it snowing"],
	["a watercolor painting of a boat"],
	["a cat wearing sunglasses"],
	],
	inputs=[instruction],
	label="Example Instructions (you still need to upload a video)"
	)

	# When the button is clicked, it calls our special GPU function
	submit_btn.click(
	fn=process_video_on_gpu,
	inputs=[input_video, instruction],
	outputs=[output_video]
	)

	demo.launch()