|
|
import gradio as gr |
|
|
import os |
|
|
from huggingface_hub import InferenceClient |
|
|
from pathlib import Path |
|
|
import tempfile |
|
|
|
|
|
|
|
|
client = InferenceClient( |
|
|
provider="fal-ai", |
|
|
api_key=os.environ.get("HF_TOKEN"), |
|
|
bill_to="huggingface", |
|
|
) |
|
|
|
|
|
def generate_video(image, prompt, progress=gr.Progress()): |
|
|
""" |
|
|
Generate a video from an image using the Ovi model. |
|
|
|
|
|
Args: |
|
|
image: Input image (PIL Image or file path) |
|
|
prompt: Text prompt describing the desired motion/animation |
|
|
progress: Gradio progress tracker |
|
|
|
|
|
Returns: |
|
|
Path to the generated video file |
|
|
""" |
|
|
if image is None: |
|
|
raise gr.Error("Please upload an image first!") |
|
|
|
|
|
if not prompt or prompt.strip() == "": |
|
|
raise gr.Error("Please enter a prompt describing the desired motion!") |
|
|
|
|
|
try: |
|
|
progress(0.2, desc="Processing image...") |
|
|
|
|
|
|
|
|
if isinstance(image, str): |
|
|
with open(image, "rb") as image_file: |
|
|
input_image = image_file.read() |
|
|
else: |
|
|
|
|
|
temp_image = tempfile.NamedTemporaryFile(delete=False, suffix=".png") |
|
|
image.save(temp_image.name) |
|
|
with open(temp_image.name, "rb") as image_file: |
|
|
input_image = image_file.read() |
|
|
|
|
|
progress(0.4, desc="Generating video with AI...") |
|
|
|
|
|
|
|
|
video = client.image_to_video( |
|
|
input_image, |
|
|
prompt=prompt, |
|
|
model="chetwinlow1/Ovi", |
|
|
) |
|
|
|
|
|
progress(0.9, desc="Finalizing video...") |
|
|
|
|
|
|
|
|
output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") |
|
|
|
|
|
|
|
|
if isinstance(video, bytes): |
|
|
with open(output_path.name, "wb") as f: |
|
|
f.write(video) |
|
|
elif isinstance(video, str) and os.path.exists(video): |
|
|
|
|
|
import shutil |
|
|
shutil.copy(video, output_path.name) |
|
|
else: |
|
|
|
|
|
with open(output_path.name, "wb") as f: |
|
|
f.write(video) |
|
|
|
|
|
progress(1.0, desc="Complete!") |
|
|
|
|
|
return output_path.name |
|
|
|
|
|
except Exception as e: |
|
|
raise gr.Error(f"Error generating video: {str(e)}") |
|
|
|
|
|
|
|
|
with gr.Blocks( |
|
|
theme=gr.themes.Soft( |
|
|
primary_hue="blue", |
|
|
secondary_hue="indigo", |
|
|
), |
|
|
css=""" |
|
|
.header-link { |
|
|
font-size: 0.9em; |
|
|
color: #666; |
|
|
text-decoration: none; |
|
|
margin-bottom: 1em; |
|
|
display: inline-block; |
|
|
} |
|
|
.header-link:hover { |
|
|
color: #333; |
|
|
text-decoration: underline; |
|
|
} |
|
|
.main-header { |
|
|
text-align: center; |
|
|
margin-bottom: 2em; |
|
|
} |
|
|
.info-box { |
|
|
background-color: #f0f7ff; |
|
|
border-left: 4px solid #4285f4; |
|
|
padding: 1em; |
|
|
margin: 1em 0; |
|
|
border-radius: 4px; |
|
|
} |
|
|
""", |
|
|
title="Image to Video Generator", |
|
|
) as demo: |
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div class="main-header"> |
|
|
<a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" class="header-link"> |
|
|
Built with anycoder β¨ |
|
|
</a> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
# π¬ Image to Video Generator with Ovi |
|
|
|
|
|
Transform your static images into dynamic videos with synchronized audio using AI! Upload an image and describe the motion you want to see. |
|
|
|
|
|
Powered by **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** via HuggingFace Inference API. |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div class="info-box"> |
|
|
<strong>π‘ Tips for best results:</strong> |
|
|
<ul> |
|
|
<li>Use clear, well-lit images with a single main subject</li> |
|
|
<li>Write specific prompts describing the desired motion or action</li> |
|
|
<li>Keep prompts concise and focused on movement and audio elements</li> |
|
|
<li>Processing generates 5-second videos at 24 FPS with synchronized audio</li> |
|
|
<li>Processing may take 30-60 seconds depending on server load</li> |
|
|
</ul> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
image_input = gr.Image( |
|
|
label="πΈ Upload Image", |
|
|
type="filepath", |
|
|
sources=["upload", "clipboard"], |
|
|
height=400, |
|
|
) |
|
|
|
|
|
prompt_input = gr.Textbox( |
|
|
label="βοΈ Motion Prompt", |
|
|
placeholder="Describe the motion or animation you want to see...", |
|
|
lines=3, |
|
|
value="The subject starts to move naturally", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
generate_btn = gr.Button( |
|
|
"π¬ Generate Video", |
|
|
variant="primary", |
|
|
size="lg", |
|
|
) |
|
|
|
|
|
clear_btn = gr.Button( |
|
|
"ποΈ Clear", |
|
|
variant="secondary", |
|
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
video_output = gr.Video( |
|
|
label="π₯ Generated Video", |
|
|
height=400, |
|
|
autoplay=True, |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
### About Ovi Model |
|
|
|
|
|
**Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** |
|
|
|
|
|
Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University) |
|
|
|
|
|
π **Key Features:** |
|
|
- π¬ **Video+Audio Generation**: Generates synchronized video and audio content simultaneously |
|
|
- π **Flexible Input**: Supports text-only or text+image conditioning |
|
|
- β±οΈ **5-second Videos**: Generates 5-second videos at 24 FPS |
|
|
- π **Multiple Aspect Ratios**: Supports 720Γ720 area at various ratios (9:16, 16:9, 1:1, etc) |
|
|
|
|
|
Ovi is a veo-3 like model that uses twin backbone cross-modal fusion for high-quality audio-video generation. |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_video, |
|
|
inputs=[image_input, prompt_input], |
|
|
outputs=[video_output], |
|
|
api_name="generate_video", |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=lambda: (None, "The subject starts to move naturally", None), |
|
|
inputs=None, |
|
|
outputs=[image_input, prompt_input, video_output], |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
|
|
|
### π How it works |
|
|
|
|
|
1. **Upload** your image - any photo or illustration |
|
|
2. **Describe** the motion you want to see in the prompt |
|
|
3. **Generate** and watch your image come to life! |
|
|
|
|
|
### β οΈ Notes |
|
|
|
|
|
- Video generation may take 30-60 seconds |
|
|
- Generates 5-second videos at 24 FPS with synchronized audio |
|
|
- Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) at 720Γ720 area |
|
|
- Requires a valid HuggingFace token with Inference API access |
|
|
- Best results with clear, high-quality images |
|
|
- The model works best with realistic subjects and natural motions |
|
|
|
|
|
### π Resources |
|
|
|
|
|
- [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi) |
|
|
- [HuggingFace Inference API](https://huggingface.co/docs/huggingface_hub/guides/inference) |
|
|
- [Character AI](https://character.ai) |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |