Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| from huggingface_hub import InferenceClient | |
| from pathlib import Path | |
| import tempfile | |
| # DO NOT create a global client for paid apps | |
| # Each user's client will be created using their OAuth token | |
| def generate_video_with_auth(image, prompt, token: gr.OAuthToken | None, progress=gr.Progress()): | |
| """ | |
| Generate a video from an image using the Ovi model with authentication check. | |
| Args: | |
| image: Input image (PIL Image or file path) | |
| prompt: Text prompt describing the desired motion/animation | |
| token: OAuth token for authentication | |
| progress: Gradio progress tracker | |
| Returns: | |
| Path to the generated video file | |
| """ | |
| if token is None or not getattr(token, "token", None): | |
| raise gr.Error("Please sign in with Hugging Face to use this paid app") | |
| if image is None: | |
| raise gr.Error("Please upload an image first!") | |
| if not prompt or prompt.strip() == "": | |
| raise gr.Error("Please enter a prompt describing the desired motion!") | |
| try: | |
| progress(0.2, desc="Processing image...") | |
| # Read the image file | |
| if isinstance(image, str): | |
| with open(image, "rb") as image_file: | |
| input_image = image_file.read() | |
| else: | |
| # If image is a PIL Image, save it temporarily | |
| temp_image = tempfile.NamedTemporaryFile(delete=False, suffix=".png") | |
| image.save(temp_image.name) | |
| with open(temp_image.name, "rb") as image_file: | |
| input_image = image_file.read() | |
| progress(0.4, desc="Generating video with AI...") | |
| # CRITICAL FOR PAID APPS: Create client with user's OAuth token | |
| # Do NOT use bill_to parameter - this makes the USER pay, not HuggingFace | |
| client = InferenceClient( | |
| provider="fal-ai", | |
| api_key=token.token, # Use token.token | |
| ) | |
| # Generate video using the inference client | |
| video = client.image_to_video( | |
| input_image, | |
| prompt=prompt, | |
| model="chetwinlow1/Ovi", | |
| ) | |
| progress(0.9, desc="Finalizing video...") | |
| # Save the video to a temporary file | |
| output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") | |
| # Check if video is bytes or a file path | |
| if isinstance(video, bytes): | |
| with open(output_path.name, "wb") as f: | |
| f.write(video) | |
| elif isinstance(video, str) and os.path.exists(video): | |
| # If it's a path, copy it | |
| import shutil | |
| shutil.copy(video, output_path.name) | |
| else: | |
| # Try to write it directly | |
| with open(output_path.name, "wb") as f: | |
| f.write(video) | |
| progress(1.0, desc="Complete!") | |
| return output_path.name | |
| except Exception as e: | |
| raise gr.Error(f"Error generating video: {str(e)}") | |
| # Create the Gradio interface | |
| with gr.Blocks( | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="indigo", | |
| ), | |
| css=""" | |
| .header-link { | |
| font-size: 0.9em; | |
| color: #666; | |
| text-decoration: none; | |
| margin-bottom: 1em; | |
| display: inline-block; | |
| } | |
| .header-link:hover { | |
| color: #333; | |
| text-decoration: underline; | |
| } | |
| .main-header { | |
| text-align: center; | |
| margin-bottom: 2em; | |
| } | |
| .info-box { | |
| background-color: #f0f7ff; | |
| border-left: 4px solid #4285f4; | |
| padding: 1em; | |
| margin: 1em 0; | |
| border-radius: 4px; | |
| } | |
| .auth-warning { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| padding: 14px 16px; | |
| border-radius: 12px; | |
| margin: 18px auto 6px; | |
| max-width: 860px; | |
| text-align: center; | |
| font-size: 0.98rem; | |
| font-weight: bold; | |
| } | |
| """, | |
| title="Image to Video Generator with Ovi (Paid)", | |
| ) as demo: | |
| gr.HTML( | |
| """ | |
| <div class="main-header"> | |
| <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" class="header-link"> | |
| Built with anycoder β¨ | |
| </a> | |
| </div> | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| # π¬ Image to Video Generator with Ovi | |
| Transform your static images into dynamic videos with synchronized audio using AI! Upload an image and describe the motion you want to see. | |
| Powered by Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation via [HuggingFace Inference Providers](https://huggingface.co/docs/huggingface_hub/guides/inference). | |
| """ | |
| ) | |
| gr.HTML( | |
| """ | |
| <div class="auth-warning"> | |
| π³ <b>PAID APP:</b> This app uses <b>YOUR</b> inference provider credits. | |
| Free users get $0.10 in included credits. PRO users get $2 in credits and can continue with billing. | |
| <a href='http://huggingface.co/subscribe/pro?source=ovi' target='_blank' style='color:#fff; text-decoration:underline;'>Subscribe to PRO</a> for more credits. | |
| Please sign in below to continue. | |
| <br><a href='https://huggingface.co/settings/inference-providers/overview' target='_blank' style='color:#fff; text-decoration:underline; font-weight:bold;'>Check your billing usage here</a> | |
| </div> | |
| """ | |
| ) | |
| # Add login button - required for OAuth | |
| gr.LoginButton() | |
| gr.HTML( | |
| """ | |
| <div class="info-box"> | |
| <strong>π‘ Tips for best results:</strong> | |
| <ul> | |
| <li>Use clear, well-lit images with a single main subject</li> | |
| <li>Write specific prompts describing the desired motion or action</li> | |
| <li>Keep prompts concise and focused on movement and audio elements</li> | |
| <li>Processing generates 5-second videos at 24 FPS with synchronized audio</li> | |
| <li>Processing may take 30-60 seconds depending on server load</li> | |
| </ul> | |
| </div> | |
| """ | |
| ) | |
| gr.HTML( | |
| """ | |
| <div class="info-box"> | |
| <strong>β¨ Special Tokens for Enhanced Control:</strong> | |
| <ul> | |
| <li><strong>Speech:</strong> <code><S>Your speech content here<E></code> - Text enclosed in these tags will be converted to speech</li> | |
| <li><strong>Audio Description:</strong> <code><AUDCAP>Audio description here<ENDAUDCAP></code> - Describes the audio or sound effects present in the video</li> | |
| </ul> | |
| <br> | |
| <strong>π Example Prompt:</strong><br> | |
| <code>Dogs bark loudly at a man wearing a red shirt. The man says <S>Please stop barking at me!<E>. <AUDCAP>Dogs barking, angry man yelling in stern voice<ENDAUDCAP>.</code> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_input = gr.Image( | |
| label="πΈ Upload Image", | |
| type="filepath", | |
| sources=["upload", "clipboard"], | |
| height=400, | |
| ) | |
| prompt_input = gr.Textbox( | |
| label="βοΈ Text Prompt", | |
| lines=3, | |
| placeholder="Describe the motion and audio you want to see..." | |
| ) | |
| generate_btn = gr.Button( | |
| "π¬ Generate Video", | |
| variant="primary", | |
| size="lg", | |
| ) | |
| clear_btn = gr.Button( | |
| "ποΈ Clear", | |
| variant="secondary", | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| "5.png", | |
| 'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>' | |
| ] | |
| ], | |
| inputs=[image_input, prompt_input], | |
| label="Example", | |
| ) | |
| with gr.Column(scale=1): | |
| video_output = gr.Video( | |
| label="π₯ Generated Video", | |
| height=400, | |
| autoplay=True, | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### About Ovi Model | |
| **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** | |
| Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University) | |
| π **Key Features:** | |
| - π¬ **Video+Audio Generation**: Generates synchronized video and audio content simultaneously | |
| - π **Flexible Input**: Supports text-only or text+image conditioning | |
| - β±οΈ **5-second Videos**: Generates 5-second videos at 24 FPS | |
| - π **Multiple Aspect Ratios**: Supports 720Γ720 area at various ratios (9:16, 16:9, 1:1, etc) | |
| Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs. | |
| --- | |
| ### π³ Pricing & Credits | |
| This is a **paid app** that charges your HuggingFace inference provider account: | |
| - **Free users**: $0.10 in included credits | |
| - **PRO users**: $2 in included credits + ability to continue with billing | |
| - Each video generation consumes credits based on processing time | |
| [Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi) for more credits! | |
| """ | |
| ) | |
| # Event handlers with authentication | |
| # NOTE: Do NOT pass profile as input - Gradio injects it automatically | |
| generate_btn.click( | |
| fn=generate_video_with_auth, | |
| inputs=[image_input, prompt_input], | |
| outputs=[video_output], | |
| queue=False, | |
| api_name=False, | |
| show_api=False, | |
| ) | |
| clear_btn.click( | |
| fn=lambda: (None, "", None), | |
| inputs=None, | |
| outputs=[image_input, prompt_input, video_output], | |
| queue=False, | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### π How it works | |
| 1. **Sign in** with your Hugging Face account (required for paid app) | |
| 2. **Upload** your image - any photo or illustration | |
| 3. **Describe** the motion you want to see in the prompt | |
| 4. **Generate** and watch your image come to life with synchronized audio! | |
| 5. **Credits are deducted** from your HuggingFace inference provider account | |
| ### β οΈ Notes | |
| - **This is a PAID app** - uses your inference provider credits | |
| - Video generation may take 30-60 seconds | |
| - Generates 5-second videos at 24 FPS with synchronized audio | |
| - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) at 720Γ720 area | |
| - Best results with clear, high-quality images | |
| - The model works best with realistic subjects and natural motions | |
| - Free accounts have limited credits - upgrade to PRO for more | |
| ### π Resources | |
| - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi) | |
| - [Character AI](https://character.ai) | |
| - [Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi) | |
| - [Inference API Documentation](https://huggingface.co/docs/huggingface_hub/guides/inference) | |
| """ | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| show_api=False, | |
| enable_monitoring=False, | |
| quiet=True, | |
| ) |