Ovi

Running

App Files Files Community

akhaliq HF Staff commited on Oct 22

Commit

3be93ed

verified ·

1 Parent(s): 3ae4fdd

Update app.py

Browse files

Files changed (1) hide show

app.py +275 -145

app.py CHANGED Viewed

@@ -1,156 +1,213 @@
 import gradio as gr
 import os
 from huggingface_hub import InferenceClient
-from pathlib import Path
 import tempfile
-# Initialize the inference client
-client = InferenceClient(
-    provider="fal-ai",
-    api_key=os.environ.get("HF_TOKEN"),
-    bill_to="huggingface",
-)
-def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()):
     """
     Generate a video from an image using the Ovi model with authentication check.
     Args:
         image: Input image (PIL Image or file path)
         prompt: Text prompt describing the desired motion/animation
-        profile: OAuth profile for authentication
         progress: Gradio progress tracker
     Returns:
-        Path to the generated video file
     """
-    if profile is None:
-        raise gr.Error("Click Sign in with Hugging Face button to use this app for free")
-    if image is None:
-        raise gr.Error("Please upload an image first!")
-    if not prompt or prompt.strip() == "":
-        raise gr.Error("Please enter a prompt describing the desired motion!")
     try:
         progress(0.2, desc="Processing image...")
         # Read the image file
         if isinstance(image, str):
             with open(image, "rb") as image_file:
                 input_image = image_file.read()
         else:
-            # If image is a PIL Image, save it temporarily
-            temp_image = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
-            image.save(temp_image.name)
-            with open(temp_image.name, "rb") as image_file:
-                input_image = image_file.read()
         progress(0.4, desc="Generating video with AI...")
         # Generate video using the inference client
-        video = client.image_to_video(
-            input_image,
-            prompt=prompt,
-            model="chetwinlow1/Ovi",
-        )
         progress(0.9, desc="Finalizing video...")
         # Save the video to a temporary file
-        output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-        # Check if video is bytes or a file path
-        if isinstance(video, bytes):
-            with open(output_path.name, "wb") as f:
-                f.write(video)
-        elif isinstance(video, str) and os.path.exists(video):
-            # If it's a path, copy it
-            import shutil
-            shutil.copy(video, output_path.name)
-        else:
-            # Try to write it directly
-            with open(output_path.name, "wb") as f:
-                f.write(video)
         progress(1.0, desc="Complete!")
-        return output_path.name
     except Exception as e:
-        raise gr.Error(f"Error generating video: {str(e)}")
 # Create the Gradio interface
-with gr.Blocks(
-    theme=gr.themes.Soft(
-        primary_hue="blue",
-        secondary_hue="indigo",
-    ),
-    css="""
-        .header-link {
-            font-size: 0.9em;
-            color: #666;
-            text-decoration: none;
-            margin-bottom: 1em;
-            display: inline-block;
-        }
-        .header-link:hover {
-            color: #333;
-            text-decoration: underline;
-        }
-        .main-header {
-            text-align: center;
-            margin-bottom: 2em;
-        }
-        .info-box {
-            background-color: #f0f7ff;
-            border-left: 4px solid #4285f4;
-            padding: 1em;
-            margin: 1em 0;
-            border-radius: 4px;
-        }
-        .auth-warning {
-            color: #ff6b00;
-            font-weight: bold;
-            text-align: center;
-            margin: 1em 0;
-        }
-    """,
-    title="Image to Video Generator with Ovi",
-) as demo:
     gr.HTML(
         """
-        <div class="main-header">
-            <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" class="header-link">
-                Built with anycoder ✨
-            </a>
         </div>
         """
     )
     gr.Markdown(
         """
-        # 🎬 Image to Video Generator with Ovi
-        Transform your static images into dynamic videos with synchronized audio using AI! Upload an image and describe the motion you want to see.
-        Powered by Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation via [HuggingFace Inference Providers](https://huggingface.co/docs/huggingface_hub/guides/inference)
-.
-        """
-    )
-    gr.HTML(
-        """
-        <div class="auth-warning">
-            ⚠️ You must Sign in with Hugging Face using the button below to use this app.
-        </div>
         """
     )
     # Add login button - required for OAuth
-    gr.LoginButton()
     gr.HTML(
         """
@@ -169,7 +226,7 @@ with gr.Blocks(
     gr.HTML(
         """
-        <div class="info-box">
             <strong>✨ Special Tokens for Enhanced Control:</strong>
             <ul>
                 <li><strong>Speech:</strong> <code>&lt;S&gt;Your speech content here&lt;E&gt;</code> - Text enclosed in these tags will be converted to speech</li>
@@ -186,36 +243,55 @@ with gr.Blocks(
         with gr.Column(scale=1):
             image_input = gr.Image(
                 label="📸 Upload Image",
-                type="filepath",
                 sources=["upload", "clipboard"],
                 height=400,
             )
             prompt_input = gr.Textbox(
                 label="✍️ Text Prompt",
-                lines=3,
             )
-            generate_btn = gr.Button(
-                "🎬 Generate Video",
-                variant="primary",
-                size="lg",
-            )
-            clear_btn = gr.Button(
-                "🗑️ Clear",
-                variant="secondary",
             )
             gr.Examples(
                 examples=[
                     [
-                        "5.png",
                         'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>'
                     ]
                 ],
                 inputs=[image_input, prompt_input],
-                label="Example",
             )
         with gr.Column(scale=1):
@@ -223,6 +299,7 @@ with gr.Blocks(
                 label="🎥 Generated Video",
                 height=400,
                 autoplay=True,
             )
             gr.Markdown(
@@ -240,57 +317,110 @@ with gr.Blocks(
                 - 📐 **Multiple Aspect Ratios**: Supports 720×720 area at various ratios (9:16, 16:9, 1:1, etc)
                 Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs.
                 """
             )
-    # Event handlers with authentication
-    generate_btn.click(
-        fn=generate_video_with_auth,
-        inputs=[image_input, prompt_input],
-        outputs=[video_output],
-        queue=False,
-        api_name=False,
-        show_api=False,
-    )
-    clear_btn.click(
-        fn=lambda: (None, "", None),
-        inputs=None,
-        outputs=[image_input, prompt_input, video_output],
-        queue=False,
-    )
     gr.Markdown(
         """
         ---
-        ### 🚀 How it works
-        1. **Sign in** with your Hugging Face account
-        2. **Upload** your image - any photo or illustration
-        3. **Describe** the motion you want to see in the prompt
-        4. **Generate** and watch your image come to life!
-        ### ⚠️ Notes
-        - Video generation may take 30-60 seconds
-        - Generates 5-second videos at 24 FPS with synchronized audio
-        - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) at 720×720 area
-        - Requires a valid HuggingFace token with Inference API access
-        - Best results with clear, high-quality images
-        - The model works best with realistic subjects and natural motions
         ### 🔗 Resources
         - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi)
         - [Character AI](https://character.ai)
         """
     )
 # Launch the app
 if __name__ == "__main__":
     demo.launch(
         show_api=False,
         enable_monitoring=False,
         quiet=True,
     )

 import gradio as gr
 import os
 from huggingface_hub import InferenceClient
 import tempfile
+import shutil
+from pathlib import Path
+from typing import Optional
+import time
+# -------------------------
+# Utilities
+# -------------------------
+def cleanup_temp_files():
+    """Clean up old temporary video files"""
+    try:
+        temp_dir = tempfile.gettempdir()
+        for file_path in Path(temp_dir).glob("*.mp4"):
+            try:
+                if file_path.stat().st_mtime < (time.time() - 300):
+                    file_path.unlink(missing_ok=True)
+            except Exception:
+                pass
+    except Exception as e:
+        print(f"Cleanup error: {e}")
+def _client_from_token(token: Optional[str]) -> InferenceClient:
+    """Create InferenceClient from user's OAuth token"""
+    if not token:
+        raise gr.Error("Please sign in first. This app requires your Hugging Face login.")
+    # IMPORTANT: do not set bill_to when using user OAuth tokens
+    # This ensures the user is billed, not Hugging Face
+    return InferenceClient(
+        provider="fal-ai",
+        api_key=token,
+    )
+def _save_bytes_as_temp_mp4(data: bytes) -> str:
+    """Save video bytes to temporary MP4 file"""
+    temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    try:
+        temp_file.write(data)
+        temp_file.flush()
+        return temp_file.name
+    finally:
+        temp_file.close()
+def generate_video_with_auth(image, prompt, token: gr.OAuthToken | None, progress=gr.Progress()):
     """
     Generate a video from an image using the Ovi model with authentication check.
     Args:
         image: Input image (PIL Image or file path)
         prompt: Text prompt describing the desired motion/animation
+        token: OAuth token for authentication
         progress: Gradio progress tracker
     Returns:
+        Tuple of (video_path, status_message)
     """
     try:
+        # Check authentication
+        if token is None or not getattr(token, "token", None):
+            return None, "❌ Sign in with Hugging Face to continue. This app uses your inference provider credits."
+        if image is None:
+            return None, "❌ Please upload an image first!"
+        if not prompt or prompt.strip() == "":
+            return None, "❌ Please enter a prompt describing the desired motion!"
         progress(0.2, desc="Processing image...")
+        cleanup_temp_files()
         # Read the image file
         if isinstance(image, str):
+            # If image is a file path
             with open(image, "rb") as image_file:
                 input_image = image_file.read()
         else:
+            # If image is PIL Image or array
+            import io
+            from PIL import Image as PILImage
+            if isinstance(image, PILImage.Image):
+                buffer = io.BytesIO()
+                image.save(buffer, format='PNG')
+                input_image = buffer.getvalue()
+            else:
+                # Assume it's a numpy array
+                pil_image = PILImage.fromarray(image)
+                buffer = io.BytesIO()
+                pil_image.save(buffer, format='PNG')
+                input_image = buffer.getvalue()
         progress(0.4, desc="Generating video with AI...")
+        # Create client with user's token
+        client = _client_from_token(token.token)
         # Generate video using the inference client
+        try:
+            video = client.image_to_video(
+                input_image,
+                prompt=prompt,
+                model="chetwinlow1/Ovi",
+            )
+        except Exception as e:
+            import requests
+            if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) == 403:
+                return None, "❌ Access denied by provider (403). Make sure your HF account has credits/permission for provider 'fal-ai' and model 'chetwinlow1/Ovi'."
+            raise
         progress(0.9, desc="Finalizing video...")
         # Save the video to a temporary file
+        video_path = _save_bytes_as_temp_mp4(video)
         progress(1.0, desc="Complete!")
+        return video_path, f"✅ Video generated successfully! Prompt: '{prompt[:60]}...'"
+    except gr.Error as e:
+        return None, f"❌ {str(e)}"
     except Exception as e:
+        return None, f"❌ Generation failed. If this keeps happening, check your provider quota or try again later. Error: {str(e)}"
+def clear_all():
+    """Clear all inputs and outputs"""
+    return None, "", None, ""
+# Custom CSS for better styling
+custom_css = """
+.container {
+    max-width: 1200px;
+    margin: auto;
+}
+.header-link {
+    text-decoration: none;
+    color: #2196F3;
+    font-weight: bold;
+}
+.header-link:hover {
+    text-decoration: underline;
+}
+.status-box {
+    padding: 10px;
+    border-radius: 5px;
+    margin-top: 10px;
+}
+.notice {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 14px 16px;
+    border-radius: 12px;
+    margin: 18px auto 6px;
+    max-width: 860px;
+    text-align: center;
+    font-size: 0.98rem;
+}
+.info-box {
+    background-color: #f0f7ff;
+    border-left: 4px solid #4285f4;
+    padding: 1em;
+    margin: 1em 0;
+    border-radius: 4px;
+}
+.special-tokens-box {
+    background: linear-gradient(135deg, #ffeaa7 0%, #fdcb6e 100%);
+    padding: 1em;
+    margin: 1em 0;
+    border-radius: 8px;
+    border-left: 4px solid #e17055;
+}
+"""
 # Create the Gradio interface
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Ovi Image-to-Video Generator (Paid)") as demo:
+    # Header with payment notice
     gr.HTML(
         """
+        <div style="text-align:center; padding:2em 1em 1em;">
+            <h1 style="font-size:2.2em; margin-bottom:6px;">🎬 Ovi: Image-to-Video with Audio</h1>
+            <p style="color:#777; margin:0 0 8px;">Generate synchronized video and audio from images</p>
+            <div class="notice">
+                <b>Heads up:</b> This is a paid app that uses <b>your</b> inference provider credits when you run generations.
+                Free users get <b>$0.10 in included credits</b>. <b>PRO users</b> get <b>$2 in included credits</b>
+                and can continue using beyond that (with billing).
+                <a href='http://huggingface.co/subscribe/pro?source=ovi' target='_blank' style='color:#fff; text-decoration:underline; font-weight:bold;'>Subscribe to PRO</a>
+                for more credits. Please sign in with your Hugging Face account to continue.
+            </div>
+            <p style="font-size: 0.9em; color: #999; margin-top: 10px;">
+                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color:#667eea; text-decoration:underline;">anycoder</a>
+            </p>
         </div>
         """
     )
     gr.Markdown(
         """
+        ### Transform your static images into dynamic videos with synchronized audio using AI!
+        Powered by **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation** via [HuggingFace Inference Providers](https://huggingface.co/docs/huggingface_hub/guides/inference)
         """
     )
     # Add login button - required for OAuth
+    login_btn = gr.LoginButton("Sign in with Hugging Face")
     gr.HTML(
         """
     gr.HTML(
         """
+        <div class="special-tokens-box">
             <strong>✨ Special Tokens for Enhanced Control:</strong>
             <ul>
                 <li><strong>Speech:</strong> <code>&lt;S&gt;Your speech content here&lt;E&gt;</code> - Text enclosed in these tags will be converted to speech</li>
         with gr.Column(scale=1):
             image_input = gr.Image(
                 label="📸 Upload Image",
+                type="pil",
                 sources=["upload", "clipboard"],
                 height=400,
             )
             prompt_input = gr.Textbox(
                 label="✍️ Text Prompt",
+                placeholder="Describe the motion and audio you want... (e.g., 'A person walking forward while talking')",
+                lines=4,
+                max_lines=6
             )
+            with gr.Row():
+                generate_btn = gr.Button(
+                    "🎬 Generate Video",
+                    variant="primary",
+                    scale=2
+                )
+                clear_btn = gr.Button(
+                    "🗑️ Clear",
+                    variant="secondary",
+                    scale=1
+                )
+            status_output = gr.Textbox(
+                label="Status",
+                interactive=False,
+                visible=True,
+                elem_classes=["status-box"]
             )
             gr.Examples(
                 examples=[
                     [
+                        "example1.png",
                         'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>'
+                    ],
+                    [
+                        None,
+                        "A cat sitting on a windowsill starts to meow loudly. <S>Meow meow meow!<E> <AUDCAP>Cat meowing, traffic sounds in the background<ENDAUDCAP>"
+                    ],
+                    [
+                        None,
+                        "A musician playing guitar on stage. The crowd cheers. <AUDCAP>Electric guitar playing, crowd cheering and applause<ENDAUDCAP>"
                     ]
                 ],
                 inputs=[image_input, prompt_input],
+                label="Example Prompts",
             )
         with gr.Column(scale=1):
                 label="🎥 Generated Video",
                 height=400,
                 autoplay=True,
+                show_download_button=True
             )
             gr.Markdown(
                 - 📐 **Multiple Aspect Ratios**: Supports 720×720 area at various ratios (9:16, 16:9, 1:1, etc)
                 Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs.
+                ---
+                ### 💳 Pricing Information
+                This app uses the Hugging Face Inference API (provider: fal-ai) which charges based on usage:
+                - **Free users**: $0.10 in included credits
+                - **PRO users**: $2 in included credits + ability to continue with billing
+                [Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi) for more credits and features!
                 """
             )
+    # How to Use section
+    with gr.Accordion("📖 How to Use", open=False):
+        gr.Markdown(
+            """
+            ### Getting Started:
+            1. **Sign in** with your Hugging Face account using the button above
+            2. **Upload** your image - any photo or illustration
+            3. **Describe** the motion and audio you want in the prompt
+            4. **Use special tokens** for speech and audio descriptions (optional but recommended)
+            5. **Generate** and watch your image come to life with synchronized audio!
+            ### Special Tokens Guide:
+            **Speech Token**: `<S>text<E>`
+            - Use this to add spoken dialogue to your video
+            - Example: `The person says <S>Hello, how are you?<E>`
+            **Audio Description Token**: `<AUDCAP>description<ENDAUDCAP>`
+            - Use this to describe background sounds and audio effects
+            - Example: `<AUDCAP>Birds chirping, gentle wind blowing<ENDAUDCAP>`
+            ### Tips for Better Results:
+            - Be specific and descriptive in your prompts
+            - Combine visual motion descriptions with audio elements
+            - Use high-quality input images for better results
+            - Experiment with different prompts and special tokens
+            - Processing takes 30-60 seconds per generation
+            ### ⚠️ Important Notes:
+            - This is a **paid app** that uses your inference provider credits
+            - Each generation consumes credits based on processing time
+            - Free accounts have limited credits ($0.10)
+            - PRO accounts get more credits ($2) and can continue with billing
+            - Videos are 5 seconds long at 24 FPS
+            - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc)
+            """
+        )
     gr.Markdown(
         """
         ---
         ### 🔗 Resources
         - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi)
         - [Character AI](https://character.ai)
+        - [Hugging Face Inference API Docs](https://huggingface.co/docs/huggingface_hub/guides/inference)
+        - [Subscribe to PRO](http://huggingface.co/subscribe/pro?source=ovi)
+        ### 📊 Model Specifications
+        - **Provider**: fal-ai
+        - **Model**: chetwinlow1/Ovi
+        - **Output**: 5-second videos at 24 FPS with audio
+        - **Input**: Image + Text prompt
+        - **Resolution**: 720×720 area (various aspect ratios)
         """
     )
+    # Event handlers with authentication
+    generate_btn.click(
+        fn=generate_video_with_auth,
+        inputs=[image_input, prompt_input, login_btn],
+        outputs=[video_output, status_output],
+        show_progress="full",
+        queue=False,
+        api_name=False,
+        show_api=False,
+    )
+    clear_btn.click(
+        fn=clear_all,
+        inputs=[],
+        outputs=[image_input, prompt_input, video_output, status_output],
+        queue=False,
+    )
 # Launch the app
 if __name__ == "__main__":
+    try:
+        cleanup_temp_files()
+        if os.path.exists("gradio_cached_examples"):
+            shutil.rmtree("gradio_cached_examples", ignore_errors=True)
+    except Exception as e:
+        print(f"Initial cleanup error: {e}")
+    demo.queue(status_update_rate="auto", api_open=False, default_concurrency_limit=None)
     demo.launch(
         show_api=False,
+        share=False,
+        show_error=True,
         enable_monitoring=False,
         quiet=True,
     )