Ovi

Running

App Files Files Community

akhaliq HF Staff commited on Oct 22

Commit

3ae4fdd

verified ·

1 Parent(s): 1f6e6f1

Update app.py

Browse files

Files changed (1) hide show

app.py +224 -367

app.py CHANGED Viewed

@@ -1,439 +1,296 @@
 import gradio as gr
 import os
 from huggingface_hub import InferenceClient
-import tempfile
-import shutil
 from pathlib import Path
-from typing import Optional, Union
-import time
-# -------------------------
-# Utilities
-# -------------------------
-def cleanup_temp_files():
-    try:
-        temp_dir = tempfile.gettempdir()
-        for file_path in Path(temp_dir).glob("*.mp4"):
-            try:
-                if file_path.stat().st_mtime < (time.time() - 300):
-                    file_path.unlink(missing_ok=True)
-            except Exception:
-                pass
-    except Exception as e:
-        print(f"Cleanup error: {e}")
-def _client_from_token(token: Optional[str]) -> InferenceClient:
-    if not token:
-        raise gr.Error("Please sign in first. This app requires your Hugging Face login.")
-    # IMPORTANT: do not set bill_to when using user OAuth tokens
-    return InferenceClient(
-        provider="fal-ai",
-        api_key=token,
-    )
-def _save_bytes_as_temp_mp4(data: bytes) -> str:
-    temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
-    try:
-        temp_file.write(data)
-        temp_file.flush()
-        return temp_file.name
-    finally:
-        temp_file.close()
-def text_to_video(prompt, token: gr.OAuthToken | None, duration=5, aspect_ratio="16:9", resolution="720p", *_):
-    """Generate video from text prompt"""
-    try:
-        if token is None or not getattr(token, "token", None):
-            return None, "❌ Sign in with Hugging Face to continue. This app uses your inference provider credits."
-        if not prompt or prompt.strip() == "":
-            return None, "Please enter a text prompt"
-        cleanup_temp_files()
-        # Create client with user's token
-        client = _client_from_token(token.token)
-        # Generate video from text
-        try:
-            video = client.text_to_video(
-                prompt,
-                model="akhaliq/veo3.1-fast",
-            )
-        except Exception as e:
-            import requests
-            if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) == 403:
-                return None, "❌ Access denied by provider (403). Make sure your HF account has credits/permission for provider 'fal-ai' and model 'akhaliq/veo3.1-fast'."
-            raise
-        # Save the video to a temporary file
-        video_path = _save_bytes_as_temp_mp4(video)
-        return video_path, f"✅ Video generated successfully from prompt: '{prompt[:50]}...'"
-    except gr.Error as e:
-        return None, f"❌ {str(e)}"
-    except Exception as e:
-        return None, f"❌ Generation failed. If this keeps happening, check your provider quota or try again later."
-def image_to_video(image, prompt, token: gr.OAuthToken | None, duration=5, aspect_ratio="16:9", resolution="720p", *_):
-    """Generate video from image and prompt"""
     try:
-        if token is None or not getattr(token, "token", None):
-            return None, "❌ Sign in with Hugging Face to continue. This app uses your inference provider credits."
-        if image is None:
-            return None, "Please upload an image"
-        if not prompt or prompt.strip() == "":
-            return None, "Please enter a prompt describing the motion"
-        cleanup_temp_files()
         # Read the image file
         if isinstance(image, str):
-            # If image is a file path
             with open(image, "rb") as image_file:
                 input_image = image_file.read()
         else:
-            # If image is already bytes or similar
-            import io
-            from PIL import Image as PILImage
-            # Convert to bytes if necessary
-            if isinstance(image, PILImage.Image):
-                buffer = io.BytesIO()
-                image.save(buffer, format='PNG')
-                input_image = buffer.getvalue()
-            else:
-                # Assume it's a numpy array or similar
-                pil_image = PILImage.fromarray(image)
-                buffer = io.BytesIO()
-                pil_image.save(buffer, format='PNG')
-                input_image = buffer.getvalue()
-        # Create client with user's token
-        client = _client_from_token(token.token)
-        # Generate video from image
-        try:
-            video = client.image_to_video(
-                input_image,
-                prompt=prompt,
-                model="akhaliq/veo3.1-fast-image-to-video",
-            )
-        except Exception as e:
-            import requests
-            if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) == 403:
-                return None, "❌ Access denied by provider (403). Make sure your HF account has credits/permission for provider 'fal-ai' and model 'akhaliq/veo3.1-fast-image-to-video'."
-            raise
         # Save the video to a temporary file
-        video_path = _save_bytes_as_temp_mp4(video)
-        return video_path, f"✅ Video generated successfully with motion: '{prompt[:50]}...'"
-    except gr.Error as e:
-        return None, f"❌ {str(e)}"
     except Exception as e:
-        return None, f"❌ Generation failed. If this keeps happening, check your provider quota or try again later."
-def clear_text_tab():
-    """Clear text-to-video tab"""
-    return "", None, ""
-def clear_image_tab():
-    """Clear image-to-video tab"""
-    return None, "", None, ""
-# Custom CSS for better styling
-custom_css = """
-.container {
-    max-width: 1200px;
-    margin: auto;
-}
-.header-link {
-    text-decoration: none;
-    color: #2196F3;
-    font-weight: bold;
-}
-.header-link:hover {
-    text-decoration: underline;
-}
-.status-box {
-    padding: 10px;
-    border-radius: 5px;
-    margin-top: 10px;
-}
-.notice {
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    color: white;
-    padding: 14px 16px;
-    border-radius: 12px;
-    margin: 18px auto 6px;
-    max-width: 860px;
-    text-align: center;
-    font-size: 0.98rem;
-}
-.mobile-link-container {
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    padding: 1.5em;
-    border-radius: 10px;
-    text-align: center;
-    margin: 1em 0;
-    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-}
-.mobile-link {
-    color: white !important;
-    font-size: 1.2em;
-    font-weight: bold;
-    text-decoration: none;
-    display: inline-block;
-    padding: 0.5em 1.5em;
-    background: rgba(255, 255, 255, 0.2);
-    border-radius: 25px;
-    transition: all 0.3s ease;
-}
-.mobile-link:hover {
-    background: rgba(255, 255, 255, 0.3);
-    transform: translateY(-2px);
-    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
-}
-.mobile-text {
-    color: white;
-    margin-bottom: 0.5em;
-    font-size: 1.1em;
-}
-"""
 # Create the Gradio interface
-with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="AI Video Generator (Paid)") as demo:
     gr.Markdown(
         """
-        # 🎬 AI Video Generator
-        ### Generate stunning videos from text or animate your images with AI
-        #### Powered by VEO 3.1 Fast Model via Hugging Face Inference API (provider: fal-ai)
         """
     )
     gr.HTML(
         """
-        <div style="text-align:center; max-width:900px; margin:0 auto;">
-            <h1 style="font-size:2.2em; margin-bottom:6px;">🎬 Sora-2</h1>
-            <p style="color:#777; margin:0 0 8px;">Generate videos via the Hugging Face Inference API (provider: fal-ai)</p>
-            <div class="notice">
-                <b>Heads up:</b> This is a paid app that uses <b>your</b> inference provider credits when you run generations.
-                Free users get <b>$0.10 in included credits</b>. <b>PRO users</b> get <b>$2 in included credits</b>
-                and can continue using beyond that (with billing).
-                <a href='http://huggingface.co/subscribe/pro?source=veo3' target='_blank' style='color:#fff; text-decoration:underline; font-weight:bold;'>Subscribe to PRO</a>
-                for more credits. Please sign in with your Hugging Face account to continue.
-            </div>
-            <p style="font-size: 0.9em; color: #999; margin-top: 10px;">
-                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color:#fff; text-decoration:underline;">anycoder</a>
-            </p>
         </div>
         """
     )
-    # Add mobile link section
     gr.HTML(
         """
-        <div class="mobile-link-container">
-            <div class="mobile-text">📱 On mobile? Use the optimized version:</div>
-            <a href="https://akhaliq-veo3-1-fast.hf.space" target="_blank" class="mobile-link">
-                🚀 Open Mobile Version
-            </a>
         </div>
         """
     )
     gr.HTML(
         """
-        <p style="text-align: center; font-size: 0.9em; color: #999; margin-top: 10px;">
-            Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color:#667eea; text-decoration:underline;">anycoder</a>
-        </p>
         """
     )
-    # Add login button - required for OAuth
-    login_btn = gr.LoginButton("Sign in with Hugging Face")
-    with gr.Tabs() as tabs:
-        # Text-to-Video Tab
-        with gr.Tab("📝 Text to Video", id=0):
-            gr.Markdown("### Transform your text descriptions into dynamic videos")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    text_prompt = gr.Textbox(
-                        label="Text Prompt",
-                        placeholder="Describe the video you want to create... (e.g., 'A young man walking on the street during sunset')",
-                        lines=4,
-                        max_lines=6
-                    )
-                    with gr.Row():
-                        text_generate_btn = gr.Button("🎬 Generate Video", variant="primary", scale=2)
-                        text_clear_btn = gr.ClearButton(value="🗑️ Clear", scale=1)
-                    text_status = gr.Textbox(
-                        label="Status",
-                        interactive=False,
-                        visible=True,
-                        elem_classes=["status-box"]
-                    )
-                with gr.Column(scale=1):
-                    text_video_output = gr.Video(
-                        label="Generated Video",
-                        autoplay=True,
-                        show_download_button=True,
-                        height=400
-                    )
-            # Examples for text-to-video
-            gr.Examples(
-                examples=[
-                    ["A serene beach at sunset with gentle waves"],
-                    ["A bustling city street with neon lights at night"],
-                    ["A majestic eagle soaring through mountain peaks"],
-                    ["An astronaut floating in space near the International Space Station"],
-                    ["Cherry blossoms falling in slow motion in a Japanese garden"],
-                ],
-                inputs=text_prompt,
-                label="Example Prompts"
             )
-        # Image-to-Video Tab
-        with gr.Tab("🖼️ Image to Video", id=1):
-            gr.Markdown("### Bring your static images to life with motion")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    image_input = gr.Image(
-                        label="Upload Image",
-                        type="pil",
-                        height=300
-                    )
-                    image_prompt = gr.Textbox(
-                        label="Motion Prompt",
-                        placeholder="Describe how the image should move... (e.g., 'The cat starts to dance')",
-                        lines=3,
-                        max_lines=5
-                    )
-                    with gr.Row():
-                        image_generate_btn = gr.Button("🎬 Animate Image", variant="primary", scale=2)
-                        image_clear_btn = gr.ClearButton(value="🗑️ Clear", scale=1)
-                    image_status = gr.Textbox(
-                        label="Status",
-                        interactive=False,
-                        visible=True,
-                        elem_classes=["status-box"]
-                    )
-                with gr.Column(scale=1):
-                    image_video_output = gr.Video(
-                        label="Generated Video",
-                        autoplay=True,
-                        show_download_button=True,
-                        height=400
-                    )
-            # Examples for image-to-video
             gr.Examples(
                 examples=[
-                    [None, "The person starts walking forward"],
-                    [None, "The animal begins to run"],
-                    [None, "Camera slowly zooms in while the subject smiles"],
-                    [None, "The flowers sway gently in the breeze"],
-                    [None, "The clouds move across the sky in time-lapse"],
                 ],
-                inputs=[image_input, image_prompt],
-                label="Example Motion Prompts"
             )
-    # How to Use section
-    with gr.Accordion("📖 How to Use", open=False):
-        gr.Markdown(
-            """
-            ### Text to Video:
-            1. Enter a detailed description of the video you want to create
-            2. Optionally adjust advanced settings (duration, aspect ratio, resolution)
-            3. Click "Generate Video" and wait for the AI to create your video
-            4. Download or preview your generated video
-            ### Image to Video:
-            1. Upload an image you want to animate
-            2. Describe the motion or action you want to add to the image
-            3. Optionally adjust advanced settings
-            4. Click "Animate Image" to bring your image to life
-            5. Download or preview your animated video
-            ### Tips for Better Results:
-            - Be specific and descriptive in your prompts
-            - For image-to-video, describe natural motions that fit the image
-            - Use high-quality input images for better results
-            - Experiment with different prompts to get the desired effect
-            ### Mobile Users:
-            - For the best mobile experience, use the optimized version at: https://akhaliq-veo3-1-fast.hf.space
-            """
-        )
-    # Event handlers - FIXED: removed login_btn from inputs
-    text_generate_btn.click(
-        fn=text_to_video,
-        inputs=[text_prompt],
-        outputs=[text_video_output, text_status],
-        show_progress="full",
         queue=False,
         api_name=False,
-        show_api=False
-    )
-    text_clear_btn.click(
-        fn=clear_text_tab,
-        inputs=[],
-        outputs=[text_prompt, text_video_output, text_status],
-        queue=False
     )
-    image_generate_btn.click(
-        fn=image_to_video,
-        inputs=[image_input, image_prompt],
-        outputs=[image_video_output, image_status],
-        show_progress="full",
         queue=False,
-        api_name=False,
-        show_api=False
     )
-    image_clear_btn.click(
-        fn=clear_image_tab,
-        inputs=[],
-        outputs=[image_input, image_prompt, image_video_output, image_status],
-        queue=False
     )
 # Launch the app
 if __name__ == "__main__":
-    try:
-        cleanup_temp_files()
-        if os.path.exists("gradio_cached_examples"):
-            shutil.rmtree("gradio_cached_examples", ignore_errors=True)
-    except Exception as e:
-        print(f"Initial cleanup error: {e}")
-    demo.queue(status_update_rate="auto", api_open=False, default_concurrency_limit=None)
     demo.launch(
         show_api=False,
-        share=False,
-        show_error=True,
         enable_monitoring=False,
         quiet=True,
-        ssr_mode=True
     )

 import gradio as gr
 import os
 from huggingface_hub import InferenceClient
 from pathlib import Path
+import tempfile
+# Initialize the inference client
+client = InferenceClient(
+    provider="fal-ai",
+    api_key=os.environ.get("HF_TOKEN"),
+    bill_to="huggingface",
+)
+def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()):
+    """
+    Generate a video from an image using the Ovi model with authentication check.
+    Args:
+        image: Input image (PIL Image or file path)
+        prompt: Text prompt describing the desired motion/animation
+        profile: OAuth profile for authentication
+        progress: Gradio progress tracker
+    Returns:
+        Path to the generated video file
+    """
+    if profile is None:
+        raise gr.Error("Click Sign in with Hugging Face button to use this app for free")
+    if image is None:
+        raise gr.Error("Please upload an image first!")
+    if not prompt or prompt.strip() == "":
+        raise gr.Error("Please enter a prompt describing the desired motion!")
     try:
+        progress(0.2, desc="Processing image...")
         # Read the image file
         if isinstance(image, str):
             with open(image, "rb") as image_file:
                 input_image = image_file.read()
         else:
+            # If image is a PIL Image, save it temporarily
+            temp_image = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
+            image.save(temp_image.name)
+            with open(temp_image.name, "rb") as image_file:
+                input_image = image_file.read()
+        progress(0.4, desc="Generating video with AI...")
+        # Generate video using the inference client
+        video = client.image_to_video(
+            input_image,
+            prompt=prompt,
+            model="chetwinlow1/Ovi",
+        )
+        progress(0.9, desc="Finalizing video...")
         # Save the video to a temporary file
+        output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+        # Check if video is bytes or a file path
+        if isinstance(video, bytes):
+            with open(output_path.name, "wb") as f:
+                f.write(video)
+        elif isinstance(video, str) and os.path.exists(video):
+            # If it's a path, copy it
+            import shutil
+            shutil.copy(video, output_path.name)
+        else:
+            # Try to write it directly
+            with open(output_path.name, "wb") as f:
+                f.write(video)
+        progress(1.0, desc="Complete!")
+        return output_path.name
     except Exception as e:
+        raise gr.Error(f"Error generating video: {str(e)}")
 # Create the Gradio interface
+with gr.Blocks(
+    theme=gr.themes.Soft(
+        primary_hue="blue",
+        secondary_hue="indigo",
+    ),
+    css="""
+        .header-link {
+            font-size: 0.9em;
+            color: #666;
+            text-decoration: none;
+            margin-bottom: 1em;
+            display: inline-block;
+        }
+        .header-link:hover {
+            color: #333;
+            text-decoration: underline;
+        }
+        .main-header {
+            text-align: center;
+            margin-bottom: 2em;
+        }
+        .info-box {
+            background-color: #f0f7ff;
+            border-left: 4px solid #4285f4;
+            padding: 1em;
+            margin: 1em 0;
+            border-radius: 4px;
+        }
+        .auth-warning {
+            color: #ff6b00;
+            font-weight: bold;
+            text-align: center;
+            margin: 1em 0;
+        }
+    """,
+    title="Image to Video Generator with Ovi",
+) as demo:
+    gr.HTML(
+        """
+        <div class="main-header">
+            <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" class="header-link">
+                Built with anycoder ✨
+            </a>
+        </div>
+        """
+    )
     gr.Markdown(
         """
+        # 🎬 Image to Video Generator with Ovi
+        Transform your static images into dynamic videos with synchronized audio using AI! Upload an image and describe the motion you want to see.
+        Powered by Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation via [HuggingFace Inference Providers](https://huggingface.co/docs/huggingface_hub/guides/inference)
+.
         """
     )
     gr.HTML(
         """
+        <div class="auth-warning">
+            ⚠️ You must Sign in with Hugging Face using the button below to use this app.
         </div>
         """
     )
+    # Add login button - required for OAuth
+    gr.LoginButton()
     gr.HTML(
         """
+        <div class="info-box">
+            <strong>💡 Tips for best results:</strong>
+            <ul>
+                <li>Use clear, well-lit images with a single main subject</li>
+                <li>Write specific prompts describing the desired motion or action</li>
+                <li>Keep prompts concise and focused on movement and audio elements</li>
+                <li>Processing generates 5-second videos at 24 FPS with synchronized audio</li>
+                <li>Processing may take 30-60 seconds depending on server load</li>
+            </ul>
         </div>
         """
     )
     gr.HTML(
         """
+        <div class="info-box">
+            <strong>✨ Special Tokens for Enhanced Control:</strong>
+            <ul>
+                <li><strong>Speech:</strong> <code>&lt;S&gt;Your speech content here&lt;E&gt;</code> - Text enclosed in these tags will be converted to speech</li>
+                <li><strong>Audio Description:</strong> <code>&lt;AUDCAP&gt;Audio description here&lt;ENDAUDCAP&gt;</code> - Describes the audio or sound effects present in the video</li>
+            </ul>
+            <br>
+            <strong>📝 Example Prompt:</strong><br>
+            <code>Dogs bark loudly at a man wearing a red shirt. The man says &lt;S&gt;Please stop barking at me!&lt;E&gt;. &lt;AUDCAP&gt;Dogs barking, angry man yelling in stern voice&lt;ENDAUDCAP&gt;.</code>
+        </div>
         """
     )
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(
+                label="📸 Upload Image",
+                type="filepath",
+                sources=["upload", "clipboard"],
+                height=400,
+            )
+            prompt_input = gr.Textbox(
+                label="✍️ Text Prompt",
+                lines=3,
+            )
+            generate_btn = gr.Button(
+                "🎬 Generate Video",
+                variant="primary",
+                size="lg",
             )
+            clear_btn = gr.Button(
+                "🗑️ Clear",
+                variant="secondary",
+            )
             gr.Examples(
                 examples=[
+                    [
+                        "5.png",
+                        'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>'
+                    ]
                 ],
+                inputs=[image_input, prompt_input],
+                label="Example",
+            )
+        with gr.Column(scale=1):
+            video_output = gr.Video(
+                label="🎥 Generated Video",
+                height=400,
+                autoplay=True,
             )
+            gr.Markdown(
+                """
+                ### About Ovi Model
+                **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation**
+                Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University)
+                🌟 **Key Features:**
+                - 🎬 **Video+Audio Generation**: Generates synchronized video and audio content simultaneously
+                - 📝 **Flexible Input**: Supports text-only or text+image conditioning
+                - ⏱️ **5-second Videos**: Generates 5-second videos at 24 FPS
+                - 📐 **Multiple Aspect Ratios**: Supports 720×720 area at various ratios (9:16, 16:9, 1:1, etc)
+                Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs.
+                """
+            )
+    # Event handlers with authentication
+    generate_btn.click(
+        fn=generate_video_with_auth,
+        inputs=[image_input, prompt_input],
+        outputs=[video_output],
         queue=False,
         api_name=False,
+        show_api=False,
     )
+    clear_btn.click(
+        fn=lambda: (None, "", None),
+        inputs=None,
+        outputs=[image_input, prompt_input, video_output],
         queue=False,
     )
+    gr.Markdown(
+        """
+        ---
+        ### 🚀 How it works
+        1. **Sign in** with your Hugging Face account
+        2. **Upload** your image - any photo or illustration
+        3. **Describe** the motion you want to see in the prompt
+        4. **Generate** and watch your image come to life!
+        ### ⚠️ Notes
+        - Video generation may take 30-60 seconds
+        - Generates 5-second videos at 24 FPS with synchronized audio
+        - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) at 720×720 area
+        - Requires a valid HuggingFace token with Inference API access
+        - Best results with clear, high-quality images
+        - The model works best with realistic subjects and natural motions
+        ### 🔗 Resources
+        - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi)
+        - [Character AI](https://character.ai)
+        """
     )
 # Launch the app
 if __name__ == "__main__":
     demo.launch(
         show_api=False,
         enable_monitoring=False,
         quiet=True,
     )