Ovi

Running

App Files Files Community

akhaliq HF Staff commited on 30 days ago

Commit

1f6e6f1

verified ·

1 Parent(s): 509ccc4

Update app.py

Browse files

Files changed (1) hide show

app.py +255 -183

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import gradio as gr
 import os
 from huggingface_hub import InferenceClient
-from pathlib import Path
 import tempfile
-import time
-from typing import Optional
 import shutil
 # -------------------------
 # Utilities
@@ -23,6 +23,15 @@ def cleanup_temp_files():
     except Exception as e:
         print(f"Cleanup error: {e}")
 def _save_bytes_as_temp_mp4(data: bytes) -> str:
     temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
     try:
@@ -32,39 +41,63 @@ def _save_bytes_as_temp_mp4(data: bytes) -> str:
     finally:
         temp_file.close()
-def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, progress=gr.Progress()):
-    """
-    Generate a video from an image using the Ovi model with authentication check.
-    Args:
-        image: Input image (PIL Image or file path)
-        prompt: Text prompt describing the desired motion/animation
-        profile: OAuth profile for authentication
-        progress: Gradio progress tracker
-    Returns:
-        Path to the generated video file
-    """
     try:
-        if profile is None:
-            raise gr.Error("Please sign in with Hugging Face to continue. This app uses your inference provider credits.")
         if image is None:
-            raise gr.Error("Please upload an image first!")
         if not prompt or prompt.strip() == "":
-            raise gr.Error("Please enter a prompt describing the desired motion!")
         cleanup_temp_files()
-        progress(0.2, desc="Processing image...")
         # Read the image file
         if isinstance(image, str):
             with open(image, "rb") as image_file:
                 input_image = image_file.read()
         else:
-            # If image is a PIL Image, save it temporarily
             import io
             from PIL import Image as PILImage
@@ -80,50 +113,39 @@ def generate_video_with_auth(image, prompt, profile: gr.OAuthProfile | None, pro
                 pil_image.save(buffer, format='PNG')
                 input_image = buffer.getvalue()
-        progress(0.4, desc="Generating video with AI...")
-        # Create client - use environment token if available, otherwise user's OAuth
-        api_key = os.environ.get("HF_TOKEN")
-        if api_key:
-            client = InferenceClient(
-                provider="fal-ai",
-                api_key=api_key,
-                bill_to="huggingface",
-            )
-        else:
-            # This would require user's token from OAuth
-            raise gr.Error("Server configuration error: HF_TOKEN not available")
-        # Generate video using the inference client
         try:
             video = client.image_to_video(
                 input_image,
                 prompt=prompt,
-                model="chetwinlow1/Ovi",
             )
         except Exception as e:
             import requests
             if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) == 403:
-                raise gr.Error("Access denied by provider (403). Make sure your HF account has credits/permission for provider 'fal-ai' and model 'chetwinlow1/Ovi'.")
             raise
-        progress(0.9, desc="Finalizing video...")
         # Save the video to a temporary file
         video_path = _save_bytes_as_temp_mp4(video)
-        progress(1.0, desc="Complete!")
-        return video_path
-    except gr.Error:
-        raise
     except Exception as e:
-        raise gr.Error(f"Generation failed: {str(e)}")
-def clear_inputs():
-    """Clear all inputs and outputs"""
-    return None, "", None
 # Custom CSS for better styling
 custom_css = """
@@ -154,197 +176,247 @@ custom_css = """
     text-align: center;
     font-size: 0.98rem;
 }
-.info-box {
-    background-color: #f0f7ff;
-    border-left: 4px solid #4285f4;
-    padding: 1em;
     margin: 1em 0;
-    border-radius: 4px;
 }
-.auth-warning {
-    color: #ff6b00;
     font-weight: bold;
-    text-align: center;
-    margin: 1em 0;
 }
 """
 # Create the Gradio interface
-with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Image to Video Generator with Ovi (Paid)") as demo:
     gr.HTML(
         """
         <div style="text-align:center; max-width:900px; margin:0 auto;">
-            <h1 style="font-size:2.2em; margin-bottom:6px;">🎬 Image to Video Generator with Ovi</h1>
-            <p style="color:#777; margin:0 0 8px;">Transform your static images into dynamic videos with synchronized audio using AI!</p>
             <div class="notice">
                 <b>Heads up:</b> This is a paid app that uses <b>your</b> inference provider credits when you run generations.
                 Free users get <b>$0.10 in included credits</b>. <b>PRO users</b> get <b>$2 in included credits</b>
                 and can continue using beyond that (with billing).
-                <a href='http://huggingface.co/subscribe/pro?source=ovi' target='_blank' style='color:#fff; text-decoration:underline; font-weight:bold;'>Subscribe to PRO</a>
                 for more credits. Please sign in with your Hugging Face account to continue.
             </div>
             <p style="font-size: 0.9em; color: #999; margin-top: 10px;">
-                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color:#667eea; text-decoration:underline;">anycoder</a>
             </p>
         </div>
         """
     )
-    gr.Markdown(
-        """
-        Powered by Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation via [HuggingFace Inference Providers](https://huggingface.co/docs/huggingface_hub/guides/inference).
-        """
-    )
     gr.HTML(
         """
-        <div class="auth-warning">
-            ⚠️ You must Sign in with Hugging Face using the button below to use this app.
         </div>
         """
     )
-    # Add login button - required for OAuth
-    gr.LoginButton()
     gr.HTML(
         """
-        <div class="info-box">
-            <strong>💡 Tips for best results:</strong>
-            <ul>
-                <li>Use clear, well-lit images with a single main subject</li>
-                <li>Write specific prompts describing the desired motion or action</li>
-                <li>Keep prompts concise and focused on movement and audio elements</li>
-                <li>Processing generates 5-second videos at 24 FPS with synchronized audio</li>
-                <li>Processing may take 30-60 seconds depending on server load</li>
-            </ul>
-        </div>
         """
     )
-    gr.HTML(
-        """
-        <div class="info-box">
-            <strong>✨ Special Tokens for Enhanced Control:</strong>
-            <ul>
-                <li><strong>Speech:</strong> <code>&lt;S&gt;Your speech content here&lt;E&gt;</code> - Text enclosed in these tags will be converted to speech</li>
-                <li><strong>Audio Description:</strong> <code>&lt;AUDCAP&gt;Audio description here&lt;ENDAUDCAP&gt;</code> - Describes the audio or sound effects present in the video</li>
-            </ul>
-            <br>
-            <strong>📝 Example Prompt:</strong><br>
-            <code>Dogs bark loudly at a man wearing a red shirt. The man says &lt;S&gt;Please stop barking at me!&lt;E&gt;. &lt;AUDCAP&gt;Dogs barking, angry man yelling in stern voice&lt;ENDAUDCAP&gt;.</code>
-        </div>
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            image_input = gr.Image(
-                label="📸 Upload Image",
-                type="pil",
-                height=400,
-            )
-            prompt_input = gr.Textbox(
-                label="✍️ Text Prompt",
-                lines=3,
-                placeholder="Describe the motion you want to see in the video..."
-            )
             with gr.Row():
-                generate_btn = gr.Button(
-                    "🎬 Generate Video",
-                    variant="primary",
-                    scale=2,
-                )
-                clear_btn = gr.Button(
-                    "🗑️ Clear",
-                    variant="secondary",
-                    scale=1,
-                )
             gr.Examples(
                 examples=[
-                    [
-                        "5.png",
-                        'A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the "CHOICE FM" logo and various social media handles like "@ilovechoicefm" with "RALEIGH" below it. The man intently addresses the microphone, articulating, <S>is talent. It\'s all about authenticity. You gotta be who you really are, especially if you\'re working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>'
-                    ]
                 ],
-                inputs=[image_input, prompt_input],
-                label="Example",
             )
-        with gr.Column(scale=1):
-            video_output = gr.Video(
-                label="🎥 Generated Video",
-                height=400,
-                autoplay=True,
-                show_download_button=True,
-            )
-            gr.Markdown(
-                """
-                ### About Ovi Model
-                **Ovi: Twin Backbone Cross-Modal Fusion for Audio-Video Generation**
-                Developed by Chetwin Low, Weimin Wang (Character AI) & Calder Katyal (Yale University)
-                🌟 **Key Features:**
-                - 🎬 **Video+Audio Generation**: Generates synchronized video and audio content simultaneously
-                - 📝 **Flexible Input**: Supports text-only or text+image conditioning
-                - ⏱️ **5-second Videos**: Generates 5-second videos at 24 FPS
-                - 📐 **Multiple Aspect Ratios**: Supports 720×720 area at various ratios (9:16, 16:9, 1:1, etc)
-                Ovi is a veo-3 like model that simultaneously generates both video and audio content from text or text+image inputs.
-                """
             )
-    # Event handlers with authentication - uses OAuthProfile automatically
-    generate_btn.click(
-        fn=generate_video_with_auth,
-        inputs=[image_input, prompt_input],
-        outputs=[video_output],
         show_progress="full",
         queue=False,
         api_name=False,
-        show_api=False,
     )
-    clear_btn.click(
-        fn=clear_inputs,
         inputs=[],
-        outputs=[image_input, prompt_input, video_output],
         queue=False,
     )
-    gr.Markdown(
-        """
-        ---
-        ### 🚀 How it works
-        1. **Sign in** with your Hugging Face account
-        2. **Upload** your image - any photo or illustration
-        3. **Describe** the motion you want to see in the prompt
-        4. **Generate** and watch your image come to life!
-        ### ⚠️ Notes
-        - Video generation may take 30-60 seconds
-        - Generates 5-second videos at 24 FPS with synchronized audio
-        - Supports multiple aspect ratios (9:16, 16:9, 1:1, etc) at 720×720 area
-        - Requires a valid HuggingFace token with Inference API access
-        - Best results with clear, high-quality images
-        - The model works best with realistic subjects and natural motions
-        ### 🔗 Resources
-        - [Ovi Model Card](https://huggingface.co/chetwinlow1/Ovi)
-        - [Character AI](https://character.ai)
-        """
     )
 # Launch the app

 import gradio as gr
 import os
 from huggingface_hub import InferenceClient
 import tempfile
 import shutil
+from pathlib import Path
+from typing import Optional, Union
+import time
 # -------------------------
 # Utilities
     except Exception as e:
         print(f"Cleanup error: {e}")
+def _client_from_token(token: Optional[str]) -> InferenceClient:
+    if not token:
+        raise gr.Error("Please sign in first. This app requires your Hugging Face login.")
+    # IMPORTANT: do not set bill_to when using user OAuth tokens
+    return InferenceClient(
+        provider="fal-ai",
+        api_key=token,
+    )
 def _save_bytes_as_temp_mp4(data: bytes) -> str:
     temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
     try:
     finally:
         temp_file.close()
+def text_to_video(prompt, token: gr.OAuthToken | None, duration=5, aspect_ratio="16:9", resolution="720p", *_):
+    """Generate video from text prompt"""
+    try:
+        if token is None or not getattr(token, "token", None):
+            return None, "❌ Sign in with Hugging Face to continue. This app uses your inference provider credits."
+        if not prompt or prompt.strip() == "":
+            return None, "Please enter a text prompt"
+        cleanup_temp_files()
+        # Create client with user's token
+        client = _client_from_token(token.token)
+        # Generate video from text
+        try:
+            video = client.text_to_video(
+                prompt,
+                model="akhaliq/veo3.1-fast",
+            )
+        except Exception as e:
+            import requests
+            if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) == 403:
+                return None, "❌ Access denied by provider (403). Make sure your HF account has credits/permission for provider 'fal-ai' and model 'akhaliq/veo3.1-fast'."
+            raise
+        # Save the video to a temporary file
+        video_path = _save_bytes_as_temp_mp4(video)
+        return video_path, f"✅ Video generated successfully from prompt: '{prompt[:50]}...'"
+    except gr.Error as e:
+        return None, f"❌ {str(e)}"
+    except Exception as e:
+        return None, f"❌ Generation failed. If this keeps happening, check your provider quota or try again later."
+def image_to_video(image, prompt, token: gr.OAuthToken | None, duration=5, aspect_ratio="16:9", resolution="720p", *_):
+    """Generate video from image and prompt"""
     try:
+        if token is None or not getattr(token, "token", None):
+            return None, "❌ Sign in with Hugging Face to continue. This app uses your inference provider credits."
         if image is None:
+            return None, "Please upload an image"
         if not prompt or prompt.strip() == "":
+            return None, "Please enter a prompt describing the motion"
         cleanup_temp_files()
         # Read the image file
         if isinstance(image, str):
+            # If image is a file path
             with open(image, "rb") as image_file:
                 input_image = image_file.read()
         else:
+            # If image is already bytes or similar
             import io
             from PIL import Image as PILImage
                 pil_image.save(buffer, format='PNG')
                 input_image = buffer.getvalue()
+        # Create client with user's token
+        client = _client_from_token(token.token)
+        # Generate video from image
         try:
             video = client.image_to_video(
                 input_image,
                 prompt=prompt,
+                model="akhaliq/veo3.1-fast-image-to-video",
             )
         except Exception as e:
             import requests
             if isinstance(e, requests.HTTPError) and getattr(e.response, "status_code", None) == 403:
+                return None, "❌ Access denied by provider (403). Make sure your HF account has credits/permission for provider 'fal-ai' and model 'akhaliq/veo3.1-fast-image-to-video'."
             raise
         # Save the video to a temporary file
         video_path = _save_bytes_as_temp_mp4(video)
+        return video_path, f"✅ Video generated successfully with motion: '{prompt[:50]}...'"
+    except gr.Error as e:
+        return None, f"❌ {str(e)}"
     except Exception as e:
+        return None, f"❌ Generation failed. If this keeps happening, check your provider quota or try again later."
+def clear_text_tab():
+    """Clear text-to-video tab"""
+    return "", None, ""
+def clear_image_tab():
+    """Clear image-to-video tab"""
+    return None, "", None, ""
 # Custom CSS for better styling
 custom_css = """
     text-align: center;
     font-size: 0.98rem;
 }
+.mobile-link-container {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    padding: 1.5em;
+    border-radius: 10px;
+    text-align: center;
     margin: 1em 0;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
 }
+.mobile-link {
+    color: white !important;
+    font-size: 1.2em;
     font-weight: bold;
+    text-decoration: none;
+    display: inline-block;
+    padding: 0.5em 1.5em;
+    background: rgba(255, 255, 255, 0.2);
+    border-radius: 25px;
+    transition: all 0.3s ease;
+}
+.mobile-link:hover {
+    background: rgba(255, 255, 255, 0.3);
+    transform: translateY(-2px);
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+}
+.mobile-text {
+    color: white;
+    margin-bottom: 0.5em;
+    font-size: 1.1em;
 }
 """
 # Create the Gradio interface
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="AI Video Generator (Paid)") as demo:
+    gr.Markdown(
+        """
+        # 🎬 AI Video Generator
+        ### Generate stunning videos from text or animate your images with AI
+        #### Powered by VEO 3.1 Fast Model via Hugging Face Inference API (provider: fal-ai)
+        """
+    )
     gr.HTML(
         """
         <div style="text-align:center; max-width:900px; margin:0 auto;">
+            <h1 style="font-size:2.2em; margin-bottom:6px;">🎬 Sora-2</h1>
+            <p style="color:#777; margin:0 0 8px;">Generate videos via the Hugging Face Inference API (provider: fal-ai)</p>
             <div class="notice">
                 <b>Heads up:</b> This is a paid app that uses <b>your</b> inference provider credits when you run generations.
                 Free users get <b>$0.10 in included credits</b>. <b>PRO users</b> get <b>$2 in included credits</b>
                 and can continue using beyond that (with billing).
+                <a href='http://huggingface.co/subscribe/pro?source=veo3' target='_blank' style='color:#fff; text-decoration:underline; font-weight:bold;'>Subscribe to PRO</a>
                 for more credits. Please sign in with your Hugging Face account to continue.
             </div>
             <p style="font-size: 0.9em; color: #999; margin-top: 10px;">
+                Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color:#fff; text-decoration:underline;">anycoder</a>
             </p>
         </div>
         """
     )
+    # Add mobile link section
     gr.HTML(
         """
+        <div class="mobile-link-container">
+            <div class="mobile-text">📱 On mobile? Use the optimized version:</div>
+            <a href="https://akhaliq-veo3-1-fast.hf.space" target="_blank" class="mobile-link">
+                🚀 Open Mobile Version
+            </a>
         </div>
         """
     )
     gr.HTML(
         """
+        <p style="text-align: center; font-size: 0.9em; color: #999; margin-top: 10px;">
+            Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color:#667eea; text-decoration:underline;">anycoder</a>
+        </p>
         """
     )
+    # Add login button - required for OAuth
+    login_btn = gr.LoginButton("Sign in with Hugging Face")
+    with gr.Tabs() as tabs:
+        # Text-to-Video Tab
+        with gr.Tab("📝 Text to Video", id=0):
+            gr.Markdown("### Transform your text descriptions into dynamic videos")
             with gr.Row():
+                with gr.Column(scale=1):
+                    text_prompt = gr.Textbox(
+                        label="Text Prompt",
+                        placeholder="Describe the video you want to create... (e.g., 'A young man walking on the street during sunset')",
+                        lines=4,
+                        max_lines=6
+                    )
+                    with gr.Row():
+                        text_generate_btn = gr.Button("🎬 Generate Video", variant="primary", scale=2)
+                        text_clear_btn = gr.ClearButton(value="🗑️ Clear", scale=1)
+                    text_status = gr.Textbox(
+                        label="Status",
+                        interactive=False,
+                        visible=True,
+                        elem_classes=["status-box"]
+                    )
+                with gr.Column(scale=1):
+                    text_video_output = gr.Video(
+                        label="Generated Video",
+                        autoplay=True,
+                        show_download_button=True,
+                        height=400
+                    )
+            # Examples for text-to-video
             gr.Examples(
                 examples=[
+                    ["A serene beach at sunset with gentle waves"],
+                    ["A bustling city street with neon lights at night"],
+                    ["A majestic eagle soaring through mountain peaks"],
+                    ["An astronaut floating in space near the International Space Station"],
+                    ["Cherry blossoms falling in slow motion in a Japanese garden"],
                 ],
+                inputs=text_prompt,
+                label="Example Prompts"
             )
+        # Image-to-Video Tab
+        with gr.Tab("🖼️ Image to Video", id=1):
+            gr.Markdown("### Bring your static images to life with motion")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    image_input = gr.Image(
+                        label="Upload Image",
+                        type="pil",
+                        height=300
+                    )
+                    image_prompt = gr.Textbox(
+                        label="Motion Prompt",
+                        placeholder="Describe how the image should move... (e.g., 'The cat starts to dance')",
+                        lines=3,
+                        max_lines=5
+                    )
+                    with gr.Row():
+                        image_generate_btn = gr.Button("🎬 Animate Image", variant="primary", scale=2)
+                        image_clear_btn = gr.ClearButton(value="🗑️ Clear", scale=1)
+                    image_status = gr.Textbox(
+                        label="Status",
+                        interactive=False,
+                        visible=True,
+                        elem_classes=["status-box"]
+                    )
+                with gr.Column(scale=1):
+                    image_video_output = gr.Video(
+                        label="Generated Video",
+                        autoplay=True,
+                        show_download_button=True,
+                        height=400
+                    )
+            # Examples for image-to-video
+            gr.Examples(
+                examples=[
+                    [None, "The person starts walking forward"],
+                    [None, "The animal begins to run"],
+                    [None, "Camera slowly zooms in while the subject smiles"],
+                    [None, "The flowers sway gently in the breeze"],
+                    [None, "The clouds move across the sky in time-lapse"],
+                ],
+                inputs=[image_input, image_prompt],
+                label="Example Motion Prompts"
             )
+    # How to Use section
+    with gr.Accordion("📖 How to Use", open=False):
+        gr.Markdown(
+            """
+            ### Text to Video:
+            1. Enter a detailed description of the video you want to create
+            2. Optionally adjust advanced settings (duration, aspect ratio, resolution)
+            3. Click "Generate Video" and wait for the AI to create your video
+            4. Download or preview your generated video
+            ### Image to Video:
+            1. Upload an image you want to animate
+            2. Describe the motion or action you want to add to the image
+            3. Optionally adjust advanced settings
+            4. Click "Animate Image" to bring your image to life
+            5. Download or preview your animated video
+            ### Tips for Better Results:
+            - Be specific and descriptive in your prompts
+            - For image-to-video, describe natural motions that fit the image
+            - Use high-quality input images for better results
+            - Experiment with different prompts to get the desired effect
+            ### Mobile Users:
+            - For the best mobile experience, use the optimized version at: https://akhaliq-veo3-1-fast.hf.space
+            """
+        )
+    # Event handlers - FIXED: removed login_btn from inputs
+    text_generate_btn.click(
+        fn=text_to_video,
+        inputs=[text_prompt],
+        outputs=[text_video_output, text_status],
         show_progress="full",
         queue=False,
         api_name=False,
+        show_api=False
     )
+    text_clear_btn.click(
+        fn=clear_text_tab,
         inputs=[],
+        outputs=[text_prompt, text_video_output, text_status],
+        queue=False
+    )
+    image_generate_btn.click(
+        fn=image_to_video,
+        inputs=[image_input, image_prompt],
+        outputs=[image_video_output, image_status],
+        show_progress="full",
         queue=False,
+        api_name=False,
+        show_api=False
     )
+    image_clear_btn.click(
+        fn=clear_image_tab,
+        inputs=[],
+        outputs=[image_input, image_prompt, image_video_output, image_status],
+        queue=False
     )
 # Launch the app