Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 31

Commit

ccdc62f

verified ·

1 Parent(s): 745586c

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -107

app.py CHANGED Viewed

@@ -1,145 +1,164 @@
-import spaces
-import os
-import tempfile
 import gradio as gr
-from dotenv import load_dotenv
 import torch
-from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
-from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
-device_id = 0 if torch.cuda.is_available() else -1
-captioning_pipeline = pipeline(
-    "image-to-text",
-    model="nlpconnect/vit-gpt2-image-captioning",
-    device=device_id
-)
-pipe = DiffusionPipeline.from_pretrained(
-    "cvssp/audioldm2",
-    use_auth_token=hf_token
-)
-@spaces.GPU(duration=120)
-def analyze_image_with_free_model(image_file):
     try:
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
-            temp_file.write(image_file)
-            temp_image_path = temp_file.name
-        results = captioning_pipeline(temp_image_path)
-        if not results or not isinstance(results, list):
-            return "Error: Could not generate caption.", True
         caption = results[0].get("generated_text", "").strip()
-        if not caption:
-            return "No caption was generated.", True
-        return caption, False
     except Exception as e:
-        return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
-def get_audioldm_from_caption(caption):
     try:
-        pipe.to("cuda")
-        audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
-            guidance_scale=7.5
-        )
-        pipe.to("cpu")
-        audio = audio_output.audios[0]
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-            write(temp_wav.name, 16000, audio)
-            return temp_wav.name
     except Exception as e:
-        print(f"Error generating audio from caption: {e}")
         return None
 css = """
-#col-container{
     margin: 0 auto;
-    max-width: 800px;
-    }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
-    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
-    <p style="text-align: center;">
-        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
-    </p>
         """)
-    gr.Markdown("""
-    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
-    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
-    **💡 How it works:**
-    1. **Upload an image**: Choose an image that you'd like to analyze.
-    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
-    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
-       sound effect that matches the image context.
-    Enjoy the journey from visual to auditory sensation with just a few clicks!
-    """)
-    image_upload = gr.File(label="Upload Image", type="binary")
-    generate_description_button = gr.Button("Generate Description")
-    caption_display = gr.Textbox(label="Image Description", interactive=False)
-    generate_sound_button = gr.Button("Generate Sound Effect")
-    audio_output = gr.Audio(label="Generated Sound Effect")
-    gr.Markdown("""
-    ## 👥 How You Can Contribute
-    We welcome contributions and suggestions for improvements. Your feedback is invaluable
-    to the continuous enhancement of this application.
-    For support, questions, or to contribute, please contact us at
-    [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
-    Support our work and get involved by donating through
-    [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
-    """)
-    gr.Markdown("""
-    ## 📢 Stay Connected
-    This app is a testament to the creative possibilities that emerge when technology meets art.
-    Enjoy exploring the auditory landscape of your images!
-    """)
-    def update_caption(image_file):
-        description, _ = analyze_image_with_free_model(image_file)
-        return description
-    def generate_sound(description):
-        if not description or description.startswith("Error"):
-            return None
-        audio_path = get_audioldm_from_caption(description)
-        return audio_path
-    generate_description_button.click(
-        fn=update_caption,
-        inputs=image_upload,
-        outputs=caption_display
     )
-    generate_sound_button.click(
-        fn=generate_sound,
-        inputs=caption_display,
-        outputs=audio_output
     )
-    gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
-    html = gr.HTML()
-demo.launch(debug=True, share=True)

+import io
+from pathlib import Path
+from typing import Tuple, Optional
 import gradio as gr
+import numpy as np
 import torch
+from PIL import Image
+from dotenv import load_dotenv
 from diffusers import DiffusionPipeline
 from transformers import pipeline
+from huggingface_hub import login
+# Load environment variables
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
+if hf_token:
+    login(token=hf_token)
+# Device configuration
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Load models
+@spaces.GPU
+def load_models():
+    """Load both models with proper device placement"""
+    caption_pipe = pipeline(
+        "image-to-text",
+        model="nlpconnect/vit-gpt2-image-captioning",
+        device=device
+    )
+    audio_pipe = DiffusionPipeline.from_pretrained(
+        "cvssp/audioldm2",
+        token=hf_token,
+        torch_dtype=torch_dtype
+    )
+    return caption_pipe, audio_pipe
+caption_pipe, audio_pipe = load_models()
+def analyze_image(image_bytes: bytes) -> Tuple[str, bool]:
+    """Generate caption from image bytes with enhanced error handling"""
     try:
+        image = Image.open(io.BytesIO(image_bytes))
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        results = caption_pipe(image)
+        if not results or not isinstance(results, list):
+            return "Error: Invalid response from caption model", True
         caption = results[0].get("generated_text", "").strip()
+        return caption or "No caption generated", not bool(caption)
     except Exception as e:
+        return f"Image processing error: {str(e)}", True
 @spaces.GPU(duration=120)
+def generate_audio(caption: str) -> Optional[Tuple[int, np.ndarray]]:
+    """Generate audio from caption with resource management"""
     try:
+        # Device management with context
+        original_device = next(audio_pipe.parameters()).device
+        audio_pipe.to(device)
+        # Generation with progress awareness
+        audio = audio_pipe(
             prompt=caption,
             num_inference_steps=50,
+            guidance_scale=7.5,
+            audio_length_in_s=5.0  # Keep audio generation short
+        ).audios[0]
+        # Post-processing
+        audio = audio.squeeze()  # Handle mono channel
+        audio = np.clip(audio, -1, 1)  # Ensure valid range
+        return (16000, audio)
     except Exception as e:
+        print(f"Audio generation error: {str(e)}")
         return None
+    finally:
+        audio_pipe.to(original_device)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+# UI Components
 css = """
+#col-container {
+    max-width: 800px;
     margin: 0 auto;
+}
+.disclaimer {
+    font-size: 0.9em;
+    color: #666;
+}
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
+            <h1 style="text-align: center;">🎶 Image to Sound Effect Generator</h1>
+            <p style="text-align: center;">
+                ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
+            </p>
         """)
+        with gr.Row():
+            image_input = gr.Image(type="filepath", label="Upload Image")
+            caption_output = gr.Textbox(label="Generated Description", interactive=False)
+        with gr.Row():
+            generate_btn = gr.Button("Generate Description", variant="primary")
+            audio_output = gr.Audio(label="Generated Sound", interactive=False)
+            sound_btn = gr.Button("Generate Sound", variant="secondary")
+        gr.Examples(
+            examples=[str(Path(__file__).parent / "examples" / f) for f in ["storm.jpg", "city.jpg"]],
+            inputs=image_input,
+            outputs=[caption_output, audio_output],
+            fn=lambda x: (analyze_image(Path(x).read_bytes())[0], None),
+            cache_examples=True
+        )
+        gr.Markdown("### 🛠️ Usage Tips")
+        gr.Markdown("""
+            - Use clear, high-contrast images for best results
+            - Complex scenes may require multiple generations
+            - Keep sound generation under 10 seconds for quick results
+        """)
+        gr.Markdown("### ⚠️ Disclaimer", elem_classes="disclaimer")
+        gr.Markdown("""
+            Generated content may not always be accurate. Use at your own discretion.
+            [Privacy Policy](https://bilsimaging.com/privacy) |
+            [Terms of Service](https://bilsimaging.com/terms)
+        """)
+    # Event handling
+    generate_btn.click(
+        fn=lambda x: analyze_image(Path(x).read_bytes())[0],
+        inputs=image_input,
+        outputs=caption_output,
+        api_name="describe"
+    )
+    sound_btn.click(
+        fn=generate_audio,
+        inputs=caption_output,
+        outputs=audio_output,
+        api_name="generate_sound"
     )
+    # Input validation
+    image_input.change(
+        fn=lambda: [gr.update(value=""), gr.update(value=None)],
+        outputs=[caption_output, audio_output]
     )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0" if os.getenv("SPACE_ID") else "127.0.0.1")