Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 31

Commit

a9aa30e

verified ·

1 Parent(s): 041bd28

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -22

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
-import spaces
 import os
 import tempfile
 import gradio as gr
 from dotenv import load_dotenv
@@ -8,54 +8,98 @@ from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
 from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
 device_id = 0 if torch.cuda.is_available() else -1
 captioning_pipeline = pipeline(
     "image-to-text",
     model="nlpconnect/vit-gpt2-image-captioning",
     device=device_id
 )
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
     use_auth_token=hf_token
 )
 @spaces.GPU(duration=120)
-def analyze_image_with_free_model(image_file):
     try:
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
             temp_file.write(image_file)
             temp_image_path = temp_file.name
         results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
-            return "Error: Could not generate caption.", True
         caption = results[0].get("generated_text", "").strip()
         if not caption:
-            return "No caption was generated.", True
         return caption, False
     except Exception as e:
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
-def get_audioldm_from_caption(caption):
     try:
-        pipe.to("cuda")
         audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
             guidance_scale=7.5
         )
         pipe.to("cpu")
         audio = audio_output.audios[0]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
             write(temp_wav.name, 16000, audio)
             return temp_wav.name
@@ -64,6 +108,8 @@ def get_audioldm_from_caption(caption):
         print(f"Error generating audio from caption: {e}")
         return None
 css = """
 #col-container{
     margin: 0 auto;
@@ -74,25 +120,28 @@ css = """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
-    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
-    <p style="text-align: center;">
-        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
-    </p>
         """)
     gr.Markdown("""
-    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
-    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
     **💡 How it works:**
     1. **Upload an image**: Choose an image that you'd like to analyze.
-    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
-    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
-       sound effect that matches the image context.
     Enjoy the journey from visual to auditory sensation with just a few clicks!
     """)
     image_upload = gr.File(label="Upload Image", type="binary")
     generate_description_button = gr.Button("Generate Description")
     caption_display = gr.Textbox(label="Image Description", interactive=False)
@@ -113,20 +162,26 @@ with gr.Blocks(css=css) as demo:
     gr.Markdown("""
     ## 📢 Stay Connected
-    This app is a testament to the creative possibilities that emerge when technology meets art.
-    Enjoy exploring the auditory landscape of your images!
     """)
     def update_caption(image_file):
-        description, _ = analyze_image_with_free_model(image_file)
         return description
     def generate_sound(description):
         if not description or description.startswith("Error"):
             return None
         audio_path = get_audioldm_from_caption(description)
         return audio_path
     generate_description_button.click(
         fn=update_caption,
         inputs=image_upload,
@@ -138,8 +193,16 @@ with gr.Blocks(css=css) as demo:
         inputs=caption_display,
         outputs=audio_output
     )
-    gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
     html = gr.HTML()
-demo.launch(debug=True, share=True)

 import os
+import io
 import tempfile
 import gradio as gr
 from dotenv import load_dotenv
 from diffusers import DiffusionPipeline
 from transformers import pipeline
 from pathlib import Path
+from PIL import Image
+import spaces
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
+# Determine if we have access to a GPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 device_id = 0 if torch.cuda.is_available() else -1
+# Initialize the image captioning pipeline
 captioning_pipeline = pipeline(
     "image-to-text",
     model="nlpconnect/vit-gpt2-image-captioning",
     device=device_id
 )
+# Initialize the text-to-audio pipeline
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
     use_auth_token=hf_token
 )
+pipe.to(device)
 @spaces.GPU(duration=120)
+def analyze_image_with_free_model(image_file: bytes):
+    """
+    Analyze the uploaded image using the ViT-GPT2 image captioning pipeline.
+    :param image_file: Binary content of the uploaded image.
+    :return: A tuple (caption, error_flag).
+             caption (str) - The generated caption or error message.
+             error_flag (bool) - Indicates if an error occurred.
+    """
     try:
+        # Validate image input
+        if not image_file:
+            return "Error: No image data received.", True
+        # Check if the file is a valid image
+        try:
+            Image.open(io.BytesIO(image_file)).verify()
+        except Exception:
+            return "Error: Invalid image file. Please upload a valid image.", True
+        # Write the valid image to a temporary file for the pipeline
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
             temp_file.write(image_file)
             temp_image_path = temp_file.name
+        # Perform image captioning
         results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
+            return "Error: Captioning pipeline returned invalid results.", True
+        # Extract and clean up the generated caption
         caption = results[0].get("generated_text", "").strip()
         if not caption:
+            return "No caption was generated by the model.", True
         return caption, False
     except Exception as e:
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
+def get_audioldm_from_caption(caption: str):
+    """
+    Generate an audio file (WAV) from a text caption using the AudioLDM2 pipeline.
+    :param caption: The text prompt used to generate audio.
+    :return: The path to the generated .wav file, or None if an error occurred.
+    """
     try:
+        # Move pipeline to GPU (if available)
+        pipe.to(device)
+        # Generate audio from text prompt
         audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
             guidance_scale=7.5
         )
+        # Move pipeline back to CPU to free GPU memory
         pipe.to("cpu")
+        # Extract the first audio sample
         audio = audio_output.audios[0]
+        # Write the audio to a temporary WAV file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
             write(temp_wav.name, 16000, audio)
             return temp_wav.name
         print(f"Error generating audio from caption: {e}")
         return None
+# Custom CSS for styling the Gradio Blocks
 css = """
 #col-container{
     margin: 0 auto;
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
+        <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
+        <p style="text-align: center;">
+            ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
+        </p>
         """)
     gr.Markdown("""
+    Welcome to this unique sound effect generator! This tool allows you to upload an image
+    and generate a descriptive caption and a corresponding sound effect, all using free,
+    open-source models on Hugging Face.
     **💡 How it works:**
     1. **Upload an image**: Choose an image that you'd like to analyze.
+    2. **Generate Description**: Click on 'Generate Description' to get a textual
+       description of your uploaded image.
+    3. **Generate Sound Effect**: Based on the image description, click on
+       'Generate Sound Effect' to create a sound effect that matches the image context.
     Enjoy the journey from visual to auditory sensation with just a few clicks!
     """)
+    # Define Gradio interface elements
     image_upload = gr.File(label="Upload Image", type="binary")
     generate_description_button = gr.Button("Generate Description")
     caption_display = gr.Textbox(label="Image Description", interactive=False)
     gr.Markdown("""
     ## 📢 Stay Connected
+    This app is a testament to the creative possibilities that emerge when
+    technology meets art. Enjoy exploring the auditory landscape of your images!
     """)
+    # Define the helper functions for Gradio event handlers
     def update_caption(image_file):
+        description, error_flag = analyze_image_with_free_model(image_file)
+        if error_flag:
+            # In case of error, just return the error message
+            return description
         return description
     def generate_sound(description):
+        # Validate the description before generating audio
         if not description or description.startswith("Error"):
             return None
         audio_path = get_audioldm_from_caption(description)
         return audio_path
+    # Wire the Gradio events to the functions
     generate_description_button.click(
         fn=update_caption,
         inputs=image_upload,
         inputs=caption_display,
         outputs=audio_output
     )
+    gr.HTML(
+        '<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image">'
+        '<img src="https://api.visitorbadge.io/api/visitors?path='
+        'https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" '
+        '/></a>'
+    )
+    # An extra placeholder if needed
     html = gr.HTML()
+# Enable debug and optional share. On Spaces, 'share=True' is typically ignored.
+demo.launch(debug=True, share=True)