Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 31

Commit

63f345f

verified ·

1 Parent(s): a9aa30e

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -85

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-import io
 import tempfile
 import gradio as gr
 from dotenv import load_dotenv
@@ -8,98 +8,54 @@ from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
 from pathlib import Path
-from PIL import Image
-import spaces
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
-# Determine if we have access to a GPU
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 device_id = 0 if torch.cuda.is_available() else -1
-# Initialize the image captioning pipeline
 captioning_pipeline = pipeline(
     "image-to-text",
     model="nlpconnect/vit-gpt2-image-captioning",
     device=device_id
 )
-# Initialize the text-to-audio pipeline
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
     use_auth_token=hf_token
 )
-pipe.to(device)
 @spaces.GPU(duration=120)
-def analyze_image_with_free_model(image_file: bytes):
-    """
-    Analyze the uploaded image using the ViT-GPT2 image captioning pipeline.
-    :param image_file: Binary content of the uploaded image.
-    :return: A tuple (caption, error_flag).
-             caption (str) - The generated caption or error message.
-             error_flag (bool) - Indicates if an error occurred.
-    """
     try:
-        # Validate image input
-        if not image_file:
-            return "Error: No image data received.", True
-        # Check if the file is a valid image
-        try:
-            Image.open(io.BytesIO(image_file)).verify()
-        except Exception:
-            return "Error: Invalid image file. Please upload a valid image.", True
-        # Write the valid image to a temporary file for the pipeline
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
             temp_file.write(image_file)
             temp_image_path = temp_file.name
-        # Perform image captioning
         results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
-            return "Error: Captioning pipeline returned invalid results.", True
-        # Extract and clean up the generated caption
         caption = results[0].get("generated_text", "").strip()
         if not caption:
-            return "No caption was generated by the model.", True
         return caption, False
     except Exception as e:
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
-def get_audioldm_from_caption(caption: str):
-    """
-    Generate an audio file (WAV) from a text caption using the AudioLDM2 pipeline.
-    :param caption: The text prompt used to generate audio.
-    :return: The path to the generated .wav file, or None if an error occurred.
-    """
     try:
-        # Move pipeline to GPU (if available)
-        pipe.to(device)
-        # Generate audio from text prompt
         audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
             guidance_scale=7.5
         )
-        # Move pipeline back to CPU to free GPU memory
         pipe.to("cpu")
-        # Extract the first audio sample
         audio = audio_output.audios[0]
-        # Write the audio to a temporary WAV file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
             write(temp_wav.name, 16000, audio)
             return temp_wav.name
@@ -108,8 +64,6 @@ def get_audioldm_from_caption(caption: str):
         print(f"Error generating audio from caption: {e}")
         return None
-# Custom CSS for styling the Gradio Blocks
 css = """
 #col-container{
     margin: 0 auto;
@@ -120,28 +74,25 @@ css = """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
-        <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
-        <p style="text-align: center;">
-            ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
-        </p>
         """)
     gr.Markdown("""
-    Welcome to this unique sound effect generator! This tool allows you to upload an image
-    and generate a descriptive caption and a corresponding sound effect, all using free,
-    open-source models on Hugging Face.
     **💡 How it works:**
     1. **Upload an image**: Choose an image that you'd like to analyze.
-    2. **Generate Description**: Click on 'Generate Description' to get a textual
-       description of your uploaded image.
-    3. **Generate Sound Effect**: Based on the image description, click on
-       'Generate Sound Effect' to create a sound effect that matches the image context.
     Enjoy the journey from visual to auditory sensation with just a few clicks!
     """)
-    # Define Gradio interface elements
     image_upload = gr.File(label="Upload Image", type="binary")
     generate_description_button = gr.Button("Generate Description")
     caption_display = gr.Textbox(label="Image Description", interactive=False)
@@ -162,26 +113,20 @@ with gr.Blocks(css=css) as demo:
     gr.Markdown("""
     ## 📢 Stay Connected
-    This app is a testament to the creative possibilities that emerge when
-    technology meets art. Enjoy exploring the auditory landscape of your images!
     """)
-    # Define the helper functions for Gradio event handlers
     def update_caption(image_file):
-        description, error_flag = analyze_image_with_free_model(image_file)
-        if error_flag:
-            # In case of error, just return the error message
-            return description
         return description
     def generate_sound(description):
-        # Validate the description before generating audio
         if not description or description.startswith("Error"):
             return None
         audio_path = get_audioldm_from_caption(description)
         return audio_path
-    # Wire the Gradio events to the functions
     generate_description_button.click(
         fn=update_caption,
         inputs=image_upload,
@@ -193,16 +138,8 @@ with gr.Blocks(css=css) as demo:
         inputs=caption_display,
         outputs=audio_output
     )
-    gr.HTML(
-        '<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image">'
-        '<img src="https://api.visitorbadge.io/api/visitors?path='
-        'https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" '
-        '/></a>'
-    )
-    # An extra placeholder if needed
     html = gr.HTML()
-# Enable debug and optional share. On Spaces, 'share=True' is typically ignored.
-demo.launch(debug=True, share=True)

+import spaces
 import os
 import tempfile
 import gradio as gr
 from dotenv import load_dotenv
 from diffusers import DiffusionPipeline
 from transformers import pipeline
 from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
 device_id = 0 if torch.cuda.is_available() else -1
 captioning_pipeline = pipeline(
     "image-to-text",
     model="nlpconnect/vit-gpt2-image-captioning",
     device=device_id
 )
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
     use_auth_token=hf_token
 )
 @spaces.GPU(duration=120)
+def analyze_image_with_free_model(image_file):
     try:
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
             temp_file.write(image_file)
             temp_image_path = temp_file.name
         results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
+            return "Error: Could not generate caption.", True
         caption = results[0].get("generated_text", "").strip()
         if not caption:
+            return "No caption was generated.", True
         return caption, False
     except Exception as e:
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
+def get_audioldm_from_caption(caption):
     try:
+        pipe.to("cuda")
         audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
             guidance_scale=7.5
         )
         pipe.to("cpu")
         audio = audio_output.audios[0]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
             write(temp_wav.name, 16000, audio)
             return temp_wav.name
         print(f"Error generating audio from caption: {e}")
         return None
 css = """
 #col-container{
     margin: 0 auto;
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
+    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
+    <p style="text-align: center;">
+        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
+    </p>
         """)
     gr.Markdown("""
+    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
+    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
     **💡 How it works:**
     1. **Upload an image**: Choose an image that you'd like to analyze.
+    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
+    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
+       sound effect that matches the image context.
     Enjoy the journey from visual to auditory sensation with just a few clicks!
     """)
     image_upload = gr.File(label="Upload Image", type="binary")
     generate_description_button = gr.Button("Generate Description")
     caption_display = gr.Textbox(label="Image Description", interactive=False)
     gr.Markdown("""
     ## 📢 Stay Connected
+    This app is a testament to the creative possibilities that emerge when technology meets art.
+    Enjoy exploring the auditory landscape of your images!
     """)
     def update_caption(image_file):
+        description, _ = analyze_image_with_free_model(image_file)
         return description
     def generate_sound(description):
         if not description or description.startswith("Error"):
             return None
         audio_path = get_audioldm_from_caption(description)
         return audio_path
     generate_description_button.click(
         fn=update_caption,
         inputs=image_upload,
         inputs=caption_display,
         outputs=audio_output
     )
+    gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
     html = gr.HTML()
+demo.launch(debug=True, share=True)