Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Aug 6

Commit

1ea1538

verified ·

1 Parent(s): b67339a

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -53

app.py CHANGED Viewed

@@ -8,78 +8,94 @@ from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
 from pathlib import Path
-from PIL import Image  # <-- ADDED THIS IMPORT
-import io               # <-- ADDED THIS IMPORT
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
-device_id = 0 if torch.cuda.is_available() else -1
 # Correctly initialize the modern, reliable captioning pipeline
 captioning_pipeline = pipeline(
     "image-to-text",
     model="Salesforce/blip-image-captioning-large",
-    device=device_id
 )
-# Initialize the audio pipeline
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
-    use_auth_token=hf_token
 )
-# === THIS IS THE CORRECTED FUNCTION ===
 @spaces.GPU(duration=120)
 def analyze_image_with_free_model(image_file_bytes):
     try:
-        # No more temp files!
         # Open the image data directly from memory using Pillow
-        image = Image.open(io.BytesIO(image_file_bytes))
-        # Pass the Pillow Image object directly to the pipeline. This is the robust method.
         results = captioning_pipeline(image)
         if not results or not isinstance(results, list):
             return "Error: Could not generate caption.", True
         caption = results[0].get("generated_text", "").strip()
         if not caption:
             return "No caption was generated.", True
         return caption, False
     except Exception as e:
-        print(f"ERROR in analyze_image_with_free_model: {e}") # Print error to logs
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
 def get_audioldm_from_caption(caption):
     try:
-        pipe.to("cuda")
         audio_output = pipe(
             prompt=caption,
-            num_inference_steps=50,
-            guidance_scale=7.5
-        )
-        pipe.to("cpu")
-        audio = audio_output.audios[0]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-            write(temp_wav.name, 16000, audio)
             return temp_wav.name
     except Exception as e:
-        print(f"Error generating audio from caption: {e}")
         return None
-# --- Gradio UI (No changes needed here) ---
 css = """
-#col-container{
-    margin: 0 auto;
-    max-width: 800px;
-    }
 """
 with gr.Blocks(css=css) as demo:
@@ -92,52 +108,42 @@ with gr.Blocks(css=css) as demo:
         """)
     gr.Markdown("""
-    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
-    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
-    **💡 How it works:**
-    1. **Upload an image**: Choose an image that you'd like to analyze.
-    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
-    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
-       sound effect that matches the image context.
-    Enjoy the journey from visual to auditory sensation with just a few clicks!
     """)
     image_upload = gr.File(label="Upload Image", type="binary")
-    generate_description_button = gr.Button("Generate Description")
     caption_display = gr.Textbox(label="Image Description", interactive=False)
     generate_sound_button = gr.Button("Generate Sound Effect")
     audio_output = gr.Audio(label="Generated Sound Effect")
     gr.Markdown("""
-    ## 👥 How You Can Contribute
-    We welcome contributions and suggestions for improvements. Your feedback is invaluable
-    to the continuous enhancement of this application.
     For support, questions, or to contribute, please contact us at
     [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
     Support our work and get involved by donating through
     [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
     """)
-    gr.Markdown("""
-    ## 📢 Stay Connected
-    This app is a testament to the creative possibilities that emerge when technology meets art.
-    Enjoy exploring the auditory landscape of your images!
-    """)
-    # --- Gradio event handlers (I've updated the function called here) ---
-    def update_caption(image_file_bytes):
-        # We pass the bytes from the uploader directly to our corrected function
-        description, _ = analyze_image_with_free_model(image_file_bytes)
         return description
     def generate_sound(description):
         if not description or description.startswith("Error"):
             return None
         audio_path = get_audioldm_from_caption(description)
         return audio_path
     generate_description_button.click(
@@ -153,6 +159,6 @@ with gr.Blocks(css=css) as demo:
     )
     gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
-    html = gr.HTML()
-demo.launch(debug=True, share=True)

 from diffusers import DiffusionPipeline
 from transformers import pipeline
 from pathlib import Path
+from PIL import Image  # <-- Required for new model
+import io               # <-- Required for new model
+# --- Setup Models and Device ---
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
+# Use GPU if available, otherwise CPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
 # Correctly initialize the modern, reliable captioning pipeline
 captioning_pipeline = pipeline(
     "image-to-text",
     model="Salesforce/blip-image-captioning-large",
+    device=device
 )
+print("Image captioning pipeline loaded.")
+# Initialize the audio pipeline. Use float16 for less VRAM on GPU.
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
+    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
 )
+print("Audio generation pipeline loaded.")
+# --- Core Functions ---
 @spaces.GPU(duration=120)
 def analyze_image_with_free_model(image_file_bytes):
+    """Takes image bytes and returns a caption."""
     try:
+        print("Received image bytes, opening with Pillow...")
         # Open the image data directly from memory using Pillow
+        image = Image.open(io.BytesIO(image_file_bytes)).convert("RGB")
+        print("Generating caption...")
         results = captioning_pipeline(image)
         if not results or not isinstance(results, list):
+            print("ERROR: Caption generation returned invalid results.")
             return "Error: Could not generate caption.", True
         caption = results[0].get("generated_text", "").strip()
         if not caption:
+            print("ERROR: Generated caption is empty.")
             return "No caption was generated.", True
+        print(f"Successfully generated caption: {caption}")
         return caption, False
     except Exception as e:
+        print(f"!!!!!! EXCEPTION in analyze_image_with_free_model: {e}")
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
 def get_audioldm_from_caption(caption):
+    """Takes a text caption and returns a filepath to a generated WAV file."""
     try:
+        # Move the large audio pipeline to the GPU only when it's being used
+        pipe.to(device)
+        print(f"Generating audio for prompt: '{caption}'")
         audio_output = pipe(
             prompt=caption,
+            num_inference_steps=25, # Fewer steps for faster generation
+            guidance_scale=7.0
+        ).audios[0]
+        # Move the pipeline back to CPU to free up GPU memory for others
+        pipe.to("cpu")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+            print(f"Saving audio to temporary file: {temp_wav.name}")
+            # write(file, sample_rate, data)
+            write(temp_wav.name, 16000, audio_output)
             return temp_wav.name
     except Exception as e:
+        print(f"!!!!!! EXCEPTION in get_audioldm_from_caption: {e}")
         return None
+# --- Gradio Interface ---
 css = """
+#col-container{ margin: 0 auto; max-width: 800px; }
 """
 with gr.Blocks(css=css) as demo:
         """)
     gr.Markdown("""
+    1. **Upload an image**.
+    2. Click **Generate Description**.
+    3. Click **Generate Sound Effect**.
     """)
     image_upload = gr.File(label="Upload Image", type="binary")
+    generate_description_button = gr.Button("Generate Description", variant="primary")
     caption_display = gr.Textbox(label="Image Description", interactive=False)
     generate_sound_button = gr.Button("Generate Sound Effect")
     audio_output = gr.Audio(label="Generated Sound Effect")
     gr.Markdown("""
+    ## 👥 Contribute & Support
     For support, questions, or to contribute, please contact us at
     [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
     Support our work and get involved by donating through
     [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
     """)
+    # --- Event Handlers ---
+    def update_caption(image_bytes):
+        """Wrapper function for the button click."""
+        if image_bytes is None:
+            return "Please upload an image first."
+        description, _ = analyze_image_with_free_model(image_bytes)
         return description
     def generate_sound(description):
+        """Wrapper function for the button click."""
         if not description or description.startswith("Error"):
+            gr.Warning("Cannot generate sound without a valid description!")
             return None
         audio_path = get_audioldm_from_caption(description)
+        if audio_path is None:
+            gr.Error("Failed to generate audio. Please check the logs.")
         return audio_path
     generate_description_button.click(
     )
     gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
+# Launch the app. `share=True` is not needed on Spaces.
+demo.launch()