Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import os | |
| import tempfile | |
| import gradio as gr | |
| from dotenv import load_dotenv | |
| import torch | |
| from scipy.io.wavfile import write | |
| from diffusers import DiffusionPipeline | |
| from transformers import pipeline | |
| from pathlib import Path | |
| from PIL import Image # <-- Required for new model | |
| import io # <-- Required for new model | |
| # --- Setup Models and Device --- | |
| load_dotenv() | |
| hf_token = os.getenv("HF_TKN") | |
| # Use GPU if available, otherwise CPU | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # Correctly initialize the modern, reliable captioning pipeline | |
| captioning_pipeline = pipeline( | |
| "image-to-text", | |
| model="Salesforce/blip-image-captioning-large", | |
| device=device | |
| ) | |
| print("Image captioning pipeline loaded.") | |
| # Initialize the audio pipeline. Use float16 for less VRAM on GPU. | |
| pipe = DiffusionPipeline.from_pretrained( | |
| "cvssp/audioldm2", | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| ) | |
| print("Audio generation pipeline loaded.") | |
| # --- Core Functions --- | |
| def analyze_image_with_free_model(image_file_bytes): | |
| """Takes image bytes and returns a caption.""" | |
| try: | |
| print("Received image bytes, opening with Pillow...") | |
| # Open the image data directly from memory using Pillow | |
| image = Image.open(io.BytesIO(image_file_bytes)).convert("RGB") | |
| print("Generating caption...") | |
| results = captioning_pipeline(image) | |
| if not results or not isinstance(results, list): | |
| print("ERROR: Caption generation returned invalid results.") | |
| return "Error: Could not generate caption.", True | |
| caption = results[0].get("generated_text", "").strip() | |
| if not caption: | |
| print("ERROR: Generated caption is empty.") | |
| return "No caption was generated.", True | |
| print(f"Successfully generated caption: {caption}") | |
| return caption, False | |
| except Exception as e: | |
| print(f"!!!!!! EXCEPTION in analyze_image_with_free_model: {e}") | |
| return f"Error analyzing image: {e}", True | |
| def get_audioldm_from_caption(caption): | |
| """Takes a text caption and returns a filepath to a generated WAV file.""" | |
| try: | |
| # Move the large audio pipeline to the GPU only when it's being used | |
| pipe.to(device) | |
| print(f"Generating audio for prompt: '{caption}'") | |
| audio_output = pipe( | |
| prompt=caption, | |
| num_inference_steps=25, # Fewer steps for faster generation | |
| guidance_scale=7.0 | |
| ).audios[0] | |
| # Move the pipeline back to CPU to free up GPU memory for others | |
| pipe.to("cpu") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: | |
| print(f"Saving audio to temporary file: {temp_wav.name}") | |
| # write(file, sample_rate, data) | |
| write(temp_wav.name, 16000, audio_output) | |
| return temp_wav.name | |
| except Exception as e: | |
| print(f"!!!!!! EXCEPTION in get_audioldm_from_caption: {e}") | |
| return None | |
| # --- Gradio Interface --- | |
| css = """ | |
| #col-container{ margin: 0 auto; max-width: 800px; } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| with gr.Column(elem_id="col-container"): | |
| gr.HTML(""" | |
| <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1> | |
| <p style="text-align: center;"> | |
| ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a> | |
| </p> | |
| """) | |
| gr.Markdown(""" | |
| 1. **Upload an image**. | |
| 2. Click **Generate Description**. | |
| 3. Click **Generate Sound Effect**. | |
| """) | |
| image_upload = gr.File(label="Upload Image", type="binary") | |
| generate_description_button = gr.Button("Generate Description", variant="primary") | |
| caption_display = gr.Textbox(label="Image Description", interactive=False) | |
| generate_sound_button = gr.Button("Generate Sound Effect") | |
| audio_output = gr.Audio(label="Generated Sound Effect") | |
| gr.Markdown(""" | |
| ## 👥 Contribute & Support | |
| For support, questions, or to contribute, please contact us at | |
| [contact@bilsimaging.com](mailto:contact@bilsimaging.com). | |
| Support our work and get involved by donating through | |
| [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua | |
| """) | |
| # --- Event Handlers --- | |
| def update_caption(image_bytes): | |
| """Wrapper function for the button click.""" | |
| if image_bytes is None: | |
| return "Please upload an image first." | |
| description, _ = analyze_image_with_free_model(image_bytes) | |
| return description | |
| def generate_sound(description): | |
| """Wrapper function for the button click.""" | |
| if not description or description.startswith("Error"): | |
| gr.Warning("Cannot generate sound without a valid description!") | |
| return None | |
| audio_path = get_audioldm_from_caption(description) | |
| if audio_path is None: | |
| gr.Error("Failed to generate audio. Please check the logs.") | |
| return audio_path | |
| generate_description_button.click( | |
| fn=update_caption, | |
| inputs=image_upload, | |
| outputs=caption_display | |
| ) | |
| generate_sound_button.click( | |
| fn=generate_sound, | |
| inputs=caption_display, | |
| outputs=audio_output | |
| ) | |
| gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>') | |
| # Launch the app. `share=True` is not needed on Spaces. | |
| demo.launch() |