Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import io | |
| import tempfile | |
| import gradio as gr | |
| from dotenv import load_dotenv | |
| import torch | |
| from scipy.io.wavfile import write | |
| from diffusers import DiffusionPipeline | |
| from transformers import pipeline | |
| from pathlib import Path | |
| from PIL import Image | |
| import spaces | |
| load_dotenv() | |
| hf_token = os.getenv("HF_TKN") | |
| # Determine if we have access to a GPU | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| device_id = 0 if torch.cuda.is_available() else -1 | |
| # Initialize the image captioning pipeline | |
| captioning_pipeline = pipeline( | |
| "image-to-text", | |
| model="nlpconnect/vit-gpt2-image-captioning", | |
| device=device_id | |
| ) | |
| # Initialize the text-to-audio pipeline | |
| pipe = DiffusionPipeline.from_pretrained( | |
| "cvssp/audioldm2", | |
| use_auth_token=hf_token | |
| ) | |
| pipe.to(device) | |
| def analyze_image_with_free_model(image_file: bytes): | |
| """ | |
| Analyze the uploaded image using the ViT-GPT2 image captioning pipeline. | |
| :param image_file: Binary content of the uploaded image. | |
| :return: A tuple (caption, error_flag). | |
| caption (str) - The generated caption or error message. | |
| error_flag (bool) - Indicates if an error occurred. | |
| """ | |
| try: | |
| # Validate image input | |
| if not image_file: | |
| return "Error: No image data received.", True | |
| # Check if the file is a valid image | |
| try: | |
| Image.open(io.BytesIO(image_file)).verify() | |
| except Exception: | |
| return "Error: Invalid image file. Please upload a valid image.", True | |
| # Write the valid image to a temporary file for the pipeline | |
| with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file: | |
| temp_file.write(image_file) | |
| temp_image_path = temp_file.name | |
| # Perform image captioning | |
| results = captioning_pipeline(temp_image_path) | |
| if not results or not isinstance(results, list): | |
| return "Error: Captioning pipeline returned invalid results.", True | |
| # Extract and clean up the generated caption | |
| caption = results[0].get("generated_text", "").strip() | |
| if not caption: | |
| return "No caption was generated by the model.", True | |
| return caption, False | |
| except Exception as e: | |
| return f"Error analyzing image: {e}", True | |
| def get_audioldm_from_caption(caption: str): | |
| """ | |
| Generate an audio file (WAV) from a text caption using the AudioLDM2 pipeline. | |
| :param caption: The text prompt used to generate audio. | |
| :return: The path to the generated .wav file, or None if an error occurred. | |
| """ | |
| try: | |
| # Move pipeline to GPU (if available) | |
| pipe.to(device) | |
| # Generate audio from text prompt | |
| audio_output = pipe( | |
| prompt=caption, | |
| num_inference_steps=50, | |
| guidance_scale=7.5 | |
| ) | |
| # Move pipeline back to CPU to free GPU memory | |
| pipe.to("cpu") | |
| # Extract the first audio sample | |
| audio = audio_output.audios[0] | |
| # Write the audio to a temporary WAV file | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: | |
| write(temp_wav.name, 16000, audio) | |
| return temp_wav.name | |
| except Exception as e: | |
| print(f"Error generating audio from caption: {e}") | |
| return None | |
| # Custom CSS for styling the Gradio Blocks | |
| css = """ | |
| #col-container{ | |
| margin: 0 auto; | |
| max-width: 800px; | |
| } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| with gr.Column(elem_id="col-container"): | |
| gr.HTML(""" | |
| <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1> | |
| <p style="text-align: center;"> | |
| âš¡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a> | |
| </p> | |
| """) | |
| gr.Markdown(""" | |
| Welcome to this unique sound effect generator! This tool allows you to upload an image | |
| and generate a descriptive caption and a corresponding sound effect, all using free, | |
| open-source models on Hugging Face. | |
| **💡 How it works:** | |
| 1. **Upload an image**: Choose an image that you'd like to analyze. | |
| 2. **Generate Description**: Click on 'Generate Description' to get a textual | |
| description of your uploaded image. | |
| 3. **Generate Sound Effect**: Based on the image description, click on | |
| 'Generate Sound Effect' to create a sound effect that matches the image context. | |
| Enjoy the journey from visual to auditory sensation with just a few clicks! | |
| """) | |
| # Define Gradio interface elements | |
| image_upload = gr.File(label="Upload Image", type="binary") | |
| generate_description_button = gr.Button("Generate Description") | |
| caption_display = gr.Textbox(label="Image Description", interactive=False) | |
| generate_sound_button = gr.Button("Generate Sound Effect") | |
| audio_output = gr.Audio(label="Generated Sound Effect") | |
| gr.Markdown(""" | |
| ## 👥 How You Can Contribute | |
| We welcome contributions and suggestions for improvements. Your feedback is invaluable | |
| to the continuous enhancement of this application. | |
| For support, questions, or to contribute, please contact us at | |
| [contact@bilsimaging.com](mailto:contact@bilsimaging.com). | |
| Support our work and get involved by donating through | |
| [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua | |
| """) | |
| gr.Markdown(""" | |
| ## 📢 Stay Connected | |
| This app is a testament to the creative possibilities that emerge when | |
| technology meets art. Enjoy exploring the auditory landscape of your images! | |
| """) | |
| # Define the helper functions for Gradio event handlers | |
| def update_caption(image_file): | |
| description, error_flag = analyze_image_with_free_model(image_file) | |
| if error_flag: | |
| # In case of error, just return the error message | |
| return description | |
| return description | |
| def generate_sound(description): | |
| # Validate the description before generating audio | |
| if not description or description.startswith("Error"): | |
| return None | |
| audio_path = get_audioldm_from_caption(description) | |
| return audio_path | |
| # Wire the Gradio events to the functions | |
| generate_description_button.click( | |
| fn=update_caption, | |
| inputs=image_upload, | |
| outputs=caption_display | |
| ) | |
| generate_sound_button.click( | |
| fn=generate_sound, | |
| inputs=caption_display, | |
| outputs=audio_output | |
| ) | |
| gr.HTML( | |
| '<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image">' | |
| '<img src="https://api.visitorbadge.io/api/visitors?path=' | |
| 'https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" ' | |
| '/></a>' | |
| ) | |
| # An extra placeholder if needed | |
| html = gr.HTML() | |
| # Enable debug and optional share. On Spaces, 'share=True' is typically ignored. | |
| demo.launch(debug=True, share=True) | |