Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import os | |
| import tempfile | |
| import torch | |
| import numpy as np | |
| from scipy.io.wavfile import write | |
| from dotenv import load_dotenv | |
| from diffusers import DiffusionPipeline | |
| from transformers import pipeline | |
| from PIL import Image | |
| import io | |
| from pydub import AudioSegment | |
| from typing import List | |
| import spaces | |
| # Load environment variables | |
| load_dotenv() | |
| HF_TOKEN = os.getenv("HF_TKN") | |
| # Device configuration | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Initialize models | |
| def load_caption_model(): | |
| return pipeline( | |
| "image-to-text", | |
| model="Salesforce/blip-image-captioning-base", | |
| device=device | |
| ) | |
| def load_audio_model(): | |
| pipe = DiffusionPipeline.from_pretrained( | |
| "cvssp/audioldm2", | |
| use_auth_token=HF_TOKEN | |
| ) | |
| return pipe | |
| caption_pipe = load_caption_model() | |
| audio_pipe = load_audio_model().to(device) | |
| def analyze_image(image_file): | |
| """Generate caption from image with validation""" | |
| try: | |
| # Validate image | |
| try: | |
| image = Image.open(io.BytesIO(image_file)) | |
| image.verify() | |
| image = Image.open(io.BytesIO(image_file)) | |
| except Exception as e: | |
| raise ValueError(f"Invalid image file: {str(e)}") | |
| results = caption_pipe(image) | |
| if not results or not isinstance(results, list): | |
| raise RuntimeError("No caption generated") | |
| caption = results[0].get("generated_text", "").strip() | |
| if not caption: | |
| raise RuntimeError("Empty caption generated") | |
| return caption | |
| except Exception as e: | |
| raise gr.Error(f"Image processing error: {str(e)}") | |
| def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5): | |
| """Generate audio from single prompt""" | |
| try: | |
| if not prompt or len(prompt) < 10: | |
| raise ValueError("Prompt must be at least 10 characters") | |
| with torch.inference_mode(): | |
| audio = audio_pipe( | |
| prompt=prompt, | |
| num_inference_steps=int(num_steps), | |
| guidance_scale=guidance_scale, | |
| audio_length_in_s=10 | |
| ).audios[0] | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile: | |
| write(tmpfile.name, 16000, audio) | |
| return tmpfile.name | |
| except Exception as e: | |
| raise gr.Error(f"Audio generation error: {str(e)}") | |
| def blend_audios(audio_files: List[str]) -> str: | |
| """Mix multiple audio files into one""" | |
| try: | |
| if not audio_files: | |
| raise ValueError("No audio files to blend") | |
| # Load first audio to get base parameters | |
| base_audio = AudioSegment.from_wav(audio_files[0]) | |
| mixed = base_audio | |
| # Mix subsequent tracks | |
| for file in audio_files[1:]: | |
| track = AudioSegment.from_wav(file) | |
| if len(track) > len(mixed): | |
| mixed = mixed.overlay(track[:len(mixed)]) | |
| else: | |
| mixed = mixed.overlay(track) | |
| # Export mixed audio | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile: | |
| mixed.export(tmpfile.name, format="wav") | |
| return tmpfile.name | |
| except Exception as e: | |
| raise gr.Error(f"Audio mixing error: {str(e)}") | |
| def process_inputs(input_choice, image_file, *prompts): | |
| """Handle both image and text input modes""" | |
| try: | |
| # Filter empty prompts | |
| valid_prompts = [p.strip() for p in prompts if p.strip()] | |
| if input_choice == "Image": | |
| if not image_file: | |
| raise gr.Error("Please upload an image") | |
| main_prompt = analyze_image(image_file) | |
| valid_prompts = [main_prompt] + valid_prompts | |
| else: | |
| if not valid_prompts: | |
| raise gr.Error("Please enter at least one text prompt") | |
| # Generate audio for each prompt | |
| audio_files = [] | |
| for idx, prompt in enumerate(valid_prompts): | |
| audio_path = generate_audio(prompt) | |
| audio_files.append(audio_path) | |
| # Blend all audio files | |
| final_audio = blend_audios(audio_files) | |
| return valid_prompts, final_audio, audio_files | |
| except Exception as e: | |
| raise gr.Error(str(e)) | |
| # Gradio interface | |
| css = """ | |
| #main-container { max-width: 800px; margin: 0 auto; } | |
| .dark { background: #1a1a1a; } | |
| .prompt-box { margin-bottom: 10px; } | |
| .audio-track { margin: 5px 0; } | |
| """ | |
| with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app: | |
| with gr.Column(elem_id="main-container"): | |
| gr.Markdown(""" | |
| # 🎨 Image to Sound Generator | |
| Transform visual content or text prompts into mixed sound effects! | |
| """) | |
| # Input Mode Selector | |
| input_choice = gr.Radio( | |
| choices=["Image", "Text"], | |
| value="Image", | |
| label="Input Mode", | |
| interactive=True | |
| ) | |
| # Image Input Section | |
| with gr.Row(visible=True) as image_row: | |
| image_input = gr.Image(type="filepath", label="Upload Image") | |
| # Text Input Section | |
| with gr.Column(visible=False) as text_inputs_col: | |
| prompt_components = [gr.Textbox(label=f"Sound Effect {i+1}", lines=2) for i in range(3)] | |
| add_prompt_btn = gr.Button("Add Another Prompt", variant="secondary") | |
| # Dynamic prompt management | |
| current_prompts = gr.State(value=3) | |
| def add_prompt(current_count): | |
| new_count = current_count + 1 | |
| new_prompt = gr.Textbox(label=f"Sound Effect {new_count}", lines=2, visible=True) | |
| return [new_count] + [new_prompt] + [gr.update(visible=True)]*(new_count) | |
| add_prompt_btn.click( | |
| fn=add_prompt, | |
| inputs=current_prompts, | |
| outputs=[current_prompts] + prompt_components + [text_inputs_col] | |
| ) | |
| # Toggle between image/text inputs | |
| def toggle_inputs(choice): | |
| if choice == "Image": | |
| return [gr.update(visible=True), gr.update(visible=False)] | |
| return [gr.update(visible=False), gr.update(visible=True)] | |
| input_choice.change( | |
| fn=toggle_inputs, | |
| inputs=input_choice, | |
| outputs=[image_row, text_inputs_col] | |
| ) | |
| # Generation Controls | |
| with gr.Accordion("Advanced Settings", open=False): | |
| steps_slider = gr.Slider(10, 200, 100, label="Generation Steps") | |
| guidance_slider = gr.Slider(1.0, 15.0, 7.5, label="Guidance Scale") | |
| generate_btn = gr.Button("Generate Mixed Sound", variant="primary") | |
| # Outputs | |
| with gr.Column(): | |
| gr.Markdown("### Generation Results") | |
| prompt_display = gr.JSON(label="Used Prompts") | |
| final_audio = gr.Audio(label="Blended Sound Effect", interactive=False) | |
| with gr.Accordion("Individual Tracks", open=False): | |
| track_components = [gr.Audio(visible=False) for _ in range(5)] | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["examples/storm.jpg", "A dramatic thunderstorm", "Heavy rain pouring", "Distant rumble"], | |
| [None, "Clock ticking", "Crowd murmuring", "Footsteps on concrete"] | |
| ], | |
| inputs=[image_input] + prompt_components[:2], | |
| outputs=[prompt_display, final_audio], | |
| fn=lambda *x: process_inputs("Image", *x), | |
| cache_examples=True | |
| ) | |
| # Contribution Section | |
| with gr.Column(): | |
| gr.Markdown(""" | |
| ## 👥 How You Can Contribute | |
| We welcome contributions! Contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com). | |
| Support us on [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua | |
| """) | |
| gr.HTML(""" | |
| <div style="text-align: center;"> | |
| <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image"> | |
| <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759" /> | |
| </a> | |
| </div> | |
| """) | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| [GitHub Repository](https://github.com/bilsimaging/Imaginesound)* | |
| """) | |
| # Event handling | |
| generate_btn.click( | |
| fn=process_inputs, | |
| inputs=[input_choice, image_input] + prompt_components, | |
| outputs=[prompt_display, final_audio, *track_components] | |
| ) | |
| if __name__ == "__main__": | |
| app.launch(debug=True, share=True) |