Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import os | |
| import tempfile | |
| import gradio as gr | |
| from dotenv import load_dotenv | |
| import torch | |
| from scipy.io.wavfile import write | |
| from diffusers import DiffusionPipeline | |
| from transformers import pipeline | |
| from pydub import AudioSegment | |
| import numpy as np | |
| # Load environment variables | |
| load_dotenv() | |
| hf_token = os.getenv("HF_TKN") | |
| # Device configuration | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if device == "cuda" else torch.float32 | |
| # Initialize models with automatic device detection | |
| def load_models(): | |
| global captioning_pipeline, pipe | |
| captioning_pipeline = pipeline( | |
| "image-to-text", | |
| model="nlpconnect/vit-gpt2-image-captioning", | |
| device=0 if torch.cuda.is_available() else -1 | |
| ) | |
| pipe = DiffusionPipeline.from_pretrained( | |
| "cvssp/audioldm2", | |
| use_auth_token=hf_token, | |
| torch_dtype=torch_dtype | |
| ).to(device) | |
| load_models() | |
| def analyze_image(image_file): | |
| """Generate caption from image with error handling""" | |
| try: | |
| results = captioning_pipeline(image_file) | |
| if results and isinstance(results, list): | |
| return results[0].get("generated_text", "").strip() | |
| return "Could not generate caption" | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def generate_audio(prompt): | |
| """Generate audio from text prompt""" | |
| try: | |
| return pipe( | |
| prompt=prompt, | |
| num_inference_steps=50, | |
| guidance_scale=7.5 | |
| ).audios[0] | |
| except Exception as e: | |
| print(f"Audio generation error: {str(e)}") | |
| return None | |
| def blend_audios(audio_list): | |
| """Mix multiple audio arrays into one""" | |
| try: | |
| valid_audios = [arr for arr in audio_list if arr is not None] | |
| if not valid_audios: | |
| return None | |
| max_length = max(arr.shape[0] for arr in valid_audios) | |
| mixed = np.zeros(max_length) | |
| for arr in valid_audios: | |
| if arr.shape[0] < max_length: | |
| padded = np.pad(arr, (0, max_length - arr.shape[0])) | |
| else: | |
| padded = arr[:max_length] | |
| mixed += padded | |
| mixed = mixed / np.max(np.abs(mixed)) | |
| _, tmp_path = tempfile.mkstemp(suffix=".wav") | |
| write(tmp_path, 16000, mixed) | |
| return tmp_path | |
| except Exception as e: | |
| print(f"Blending error: {str(e)}") | |
| return None | |
| css = """ | |
| #col-container { max-width: 800px; margin: 0 auto; } | |
| .toggle-row { margin: 1rem 0; } | |
| .prompt-box { margin-bottom: 0.5rem; } | |
| .danger { color: #ff4444; font-weight: bold; } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| with gr.Column(elem_id="col-container"): | |
| # Header Section | |
| gr.HTML(""" | |
| <h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1> | |
| <p style="text-align: center;"> | |
| âš¡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a> | |
| </p> | |
| """) | |
| # Input Mode Toggle | |
| input_mode = gr.Radio( | |
| choices=["Image Input", "Text Input"], | |
| value="Image Input", | |
| label="Select Input Mode", | |
| elem_classes="toggle-row" | |
| ) | |
| # Image Input Section | |
| with gr.Column(visible=True) as image_col: | |
| image_upload = gr.Image(type="filepath", label="Upload Image") | |
| generate_desc_btn = gr.Button("Generate Description from Image", variant="primary") | |
| caption_display = gr.Textbox(label="Generated Description", interactive=False) | |
| # Text Input Section | |
| with gr.Column(visible=False) as text_col: | |
| with gr.Row(): | |
| prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...") | |
| prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...") | |
| additional_prompts = gr.Column() | |
| add_prompt_btn = gr.Button("âž• Add Another Prompt", variant="secondary") | |
| gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>") | |
| # Generation Controls | |
| generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary") | |
| audio_output = gr.Audio(label="Generated Sound Effect", interactive=False) | |
| # Documentation Section | |
| gr.Markdown(""" | |
| ## 👥 How You Can Contribute | |
| We welcome contributions! Contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com). | |
| Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua | |
| """) | |
| # Visitor Badge | |
| gr.HTML(""" | |
| <div style="text-align: center;"> | |
| <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image"> | |
| <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/> | |
| </a> | |
| </div> | |
| """) | |
| # Input Mode Toggle Handler | |
| input_mode.change( | |
| lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")), | |
| inputs=input_mode, | |
| outputs=[image_col, text_col], | |
| concurrency_limit=1 | |
| ) | |
| # Image Description Generation | |
| generate_desc_btn.click( | |
| analyze_image, | |
| inputs=image_upload, | |
| outputs=caption_display, | |
| concurrency_limit=2 | |
| ) | |
| # Dynamic Prompt Addition | |
| def add_prompt(current_count): | |
| if current_count >= 5: | |
| return current_count, gr.update() | |
| new_count = current_count + 1 | |
| new_prompt = gr.Textbox( | |
| label=f"Sound Prompt {new_count}", | |
| lines=2, | |
| visible=True, | |
| placeholder="Enter sound description..." | |
| ) | |
| return new_count, new_prompt | |
| prompt_count = gr.State(2) | |
| add_prompt_btn.click( | |
| add_prompt, | |
| inputs=prompt_count, | |
| outputs=[prompt_count, additional_prompts], | |
| concurrency_limit=1 | |
| ) | |
| # Sound Generation Handler | |
| def process_inputs(mode, image_file, caption, *prompts): | |
| try: | |
| if mode == "Image Input": | |
| if not image_file: | |
| raise gr.Error("Please upload an image") | |
| caption = analyze_image(image_file) | |
| prompts = [caption] | |
| else: | |
| prompts = [p.strip() for p in prompts if p.strip()] | |
| if not prompts: | |
| raise gr.Error("Please enter at least one valid prompt") | |
| # Generate individual audio tracks | |
| audio_tracks = [] | |
| for prompt in prompts: | |
| if not prompt: | |
| continue | |
| audio = generate_audio(prompt) | |
| if audio is not None: | |
| audio_tracks.append(audio) | |
| # Blend audio tracks | |
| if not audio_tracks: | |
| return None | |
| return blend_audios(audio_tracks) | |
| except Exception as e: | |
| raise gr.Error(f"Processing error: {str(e)}") | |
| generate_sound_btn.click( | |
| process_inputs, | |
| inputs=[input_mode, image_upload, caption_display, prompt1, prompt2], | |
| outputs=audio_output, | |
| concurrency_limit=2 | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(max_threads=4) |