Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 30

Commit

e18ae6e

verified ·

1 Parent(s): 4a36f0d

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -88

app.py CHANGED Viewed

@@ -10,169 +10,213 @@ from transformers import pipeline
 from pydub import AudioSegment
 import numpy as np
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
-device_id = 0 if torch.cuda.is_available() else -1
-# Initialize models
-captioning_pipeline = pipeline(
-    "image-to-text",
-    model="nlpconnect/vit-gpt2-image-captioning",
-    device=device_id
-)
-pipe = DiffusionPipeline.from_pretrained(
-    "cvssp/audioldm2",
-    use_auth_token=hf_token
-)
-@spaces.GPU(duration=120)
 def analyze_image(image_file):
     try:
         results = captioning_pipeline(image_file)
-        if not results or not isinstance(results, list):
-            return "Error: Could not generate caption.", True
-        caption = results[0].get("generated_text", "").strip()
-        return caption if caption else "No caption generated.", not bool(caption)
     except Exception as e:
-        return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
 def generate_audio(prompt):
     try:
-        pipe.to("cuda")
-        audio_output = pipe(
             prompt=prompt,
             num_inference_steps=50,
             guidance_scale=7.5
-        )
-        pipe.to("cpu")
-        return audio_output.audios[0]
     except Exception as e:
-        print(f"Error generating audio: {e}")
         return None
 def blend_audios(audio_list):
     try:
-        # Find the longest audio duration
-        max_length = max([arr.shape[0] for arr in audio_list])
-        # Mix all audios
         mixed = np.zeros(max_length)
-        for arr in audio_list:
             if arr.shape[0] < max_length:
                 padded = np.pad(arr, (0, max_length - arr.shape[0]))
             else:
                 padded = arr[:max_length]
             mixed += padded
-        # Normalize the audio
         mixed = mixed / np.max(np.abs(mixed))
-        # Save to temporary file
         _, tmp_path = tempfile.mkstemp(suffix=".wav")
         write(tmp_path, 16000, mixed)
         return tmp_path
     except Exception as e:
-        print(f"Error blending audio: {e}")
         return None
 css = """
 #col-container { max-width: 800px; margin: 0 auto; }
 .toggle-row { margin: 1rem 0; }
 .prompt-box { margin-bottom: 0.5rem; }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
-        <h1 style="text-align: center;">🎶 Advanced Sound Generator</h1>
-        <p style="text-align: center;">⚡ Powered by Bilsimaging</p>
         """)
-        # Input mode toggle
         input_mode = gr.Radio(
-            choices=["Image Input", "Text Prompts"],
             value="Image Input",
             label="Select Input Mode",
             elem_classes="toggle-row"
         )
-        # Image input section
         with gr.Column(visible=True) as image_col:
             image_upload = gr.Image(type="filepath", label="Upload Image")
-            generate_desc_btn = gr.Button("Generate Description from Image")
             caption_display = gr.Textbox(label="Generated Description", interactive=False)
-        # Text input section
         with gr.Column(visible=False) as text_col:
             with gr.Row():
-                prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2)
-                prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2)
             additional_prompts = gr.Column()
             add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
-            generate_sound_btn = gr.Button("Generate Blended Sound", variant="primary")
-        # Audio output
-        audio_output = gr.Audio(label="Final Sound Composition", interactive=False)
-        # Documentation section
         gr.Markdown("""
-        ## 🎚️ How to Use
-        1. **Choose Input Mode** above
-        2. For images: Upload + Generate Description → Generate Sound
-        3. For text: Enter multiple sound prompts → Generate Blended Sound
-        [Support on Ko-fi](https://ko-fi.com/bilsimaging)
         """)
-        # Visitor badge
         gr.HTML("""
-        <div style="text-align: center; margin-top: 2rem;">
-            <a href="https://visitorbadge.io/status?path=YOUR_SPACE_URL">
-                <img src="https://api.visitorbadge.io/api/visitors?path=YOUR_SPACE_URL&countColor=%23263759"/>
             </a>
         </div>
         """)
-    # Toggle visibility based on input mode
-    def toggle_input(mode):
-        if mode == "Image Input":
-            return [gr.update(visible=True), gr.update(visible=False)]
-        return [gr.update(visible=False), gr.update(visible=True)]
     input_mode.change(
-        fn=toggle_input,
         inputs=input_mode,
-        outputs=[image_col, text_col]
     )
-    # Image processing chain
     generate_desc_btn.click(
-        fn=analyze_image,
         inputs=image_upload,
-        outputs=caption_display
-    ).then(
-        fn=lambda: gr.update(interactive=True),
-        outputs=generate_sound_btn
     )
-    # Text processing chain
-    generate_sound_btn.click(
-        fn=lambda *prompts: [p for p in prompts if p.strip()],
-        inputs=[prompt1, prompt2],
-        outputs=[]
-    ).then(
-        fn=lambda prompts: [generate_audio(p) for p in prompts],
-        outputs=[]
-    ).then(
-        fn=blend_audios,
-        outputs=audio_output
     )
-# Queue management
-demo.queue(concurrency_count=2)
 if __name__ == "__main__":
-    demo.launch()

 from pydub import AudioSegment
 import numpy as np
+# Load environment variables
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
+# Device configuration
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if device == "cuda" else torch.float32
+# Initialize models with automatic device detection
+@spaces.GPU(duration=120)
+def load_models():
+    global captioning_pipeline, pipe
+    captioning_pipeline = pipeline(
+        "image-to-text",
+        model="nlpconnect/vit-gpt2-image-captioning",
+        device=0 if torch.cuda.is_available() else -1
+    )
+    pipe = DiffusionPipeline.from_pretrained(
+        "cvssp/audioldm2",
+        use_auth_token=hf_token,
+        torch_dtype=torch_dtype
+    ).to(device)
+load_models()
+@spaces.GPU(duration=60)
 def analyze_image(image_file):
+    """Generate caption from image with error handling"""
     try:
         results = captioning_pipeline(image_file)
+        if results and isinstance(results, list):
+            return results[0].get("generated_text", "").strip()
+        return "Could not generate caption"
     except Exception as e:
+        return f"Error: {str(e)}"
 @spaces.GPU(duration=120)
 def generate_audio(prompt):
+    """Generate audio from text prompt"""
     try:
+        return pipe(
             prompt=prompt,
             num_inference_steps=50,
             guidance_scale=7.5
+        ).audios[0]
     except Exception as e:
+        print(f"Audio generation error: {str(e)}")
         return None
 def blend_audios(audio_list):
+    """Mix multiple audio arrays into one"""
     try:
+        valid_audios = [arr for arr in audio_list if arr is not None]
+        if not valid_audios:
+            return None
+        max_length = max(arr.shape[0] for arr in valid_audios)
         mixed = np.zeros(max_length)
+        for arr in valid_audios:
             if arr.shape[0] < max_length:
                 padded = np.pad(arr, (0, max_length - arr.shape[0]))
             else:
                 padded = arr[:max_length]
             mixed += padded
         mixed = mixed / np.max(np.abs(mixed))
         _, tmp_path = tempfile.mkstemp(suffix=".wav")
         write(tmp_path, 16000, mixed)
         return tmp_path
     except Exception as e:
+        print(f"Blending error: {str(e)}")
         return None
 css = """
 #col-container { max-width: 800px; margin: 0 auto; }
 .toggle-row { margin: 1rem 0; }
 .prompt-box { margin-bottom: 0.5rem; }
+.danger { color: #ff4444; font-weight: bold; }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
+        # Header Section
         gr.HTML("""
+        <h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1>
+        <p style="text-align: center;">
+            ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
+        </p>
         """)
+        # Input Mode Toggle
         input_mode = gr.Radio(
+            choices=["Image Input", "Text Input"],
             value="Image Input",
             label="Select Input Mode",
             elem_classes="toggle-row"
         )
+        # Image Input Section
         with gr.Column(visible=True) as image_col:
             image_upload = gr.Image(type="filepath", label="Upload Image")
+            generate_desc_btn = gr.Button("Generate Description from Image", variant="primary")
             caption_display = gr.Textbox(label="Generated Description", interactive=False)
+        # Text Input Section
         with gr.Column(visible=False) as text_col:
             with gr.Row():
+                prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...")
+                prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...")
             additional_prompts = gr.Column()
             add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
+            gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>")
+        # Generation Controls
+        generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary")
+        audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
+        # Documentation Section
         gr.Markdown("""
+        ## 👥 How You Can Contribute
+        We welcome contributions! Contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
+        Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua
         """)
+        # Visitor Badge
         gr.HTML("""
+        <div style="text-align: center;">
+            <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
+                <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/>
             </a>
         </div>
         """)
+    # Input Mode Toggle Handler
     input_mode.change(
+        lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")),
         inputs=input_mode,
+        outputs=[image_col, text_col],
+        concurrency_limit=1
     )
+    # Image Description Generation
     generate_desc_btn.click(
+        analyze_image,
         inputs=image_upload,
+        outputs=caption_display,
+        concurrency_limit=2
     )
+    # Dynamic Prompt Addition
+    def add_prompt(current_count):
+        if current_count >= 5:
+            return current_count, gr.update()
+        new_count = current_count + 1
+        new_prompt = gr.Textbox(
+            label=f"Sound Prompt {new_count}",
+            lines=2,
+            visible=True,
+            placeholder="Enter sound description..."
+        )
+        return new_count, new_prompt
+    prompt_count = gr.State(2)
+    add_prompt_btn.click(
+        add_prompt,
+        inputs=prompt_count,
+        outputs=[prompt_count, additional_prompts],
+        concurrency_limit=1
     )
+    # Sound Generation Handler
+    def process_inputs(mode, image_file, caption, *prompts):
+        try:
+            if mode == "Image Input":
+                if not image_file:
+                    raise gr.Error("Please upload an image")
+                caption = analyze_image(image_file)
+                prompts = [caption]
+            else:
+                prompts = [p.strip() for p in prompts if p.strip()]
+                if not prompts:
+                    raise gr.Error("Please enter at least one valid prompt")
+            # Generate individual audio tracks
+            audio_tracks = []
+            for prompt in prompts:
+                if not prompt:
+                    continue
+                audio = generate_audio(prompt)
+                if audio is not None:
+                    audio_tracks.append(audio)
+            # Blend audio tracks
+            if not audio_tracks:
+                return None
+            return blend_audios(audio_tracks)
+        except Exception as e:
+            raise gr.Error(f"Processing error: {str(e)}")
+    generate_sound_btn.click(
+        process_inputs,
+        inputs=[input_mode, image_upload, caption_display, prompt1, prompt2],
+        outputs=audio_output,
+        concurrency_limit=2
+    )
 if __name__ == "__main__":
+    demo.launch(max_threads=4)