Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 11

Commit

213e5d3

verified ·

1 Parent(s): 2019ee0

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -107

app.py CHANGED Viewed

@@ -1,120 +1,126 @@
-import gradio as gr
 import os
 import torch
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    pipeline,
-    AutoProcessor,
-    MusicgenForConditionalGeneration
 )
-import scipy.io.wavfile as wav
-# ---------------------------------------------------------------------
-# Load Llama 3 Model with Zero GPU
-# ---------------------------------------------------------------------
-def load_llama_pipeline_zero_gpu(model_id: str, token: str):
     try:
-        if not torch.cuda.is_available():
-            raise RuntimeError("ZeroGPU is not properly initialized or GPU is unavailable.")
-        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            use_auth_token=token,
-            torch_dtype=torch.float16,
-            device_map="auto",  # Use device map to offload computations
-            trust_remote_code=True  # Enables execution of remote code for Zero GPU
-        )
-        return pipeline("text-generation", model=model, tokenizer=tokenizer)
     except Exception as e:
-        return str(e)
-# ---------------------------------------------------------------------
-# Generate Radio Script
-# ---------------------------------------------------------------------
-def generate_script(user_input: str, pipeline_llama):
     try:
-        system_prompt = (
-            "You are a top-tier radio imaging producer using Llama 3. "
-            "Take the user's concept and craft a short, creative promo script."
         )
-        combined_prompt = f"{system_prompt}\nUser concept: {user_input}\nRefined script:"
-        result = pipeline_llama(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
-        return result[0]['generated_text'].split("Refined script:")[-1].strip()
-    except Exception as e:
-        return f"Error generating script: {e}"
-# ---------------------------------------------------------------------
-# Load MusicGen Model
-# ---------------------------------------------------------------------
-def load_musicgen_model():
-    try:
-        model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
-        processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
-        return model, processor
-    except Exception as e:
-        return None, str(e)
-# ---------------------------------------------------------------------
-# Generate Audio
-# ---------------------------------------------------------------------
-def generate_audio(prompt: str, audio_length: int, mg_model, mg_processor):
-    try:
-        inputs = mg_processor(text=[prompt], padding=True, return_tensors="pt")
-        outputs = mg_model.generate(**inputs, max_new_tokens=audio_length)
-        sr = mg_model.config.audio_encoder.sampling_rate
-        audio_data = outputs[0, 0].cpu().numpy()
-        normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
-        output_file = "radio_jingle.wav"
-        wav.write(output_file, rate=sr, data=normalized_audio)
-        return sr, normalized_audio
     except Exception as e:
-        return str(e)
-# ---------------------------------------------------------------------
-# Gradio Interface
-# ---------------------------------------------------------------------
-def radio_imaging_app(user_prompt, llama_model_id, hf_token, audio_length):
-    # Load Llama 3 Pipeline with Zero GPU
-    pipeline_llama = load_llama_pipeline_zero_gpu(llama_model_id, hf_token)
-    if isinstance(pipeline_llama, str):
-        return pipeline_llama, None
-    # Generate Script
-    script = generate_script(user_prompt, pipeline_llama)
-    # Load MusicGen
-    mg_model, mg_processor = load_musicgen_model()
-    if isinstance(mg_processor, str):
-        return script, mg_processor
-    # Generate Audio
-    audio_data = generate_audio(script, audio_length, mg_model, mg_processor)
-    if isinstance(audio_data, str):
-        return script, audio_data
-    return script, audio_data
-# ---------------------------------------------------------------------
-# Interface
-# ---------------------------------------------------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("# 🎧 AI Radio Imaging with Llama 3 + MusicGen (Zero GPU)")
     with gr.Row():
-        user_prompt = gr.Textbox(label="Enter your promo idea", placeholder="E.g., A 15-second hype jingle for a morning talk show, fun and energetic.")
-        llama_model_id = gr.Textbox(label="Llama 3 Model ID", value="meta-llama/Meta-Llama-3-70B")
-        hf_token = gr.Textbox(label="Hugging Face Token", type="password")
-        audio_length = gr.Slider(label="Audio Length (tokens)", minimum=128, maximum=1024, step=64, value=512)
-    generate_button = gr.Button("Generate Promo Script and Audio")
-    script_output = gr.Textbox(label="Generated Script")
-    audio_output = gr.Audio(label="Generated Audio", type="numpy")
-    generate_button.click(radio_imaging_app,
-                          inputs=[user_prompt, llama_model_id, hf_token, audio_length],
-                          outputs=[script_output, audio_output])
-# ---------------------------------------------------------------------
-# Launch App
-# ---------------------------------------------------------------------
-demo.launch()

+import spaces
 import os
+import tempfile
+import gradio as gr
+from dotenv import load_dotenv
 import torch
+from scipy.io.wavfile import write
+from diffusers import DiffusionPipeline
+from transformers import pipeline
+from pathlib import Path
+load_dotenv()
+hf_token = os.getenv("HF_TKN")
+device_id = 0 if torch.cuda.is_available() else -1
+captioning_pipeline = pipeline(
+    "image-to-text",
+    model="nlpconnect/vit-gpt2-image-captioning",
+    device=device_id
+)
+pipe = DiffusionPipeline.from_pretrained(
+    "cvssp/audioldm2",
+    use_auth_token=hf_token
 )
+@spaces.GPU(duration=120)
+def analyze_image_with_free_model(image_file):
     try:
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
+            temp_file.write(image_file)
+            temp_image_path = temp_file.name
+        results = captioning_pipeline(temp_image_path)
+        if not results or not isinstance(results, list):
+            return "Error: Could not generate caption.", True
+        caption = results[0].get("generated_text", "").strip()
+        if not caption:
+            return "No caption was generated.", True
+        return caption, False
     except Exception as e:
+        return f"Error analyzing image: {e}", True
+@spaces.GPU(duration=120)
+def get_audioldm_from_caption(caption):
     try:
+        pipe.to("cuda")
+        audio_output = pipe(
+            prompt=caption,
+            num_inference_steps=50,
+            guidance_scale=7.5
         )
+        pipe.to("cpu")
+        audio = audio_output.audios[0]
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+            write(temp_wav.name, 16000, audio)
+            return temp_wav.name
     except Exception as e:
+        print(f"Error generating audio from caption: {e}")
+        return None
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Image(value="https://via.placeholder.com/150", interactive=False, label="App Logo", elem_id="app-logo")
+        with gr.Column(scale=5):
+            gr.HTML("""
+            <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 10px;">🎶 Image-to-Sound Generator</div>
+            <div style="text-align: center; font-size: 16px; color: #6c757d;">Transform your images into descriptive captions and immersive soundscapes.</div>
+            """)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("""
+            ### How It Works
+            1. **Upload an Image**: Select an image to analyze.
+            2. **Generate Description**: Get a detailed caption describing your image.
+            3. **Generate Sound**: Create an audio representation based on the caption.
+            """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_upload = gr.File(label="Upload Image", type="binary")
+            generate_description_button = gr.Button("Generate Description", variant="primary")
+        with gr.Column(scale=2):
+            caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.")
+            generate_sound_button = gr.Button("Generate Sound", variant="primary")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
     with gr.Row():
+        gr.Markdown("""
+        ## About This App
+        This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology.
+        For inquiries, contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
+        """)
+    def update_caption(image_file):
+        description, _ = analyze_image_with_free_model(image_file)
+        return description
+    def generate_sound(description):
+        if not description or description.startswith("Error"):
+            return None
+        audio_path = get_audioldm_from_caption(description)
+        return audio_path
+    generate_description_button.click(
+        fn=update_caption,
+        inputs=image_upload,
+        outputs=caption_display
+    )
+    generate_sound_button.click(
+        fn=generate_sound,
+        inputs=caption_display,
+        outputs=audio_output
+    )
+demo.launch(debug=True, share=True)