Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Bils commited on Jan 10

Commit

9f4cca0

verified ·

1 Parent(s): 24da5c3

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -25

app.py CHANGED Viewed

@@ -8,31 +8,19 @@ from diffusers import DiffusionPipeline
 from transformers import pipeline
 from pathlib import Path
-# Load environment variables from .env file if needed
 load_dotenv()
-# If you have any Hugging Face tokens for private models (AudioLDM2 requires HF_TKN)
 hf_token = os.getenv("HF_TKN")
-# ------------------------------------------------
-# 1) INITIALIZE FREE IMAGE CAPTIONING PIPELINE
-# ------------------------------------------------
-# Replace "nlpconnect/vit-gpt2-image-captioning" with any other free image captioning model you prefer.
 captioning_pipeline = pipeline(
     "image-to-text",
     model="nlpconnect/vit-gpt2-image-captioning",
-    # If the model is private or requires auth, pass the token here: use_auth_token=hf_token,
 )
-# ------------------------------------------------
-# 2) INITIALIZE AUDIO LDM-2 PIPELINE
-# ------------------------------------------------
-# AudioLDM2 is also from Hugging Face. If it’s a private model, pass your token via use_auth_token.
-# If you’re using the public version, you may not need the token at all.
 device = "cuda" if torch.cuda.is_available() else "cpu"
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
-    use_auth_token=hf_token  # remove or comment out if not needed
 )
 pipe = pipe.to(device)
@@ -42,17 +30,14 @@ def analyze_image_with_free_model(image_file):
     Returns: (caption_text, is_error_flag)
     """
     try:
-        # Save uploaded image to a temporary file
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
             temp_file.write(image_file)
             temp_image_path = temp_file.name
-        # Run the image captioning pipeline
         results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
             return "Error: Could not generate caption.", True
-        # Typically, pipeline returns a list of dicts with a "generated_text" key
         caption = results[0].get("generated_text", "").strip()
         if not caption:
             return "No caption was generated.", True
@@ -68,7 +53,6 @@ def get_audioldm_from_caption(caption):
     Returns the filename (path) of the generated .wav file.
     """
     try:
-        # Generate audio from the caption
         audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
@@ -76,7 +60,6 @@ def get_audioldm_from_caption(caption):
         )
         audio = audio_output.audios[0]
-        # Write the audio to a temporary .wav file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
             write(temp_wav.name, 16000, audio)
             return temp_wav.name
@@ -85,9 +68,6 @@ def get_audioldm_from_caption(caption):
         print(f"Error generating audio from caption: {e}")
         return None
-# ------------------------------------------------
-# 3) GRADIO INTERFACE
-# ------------------------------------------------
 css = """
 #col-container{
     margin: 0 auto;
@@ -96,7 +76,6 @@ css = """
 """
 with gr.Blocks(css=css) as demo:
-    # Main Title and App Description
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
     <h1 style="text-align: center;">
@@ -145,15 +124,13 @@ with gr.Blocks(css=css) as demo:
     Enjoy exploring the auditory landscape of your images!
     """)
-    # Function to update the caption display based on the uploaded image
     def update_caption(image_file):
         description, error_flag = analyze_image_with_free_model(image_file)
         return description
-    # Function to generate sound from the description
     def generate_sound(description):
         if not description or description.startswith("Error"):
-            return None  # or some default sound
         audio_path = get_audioldm_from_caption(description)
         return audio_path

 from transformers import pipeline
 from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
 captioning_pipeline = pipeline(
     "image-to-text",
     model="nlpconnect/vit-gpt2-image-captioning",
 )
 device = "cuda" if torch.cuda.is_available() else "cpu"
 pipe = DiffusionPipeline.from_pretrained(
     "cvssp/audioldm2",
+    use_auth_token=hf_token
 )
 pipe = pipe.to(device)
     Returns: (caption_text, is_error_flag)
     """
     try:
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
             temp_file.write(image_file)
             temp_image_path = temp_file.name
         results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
             return "Error: Could not generate caption.", True
         caption = results[0].get("generated_text", "").strip()
         if not caption:
             return "No caption was generated.", True
     Returns the filename (path) of the generated .wav file.
     """
     try:
         audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
         )
         audio = audio_output.audios[0]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
             write(temp_wav.name, 16000, audio)
             return temp_wav.name
         print(f"Error generating audio from caption: {e}")
         return None
 css = """
 #col-container{
     margin: 0 auto;
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
     <h1 style="text-align: center;">
     Enjoy exploring the auditory landscape of your images!
     """)
     def update_caption(image_file):
         description, error_flag = analyze_image_with_free_model(image_file)
         return description
     def generate_sound(description):
         if not description or description.startswith("Error"):
+            return None
         audio_path = get_audioldm_from_caption(description)
         return audio_path