Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,31 +8,19 @@ from diffusers import DiffusionPipeline
|
|
| 8 |
from transformers import pipeline
|
| 9 |
from pathlib import Path
|
| 10 |
|
| 11 |
-
# Load environment variables from .env file if needed
|
| 12 |
load_dotenv()
|
| 13 |
|
| 14 |
-
# If you have any Hugging Face tokens for private models (AudioLDM2 requires HF_TKN)
|
| 15 |
hf_token = os.getenv("HF_TKN")
|
| 16 |
|
| 17 |
-
# ------------------------------------------------
|
| 18 |
-
# 1) INITIALIZE FREE IMAGE CAPTIONING PIPELINE
|
| 19 |
-
# ------------------------------------------------
|
| 20 |
-
# Replace "nlpconnect/vit-gpt2-image-captioning" with any other free image captioning model you prefer.
|
| 21 |
captioning_pipeline = pipeline(
|
| 22 |
"image-to-text",
|
| 23 |
model="nlpconnect/vit-gpt2-image-captioning",
|
| 24 |
-
# If the model is private or requires auth, pass the token here: use_auth_token=hf_token,
|
| 25 |
)
|
| 26 |
|
| 27 |
-
# ------------------------------------------------
|
| 28 |
-
# 2) INITIALIZE AUDIO LDM-2 PIPELINE
|
| 29 |
-
# ------------------------------------------------
|
| 30 |
-
# AudioLDM2 is also from Hugging Face. If it’s a private model, pass your token via use_auth_token.
|
| 31 |
-
# If you’re using the public version, you may not need the token at all.
|
| 32 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 33 |
pipe = DiffusionPipeline.from_pretrained(
|
| 34 |
"cvssp/audioldm2",
|
| 35 |
-
use_auth_token=hf_token
|
| 36 |
)
|
| 37 |
pipe = pipe.to(device)
|
| 38 |
|
|
@@ -42,17 +30,14 @@ def analyze_image_with_free_model(image_file):
|
|
| 42 |
Returns: (caption_text, is_error_flag)
|
| 43 |
"""
|
| 44 |
try:
|
| 45 |
-
# Save uploaded image to a temporary file
|
| 46 |
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
|
| 47 |
temp_file.write(image_file)
|
| 48 |
temp_image_path = temp_file.name
|
| 49 |
|
| 50 |
-
# Run the image captioning pipeline
|
| 51 |
results = captioning_pipeline(temp_image_path)
|
| 52 |
if not results or not isinstance(results, list):
|
| 53 |
return "Error: Could not generate caption.", True
|
| 54 |
|
| 55 |
-
# Typically, pipeline returns a list of dicts with a "generated_text" key
|
| 56 |
caption = results[0].get("generated_text", "").strip()
|
| 57 |
if not caption:
|
| 58 |
return "No caption was generated.", True
|
|
@@ -68,7 +53,6 @@ def get_audioldm_from_caption(caption):
|
|
| 68 |
Returns the filename (path) of the generated .wav file.
|
| 69 |
"""
|
| 70 |
try:
|
| 71 |
-
# Generate audio from the caption
|
| 72 |
audio_output = pipe(
|
| 73 |
prompt=caption,
|
| 74 |
num_inference_steps=50,
|
|
@@ -76,7 +60,6 @@ def get_audioldm_from_caption(caption):
|
|
| 76 |
)
|
| 77 |
audio = audio_output.audios[0]
|
| 78 |
|
| 79 |
-
# Write the audio to a temporary .wav file
|
| 80 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
|
| 81 |
write(temp_wav.name, 16000, audio)
|
| 82 |
return temp_wav.name
|
|
@@ -85,9 +68,6 @@ def get_audioldm_from_caption(caption):
|
|
| 85 |
print(f"Error generating audio from caption: {e}")
|
| 86 |
return None
|
| 87 |
|
| 88 |
-
# ------------------------------------------------
|
| 89 |
-
# 3) GRADIO INTERFACE
|
| 90 |
-
# ------------------------------------------------
|
| 91 |
css = """
|
| 92 |
#col-container{
|
| 93 |
margin: 0 auto;
|
|
@@ -96,7 +76,6 @@ css = """
|
|
| 96 |
"""
|
| 97 |
|
| 98 |
with gr.Blocks(css=css) as demo:
|
| 99 |
-
# Main Title and App Description
|
| 100 |
with gr.Column(elem_id="col-container"):
|
| 101 |
gr.HTML("""
|
| 102 |
<h1 style="text-align: center;">
|
|
@@ -145,15 +124,13 @@ with gr.Blocks(css=css) as demo:
|
|
| 145 |
Enjoy exploring the auditory landscape of your images!
|
| 146 |
""")
|
| 147 |
|
| 148 |
-
# Function to update the caption display based on the uploaded image
|
| 149 |
def update_caption(image_file):
|
| 150 |
description, error_flag = analyze_image_with_free_model(image_file)
|
| 151 |
return description
|
| 152 |
|
| 153 |
-
# Function to generate sound from the description
|
| 154 |
def generate_sound(description):
|
| 155 |
if not description or description.startswith("Error"):
|
| 156 |
-
return None
|
| 157 |
audio_path = get_audioldm_from_caption(description)
|
| 158 |
return audio_path
|
| 159 |
|
|
|
|
| 8 |
from transformers import pipeline
|
| 9 |
from pathlib import Path
|
| 10 |
|
|
|
|
| 11 |
load_dotenv()
|
| 12 |
|
|
|
|
| 13 |
hf_token = os.getenv("HF_TKN")
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
captioning_pipeline = pipeline(
|
| 16 |
"image-to-text",
|
| 17 |
model="nlpconnect/vit-gpt2-image-captioning",
|
|
|
|
| 18 |
)
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 21 |
pipe = DiffusionPipeline.from_pretrained(
|
| 22 |
"cvssp/audioldm2",
|
| 23 |
+
use_auth_token=hf_token
|
| 24 |
)
|
| 25 |
pipe = pipe.to(device)
|
| 26 |
|
|
|
|
| 30 |
Returns: (caption_text, is_error_flag)
|
| 31 |
"""
|
| 32 |
try:
|
|
|
|
| 33 |
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
|
| 34 |
temp_file.write(image_file)
|
| 35 |
temp_image_path = temp_file.name
|
| 36 |
|
|
|
|
| 37 |
results = captioning_pipeline(temp_image_path)
|
| 38 |
if not results or not isinstance(results, list):
|
| 39 |
return "Error: Could not generate caption.", True
|
| 40 |
|
|
|
|
| 41 |
caption = results[0].get("generated_text", "").strip()
|
| 42 |
if not caption:
|
| 43 |
return "No caption was generated.", True
|
|
|
|
| 53 |
Returns the filename (path) of the generated .wav file.
|
| 54 |
"""
|
| 55 |
try:
|
|
|
|
| 56 |
audio_output = pipe(
|
| 57 |
prompt=caption,
|
| 58 |
num_inference_steps=50,
|
|
|
|
| 60 |
)
|
| 61 |
audio = audio_output.audios[0]
|
| 62 |
|
|
|
|
| 63 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
|
| 64 |
write(temp_wav.name, 16000, audio)
|
| 65 |
return temp_wav.name
|
|
|
|
| 68 |
print(f"Error generating audio from caption: {e}")
|
| 69 |
return None
|
| 70 |
|
|
|
|
|
|
|
|
|
|
| 71 |
css = """
|
| 72 |
#col-container{
|
| 73 |
margin: 0 auto;
|
|
|
|
| 76 |
"""
|
| 77 |
|
| 78 |
with gr.Blocks(css=css) as demo:
|
|
|
|
| 79 |
with gr.Column(elem_id="col-container"):
|
| 80 |
gr.HTML("""
|
| 81 |
<h1 style="text-align: center;">
|
|
|
|
| 124 |
Enjoy exploring the auditory landscape of your images!
|
| 125 |
""")
|
| 126 |
|
|
|
|
| 127 |
def update_caption(image_file):
|
| 128 |
description, error_flag = analyze_image_with_free_model(image_file)
|
| 129 |
return description
|
| 130 |
|
|
|
|
| 131 |
def generate_sound(description):
|
| 132 |
if not description or description.startswith("Error"):
|
| 133 |
+
return None
|
| 134 |
audio_path = get_audioldm_from_caption(description)
|
| 135 |
return audio_path
|
| 136 |
|