Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@ from PIL import Image
|
|
| 11 |
import io
|
| 12 |
from pydub import AudioSegment
|
| 13 |
from typing import List
|
| 14 |
-
import
|
| 15 |
|
| 16 |
# Load environment variables
|
| 17 |
load_dotenv()
|
|
@@ -20,8 +20,8 @@ HF_TOKEN = os.getenv("HF_TKN")
|
|
| 20 |
# Device configuration
|
| 21 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 22 |
|
| 23 |
-
# Initialize models
|
| 24 |
-
@
|
| 25 |
def load_caption_model():
|
| 26 |
return pipeline(
|
| 27 |
"image-to-text",
|
|
@@ -29,7 +29,7 @@ def load_caption_model():
|
|
| 29 |
device=device
|
| 30 |
)
|
| 31 |
|
| 32 |
-
@
|
| 33 |
def load_audio_model():
|
| 34 |
pipe = DiffusionPipeline.from_pretrained(
|
| 35 |
"cvssp/audioldm2",
|
|
@@ -40,7 +40,6 @@ def load_audio_model():
|
|
| 40 |
caption_pipe = load_caption_model()
|
| 41 |
audio_pipe = load_audio_model().to(device)
|
| 42 |
|
| 43 |
-
@spaces.GPU(duration=120)
|
| 44 |
def analyze_image(image_file):
|
| 45 |
"""Generate caption from image with validation"""
|
| 46 |
try:
|
|
@@ -65,7 +64,6 @@ def analyze_image(image_file):
|
|
| 65 |
except Exception as e:
|
| 66 |
raise gr.Error(f"Image processing error: {str(e)}")
|
| 67 |
|
| 68 |
-
@spaces.GPU(duration=120)
|
| 69 |
def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5):
|
| 70 |
"""Generate audio from single prompt"""
|
| 71 |
try:
|
|
@@ -87,7 +85,6 @@ def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5):
|
|
| 87 |
except Exception as e:
|
| 88 |
raise gr.Error(f"Audio generation error: {str(e)}")
|
| 89 |
|
| 90 |
-
@spaces.GPU(duration=120)
|
| 91 |
def blend_audios(audio_files: List[str]) -> str:
|
| 92 |
"""Mix multiple audio files into one"""
|
| 93 |
try:
|
|
@@ -246,6 +243,8 @@ with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app:
|
|
| 246 |
# Footer
|
| 247 |
gr.Markdown("""
|
| 248 |
---
|
|
|
|
|
|
|
| 249 |
[GitHub Repository](https://github.com/bilsimaging/Imaginesound)*
|
| 250 |
""")
|
| 251 |
|
|
@@ -256,5 +255,8 @@ with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app:
|
|
| 256 |
outputs=[prompt_display, final_audio, *track_components]
|
| 257 |
)
|
| 258 |
|
|
|
|
|
|
|
|
|
|
| 259 |
if __name__ == "__main__":
|
| 260 |
app.launch(debug=True, share=True)
|
|
|
|
| 11 |
import io
|
| 12 |
from pydub import AudioSegment
|
| 13 |
from typing import List
|
| 14 |
+
from functools import lru_cache
|
| 15 |
|
| 16 |
# Load environment variables
|
| 17 |
load_dotenv()
|
|
|
|
| 20 |
# Device configuration
|
| 21 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 22 |
|
| 23 |
+
# Initialize models with caching
|
| 24 |
+
@lru_cache(maxsize=None)
|
| 25 |
def load_caption_model():
|
| 26 |
return pipeline(
|
| 27 |
"image-to-text",
|
|
|
|
| 29 |
device=device
|
| 30 |
)
|
| 31 |
|
| 32 |
+
@lru_cache(maxsize=None)
|
| 33 |
def load_audio_model():
|
| 34 |
pipe = DiffusionPipeline.from_pretrained(
|
| 35 |
"cvssp/audioldm2",
|
|
|
|
| 40 |
caption_pipe = load_caption_model()
|
| 41 |
audio_pipe = load_audio_model().to(device)
|
| 42 |
|
|
|
|
| 43 |
def analyze_image(image_file):
|
| 44 |
"""Generate caption from image with validation"""
|
| 45 |
try:
|
|
|
|
| 64 |
except Exception as e:
|
| 65 |
raise gr.Error(f"Image processing error: {str(e)}")
|
| 66 |
|
|
|
|
| 67 |
def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5):
|
| 68 |
"""Generate audio from single prompt"""
|
| 69 |
try:
|
|
|
|
| 85 |
except Exception as e:
|
| 86 |
raise gr.Error(f"Audio generation error: {str(e)}")
|
| 87 |
|
|
|
|
| 88 |
def blend_audios(audio_files: List[str]) -> str:
|
| 89 |
"""Mix multiple audio files into one"""
|
| 90 |
try:
|
|
|
|
| 243 |
# Footer
|
| 244 |
gr.Markdown("""
|
| 245 |
---
|
| 246 |
+
*Powered by [BLIP](https://huggingface.co/Salesforce/blip-image-captioning-base) and
|
| 247 |
+
[AudioLDM 2](https://huggingface.co/cvssp/audioldm2) •
|
| 248 |
[GitHub Repository](https://github.com/bilsimaging/Imaginesound)*
|
| 249 |
""")
|
| 250 |
|
|
|
|
| 255 |
outputs=[prompt_display, final_audio, *track_components]
|
| 256 |
)
|
| 257 |
|
| 258 |
+
# Enable queuing for concurrent processing
|
| 259 |
+
app.queue(concurrency_count=3)
|
| 260 |
+
|
| 261 |
if __name__ == "__main__":
|
| 262 |
app.launch(debug=True, share=True)
|