Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,7 +21,7 @@ from TTS.api import TTS
|
|
| 21 |
# Load Environment Variables
|
| 22 |
# ---------------------------------------------------------------------
|
| 23 |
load_dotenv()
|
| 24 |
-
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 25 |
|
| 26 |
# ---------------------------------------------------------------------
|
| 27 |
# Global Model Caches
|
|
@@ -66,7 +66,6 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
|
|
| 66 |
|
| 67 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 68 |
model.to(device)
|
| 69 |
-
|
| 70 |
MUSICGEN_MODELS[model_key] = (model, processor)
|
| 71 |
return model, processor
|
| 72 |
|
|
@@ -175,7 +174,7 @@ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/ta
|
|
| 175 |
|
| 176 |
|
| 177 |
# ---------------------------------------------------------------------
|
| 178 |
-
# Music Generation Function
|
| 179 |
# ---------------------------------------------------------------------
|
| 180 |
@spaces.GPU(duration=100)
|
| 181 |
def generate_music(prompt: str, audio_length: int):
|
|
@@ -209,13 +208,15 @@ def generate_music(prompt: str, audio_length: int):
|
|
| 209 |
|
| 210 |
|
| 211 |
# ---------------------------------------------------------------------
|
| 212 |
-
# Audio Blending
|
| 213 |
# ---------------------------------------------------------------------
|
| 214 |
@spaces.GPU(duration=100)
|
| 215 |
def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
|
| 216 |
"""
|
| 217 |
-
Blends two audio files (voice and music).
|
| 218 |
-
|
|
|
|
|
|
|
| 219 |
Returns the file path to the blended .wav file.
|
| 220 |
"""
|
| 221 |
try:
|
|
@@ -225,20 +226,27 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
|
|
| 225 |
voice = AudioSegment.from_wav(voice_path)
|
| 226 |
music = AudioSegment.from_wav(music_path)
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
if ducking:
|
| 234 |
-
# Step 1: Reduce music
|
| 235 |
-
|
| 236 |
-
# Overlay voice on top of
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
# Step 2: Keep the rest of the music as-is
|
| 240 |
-
remainder = music[len(voice):]
|
| 241 |
-
final_audio = voice_overlaid + remainder
|
| 242 |
else:
|
| 243 |
# No ducking, just overlay
|
| 244 |
final_audio = music.overlay(voice)
|
|
@@ -256,16 +264,18 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
|
|
| 256 |
# ---------------------------------------------------------------------
|
| 257 |
with gr.Blocks() as demo:
|
| 258 |
gr.Markdown("""
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
| 269 |
|
| 270 |
with gr.Tabs():
|
| 271 |
# Step 1: Generate Script
|
|
@@ -342,9 +352,9 @@ with gr.Blocks() as demo:
|
|
| 342 |
outputs=[music_output],
|
| 343 |
)
|
| 344 |
|
| 345 |
-
# Step 4: Blend Audio
|
| 346 |
with gr.Tab("Step 4: Blend Audio"):
|
| 347 |
-
gr.Markdown("
|
| 348 |
ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
|
| 349 |
duck_level_slider = gr.Slider(
|
| 350 |
label="Ducking Level (dB attenuation)",
|
|
|
|
| 21 |
# Load Environment Variables
|
| 22 |
# ---------------------------------------------------------------------
|
| 23 |
load_dotenv()
|
| 24 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 25 |
|
| 26 |
# ---------------------------------------------------------------------
|
| 27 |
# Global Model Caches
|
|
|
|
| 66 |
|
| 67 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 68 |
model.to(device)
|
|
|
|
| 69 |
MUSICGEN_MODELS[model_key] = (model, processor)
|
| 70 |
return model, processor
|
| 71 |
|
|
|
|
| 174 |
|
| 175 |
|
| 176 |
# ---------------------------------------------------------------------
|
| 177 |
+
# Music Generation Function
|
| 178 |
# ---------------------------------------------------------------------
|
| 179 |
@spaces.GPU(duration=100)
|
| 180 |
def generate_music(prompt: str, audio_length: int):
|
|
|
|
| 208 |
|
| 209 |
|
| 210 |
# ---------------------------------------------------------------------
|
| 211 |
+
# Audio Blending with Duration Sync & Ducking
|
| 212 |
# ---------------------------------------------------------------------
|
| 213 |
@spaces.GPU(duration=100)
|
| 214 |
def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
|
| 215 |
"""
|
| 216 |
+
Blends two audio files (voice and music).
|
| 217 |
+
1. If music < voice, loops the music until it meets/exceeds the voice duration.
|
| 218 |
+
2. If music > voice, trims music to the voice duration.
|
| 219 |
+
3. If ducking=True, the music is attenuated by 'duck_level' dB while the voice is playing.
|
| 220 |
Returns the file path to the blended .wav file.
|
| 221 |
"""
|
| 222 |
try:
|
|
|
|
| 226 |
voice = AudioSegment.from_wav(voice_path)
|
| 227 |
music = AudioSegment.from_wav(music_path)
|
| 228 |
|
| 229 |
+
voice_len = len(voice) # in milliseconds
|
| 230 |
+
music_len = len(music) # in milliseconds
|
| 231 |
+
|
| 232 |
+
# 1) If the music is shorter than the voice, loop it:
|
| 233 |
+
if music_len < voice_len:
|
| 234 |
+
looped_music = AudioSegment.empty()
|
| 235 |
+
# Keep appending until we exceed voice length
|
| 236 |
+
while len(looped_music) < voice_len:
|
| 237 |
+
looped_music += music
|
| 238 |
+
music = looped_music
|
| 239 |
|
| 240 |
+
# 2) If the music is longer than the voice, truncate it:
|
| 241 |
+
if len(music) > voice_len:
|
| 242 |
+
music = music[:voice_len]
|
| 243 |
+
|
| 244 |
+
# Now music and voice are the same length
|
| 245 |
if ducking:
|
| 246 |
+
# Step 1: Reduce music dB while voice is playing
|
| 247 |
+
ducked_music = music - duck_level
|
| 248 |
+
# Step 2: Overlay voice on top of ducked music
|
| 249 |
+
final_audio = ducked_music.overlay(voice)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
else:
|
| 251 |
# No ducking, just overlay
|
| 252 |
final_audio = music.overlay(voice)
|
|
|
|
| 264 |
# ---------------------------------------------------------------------
|
| 265 |
with gr.Blocks() as demo:
|
| 266 |
gr.Markdown("""
|
| 267 |
+
# 🎧 AI Promo Studio
|
| 268 |
+
Welcome to **AI Promo Studio**, your all-in-one solution for creating professional, engaging audio promos with minimal effort!
|
| 269 |
+
|
| 270 |
+
This next-generation platform uses powerful AI models to handle:
|
| 271 |
+
- **Script Generation**: Craft concise and impactful copy with LLaMA.
|
| 272 |
+
- **Voice Synthesis**: Convert text into natural-sounding voice-overs using Coqui TTS.
|
| 273 |
+
- **Music Production**: Generate custom music tracks with MusicGen Large for sound bed.
|
| 274 |
+
- **Seamless Blending**: Easily combine voice and music—loop or trim tracks to match your desired promo length, with optional ducking to keep the voice front and center.
|
| 275 |
+
|
| 276 |
+
Whether you’re a radio producer, podcaster, or content creator, **AI Promo Studio** streamlines your entire production pipeline—cutting hours of manual editing down to a few clicks.
|
| 277 |
+
""")
|
| 278 |
+
|
| 279 |
|
| 280 |
with gr.Tabs():
|
| 281 |
# Step 1: Generate Script
|
|
|
|
| 352 |
outputs=[music_output],
|
| 353 |
)
|
| 354 |
|
| 355 |
+
# Step 4: Blend Audio (Loop/Trim + Ducking)
|
| 356 |
with gr.Tab("Step 4: Blend Audio"):
|
| 357 |
+
gr.Markdown("**Music** will be looped or trimmed to match **Voice** duration, then optionally ducked.")
|
| 358 |
ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
|
| 359 |
duck_level_slider = gr.Slider(
|
| 360 |
label="Ducking Level (dB attenuation)",
|