Spaces:
Running
Running
Error forced on audio normalization failure
Browse files- app/synth.py +22 -117
app/synth.py
CHANGED
|
@@ -175,25 +175,24 @@ def synthandreturn(text, autoplay, request: gr.Request):
|
|
| 175 |
print('Done with', model)
|
| 176 |
|
| 177 |
# Resample to 24kHz
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
pass
|
| 197 |
# if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
|
| 198 |
result_storage[model] = result
|
| 199 |
|
|
@@ -281,10 +280,12 @@ def synthandreturn(text, autoplay, request: gr.Request):
|
|
| 281 |
):
|
| 282 |
# run Zero-GPU spaces one at a time
|
| 283 |
predict_and_update_result(text, mdl1k, results, request)
|
| 284 |
-
|
|
|
|
| 285 |
|
| 286 |
predict_and_update_result(text, mdl2k, results, request)
|
| 287 |
-
|
|
|
|
| 288 |
else:
|
| 289 |
# use multithreading
|
| 290 |
thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
|
|
@@ -323,104 +324,8 @@ def synthandreturn(text, autoplay, request: gr.Request):
|
|
| 323 |
|
| 324 |
# Battle Mode
|
| 325 |
|
| 326 |
-
def synthandreturn_battle(text, mdl1, mdl2, autoplay):
|
| 327 |
-
if mdl1 == mdl2:
|
| 328 |
-
raise gr.Error('You can\'t pick two of the same models.')
|
| 329 |
-
text = text.strip()
|
| 330 |
-
if len(text) > MAX_SAMPLE_TXT_LENGTH:
|
| 331 |
-
raise gr.Error(f'You exceeded the limit of {MAX_SAMPLE_TXT_LENGTH} characters')
|
| 332 |
-
if len(text) < MIN_SAMPLE_TXT_LENGTH:
|
| 333 |
-
raise gr.Error(f'Please input a text longer than {MIN_SAMPLE_TXT_LENGTH} characters')
|
| 334 |
-
if (
|
| 335 |
-
# test toxicity if not prepared text
|
| 336 |
-
text not in sents
|
| 337 |
-
and check_toxicity(text)
|
| 338 |
-
):
|
| 339 |
-
print(f'Detected toxic content! "{text}"')
|
| 340 |
-
raise gr.Error('Your text failed the toxicity test')
|
| 341 |
-
if not text:
|
| 342 |
-
raise gr.Error(f'You did not enter any text')
|
| 343 |
-
# Check language
|
| 344 |
-
try:
|
| 345 |
-
if not detect(text) == "en":
|
| 346 |
-
gr.Warning('Warning: The input text may not be in English')
|
| 347 |
-
except:
|
| 348 |
-
pass
|
| 349 |
-
# Get two random models
|
| 350 |
-
log_text(text)
|
| 351 |
-
print("[debug] Using", mdl1, mdl2)
|
| 352 |
-
def predict_and_update_result(text, model, result_storage):
|
| 353 |
-
try:
|
| 354 |
-
if model in AVAILABLE_MODELS:
|
| 355 |
-
result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
|
| 356 |
-
else:
|
| 357 |
-
result = router.predict(text, model.lower(), api_name="/synthesize")
|
| 358 |
-
except:
|
| 359 |
-
raise gr.Error('Unable to call API, please try again :)')
|
| 360 |
-
print('Done with', model)
|
| 361 |
-
# try:
|
| 362 |
-
# doresample(result)
|
| 363 |
-
# except:
|
| 364 |
-
# pass
|
| 365 |
-
try:
|
| 366 |
-
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
| 367 |
-
audio = AudioSegment.from_file(result)
|
| 368 |
-
current_sr = audio.frame_rate
|
| 369 |
-
if current_sr > 24000:
|
| 370 |
-
audio = audio.set_frame_rate(24000)
|
| 371 |
-
try:
|
| 372 |
-
print('Trying to normalize audio')
|
| 373 |
-
audio = match_target_amplitude(audio, -20)
|
| 374 |
-
except:
|
| 375 |
-
print('[WARN] Unable to normalize audio')
|
| 376 |
-
audio.export(f.name, format="wav")
|
| 377 |
-
os.unlink(result)
|
| 378 |
-
result = f.name
|
| 379 |
-
except:
|
| 380 |
-
pass
|
| 381 |
-
# if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
|
| 382 |
-
print(model)
|
| 383 |
-
print(f"Running model {model}")
|
| 384 |
-
result_storage[model] = result
|
| 385 |
-
# try:
|
| 386 |
-
# doloudnorm(result)
|
| 387 |
-
# except:
|
| 388 |
-
# pass
|
| 389 |
-
mdl1k = mdl1
|
| 390 |
-
mdl2k = mdl2
|
| 391 |
-
print(mdl1k, mdl2k)
|
| 392 |
-
# if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
|
| 393 |
-
# if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
|
| 394 |
-
results = {}
|
| 395 |
-
print(f"Sending models {mdl1k} and {mdl2k} to API")
|
| 396 |
-
thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
|
| 397 |
-
thread2 = threading.Thread(target=predict_and_update_result, args=(text, mdl2k, results))
|
| 398 |
-
|
| 399 |
-
thread1.start()
|
| 400 |
-
thread2.start()
|
| 401 |
-
thread1.join()
|
| 402 |
-
thread2.join()
|
| 403 |
-
|
| 404 |
-
print(f"Retrieving models {mdl1k} and {mdl2k} from API")
|
| 405 |
-
return (
|
| 406 |
-
text,
|
| 407 |
-
"Synthesize 🐢",
|
| 408 |
-
gr.update(visible=True), # r2
|
| 409 |
-
mdl1, # model1
|
| 410 |
-
mdl2, # model2
|
| 411 |
-
gr.update(visible=True, value=results[mdl1k], autoplay=autoplay), # aud1
|
| 412 |
-
gr.update(visible=True, value=results[mdl2k], autoplay=False), # aud2
|
| 413 |
-
gr.update(visible=True, interactive=False), #abetter
|
| 414 |
-
gr.update(visible=True, interactive=False), #bbetter
|
| 415 |
-
gr.update(visible=False), #prevmodel1
|
| 416 |
-
gr.update(visible=False), #prevmodel2
|
| 417 |
-
gr.update(visible=False), #nxt round btn
|
| 418 |
-
)
|
| 419 |
-
|
| 420 |
def randomsent():
|
| 421 |
return '⚡', random.choice(sents), '🎲'
|
| 422 |
-
def randomsent_battle():
|
| 423 |
-
return tuple(randomsent()) + tuple(random_m())
|
| 424 |
def clear_stuff():
|
| 425 |
return [
|
| 426 |
gr.update(visible=True, value="", elem_classes=[]),
|
|
|
|
| 175 |
print('Done with', model)
|
| 176 |
|
| 177 |
# Resample to 24kHz
|
| 178 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
|
| 179 |
+
audio = AudioSegment.from_file(result)
|
| 180 |
+
current_sr = audio.frame_rate
|
| 181 |
+
if current_sr > 24000:
|
| 182 |
+
print(f"{model}: Resampling")
|
| 183 |
+
audio = audio.set_frame_rate(24000)
|
| 184 |
+
try:
|
| 185 |
+
print(f"{model}: Trying to normalize audio")
|
| 186 |
+
audio = match_target_amplitude(audio, -20)
|
| 187 |
+
except:
|
| 188 |
+
print(f"{model}: [WARN] Unable to normalize audio")
|
| 189 |
+
raise gr.Error('Unable to normalize audio for output of space')
|
| 190 |
+
|
| 191 |
+
audio.export(f.name, format="wav")
|
| 192 |
+
os.unlink(result)
|
| 193 |
+
result = f.name
|
| 194 |
+
gr.Info('Audio from a TTS model received')
|
| 195 |
+
|
|
|
|
| 196 |
# if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
|
| 197 |
result_storage[model] = result
|
| 198 |
|
|
|
|
| 280 |
):
|
| 281 |
# run Zero-GPU spaces one at a time
|
| 282 |
predict_and_update_result(text, mdl1k, results, request)
|
| 283 |
+
if results[mdl1k] != None:
|
| 284 |
+
cache_sample(results[mdl1k], text, mdl1k)
|
| 285 |
|
| 286 |
predict_and_update_result(text, mdl2k, results, request)
|
| 287 |
+
if results[mdl2k] != None:
|
| 288 |
+
cache_sample(results[mdl2k], text, mdl2k)
|
| 289 |
else:
|
| 290 |
# use multithreading
|
| 291 |
thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results, request))
|
|
|
|
| 324 |
|
| 325 |
# Battle Mode
|
| 326 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
def randomsent():
|
| 328 |
return '⚡', random.choice(sents), '🎲'
|
|
|
|
|
|
|
| 329 |
def clear_stuff():
|
| 330 |
return [
|
| 331 |
gr.update(visible=True, value="", elem_classes=[]),
|