Spaces:
Running
on
Zero
Running
on
Zero
Update the sound effect normalization and ras (#2)
Browse files- Update the sound effect normalization (d842d2435a36da291401d91da69a72a04db6abba)
- Auto update ras (4eba77350bb174551f472e599ca12151bf313064)
Co-authored-by: Zach Zheng <zachzzc@users.noreply.huggingface.co>
app.py
CHANGED
|
@@ -89,7 +89,7 @@ PREDEFINED_EXAMPLES = {
|
|
| 89 |
},
|
| 90 |
"single-speaker-bgm": {
|
| 91 |
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
| 92 |
-
"input_text": "
|
| 93 |
"description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
|
| 94 |
},
|
| 95 |
}
|
|
@@ -184,6 +184,22 @@ def normalize_text(transcript: str):
|
|
| 184 |
transcript = transcript.replace(")", " ")
|
| 185 |
transcript = transcript.replace("°F", " degrees Fahrenheit")
|
| 186 |
transcript = transcript.replace("°C", " degrees Celsius")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
lines = transcript.split("\n")
|
| 188 |
transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
|
| 189 |
transcript = transcript.strip()
|
|
@@ -530,6 +546,8 @@ def create_ui():
|
|
| 530 |
# Enable voice preset and custom reference only for voice-clone template
|
| 531 |
is_voice_clone = template_name == "voice-clone"
|
| 532 |
voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
|
|
|
|
|
|
|
| 533 |
description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
|
| 534 |
return (
|
| 535 |
template["system_prompt"], # system_prompt
|
|
@@ -540,6 +558,7 @@ def create_ui():
|
|
| 540 |
), # voice_preset (value and interactivity)
|
| 541 |
gr.update(visible=is_voice_clone), # custom reference accordion visibility
|
| 542 |
gr.update(visible=is_voice_clone), # voice samples section visibility
|
|
|
|
| 543 |
)
|
| 544 |
else:
|
| 545 |
return (
|
|
@@ -549,6 +568,7 @@ def create_ui():
|
|
| 549 |
gr.update(),
|
| 550 |
gr.update(),
|
| 551 |
gr.update(),
|
|
|
|
| 552 |
) # No change if template not found
|
| 553 |
|
| 554 |
# Set up event handlers
|
|
@@ -564,6 +584,7 @@ def create_ui():
|
|
| 564 |
voice_preset,
|
| 565 |
custom_reference_accordion,
|
| 566 |
voice_samples_section,
|
|
|
|
| 567 |
],
|
| 568 |
)
|
| 569 |
|
|
|
|
| 89 |
},
|
| 90 |
"single-speaker-bgm": {
|
| 91 |
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
| 92 |
+
"input_text": "[music start] I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. [music end]",
|
| 93 |
"description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
|
| 94 |
},
|
| 95 |
}
|
|
|
|
| 184 |
transcript = transcript.replace(")", " ")
|
| 185 |
transcript = transcript.replace("°F", " degrees Fahrenheit")
|
| 186 |
transcript = transcript.replace("°C", " degrees Celsius")
|
| 187 |
+
|
| 188 |
+
for tag, replacement in [
|
| 189 |
+
("[laugh]", "<SE>[Laughter]</SE>"),
|
| 190 |
+
("[humming start]", "<SE>[Humming]</SE>"),
|
| 191 |
+
("[humming end]", "<SE_e>[Humming]</SE_e>"),
|
| 192 |
+
("[music start]", "<SE_s>[Music]</SE_s>"),
|
| 193 |
+
("[music end]", "<SE_e>[Music]</SE_e>"),
|
| 194 |
+
("[music]", "<SE>[Music]</SE>"),
|
| 195 |
+
("[sing start]", "<SE_s>[Singing]</SE_s>"),
|
| 196 |
+
("[sing end]", "<SE_e>[Singing]</SE_e>"),
|
| 197 |
+
("[applause]", "<SE>[Applause]</SE>"),
|
| 198 |
+
("[cheering]", "<SE>[Cheering]</SE>"),
|
| 199 |
+
("[cough]", "<SE>[Cough]</SE>"),
|
| 200 |
+
]:
|
| 201 |
+
transcript = transcript.replace(tag, replacement)
|
| 202 |
+
|
| 203 |
lines = transcript.split("\n")
|
| 204 |
transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
|
| 205 |
transcript = transcript.strip()
|
|
|
|
| 546 |
# Enable voice preset and custom reference only for voice-clone template
|
| 547 |
is_voice_clone = template_name == "voice-clone"
|
| 548 |
voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
|
| 549 |
+
# Set ras_win_len to 0 for single-speaker-bgm, 7 for others
|
| 550 |
+
ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
|
| 551 |
description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
|
| 552 |
return (
|
| 553 |
template["system_prompt"], # system_prompt
|
|
|
|
| 558 |
), # voice_preset (value and interactivity)
|
| 559 |
gr.update(visible=is_voice_clone), # custom reference accordion visibility
|
| 560 |
gr.update(visible=is_voice_clone), # voice samples section visibility
|
| 561 |
+
ras_win_len_value, # ras_win_len
|
| 562 |
)
|
| 563 |
else:
|
| 564 |
return (
|
|
|
|
| 568 |
gr.update(),
|
| 569 |
gr.update(),
|
| 570 |
gr.update(),
|
| 571 |
+
gr.update(),
|
| 572 |
) # No change if template not found
|
| 573 |
|
| 574 |
# Set up event handlers
|
|
|
|
| 584 |
voice_preset,
|
| 585 |
custom_reference_accordion,
|
| 586 |
voice_samples_section,
|
| 587 |
+
ras_win_len,
|
| 588 |
],
|
| 589 |
)
|
| 590 |
|