Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
|
| 12 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 13 |
|
| 14 |
|
| 15 |
-
repo_id = "
|
| 16 |
|
| 17 |
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
|
@@ -25,26 +25,54 @@ SEED = 42
|
|
| 25 |
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
|
| 26 |
default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
|
| 27 |
examples = [
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
]
|
| 49 |
number_normalizer = EnglishNumberNormalizer()
|
| 50 |
|
|
@@ -134,16 +162,13 @@ with gr.Blocks(css=css) as block:
|
|
| 134 |
"""
|
| 135 |
)
|
| 136 |
gr.HTML(
|
| 137 |
-
|
| 138 |
<p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
|
| 139 |
high-fidelity text-to-speech (TTS) models.</p>
|
| 140 |
-
<p>
|
| 141 |
-
has been fine-tuned on a French dataset. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).
|
| 142 |
-
Due to limitations on the dataset, this model might underperform for female voices (we recommend using male voices only).</p>
|
| 143 |
|
| 144 |
-
<p>By default, Parler-TTS generates 🎲 random
|
| 145 |
-
<p><b>Note:</b> do
|
| 146 |
-
<p><b>Important note:</b> this model does NOT work in english, it will generate incoherent audios. But you can still use the original Parler TTS model for that. </p>
|
| 147 |
"""
|
| 148 |
)
|
| 149 |
with gr.Row():
|
|
|
|
| 12 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 13 |
|
| 14 |
|
| 15 |
+
repo_id = "https://huggingface.co/ylacombe/p-m-e"
|
| 16 |
|
| 17 |
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
|
|
|
| 25 |
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
|
| 26 |
default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
|
| 27 |
examples = [
|
| 28 |
+
# French
|
| 29 |
+
[
|
| 30 |
+
"La voix humaine est un instrument de musique au-dessus de tous les autres.",
|
| 31 |
+
"A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
|
| 32 |
+
None,
|
| 33 |
+
],
|
| 34 |
+
# Spanish
|
| 35 |
+
[
|
| 36 |
+
"La voz es el reflejo del alma en el espejo del tiempo.",
|
| 37 |
+
"A female voice speaks with moderate speed, showing warmth and clarity. The recording is clean with minimal background noise and has natural resonance.",
|
| 38 |
+
None,
|
| 39 |
+
],
|
| 40 |
+
# Italian
|
| 41 |
+
[
|
| 42 |
+
"La voce umana è la più bella musica che esista al mondo.",
|
| 43 |
+
"A male voice delivers the message with passion and depth. The recording has good clarity with slight room acoustics and a medium-distance perspective.",
|
| 44 |
+
None,
|
| 45 |
+
],
|
| 46 |
+
# Portuguese
|
| 47 |
+
[
|
| 48 |
+
"A voz é o espelho da alma e o som do coração.",
|
| 49 |
+
"A young female voice speaks with enthusiasm and energy. The recording is close-miked with crisp audio quality and subtle room ambiance.",
|
| 50 |
+
None,
|
| 51 |
+
],
|
| 52 |
+
# Polish
|
| 53 |
+
[
|
| 54 |
+
"Głos ludzki jest najpiękniejszym instrumentem świata.",
|
| 55 |
+
"An elderly male voice speaks with wisdom and gravitas. The recording has a vintage quality with some characteristic analog warmth.",
|
| 56 |
+
None,
|
| 57 |
+
],
|
| 58 |
+
# German
|
| 59 |
+
[
|
| 60 |
+
"Die menschliche Stimme ist das schönste Instrument der Welt.",
|
| 61 |
+
"A mature female voice speaks with authority and precision. The recording is studio-quality with perfect clarity and no background noise.",
|
| 62 |
+
None,
|
| 63 |
+
],
|
| 64 |
+
# Dutch
|
| 65 |
+
[
|
| 66 |
+
"De menselijke stem is het mooiste instrument dat er bestaat.",
|
| 67 |
+
"A middle-aged male voice speaks with gentle inflection and warmth. The recording has natural room acoustics and balanced frequency response.",
|
| 68 |
+
None,
|
| 69 |
+
],
|
| 70 |
+
# English
|
| 71 |
+
[
|
| 72 |
+
"The human voice is nature's most perfect instrument.",
|
| 73 |
+
"A young male voice speaks with dynamic expression and energy. The recording is professional quality with subtle environmental ambiance.",
|
| 74 |
+
None,
|
| 75 |
+
],
|
| 76 |
]
|
| 77 |
number_normalizer = EnglishNumberNormalizer()
|
| 78 |
|
|
|
|
| 162 |
"""
|
| 163 |
)
|
| 164 |
gr.HTML(
|
| 165 |
+
f"""
|
| 166 |
<p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
|
| 167 |
high-fidelity text-to-speech (TTS) models.</p>
|
| 168 |
+
<p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation). </p>
|
|
|
|
|
|
|
| 169 |
|
| 170 |
+
<p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
|
| 171 |
+
<p><b>Note:</b> you do not need to specify the nationality of the speaker in the description (do: "a male speaker", don't: "a french male speaker") </p>
|
|
|
|
| 172 |
"""
|
| 173 |
)
|
| 174 |
with gr.Row():
|