multi_parler_tts

Running on Zero

App Files Files Community

PHBJT commited on Sep 15, 2024

Commit

85185da

verified ·

1 Parent(s): 364343c

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -33

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-repo_id =  "PHBJT/parler_french_tts_mini_v0.1"
 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
@@ -22,36 +22,30 @@ feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
 SAMPLE_RATE = feature_extractor.sampling_rate
 SEED = 42
-default_text = "All of the data, pre-processing, training code, and weights are released publicly under a permissive license, enabling the community to build on our work and develop their own powerful models."
-default_description = "Laura's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
 examples = [
     [
-        "This version introduces speaker consistency across generations, characterized by their name. For example, Jon, Lea, Gary, Jenna, Mike and Laura.",
-        "Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.",
         None,
     ],
     [
-        '''There's 34 speakers. To take advantage of this, simply adapt your text description to specify which speaker to use: "Mike speaks animatedly...".''',
-        "Gary speaks slightly animatedly and slightly slowly in delivery, with a very close recording that has no background noise.",
-        None
-    ],
-    [
-        "'This is the best time of my life, Bartley,' she said happily.",
-        "A female speaker delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
         None,
     ],
     [
-        "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
-        "A man voice speaks slightly slowly with very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
-        None
     ],
     [
-        "Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her pal-ace window, which had a carved frame of black wood.",
-        "In a very poor recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average.",
         None,
     ],
 ]
 number_normalizer = EnglishNumberNormalizer()
 def preprocess(text):
@@ -133,7 +127,7 @@ with gr.Blocks(css=css) as block:
                 "
               >
                 <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
-                  Parler-TTS 🗣️
                 </h1>
               </div>
             </div>
@@ -141,15 +135,14 @@ with gr.Blocks(css=css) as block:
     )
     gr.HTML(
         f"""
-        <p><a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> is a training and inference library for
-        high-fidelity text-to-speech (TTS) models.</p>
-        <p>The models demonstrated here, Parler-TTS <a href="https://huggingface.co/parler-tts/parler-tts-mini-v1">Mini v1</a> and <a href="https://huggingface.co/parler-tts/parler-tts-large-v1">Large v1</a>,
-        are trained using 45k hours of narrated English audiobooks. It generates high-quality speech
-        with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).</p>
-        <p>By default, Parler-TTS generates 🎲 random voice. To ensure 🎯 <b> speaker consistency </b> across generations, these checkpoints were also trained on 34 speakers, characterized by name (e.g. Jon, Lea, Gary, Jenna, Mike, Laura).</p>
-        <p>To take advantage of this, simply adapt your text description to specify which speaker to use: `Jon's voice is monotone...`</p>
         """
     )
     with gr.Row():
@@ -173,13 +166,9 @@ with gr.Blocks(css=css) as block:
             <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
         </ul>
         </p>
-        <p>Parler-TTS can be much faster. We give some tips on how to generate much more quickly in this <a href="https://github.com/huggingface/parler-tts/blob/main/INFERENCE.md"> inference guide</a>. Think SDPA, torch.compile, batching and streaming!</p>
-        <p>If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the
-        <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> repository on GitHub.</p>
-        <p>The Parler-TTS codebase and its associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.</p>
         """
     )

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+repo_id =  "PHBJT/french_parler_tts_mini_v0.1"
 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 SAMPLE_RATE = feature_extractor.sampling_rate
 SEED = 42
+default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
+default_description = "The voice speaks slowly with a very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
 examples = [
     [
+        "La voix humaine est un instrument de musique au-dessus de tous les autres.",
+        "The voice speaks slowly with a very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
         None,
     ],
     [
+        "Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
+        "A slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
         None,
     ],
     [
+        "La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
+        "A monotone yet slightly fast delivery, with a very close recording that almost has no background noise.",
+        None,
     ],
     [
+        "Le progrès fait naître plus de besoins qu'il n'en satisfait.",
+        "In a very poor recording quality, the voice delivers slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. The voice is slightly higher pitched than average.",
         None,
     ],
 ]
 number_normalizer = EnglishNumberNormalizer()
 def preprocess(text):
                 "
               >
                 <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
+                  French Parler-TTS 🗣️
                 </h1>
               </div>
             </div>
     )
     gr.HTML(
         f"""
+       <p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
+high-fidelity text-to-speech (TTS) models.</p>
+<p>The model demonstrated here, French Parler-TTS <a href="https://huggingface.co/PHBJT/french_parler_tts_mini_v0.1">Mini v0.1 French</a>,
+has been fine-tuned on a French dataset. It generates high-quality male speech
+with features that can be controlled using a simple text prompt (e.g. background noise, speaking rate, pitch and reverberation). Please note that this model currently supports only male voices (due to limitations on the dataset).</p>
+<p>By default, Parler-TTS generates 🎲 random male voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
         """
     )
     with gr.Row():
             <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
         </ul>
         </p>
+        <p>If you want to find out more about how this model was trained and even fine tune Parler TTS in any language, check-out <a href=">this</a> post
         """
     )