Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed 
     | 
|
| 12 | 
         
             
            device = "cuda:0" if torch.cuda.is_available() else "cpu"
         
     | 
| 13 | 
         | 
| 14 | 
         | 
| 15 | 
         
            -
            repo_id =  "PHBJT/ 
     | 
| 16 | 
         | 
| 17 | 
         
             
            model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
         
     | 
| 18 | 
         
             
            tokenizer = AutoTokenizer.from_pretrained(repo_id)
         
     | 
| 
         @@ -22,36 +22,30 @@ feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) 
     | 
|
| 22 | 
         
             
            SAMPLE_RATE = feature_extractor.sampling_rate
         
     | 
| 23 | 
         
             
            SEED = 42
         
     | 
| 24 | 
         | 
| 25 | 
         
            -
            default_text = " 
     | 
| 26 | 
         
            -
            default_description = " 
     | 
| 27 | 
         
             
            examples = [
         
     | 
| 28 | 
         
             
                [
         
     | 
| 29 | 
         
            -
                    " 
     | 
| 30 | 
         
            -
                    " 
     | 
| 31 | 
         
             
                    None,
         
     | 
| 32 | 
         
             
                ],
         
     | 
| 33 | 
         
             
                [
         
     | 
| 34 | 
         
            -
                     
     | 
| 35 | 
         
            -
                    " 
     | 
| 36 | 
         
            -
                    None
         
     | 
| 37 | 
         
            -
                ],
         
     | 
| 38 | 
         
            -
                [
         
     | 
| 39 | 
         
            -
                    "'This is the best time of my life, Bartley,' she said happily.",
         
     | 
| 40 | 
         
            -
                    "A female speaker delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
         
     | 
| 41 | 
         
             
                    None,
         
     | 
| 42 | 
         
             
                ],
         
     | 
| 43 | 
         
             
                [
         
     | 
| 44 | 
         
            -
                    " 
     | 
| 45 | 
         
            -
                    "A  
     | 
| 46 | 
         
            -
                    None
         
     | 
| 47 | 
         
             
                ],
         
     | 
| 48 | 
         
             
                [
         
     | 
| 49 | 
         
            -
                    " 
     | 
| 50 | 
         
            -
                    "In a very poor recording quality,  
     | 
| 51 | 
         
             
                    None,
         
     | 
| 52 | 
         
             
                ],
         
     | 
| 53 | 
         
             
            ]
         
     | 
| 54 | 
         
            -
             
     | 
| 55 | 
         
             
            number_normalizer = EnglishNumberNormalizer()
         
     | 
| 56 | 
         | 
| 57 | 
         
             
            def preprocess(text):
         
     | 
| 
         @@ -133,7 +127,7 @@ with gr.Blocks(css=css) as block: 
     | 
|
| 133 | 
         
             
                            "
         
     | 
| 134 | 
         
             
                          >
         
     | 
| 135 | 
         
             
                            <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
         
     | 
| 136 | 
         
            -
                              Parler-TTS 🗣️
         
     | 
| 137 | 
         
             
                            </h1>
         
     | 
| 138 | 
         
             
                          </div>
         
     | 
| 139 | 
         
             
                        </div>
         
     | 
| 
         @@ -141,15 +135,14 @@ with gr.Blocks(css=css) as block: 
     | 
|
| 141 | 
         
             
                )
         
     | 
| 142 | 
         
             
                gr.HTML(
         
     | 
| 143 | 
         
             
                    f"""
         
     | 
| 144 | 
         
            -
             
     | 
| 145 | 
         
            -
             
     | 
| 146 | 
         
            -
             
     | 
| 147 | 
         
            -
             
     | 
| 148 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 149 | 
         | 
| 150 | 
         
            -
                    <p>By default, Parler-TTS generates 🎲 random voice. To ensure 🎯 <b> speaker consistency </b> across generations, these checkpoints were also trained on 34 speakers, characterized by name (e.g. Jon, Lea, Gary, Jenna, Mike, Laura).</p>
         
     | 
| 151 | 
         
            -
                    
         
     | 
| 152 | 
         
            -
                    <p>To take advantage of this, simply adapt your text description to specify which speaker to use: `Jon's voice is monotone...`</p>
         
     | 
| 153 | 
         
             
                    """
         
     | 
| 154 | 
         
             
                )
         
     | 
| 155 | 
         
             
                with gr.Row():
         
     | 
| 
         @@ -173,13 +166,9 @@ with gr.Blocks(css=css) as block: 
     | 
|
| 173 | 
         
             
                        <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
         
     | 
| 174 | 
         
             
                    </ul>
         
     | 
| 175 | 
         
             
                    </p>
         
     | 
| 176 | 
         
            -
             
     | 
| 177 | 
         
            -
                    <p>Parler-TTS can be much faster. We give some tips on how to generate much more quickly in this <a href="https://github.com/huggingface/parler-tts/blob/main/INFERENCE.md"> inference guide</a>. Think SDPA, torch.compile, batching and streaming!</p>
         
     | 
| 178 | 
         | 
| 179 | 
         
            -
                    <p>If you want to find out more about how this model was trained and even fine 
     | 
| 180 | 
         
            -
             
     | 
| 181 | 
         
            -
                    
         
     | 
| 182 | 
         
            -
                    <p>The Parler-TTS codebase and its associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.</p>
         
     | 
| 183 | 
         
             
                    """
         
     | 
| 184 | 
         
             
                )
         
     | 
| 185 | 
         | 
| 
         | 
|
| 12 | 
         
             
            device = "cuda:0" if torch.cuda.is_available() else "cpu"
         
     | 
| 13 | 
         | 
| 14 | 
         | 
| 15 | 
         
            +
            repo_id =  "PHBJT/french_parler_tts_mini_v0.1"
         
     | 
| 16 | 
         | 
| 17 | 
         
             
            model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
         
     | 
| 18 | 
         
             
            tokenizer = AutoTokenizer.from_pretrained(repo_id)
         
     | 
| 
         | 
|
| 22 | 
         
             
            SAMPLE_RATE = feature_extractor.sampling_rate
         
     | 
| 23 | 
         
             
            SEED = 42
         
     | 
| 24 | 
         | 
| 25 | 
         
            +
            default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
         
     | 
| 26 | 
         
            +
            default_description = "The voice speaks slowly with a very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
         
     | 
| 27 | 
         
             
            examples = [
         
     | 
| 28 | 
         
             
                [
         
     | 
| 29 | 
         
            +
                    "La voix humaine est un instrument de musique au-dessus de tous les autres.",
         
     | 
| 30 | 
         
            +
                    "The voice speaks slowly with a very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
         
     | 
| 31 | 
         
             
                    None,
         
     | 
| 32 | 
         
             
                ],
         
     | 
| 33 | 
         
             
                [
         
     | 
| 34 | 
         
            +
                    "Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
         
     | 
| 35 | 
         
            +
                    "A slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 36 | 
         
             
                    None,
         
     | 
| 37 | 
         
             
                ],
         
     | 
| 38 | 
         
             
                [
         
     | 
| 39 | 
         
            +
                    "La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
         
     | 
| 40 | 
         
            +
                    "A monotone yet slightly fast delivery, with a very close recording that almost has no background noise.",
         
     | 
| 41 | 
         
            +
                    None,
         
     | 
| 42 | 
         
             
                ],
         
     | 
| 43 | 
         
             
                [
         
     | 
| 44 | 
         
            +
                    "Le progrès fait naître plus de besoins qu'il n'en satisfait.",
         
     | 
| 45 | 
         
            +
                    "In a very poor recording quality, the voice delivers slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. The voice is slightly higher pitched than average.",
         
     | 
| 46 | 
         
             
                    None,
         
     | 
| 47 | 
         
             
                ],
         
     | 
| 48 | 
         
             
            ]
         
     | 
| 
         | 
|
| 49 | 
         
             
            number_normalizer = EnglishNumberNormalizer()
         
     | 
| 50 | 
         | 
| 51 | 
         
             
            def preprocess(text):
         
     | 
| 
         | 
|
| 127 | 
         
             
                            "
         
     | 
| 128 | 
         
             
                          >
         
     | 
| 129 | 
         
             
                            <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
         
     | 
| 130 | 
         
            +
                              French Parler-TTS 🗣️
         
     | 
| 131 | 
         
             
                            </h1>
         
     | 
| 132 | 
         
             
                          </div>
         
     | 
| 133 | 
         
             
                        </div>
         
     | 
| 
         | 
|
| 135 | 
         
             
                )
         
     | 
| 136 | 
         
             
                gr.HTML(
         
     | 
| 137 | 
         
             
                    f"""
         
     | 
| 138 | 
         
            +
                   <p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
         
     | 
| 139 | 
         
            +
            high-fidelity text-to-speech (TTS) models.</p> 
         
     | 
| 140 | 
         
            +
            <p>The model demonstrated here, French Parler-TTS <a href="https://huggingface.co/PHBJT/french_parler_tts_mini_v0.1">Mini v0.1 French</a>, 
         
     | 
| 141 | 
         
            +
            has been fine-tuned on a French dataset. It generates high-quality male speech 
         
     | 
| 142 | 
         
            +
            with features that can be controlled using a simple text prompt (e.g. background noise, speaking rate, pitch and reverberation). Please note that this model currently supports only male voices (due to limitations on the dataset).</p>
         
     | 
| 143 | 
         
            +
             
     | 
| 144 | 
         
            +
            <p>By default, Parler-TTS generates 🎲 random male voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
         
     | 
| 145 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 146 | 
         
             
                    """
         
     | 
| 147 | 
         
             
                )
         
     | 
| 148 | 
         
             
                with gr.Row():
         
     | 
| 
         | 
|
| 166 | 
         
             
                        <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
         
     | 
| 167 | 
         
             
                    </ul>
         
     | 
| 168 | 
         
             
                    </p>
         
     | 
| 
         | 
|
| 
         | 
|
| 169 | 
         | 
| 170 | 
         
            +
                    <p>If you want to find out more about how this model was trained and even fine tune Parler TTS in any language, check-out <a href=">this</a> post 
         
     | 
| 171 | 
         
            +
             
     | 
| 
         | 
|
| 
         | 
|
| 172 | 
         
             
                    """
         
     | 
| 173 | 
         
             
                )
         
     | 
| 174 | 
         |