Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -4,16 +4,14 @@ import torch | |
| 4 | 
             
            import librosa
         | 
| 5 | 
             
            from pathlib import Path
         | 
| 6 | 
             
            import tempfile, torchaudio
         | 
| 7 | 
            -
            # from faster_whisper import WhisperModel
         | 
| 8 | 
             
            from transformers import pipeline
         | 
| 9 | 
             
            from uuid import uuid4
         | 
| 10 |  | 
| 11 | 
             
            # Load the MARS5 model
         | 
| 12 | 
             
            mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
         | 
| 13 | 
            -
            # asr_model = WhisperModel("small", device="cpu", compute_type="int8")
         | 
| 14 | 
             
            asr_model = pipeline(
         | 
| 15 | 
             
                "automatic-speech-recognition",
         | 
| 16 | 
            -
                model="openai/whisper- | 
| 17 | 
             
                chunk_length_s=30,
         | 
| 18 | 
             
                device=torch.device("cuda:0"),
         | 
| 19 | 
             
            )
         | 
| @@ -24,15 +22,16 @@ def transcribe_file(f: str) -> str: | |
| 24 | 
             
                return " ".join([prediction["text"] for prediction in predictions])
         | 
| 25 |  | 
| 26 | 
             
            # Function to process the text and audio input and generate the synthesized output
         | 
| 27 | 
            -
            def synthesize(text, audio_file, transcript):
         | 
| 28 | 
            -
                 | 
| 29 | 
            -
                 | 
|  | |
| 30 |  | 
| 31 | 
            -
                # copying the audio_file
         | 
| 32 | 
            -
                with open(audio_file, 'rb') as src, open(temp_file, 'wb') as dst:
         | 
| 33 | 
            -
             | 
| 34 |  | 
| 35 | 
            -
                audio_file = temp_file
         | 
| 36 |  | 
| 37 | 
             
                print(f">>>>> synthesizing! audio_file: {audio_file}")
         | 
| 38 | 
             
                if not transcript:
         | 
| @@ -43,11 +42,10 @@ def synthesize(text, audio_file, transcript): | |
| 43 | 
             
                wav = torch.from_numpy(wav)
         | 
| 44 |  | 
| 45 | 
             
                # Define the configuration for the TTS model
         | 
| 46 | 
            -
                 | 
| 47 | 
            -
                cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3)
         | 
| 48 |  | 
| 49 | 
             
                # Generate the synthesized audio
         | 
| 50 | 
            -
                ar_codes, wav_out = mars5.tts(text, wav, transcript, cfg=cfg)
         | 
| 51 |  | 
| 52 | 
             
                # Save the synthesized audio to a temporary file
         | 
| 53 | 
             
                output_path = Path(tempfile.mktemp(suffix=".wav"))
         | 
| @@ -73,7 +71,7 @@ with gr.Blocks() as demo: | |
| 73 | 
             
                text = gr.Textbox(label="Text to synthesize")
         | 
| 74 | 
             
                audio_file = gr.Audio(label="Audio file to clone from", type="filepath")
         | 
| 75 |  | 
| 76 | 
            -
                generate_btn = gr.Button( | 
| 77 |  | 
| 78 | 
             
                with gr.Accordion("Advanced Settings", open=False):
         | 
| 79 | 
             
                    gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.")
         | 
| @@ -86,18 +84,77 @@ with gr.Blocks() as demo: | |
| 86 | 
             
                    presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty'])
         | 
| 87 | 
             
                    rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window'])
         | 
| 88 | 
             
                    nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w'])
         | 
| 89 | 
            -
                    meta_n = gr.Slider(minimum=1, maximum=10, step=1, label="meta_n", value=2, interactive=False)
         | 
| 90 | 
             
                    deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone')
         | 
| 91 | 
            -
             | 
| 92 | 
            -
                    dummy = gr.Number(label='Example number', visible=False)
         | 
| 93 | 
            -
             | 
| 94 | 
             
                output = gr.Audio(label="Synthesized Audio", type="filepath")
         | 
| 95 | 
            -
                def on_click( | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 96 | 
             
                    print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}")
         | 
| 97 | 
            -
                    of = synthesize( | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 98 | 
             
                    print(f">>>> output file: {of}")
         | 
| 99 | 
             
                    return of
         | 
| 100 |  | 
| 101 | 
            -
                generate_btn.click( | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 102 |  | 
| 103 | 
             
            demo.launch(share=False)
         | 
|  | |
| 4 | 
             
            import librosa
         | 
| 5 | 
             
            from pathlib import Path
         | 
| 6 | 
             
            import tempfile, torchaudio
         | 
|  | |
| 7 | 
             
            from transformers import pipeline
         | 
| 8 | 
             
            from uuid import uuid4
         | 
| 9 |  | 
| 10 | 
             
            # Load the MARS5 model
         | 
| 11 | 
             
            mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
         | 
|  | |
| 12 | 
             
            asr_model = pipeline(
         | 
| 13 | 
             
                "automatic-speech-recognition",
         | 
| 14 | 
            +
                model="openai/whisper-tiny",
         | 
| 15 | 
             
                chunk_length_s=30,
         | 
| 16 | 
             
                device=torch.device("cuda:0"),
         | 
| 17 | 
             
            )
         | 
|  | |
| 22 | 
             
                return " ".join([prediction["text"] for prediction in predictions])
         | 
| 23 |  | 
| 24 | 
             
            # Function to process the text and audio input and generate the synthesized output
         | 
| 25 | 
            +
            def synthesize(text, audio_file, transcript, kwargs_dict):
         | 
| 26 | 
            +
                print(f">>>>>>> Kwargs dict: {kwargs_dict}")
         | 
| 27 | 
            +
                # audio_file = Path(audio_file)
         | 
| 28 | 
            +
                # temp_file = f"{uuid4()}.{audio_file.suffix}"
         | 
| 29 |  | 
| 30 | 
            +
                # # copying the audio_file
         | 
| 31 | 
            +
                # with open(audio_file, 'rb') as src, open(temp_file, 'wb') as dst:
         | 
| 32 | 
            +
                #     dst.write(src.read())
         | 
| 33 |  | 
| 34 | 
            +
                # audio_file = temp_file
         | 
| 35 |  | 
| 36 | 
             
                print(f">>>>> synthesizing! audio_file: {audio_file}")
         | 
| 37 | 
             
                if not transcript:
         | 
|  | |
| 42 | 
             
                wav = torch.from_numpy(wav)
         | 
| 43 |  | 
| 44 | 
             
                # Define the configuration for the TTS model
         | 
| 45 | 
            +
                cfg = config_class(**kwargs_dict)
         | 
|  | |
| 46 |  | 
| 47 | 
             
                # Generate the synthesized audio
         | 
| 48 | 
            +
                ar_codes, wav_out = mars5.tts(text, wav, transcript.strip(), cfg=cfg)
         | 
| 49 |  | 
| 50 | 
             
                # Save the synthesized audio to a temporary file
         | 
| 51 | 
             
                output_path = Path(tempfile.mktemp(suffix=".wav"))
         | 
|  | |
| 71 | 
             
                text = gr.Textbox(label="Text to synthesize")
         | 
| 72 | 
             
                audio_file = gr.Audio(label="Audio file to clone from", type="filepath")
         | 
| 73 |  | 
| 74 | 
            +
                generate_btn = gr.Button("Generate Synthesized Audio")
         | 
| 75 |  | 
| 76 | 
             
                with gr.Accordion("Advanced Settings", open=False):
         | 
| 77 | 
             
                    gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.")
         | 
|  | |
| 84 | 
             
                    presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty'])
         | 
| 85 | 
             
                    rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window'])
         | 
| 86 | 
             
                    nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w'])
         | 
|  | |
| 87 | 
             
                    deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone')
         | 
| 88 | 
            +
                    
         | 
|  | |
|  | |
| 89 | 
             
                output = gr.Audio(label="Synthesized Audio", type="filepath")
         | 
| 90 | 
            +
                def on_click(
         | 
| 91 | 
            +
                    text,
         | 
| 92 | 
            +
                    audio_file,
         | 
| 93 | 
            +
                    prompt_text,
         | 
| 94 | 
            +
                    temperature,
         | 
| 95 | 
            +
                    top_k,
         | 
| 96 | 
            +
                    top_p,
         | 
| 97 | 
            +
                    typical_p,
         | 
| 98 | 
            +
                    freq_penalty,
         | 
| 99 | 
            +
                    presence_penalty,
         | 
| 100 | 
            +
                    rep_penalty_window,
         | 
| 101 | 
            +
                    nar_guidance_w,
         | 
| 102 | 
            +
                    deep_clone
         | 
| 103 | 
            +
                ):
         | 
| 104 | 
             
                    print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}")
         | 
| 105 | 
            +
                    of = synthesize(
         | 
| 106 | 
            +
                        text,
         | 
| 107 | 
            +
                        audio_file,
         | 
| 108 | 
            +
                        prompt_text,
         | 
| 109 | 
            +
                        {
         | 
| 110 | 
            +
                            'temperature': temperature,
         | 
| 111 | 
            +
                            'top_k': top_k,
         | 
| 112 | 
            +
                            'top_p': top_p,
         | 
| 113 | 
            +
                            'typical_p': typical_p,
         | 
| 114 | 
            +
                            'freq_penalty': freq_penalty,
         | 
| 115 | 
            +
                            'presence_penalty': presence_penalty,
         | 
| 116 | 
            +
                            'rep_penalty_window': rep_penalty_window,
         | 
| 117 | 
            +
                            'nar_guidance_w': nar_guidance_w,
         | 
| 118 | 
            +
                            'deep_clone': deep_clone
         | 
| 119 | 
            +
                        }
         | 
| 120 | 
            +
                    )
         | 
| 121 | 
             
                    print(f">>>> output file: {of}")
         | 
| 122 | 
             
                    return of
         | 
| 123 |  | 
| 124 | 
            +
                generate_btn.click(
         | 
| 125 | 
            +
                    on_click,
         | 
| 126 | 
            +
                    inputs=[
         | 
| 127 | 
            +
                        text,
         | 
| 128 | 
            +
                        audio_file,
         | 
| 129 | 
            +
                        prompt_text,
         | 
| 130 | 
            +
                        temperature,
         | 
| 131 | 
            +
                        top_k,
         | 
| 132 | 
            +
                        top_p,
         | 
| 133 | 
            +
                        typical_p,
         | 
| 134 | 
            +
                        freq_penalty,
         | 
| 135 | 
            +
                        presence_penalty,
         | 
| 136 | 
            +
                        rep_penalty_window,
         | 
| 137 | 
            +
                        nar_guidance_w,
         | 
| 138 | 
            +
                        deep_clone
         | 
| 139 | 
            +
                    ],
         | 
| 140 | 
            +
                    outputs=[output]
         | 
| 141 | 
            +
                )
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                gr.Markdown("### Examples")
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                # Add examples
         | 
| 146 | 
            +
                defaults = [0.8, -1, 0.2, 1.0, 2.6, 0.4, 100, 3, True]
         | 
| 147 | 
            +
                examples = [
         | 
| 148 | 
            +
                    ["Today is a wonderful day!", "female_speaker_1.flac", "People look, but no one ever finds it.", *defaults],
         | 
| 149 | 
            +
                    ["You guys need to figure this out.", "male_speaker_1.flac", "Ask her to bring these things with her from the store.", *defaults]
         | 
| 150 | 
            +
                ]
         | 
| 151 | 
            +
                
         | 
| 152 | 
            +
                gr.Examples(
         | 
| 153 | 
            +
                    examples=examples,
         | 
| 154 | 
            +
                    inputs=[text, audio_file, prompt_text, temperature, top_k, top_p, typical_p, freq_penalty, presence_penalty, rep_penalty_window, nar_guidance_w, deep_clone],
         | 
| 155 | 
            +
                    outputs=[output],
         | 
| 156 | 
            +
                    cache_examples=False,
         | 
| 157 | 
            +
                    fn=on_click    
         | 
| 158 | 
            +
                )
         | 
| 159 |  | 
| 160 | 
             
            demo.launch(share=False)
         | 
