Llasa-1b-multilingual

Runtime error

App Files Files Community

Steveeeeeeen HF Staff commited on Feb 10

Commit

15994b1

verified ·

1 Parent(s): 4f7f0f0

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -4

app.py CHANGED Viewed

@@ -186,6 +186,61 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
         return (16000, gen_wav[0, 0, :].cpu().numpy())
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")
@@ -229,17 +284,41 @@ with gr.Blocks() as app_credits:
 * [mrfakename](https://huggingface.co/mrfakename) for the [gradio demo code](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
 """)
 with gr.Blocks() as app:
     gr.Markdown(
         """
 # Llasa 1b Multilingual TTS
-This is a local web UI for Llasa 1b multilingual Zero Shot Voice Cloning and TTS model that supports English, Chinese, French, German, Dutch, Spanish, Italian, Portuguese, Polish, Japanese and Korean!
 If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
 """
     )
-    gr.TabbedInterface([app_tts], ["TTS"])
-app.launch(ssr_mode=False, share=True)

         return (16000, gen_wav[0, 0, :].cpu().numpy())
+def text_only_infer(target_text, progress=gr.Progress()):
+    """Function to generate speech directly from text without a reference voice"""
+    if len(target_text) == 0:
+        return None
+    elif len(target_text) > 300:
+        gr.Warning("Text is too long. Please keep it under 300 characters.")
+        target_text = target_text[:300]
+    progress(0.2, 'Generating speech...')
+    with torch.no_grad():
+        formatted_text = f"<|TEXT_UNDERSTANDING_START|>{target_text}<|TEXT_UNDERSTANDING_END|>"
+        # Tokenize the text
+        chat = [
+            {"role": "user", "content": "Convert the text to speech:" + formatted_text},
+            {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
+        ]
+        input_ids = tokenizer.apply_chat_template(
+            chat,
+            tokenize=True,
+            return_tensors='pt',
+            continue_final_message=True
+        )
+        input_ids = input_ids.to('cuda')
+        speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
+        # Generate the speech autoregressively
+        outputs = model.generate(
+            input_ids,
+            max_length=2048,
+            eos_token_id=speech_end_id,
+            do_sample=True,
+            top_p=1,
+            temperature=0.8
+        )
+        progress(0.6, 'Processing audio...')
+        # Extract the speech tokens
+        generated_ids = outputs[0][input_ids.shape[1]:-1]
+        speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        # Convert token <|s_23456|> to int 23456
+        speech_tokens = extract_speech_ids(speech_tokens)
+        speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
+        # Decode the speech tokens to speech waveform
+        gen_wav = Codec_model.decode_code(speech_tokens)
+        progress(1, 'Done!')
+        return (16000, gen_wav[0, 0, :].cpu().numpy())
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")
 * [mrfakename](https://huggingface.co/mrfakename) for the [gradio demo code](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
 """)
+with gr.Blocks() as app_direct_tts:
+    gr.Markdown("# Direct Text-to-Speech")
+    gr.Markdown("Generate speech directly from text without voice cloning")
+    text_input = gr.Textbox(
+        label="Text to Generate",
+        lines=10,
+        placeholder="Enter the text you want to convert to speech..."
+    )
+    generate_btn = gr.Button("Generate Speech", variant="primary")
+    audio_output = gr.Audio(label="Generated Audio")
+    generate_btn.click(
+        text_only_infer,
+        inputs=[text_input],
+        outputs=[audio_output],
+    )
 with gr.Blocks() as app:
     gr.Markdown(
         """
 # Llasa 1b Multilingual TTS
+This is a local web UI for Llasa 1b multilingual TTS that supports:
+- Zero Shot Voice Cloning
+- Direct Text-to-Speech
+Supports multiple languages including English, Chinese, French, German, Dutch, Spanish, Italian, Portuguese, Polish, Japanese and Korean!
 If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
 """
     )
+    gr.TabbedInterface(
+        [app_direct_tts, app_tts],
+        ["Direct TTS", "Voice Cloning"]
+    )
+app.launch(ssr_mode=False)