StyleTTS2-Public-API-2

Running

App Files Files Community

Dupaja commited on Jan 6, 2024

Commit

c6292d7

1 Parent(s): 3f96c9f

Add ljspeech endpoint

Browse files

Files changed (1) hide show

app.py +43 -12

app.py CHANGED Viewed

@@ -1,17 +1,10 @@
 import gradio as gr
-import styletts2importable
 import numpy as np
 import re
-theme = gr.themes.Base(
-    font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
-)
-voicelist = ['f-us-1', 'f-us-2', 'f-us-3', 'f-us-4', 'm-us-1', 'm-us-2', 'm-us-3', 'm-us-4']
-voices = {}
-import phonemizer
-global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
-for v in voicelist:
-    voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
 def split_and_recombine_text(text, desired_length=200, max_length=400):
     """Split text it into chunks of a desired length trying to keep sentences intact."""
     # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
@@ -83,6 +76,10 @@ def split_and_recombine_text(text, desired_length=200, max_length=400):
     return rv
 def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
@@ -95,15 +92,49 @@ def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     return (24000, np.concatenate(audios))
-with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=1):
             inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
             voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
             multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
             audio = gr.Audio(interactive=False, label="Synthesized Audio")
             btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
-demo.queue(api_open=True, max_size=15).launch(show_api=True)

 import gradio as gr
+import ljspeechimportable
+import torch
 import numpy as np
+import styletts2importable
 import re
 def split_and_recombine_text(text, desired_length=200, max_length=400):
     """Split text it into chunks of a desired length trying to keep sentences intact."""
     # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
     return rv
+theme = gr.themes.Base(
+    font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
+)
 def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     return (24000, np.concatenate(audios))
+def ljsynthesize(text, steps, progress=gr.Progress()):
+    noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
+    if text.strip() == "":
+        raise gr.Error("You must enter some text")
+    texts = split_and_recombine_text(text)
+    audios = []
+    for t in progress.tqdm(texts):
+        audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
+    return (24000, np.concatenate(audios))
+with gr.Blocks() as libritts: # just realized it isn't vctk but libritts but i'm too lazy to change it rn
     with gr.Row():
         with gr.Column(scale=1):
             inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
             voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
             multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
+            # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
             audio = gr.Audio(interactive=False, label="Synthesized Audio")
             btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
+with gr.Blocks() as lj:
+    with gr.Row():
+        with gr.Column(scale=1):
+            ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
+            ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
+        with gr.Column(scale=1):
+            ljbtn = gr.Button("Synthesize", variant="primary")
+            ljaudio = gr.Audio(interactive=False, label="Synthesized Audio")
+            ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
+with gr.Blocks(title="StyleTTS 2", css="", theme=theme) as demo:
+    gr.DuplicateButton("Duplicate Space")
+    gr.TabbedInterface([libritts, lj], ['Multi-Voice', 'LJSpeech'])
+    gr.Markdown("""
+Original Demo by [mrfakename](https://twitter.com/realmrfakename). I am not affiliated with the StyleTTS 2 authors.
+Run this demo locally using Docker:
+```bash
+docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all registry.hf.space/styletts2-styletts2:latest python app.py
+```
+""") # Please do not remove this line.
+if __name__ == "__main__":
+    # demo.queue(api_open=False, max_size=15).launch(show_api=False)
+    demo.queue(api_open=True, max_size=15).launch(show_api=True)