import gradio as gr import torch import numpy as np from soprano import SopranoTTS from scipy.io.wavfile import write as wav_write import tempfile import os import spaces assert torch.cuda.is_available(), "Demo requires a GPU." DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(DEVICE) model = None def load_model(): global model if model is None: # Load model once model = SopranoTTS( backend="auto", device=DEVICE, cache_size_mb=100, decoder_batch_size=1, ) return model SAMPLE_RATE = 32000 @spaces.GPU def tts_stream(text, temperature, top_p, repetition_penalty, state): model = load_model() if not text.strip(): yield None, state return out = model.infer( text, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, ) audio_np = out.cpu().numpy() yield (SAMPLE_RATE, audio_np), audio_np def save_audio(state): if state is None or len(state) == 0: return None fd, path = tempfile.mkstemp(suffix=".wav") os.close(fd) wav_write(path, SAMPLE_RATE, state) return path with gr.Blocks() as demo: state_audio = gr.State(None) with gr.Row(): with gr.Column(): gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M") text_in = gr.Textbox( label="Input Text", placeholder="Enter text to synthesize...", value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", lines=4, ) with gr.Accordion("Advanced options", open=False): temperature = gr.Slider( 0.0, 1.0, value=0.3, step=0.05, label="Temperature" ) top_p = gr.Slider( 0.0, 1.0, value=0.95, step=0.01, label="Top-p" ) repetition_penalty = gr.Slider( 1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty" ) gen_btn = gr.Button("Generate") with gr.Column(): audio_out = gr.Audio( label="Output Audio", autoplay=True, streaming=False, ) #download_btn = gr.Button("Download") #file_out = gr.File(label="Download file") gr.Markdown( "Usage tips:\n\n" "- Soprano works best when each sentence is between 2 and 15 seconds long.\n" "- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n" "- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results.\n" "- Avoid improper grammar such as not using contractions, multiple spaces, etc." ) gen_btn.click( fn=tts_stream, inputs=[text_in, temperature, top_p, repetition_penalty, state_audio], outputs=[audio_out, state_audio], ) #download_btn.click( # fn=save_audio, # inputs=[state_audio], # outputs=[file_out], #) demo.queue() demo.launch()