Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on Aug 24

Commit

47e7ebf

verified ·

1 Parent(s): 208563c

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -10

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ from duckduckgo_search import DDGS
 from PIL import Image
 from huggingface_hub import InferenceClient
 import time
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
@@ -497,11 +498,26 @@ def _init_kokoro() -> None:
     )
 def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
     text: Annotated[str, "The text to synthesize (English)."],
     speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
     voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
-) -> Tuple[int, np.ndarray]:
     """
     Synthesize speech from text using the Kokoro-82M model.
@@ -516,9 +532,9 @@ def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
         voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
     Returns:
-        A tuple of (sample_rate_hz, audio_waveform) where:
-        - sample_rate_hz: int sample rate in Hz (24_000)
-        - audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
     Notes:
         - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
@@ -544,8 +560,9 @@ def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
             audio = model(ps, ref_s, float(speed))
         except Exception as e:  # propagate as UI-friendly error
             raise gr.Error(f"Error generating audio: {str(e)}")
-        # Return 24 kHz mono waveform
-        return 24_000, audio.detach().cpu().numpy()
     # If pipeline produced no segments
     raise gr.Error("No audio was generated (empty synthesis result).")
@@ -671,15 +688,14 @@ kokoro_interface = gr.Interface(
         gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
         gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
     ],
-    outputs=gr.Audio(label="Audio", type="numpy"),
     title="Kokoro TTS",
     description=(
         "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
-        "Synthesize speech from text using Kokoro-82M. Returns (sample_rate, waveform) suitable for playback. "
-        "Parameters: text (str), speed (float 0.5–2.0), voice (str). "
-        "Return the generated image to the user."
     ),
     allow_flagging="never",
 )

 from PIL import Image
 from huggingface_hub import InferenceClient
 import time
+import wave
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
     )
+def _save_wav(waveform: np.ndarray, sample_rate: int = 24_000) -> str:
+    """Save float32 mono waveform [-1,1] to a 16-bit PCM WAV file and return its path."""
+    os.makedirs("outputs", exist_ok=True)
+    # Normalize/clip and convert to int16 PCM
+    wf = np.clip(waveform, -1.0, 1.0)
+    pcm16 = (wf * 32767.0).astype(np.int16)
+    fname = f"outputs/tts_{int(time.time())}_{random.randint(1000,9999)}.wav"
+    with wave.open(fname, "wb") as w:
+        w.setnchannels(1)
+        w.setsampwidth(2)  # 16-bit
+        w.setframerate(sample_rate)
+        w.writeframes(pcm16.tobytes())
+    return fname
 def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
     text: Annotated[str, "The text to synthesize (English)."],
     speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
     voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
+) -> str:
     """
     Synthesize speech from text using the Kokoro-82M model.
         voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
     Returns:
+        str: Path to a generated WAV file (24 kHz mono, 16-bit PCM). In the
+        Gradio UI this renders an inline audio player; via MCP this is
+        converted to a public URL that most clients will open in a browser tab.
     Notes:
         - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
             audio = model(ps, ref_s, float(speed))
         except Exception as e:  # propagate as UI-friendly error
             raise gr.Error(f"Error generating audio: {str(e)}")
+        # Save as WAV and return path
+        wav_path = _save_wav(audio.detach().cpu().numpy(), sample_rate=24_000)
+        return wav_path
     # If pipeline produced no segments
     raise gr.Error("No audio was generated (empty synthesis result).")
         gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
         gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
     ],
+    outputs=gr.Audio(label="Audio", type="filepath", autoplay=True),
     title="Kokoro TTS",
     description=(
         "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
+        "Synthesize speech from text using Kokoro-82M. Returns a file path to a WAV (24 kHz mono) that is playable inline in the UI,"
+        " and exposed as a URL via MCP. Parameters: text (str), speed (float 0.5–2.0), voice (str)."
     ),
     allow_flagging="never",
 )