Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on Aug 24

Commit

675e6f3

verified ·

1 Parent(s): 47e7ebf

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -26

app.py CHANGED Viewed

@@ -25,7 +25,6 @@ from duckduckgo_search import DDGS
 from PIL import Image
 from huggingface_hub import InferenceClient
 import time
-import wave
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
@@ -498,26 +497,11 @@ def _init_kokoro() -> None:
     )
-def _save_wav(waveform: np.ndarray, sample_rate: int = 24_000) -> str:
-    """Save float32 mono waveform [-1,1] to a 16-bit PCM WAV file and return its path."""
-    os.makedirs("outputs", exist_ok=True)
-    # Normalize/clip and convert to int16 PCM
-    wf = np.clip(waveform, -1.0, 1.0)
-    pcm16 = (wf * 32767.0).astype(np.int16)
-    fname = f"outputs/tts_{int(time.time())}_{random.randint(1000,9999)}.wav"
-    with wave.open(fname, "wb") as w:
-        w.setnchannels(1)
-        w.setsampwidth(2)  # 16-bit
-        w.setframerate(sample_rate)
-        w.writeframes(pcm16.tobytes())
-    return fname
 def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
     text: Annotated[str, "The text to synthesize (English)."],
     speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
     voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
-) -> str:
     """
     Synthesize speech from text using the Kokoro-82M model.
@@ -532,9 +516,9 @@ def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
         voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
     Returns:
-        str: Path to a generated WAV file (24 kHz mono, 16-bit PCM). In the
-        Gradio UI this renders an inline audio player; via MCP this is
-        converted to a public URL that most clients will open in a browser tab.
     Notes:
         - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
@@ -560,9 +544,8 @@ def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
             audio = model(ps, ref_s, float(speed))
         except Exception as e:  # propagate as UI-friendly error
             raise gr.Error(f"Error generating audio: {str(e)}")
-        # Save as WAV and return path
-        wav_path = _save_wav(audio.detach().cpu().numpy(), sample_rate=24_000)
-        return wav_path
     # If pipeline produced no segments
     raise gr.Error("No audio was generated (empty synthesis result).")
@@ -688,14 +671,15 @@ kokoro_interface = gr.Interface(
         gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
         gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
     ],
-    outputs=gr.Audio(label="Audio", type="filepath", autoplay=True),
     title="Kokoro TTS",
     description=(
         "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
-        "Synthesize speech from text using Kokoro-82M. Returns a file path to a WAV (24 kHz mono) that is playable inline in the UI,"
-        " and exposed as a URL via MCP. Parameters: text (str), speed (float 0.5–2.0), voice (str)."
     ),
     allow_flagging="never",
 )

 from PIL import Image
 from huggingface_hub import InferenceClient
 import time
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
     )
 def Generate_Speech(  # <-- MCP tool #4 (Generate Speech)
     text: Annotated[str, "The text to synthesize (English)."],
     speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
     voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
+) -> Tuple[int, np.ndarray]:
     """
     Synthesize speech from text using the Kokoro-82M model.
         voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
     Returns:
+        A tuple of (sample_rate_hz, audio_waveform) where:
+        - sample_rate_hz: int sample rate in Hz (24_000)
+        - audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
     Notes:
         - Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
             audio = model(ps, ref_s, float(speed))
         except Exception as e:  # propagate as UI-friendly error
             raise gr.Error(f"Error generating audio: {str(e)}")
+        # Return 24 kHz mono waveform
+        return 24_000, audio.detach().cpu().numpy()
     # If pipeline produced no segments
     raise gr.Error("No audio was generated (empty synthesis result).")
         gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
         gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
     ],
+    outputs=gr.Audio(label="Audio", type="numpy"),
     title="Kokoro TTS",
     description=(
         "<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
     ),
     api_description=(
+        "Synthesize speech from text using Kokoro-82M. Returns (sample_rate, waveform) suitable for playback. "
+        "Parameters: text (str), speed (float 0.5–2.0), voice (str). "
+        "Return the generated image to the user."
     ),
     allow_flagging="never",
 )