Update app.py
Browse files
app.py
CHANGED
|
@@ -25,6 +25,7 @@ from duckduckgo_search import DDGS
|
|
| 25 |
from PIL import Image
|
| 26 |
from huggingface_hub import InferenceClient
|
| 27 |
import time
|
|
|
|
| 28 |
|
| 29 |
# Optional imports for Kokoro TTS (loaded lazily)
|
| 30 |
import numpy as np
|
|
@@ -497,11 +498,26 @@ def _init_kokoro() -> None:
|
|
| 497 |
)
|
| 498 |
|
| 499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
| 501 |
text: Annotated[str, "The text to synthesize (English)."],
|
| 502 |
speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
|
| 503 |
voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
|
| 504 |
-
) ->
|
| 505 |
"""
|
| 506 |
Synthesize speech from text using the Kokoro-82M model.
|
| 507 |
|
|
@@ -516,9 +532,9 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
|
| 516 |
voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
|
| 517 |
|
| 518 |
Returns:
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
|
| 523 |
Notes:
|
| 524 |
- Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
|
|
@@ -544,8 +560,9 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
|
| 544 |
audio = model(ps, ref_s, float(speed))
|
| 545 |
except Exception as e: # propagate as UI-friendly error
|
| 546 |
raise gr.Error(f"Error generating audio: {str(e)}")
|
| 547 |
-
#
|
| 548 |
-
|
|
|
|
| 549 |
|
| 550 |
# If pipeline produced no segments
|
| 551 |
raise gr.Error("No audio was generated (empty synthesis result).")
|
|
@@ -671,15 +688,14 @@ kokoro_interface = gr.Interface(
|
|
| 671 |
gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
|
| 672 |
gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
|
| 673 |
],
|
| 674 |
-
outputs=gr.Audio(label="Audio", type="
|
| 675 |
title="Kokoro TTS",
|
| 676 |
description=(
|
| 677 |
"<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
|
| 678 |
),
|
| 679 |
api_description=(
|
| 680 |
-
"Synthesize speech from text using Kokoro-82M. Returns (
|
| 681 |
-
"Parameters: text (str), speed (float 0.5–2.0), voice (str).
|
| 682 |
-
"Return the generated image to the user."
|
| 683 |
),
|
| 684 |
allow_flagging="never",
|
| 685 |
)
|
|
|
|
| 25 |
from PIL import Image
|
| 26 |
from huggingface_hub import InferenceClient
|
| 27 |
import time
|
| 28 |
+
import wave
|
| 29 |
|
| 30 |
# Optional imports for Kokoro TTS (loaded lazily)
|
| 31 |
import numpy as np
|
|
|
|
| 498 |
)
|
| 499 |
|
| 500 |
|
| 501 |
+
def _save_wav(waveform: np.ndarray, sample_rate: int = 24_000) -> str:
|
| 502 |
+
"""Save float32 mono waveform [-1,1] to a 16-bit PCM WAV file and return its path."""
|
| 503 |
+
os.makedirs("outputs", exist_ok=True)
|
| 504 |
+
# Normalize/clip and convert to int16 PCM
|
| 505 |
+
wf = np.clip(waveform, -1.0, 1.0)
|
| 506 |
+
pcm16 = (wf * 32767.0).astype(np.int16)
|
| 507 |
+
fname = f"outputs/tts_{int(time.time())}_{random.randint(1000,9999)}.wav"
|
| 508 |
+
with wave.open(fname, "wb") as w:
|
| 509 |
+
w.setnchannels(1)
|
| 510 |
+
w.setsampwidth(2) # 16-bit
|
| 511 |
+
w.setframerate(sample_rate)
|
| 512 |
+
w.writeframes(pcm16.tobytes())
|
| 513 |
+
return fname
|
| 514 |
+
|
| 515 |
+
|
| 516 |
def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
| 517 |
text: Annotated[str, "The text to synthesize (English)."],
|
| 518 |
speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
|
| 519 |
voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
|
| 520 |
+
) -> str:
|
| 521 |
"""
|
| 522 |
Synthesize speech from text using the Kokoro-82M model.
|
| 523 |
|
|
|
|
| 532 |
voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
|
| 533 |
|
| 534 |
Returns:
|
| 535 |
+
str: Path to a generated WAV file (24 kHz mono, 16-bit PCM). In the
|
| 536 |
+
Gradio UI this renders an inline audio player; via MCP this is
|
| 537 |
+
converted to a public URL that most clients will open in a browser tab.
|
| 538 |
|
| 539 |
Notes:
|
| 540 |
- Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
|
|
|
|
| 560 |
audio = model(ps, ref_s, float(speed))
|
| 561 |
except Exception as e: # propagate as UI-friendly error
|
| 562 |
raise gr.Error(f"Error generating audio: {str(e)}")
|
| 563 |
+
# Save as WAV and return path
|
| 564 |
+
wav_path = _save_wav(audio.detach().cpu().numpy(), sample_rate=24_000)
|
| 565 |
+
return wav_path
|
| 566 |
|
| 567 |
# If pipeline produced no segments
|
| 568 |
raise gr.Error("No audio was generated (empty synthesis result).")
|
|
|
|
| 688 |
gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
|
| 689 |
gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
|
| 690 |
],
|
| 691 |
+
outputs=gr.Audio(label="Audio", type="filepath", autoplay=True),
|
| 692 |
title="Kokoro TTS",
|
| 693 |
description=(
|
| 694 |
"<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
|
| 695 |
),
|
| 696 |
api_description=(
|
| 697 |
+
"Synthesize speech from text using Kokoro-82M. Returns a file path to a WAV (24 kHz mono) that is playable inline in the UI,"
|
| 698 |
+
" and exposed as a URL via MCP. Parameters: text (str), speed (float 0.5–2.0), voice (str)."
|
|
|
|
| 699 |
),
|
| 700 |
allow_flagging="never",
|
| 701 |
)
|