Update app.py
Browse files
app.py
CHANGED
|
@@ -25,7 +25,6 @@ from duckduckgo_search import DDGS
|
|
| 25 |
from PIL import Image
|
| 26 |
from huggingface_hub import InferenceClient
|
| 27 |
import time
|
| 28 |
-
import wave
|
| 29 |
|
| 30 |
# Optional imports for Kokoro TTS (loaded lazily)
|
| 31 |
import numpy as np
|
|
@@ -498,26 +497,11 @@ def _init_kokoro() -> None:
|
|
| 498 |
)
|
| 499 |
|
| 500 |
|
| 501 |
-
def _save_wav(waveform: np.ndarray, sample_rate: int = 24_000) -> str:
|
| 502 |
-
"""Save float32 mono waveform [-1,1] to a 16-bit PCM WAV file and return its path."""
|
| 503 |
-
os.makedirs("outputs", exist_ok=True)
|
| 504 |
-
# Normalize/clip and convert to int16 PCM
|
| 505 |
-
wf = np.clip(waveform, -1.0, 1.0)
|
| 506 |
-
pcm16 = (wf * 32767.0).astype(np.int16)
|
| 507 |
-
fname = f"outputs/tts_{int(time.time())}_{random.randint(1000,9999)}.wav"
|
| 508 |
-
with wave.open(fname, "wb") as w:
|
| 509 |
-
w.setnchannels(1)
|
| 510 |
-
w.setsampwidth(2) # 16-bit
|
| 511 |
-
w.setframerate(sample_rate)
|
| 512 |
-
w.writeframes(pcm16.tobytes())
|
| 513 |
-
return fname
|
| 514 |
-
|
| 515 |
-
|
| 516 |
def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
| 517 |
text: Annotated[str, "The text to synthesize (English)."],
|
| 518 |
speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
|
| 519 |
voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
|
| 520 |
-
) ->
|
| 521 |
"""
|
| 522 |
Synthesize speech from text using the Kokoro-82M model.
|
| 523 |
|
|
@@ -532,9 +516,9 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
|
| 532 |
voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
|
| 533 |
|
| 534 |
Returns:
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
|
| 539 |
Notes:
|
| 540 |
- Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
|
|
@@ -560,9 +544,8 @@ def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
|
| 560 |
audio = model(ps, ref_s, float(speed))
|
| 561 |
except Exception as e: # propagate as UI-friendly error
|
| 562 |
raise gr.Error(f"Error generating audio: {str(e)}")
|
| 563 |
-
#
|
| 564 |
-
|
| 565 |
-
return wav_path
|
| 566 |
|
| 567 |
# If pipeline produced no segments
|
| 568 |
raise gr.Error("No audio was generated (empty synthesis result).")
|
|
@@ -688,14 +671,15 @@ kokoro_interface = gr.Interface(
|
|
| 688 |
gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
|
| 689 |
gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
|
| 690 |
],
|
| 691 |
-
outputs=gr.Audio(label="Audio", type="
|
| 692 |
title="Kokoro TTS",
|
| 693 |
description=(
|
| 694 |
"<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
|
| 695 |
),
|
| 696 |
api_description=(
|
| 697 |
-
"Synthesize speech from text using Kokoro-82M. Returns
|
| 698 |
-
"
|
|
|
|
| 699 |
),
|
| 700 |
allow_flagging="never",
|
| 701 |
)
|
|
|
|
| 25 |
from PIL import Image
|
| 26 |
from huggingface_hub import InferenceClient
|
| 27 |
import time
|
|
|
|
| 28 |
|
| 29 |
# Optional imports for Kokoro TTS (loaded lazily)
|
| 30 |
import numpy as np
|
|
|
|
| 497 |
)
|
| 498 |
|
| 499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
def Generate_Speech( # <-- MCP tool #4 (Generate Speech)
|
| 501 |
text: Annotated[str, "The text to synthesize (English)."],
|
| 502 |
speed: Annotated[float, "Speech speed multiplier in 0.5–2.0; 1.0 = normal speed."] = 1.0,
|
| 503 |
voice: Annotated[str, "Voice identifier. Example: 'af_heart' (US English, female, Heart)."] = "af_heart",
|
| 504 |
+
) -> Tuple[int, np.ndarray]:
|
| 505 |
"""
|
| 506 |
Synthesize speech from text using the Kokoro-82M model.
|
| 507 |
|
|
|
|
| 516 |
voice: Voice identifier. Example: 'af_heart' (US English, female, Heart).
|
| 517 |
|
| 518 |
Returns:
|
| 519 |
+
A tuple of (sample_rate_hz, audio_waveform) where:
|
| 520 |
+
- sample_rate_hz: int sample rate in Hz (24_000)
|
| 521 |
+
- audio_waveform: numpy.ndarray float32 mono waveform in range [-1, 1]
|
| 522 |
|
| 523 |
Notes:
|
| 524 |
- Requires the 'kokoro' package (>=0.9.4). If unavailable, an error is
|
|
|
|
| 544 |
audio = model(ps, ref_s, float(speed))
|
| 545 |
except Exception as e: # propagate as UI-friendly error
|
| 546 |
raise gr.Error(f"Error generating audio: {str(e)}")
|
| 547 |
+
# Return 24 kHz mono waveform
|
| 548 |
+
return 24_000, audio.detach().cpu().numpy()
|
|
|
|
| 549 |
|
| 550 |
# If pipeline produced no segments
|
| 551 |
raise gr.Error("No audio was generated (empty synthesis result).")
|
|
|
|
| 671 |
gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed"),
|
| 672 |
gr.Textbox(label="Voice", value="af_heart", placeholder="e.g., af_heart"),
|
| 673 |
],
|
| 674 |
+
outputs=gr.Audio(label="Audio", type="numpy"),
|
| 675 |
title="Kokoro TTS",
|
| 676 |
description=(
|
| 677 |
"<div style=\"text-align:center\">Generate English speech with Kokoro-82M. 30 second max output. Runs on CPU or CUDA if available.</div>"
|
| 678 |
),
|
| 679 |
api_description=(
|
| 680 |
+
"Synthesize speech from text using Kokoro-82M. Returns (sample_rate, waveform) suitable for playback. "
|
| 681 |
+
"Parameters: text (str), speed (float 0.5–2.0), voice (str). "
|
| 682 |
+
"Return the generated image to the user."
|
| 683 |
),
|
| 684 |
allow_flagging="never",
|
| 685 |
)
|