Spaces:
Runtime error
Runtime error
| """ | |
| Copyright (c) 2025 Xposed73 | |
| All rights reserved. | |
| This file is part of the Manim Voiceover project. | |
| """ | |
| import hashlib | |
| import json | |
| import numpy as np | |
| from pathlib import Path | |
| from manim_voiceover.services.base import SpeechService | |
| from kokoro_onnx import Kokoro | |
| from manim_voiceover.helper import remove_bookmarks, wav2mp3 | |
| from scipy.io.wavfile import write as write_wav | |
| from src.config.config import Config | |
| class KokoroService(SpeechService): | |
| """Speech service class for kokoro_self (using text_to_speech via Kokoro ONNX).""" | |
| def __init__(self, engine=None, | |
| model_path: str = Config.KOKORO_MODEL_PATH, | |
| voices_path: str = Config.KOKORO_VOICES_PATH, | |
| voice: str = Config.KOKORO_DEFAULT_VOICE, | |
| speed: float = Config.KOKORO_DEFAULT_SPEED, | |
| lang: str = Config.KOKORO_DEFAULT_LANG, | |
| **kwargs): | |
| self.kokoro = Kokoro(model_path, voices_path) | |
| self.voice = voice | |
| self.speed = speed | |
| self.lang = lang | |
| if engine is None: | |
| engine = self.text_to_speech # Default to local function | |
| self.engine = engine | |
| super().__init__(**kwargs) | |
| def get_data_hash(self, input_data: dict) -> str: | |
| """ | |
| Generates a hash based on the input data dictionary. | |
| The hash is used to create a unique identifier for the input data. | |
| Parameters: | |
| input_data (dict): A dictionary of input data (e.g., text, voice, etc.). | |
| Returns: | |
| str: The generated hash as a string. | |
| """ | |
| # Convert the input data dictionary to a JSON string (sorted for consistency) | |
| data_str = json.dumps(input_data, sort_keys=True) | |
| # Generate a SHA-256 hash of the JSON string | |
| return hashlib.sha256(data_str.encode('utf-8')).hexdigest() | |
| def text_to_speech(self, text, output_file, voice_name, speed, lang): | |
| """ | |
| Generates speech from text using Kokoro ONNX and saves the audio file. | |
| Normalizes the audio to make it audible. | |
| """ | |
| # Generate audio samples using Kokoro | |
| samples, sample_rate = self.kokoro.create( | |
| text, voice=voice_name, speed=speed, lang=lang | |
| ) | |
| # Normalize audio to the range [-1, 1] | |
| max_val = np.max(np.abs(samples)) | |
| if max_val > 0: | |
| samples = samples / max_val | |
| # Convert to 16-bit integer PCM format | |
| samples = (samples * 32767).astype("int16") | |
| # Save the normalized audio as a .wav file | |
| write_wav(output_file, sample_rate, samples) | |
| print(f"Saved at {output_file}") | |
| return output_file | |
| def generate_from_text(self, text: str, cache_dir: str = None, path: str = None) -> dict: | |
| if cache_dir is None: | |
| cache_dir = self.cache_dir | |
| input_data = {"input_text": text, "service": "kokoro_self", "voice": self.voice, "lang": self.lang} | |
| cached_result = self.get_cached_result(input_data, cache_dir) | |
| if cached_result is not None: | |
| return cached_result | |
| if path is None: | |
| audio_path = self.get_data_hash(input_data) + ".mp3" | |
| else: | |
| audio_path = path | |
| # Generate .wav file using the text_to_speech function | |
| audio_path_wav = str(Path(cache_dir) / audio_path.replace(".mp3", ".wav")) | |
| self.engine( | |
| text=text, | |
| output_file=audio_path_wav, | |
| voice_name=self.voice, | |
| speed=self.speed, | |
| lang=self.lang, | |
| ) | |
| # Convert .wav to .mp3 | |
| mp3_audio_path = str(Path(cache_dir) / audio_path) | |
| wav2mp3(audio_path_wav, mp3_audio_path) | |
| # Remove original .wav file | |
| remove_bookmarks(audio_path_wav) | |
| json_dict = { | |
| "input_text": text, | |
| "input_data": input_data, | |
| "original_audio": audio_path, | |
| } | |
| return json_dict |