Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,4 @@
|
|
| 1 |
|
| 2 |
-
# import os
|
| 3 |
-
# os.system("pip install --no-cache-dir --upgrade --force-reinstall -r requirements.txt")
|
| 4 |
-
|
| 5 |
-
|
| 6 |
# Initalize a pipeline
|
| 7 |
from kokoro import KPipeline
|
| 8 |
# from IPython.display import display, Audio
|
|
@@ -177,6 +173,7 @@ def remove_silence_function(file_path,minimum_silence=50):
|
|
| 177 |
combined += chunk
|
| 178 |
combined.export(output_path, format=audio_format)
|
| 179 |
return output_path
|
|
|
|
| 180 |
def generate_and_save_audio(text, Language="American English",voice="af_bella", speed=1,remove_silence=False,keep_silence_up_to=0.05):
|
| 181 |
text=clean_text(text)
|
| 182 |
update_pipeline(Language)
|
|
@@ -205,6 +202,8 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
|
|
| 205 |
audio_int16 = (audio_np * 32767).astype(np.int16) # Scale to 16-bit range
|
| 206 |
audio_bytes = audio_int16.tobytes() # Convert to bytes
|
| 207 |
# Write the audio chunk to the WAV file
|
|
|
|
|
|
|
| 208 |
wav_file.writeframes(audio_bytes)
|
| 209 |
if remove_silence:
|
| 210 |
keep_silence = int(keep_silence_up_to * 1000)
|
|
@@ -212,39 +211,44 @@ def generate_and_save_audio(text, Language="American English",voice="af_bella",
|
|
| 212 |
return new_wave_file,timestamps
|
| 213 |
return save_path,timestamps
|
| 214 |
|
|
|
|
|
|
|
| 215 |
def adjust_timestamps(timestamp_dict):
|
| 216 |
adjusted_timestamps = []
|
| 217 |
-
|
| 218 |
|
| 219 |
for segment_id in sorted(timestamp_dict.keys()):
|
| 220 |
segment = timestamp_dict[segment_id]
|
| 221 |
words = segment["words"]
|
|
|
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
|
|
|
| 234 |
|
| 235 |
adjusted_timestamps.append({
|
| 236 |
-
"word":
|
| 237 |
-
"start": round(
|
| 238 |
-
"end": round(
|
| 239 |
})
|
| 240 |
|
| 241 |
-
#
|
| 242 |
-
|
| 243 |
-
last_end_time = adjusted_timestamps[-1]["end"]
|
| 244 |
|
| 245 |
return adjusted_timestamps
|
| 246 |
|
| 247 |
|
|
|
|
| 248 |
import string
|
| 249 |
|
| 250 |
def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuation=True):
|
|
@@ -278,6 +282,30 @@ def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuati
|
|
| 278 |
|
| 279 |
import string
|
| 280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_words=8, min_pause=0.1):
|
| 282 |
subtitles = [] # Stores subtitle blocks
|
| 283 |
subtitle_words = [] # Temporary list for words in the current subtitle
|
|
@@ -343,6 +371,7 @@ def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_w
|
|
| 343 |
# Write subtitles to SRT file
|
| 344 |
with open(output_file, "w", encoding="utf-8") as f:
|
| 345 |
for i, (start, end, text) in enumerate(subtitles, start=1):
|
|
|
|
| 346 |
f.write(f"{i}\n{format_srt_time(start)} --> {format_srt_time(end)}\n{text}\n\n")
|
| 347 |
|
| 348 |
# print(f"SRT file '{output_file}' created successfully!")
|
|
@@ -591,6 +620,7 @@ import click
|
|
| 591 |
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
|
| 592 |
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
|
| 593 |
def main(debug, share):
|
|
|
|
| 594 |
demo1 = ui()
|
| 595 |
demo2 = tutorial()
|
| 596 |
demo = gr.TabbedInterface([demo1, demo2],["Multilingual TTS","VoicePack Explanation"],title="Kokoro TTS")#,theme='JohnSmith9982/small_and_pretty')
|
|
@@ -608,4 +638,4 @@ last_used_language = "a"
|
|
| 608 |
pipeline = KPipeline(lang_code=last_used_language)
|
| 609 |
temp_folder = create_audio_dir()
|
| 610 |
if __name__ == "__main__":
|
| 611 |
-
main()
|
|
|
|
| 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
# Initalize a pipeline
|
| 3 |
from kokoro import KPipeline
|
| 4 |
# from IPython.display import display, Audio
|
|
|
|
| 173 |
combined += chunk
|
| 174 |
combined.export(output_path, format=audio_format)
|
| 175 |
return output_path
|
| 176 |
+
|
| 177 |
def generate_and_save_audio(text, Language="American English",voice="af_bella", speed=1,remove_silence=False,keep_silence_up_to=0.05):
|
| 178 |
text=clean_text(text)
|
| 179 |
update_pipeline(Language)
|
|
|
|
| 202 |
audio_int16 = (audio_np * 32767).astype(np.int16) # Scale to 16-bit range
|
| 203 |
audio_bytes = audio_int16.tobytes() # Convert to bytes
|
| 204 |
# Write the audio chunk to the WAV file
|
| 205 |
+
duration_sec = len(audio_np) / 24000
|
| 206 |
+
timestamps[i]["duration"] = duration_sec
|
| 207 |
wav_file.writeframes(audio_bytes)
|
| 208 |
if remove_silence:
|
| 209 |
keep_silence = int(keep_silence_up_to * 1000)
|
|
|
|
| 211 |
return new_wave_file,timestamps
|
| 212 |
return save_path,timestamps
|
| 213 |
|
| 214 |
+
|
| 215 |
+
|
| 216 |
def adjust_timestamps(timestamp_dict):
|
| 217 |
adjusted_timestamps = []
|
| 218 |
+
last_global_end = 0 # Cumulative audio timeline
|
| 219 |
|
| 220 |
for segment_id in sorted(timestamp_dict.keys()):
|
| 221 |
segment = timestamp_dict[segment_id]
|
| 222 |
words = segment["words"]
|
| 223 |
+
chunk_duration = segment["duration"]
|
| 224 |
|
| 225 |
+
# If there are valid words, get last word end
|
| 226 |
+
last_word_end_in_chunk = (
|
| 227 |
+
max(w["end"] for w in words if w["end"] not in [None, 0])
|
| 228 |
+
if words else 0
|
| 229 |
+
)
|
| 230 |
|
| 231 |
+
silence_gap = chunk_duration - last_word_end_in_chunk
|
| 232 |
+
if silence_gap < 0: # In rare cases where end > duration (due to rounding)
|
| 233 |
+
silence_gap = 0
|
| 234 |
|
| 235 |
+
for word in words:
|
| 236 |
+
start = word["start"] or 0
|
| 237 |
+
end = word["end"] or start
|
| 238 |
|
| 239 |
adjusted_timestamps.append({
|
| 240 |
+
"word": word["word"],
|
| 241 |
+
"start": round(last_global_end + start, 3),
|
| 242 |
+
"end": round(last_global_end + end, 3)
|
| 243 |
})
|
| 244 |
|
| 245 |
+
# Add entire chunk duration to global end
|
| 246 |
+
last_global_end += chunk_duration
|
|
|
|
| 247 |
|
| 248 |
return adjusted_timestamps
|
| 249 |
|
| 250 |
|
| 251 |
+
|
| 252 |
import string
|
| 253 |
|
| 254 |
def write_word_srt(word_level_timestamps, output_file="word.srt", skip_punctuation=True):
|
|
|
|
| 282 |
|
| 283 |
import string
|
| 284 |
|
| 285 |
+
|
| 286 |
+
def split_line_by_char_limit(text, max_chars=30):
|
| 287 |
+
words = text.split()
|
| 288 |
+
lines = []
|
| 289 |
+
current_line = ""
|
| 290 |
+
|
| 291 |
+
for word in words:
|
| 292 |
+
if len(current_line + " " + word) <= max_chars:
|
| 293 |
+
current_line = (current_line + " " + word).strip()
|
| 294 |
+
else:
|
| 295 |
+
lines.append(current_line)
|
| 296 |
+
current_line = word
|
| 297 |
+
|
| 298 |
+
if current_line:
|
| 299 |
+
# Check if last line is a single word and there is a previous line
|
| 300 |
+
if len(current_line.split()) == 1 and len(lines) > 0:
|
| 301 |
+
# Append single word to previous line
|
| 302 |
+
lines[-1] += " " + current_line
|
| 303 |
+
else:
|
| 304 |
+
lines.append(current_line)
|
| 305 |
+
|
| 306 |
+
return "\n".join(lines)
|
| 307 |
+
|
| 308 |
+
|
| 309 |
def write_sentence_srt(word_level_timestamps, output_file="subtitles.srt", max_words=8, min_pause=0.1):
|
| 310 |
subtitles = [] # Stores subtitle blocks
|
| 311 |
subtitle_words = [] # Temporary list for words in the current subtitle
|
|
|
|
| 371 |
# Write subtitles to SRT file
|
| 372 |
with open(output_file, "w", encoding="utf-8") as f:
|
| 373 |
for i, (start, end, text) in enumerate(subtitles, start=1):
|
| 374 |
+
text=split_line_by_char_limit(text, max_chars=30)
|
| 375 |
f.write(f"{i}\n{format_srt_time(start)} --> {format_srt_time(end)}\n{text}\n\n")
|
| 376 |
|
| 377 |
# print(f"SRT file '{output_file}' created successfully!")
|
|
|
|
| 620 |
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
|
| 621 |
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
|
| 622 |
def main(debug, share):
|
| 623 |
+
# def main(debug=True, share=True):
|
| 624 |
demo1 = ui()
|
| 625 |
demo2 = tutorial()
|
| 626 |
demo = gr.TabbedInterface([demo1, demo2],["Multilingual TTS","VoicePack Explanation"],title="Kokoro TTS")#,theme='JohnSmith9982/small_and_pretty')
|
|
|
|
| 638 |
pipeline = KPipeline(lang_code=last_used_language)
|
| 639 |
temp_folder = create_audio_dir()
|
| 640 |
if __name__ == "__main__":
|
| 641 |
+
main()
|