Commit
·
e904bea
1
Parent(s):
cc98bbc
Refactor multi-speaker TTS to improve subtitle generation by consolidating word boundary handling and processing phrases for SRT output
Browse files
app.py
CHANGED
|
@@ -3,10 +3,7 @@ import edge_tts
|
|
| 3 |
import asyncio
|
| 4 |
import tempfile
|
| 5 |
import os
|
| 6 |
-
import json
|
| 7 |
-
import datetime
|
| 8 |
import re
|
| 9 |
-
import io
|
| 10 |
|
| 11 |
|
| 12 |
async def get_voices():
|
|
@@ -371,7 +368,7 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
|
|
| 371 |
# Process each speaker segment with the corresponding voice
|
| 372 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 373 |
audio_segments = []
|
| 374 |
-
|
| 375 |
current_offset = 0 # Track the time offset in milliseconds
|
| 376 |
|
| 377 |
for i, segment in enumerate(speaker_segments):
|
|
@@ -399,70 +396,19 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
|
|
| 399 |
text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
|
| 400 |
)
|
| 401 |
|
| 402 |
-
#
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
# Process word boundaries for subtitles
|
| 416 |
-
if word_boundaries:
|
| 417 |
-
# Group words into phrases for subtitles
|
| 418 |
-
phrases = []
|
| 419 |
-
current_phrase = []
|
| 420 |
-
current_text = ""
|
| 421 |
-
phrase_start = 0
|
| 422 |
-
|
| 423 |
-
for j, boundary in enumerate(word_boundaries):
|
| 424 |
-
word = boundary["text"]
|
| 425 |
-
start_time = boundary["offset"] / 10000
|
| 426 |
-
duration = boundary["duration"] / 10000
|
| 427 |
-
end_time = start_time + duration
|
| 428 |
-
|
| 429 |
-
if not current_phrase:
|
| 430 |
-
phrase_start = start_time
|
| 431 |
-
|
| 432 |
-
current_phrase.append(boundary)
|
| 433 |
-
|
| 434 |
-
if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
|
| 435 |
-
current_text = current_text.rstrip() + word + " "
|
| 436 |
-
else:
|
| 437 |
-
current_text += word + " "
|
| 438 |
-
|
| 439 |
-
# Determine if we should end this phrase
|
| 440 |
-
should_break = False
|
| 441 |
-
|
| 442 |
-
if word.endswith(('.', '!', '?', ':', ';', ',')) or j == len(word_boundaries) - 1:
|
| 443 |
-
should_break = True
|
| 444 |
-
elif len(current_phrase) >= 5:
|
| 445 |
-
should_break = True
|
| 446 |
-
elif j < len(word_boundaries) - 1:
|
| 447 |
-
next_start = word_boundaries[j + 1]["offset"] / 10000
|
| 448 |
-
if next_start - end_time > 300:
|
| 449 |
-
should_break = True
|
| 450 |
-
|
| 451 |
-
if should_break or j == len(word_boundaries) - 1:
|
| 452 |
-
if current_phrase:
|
| 453 |
-
last_boundary = current_phrase[-1]
|
| 454 |
-
phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
|
| 455 |
-
phrases.append({
|
| 456 |
-
"text": f"[{speaker}] {current_text.strip()}",
|
| 457 |
-
"start": phrase_start,
|
| 458 |
-
"end": phrase_end
|
| 459 |
-
})
|
| 460 |
-
subtitle_entries.extend(phrases)
|
| 461 |
-
current_phrase = []
|
| 462 |
-
current_text = ""
|
| 463 |
-
else:
|
| 464 |
-
# Simple audio generation without subtitles
|
| 465 |
-
await communicate.save(segment_file)
|
| 466 |
|
| 467 |
# Get duration of the generated audio
|
| 468 |
from pydub import AudioSegment
|
|
@@ -489,11 +435,58 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
|
|
| 489 |
|
| 490 |
# Generate subtitles file if requested
|
| 491 |
if generate_subtitles and subtitle_path:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
with open(subtitle_path, "w", encoding="utf-8") as f:
|
| 493 |
-
for i,
|
| 494 |
f.write(f"{i+1}\n")
|
| 495 |
-
f.write(f"{format_time(
|
| 496 |
-
f.write(f"{
|
| 497 |
|
| 498 |
return final_audio_path, subtitle_path, None
|
| 499 |
|
|
|
|
| 3 |
import asyncio
|
| 4 |
import tempfile
|
| 5 |
import os
|
|
|
|
|
|
|
| 6 |
import re
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
async def get_voices():
|
|
|
|
| 368 |
# Process each speaker segment with the corresponding voice
|
| 369 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 370 |
audio_segments = []
|
| 371 |
+
all_word_boundaries = [] # Collect all word boundaries for subtitle generation
|
| 372 |
current_offset = 0 # Track the time offset in milliseconds
|
| 373 |
|
| 374 |
for i, segment in enumerate(speaker_segments):
|
|
|
|
| 396 |
text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
|
| 397 |
)
|
| 398 |
|
| 399 |
+
# Collect word boundaries for subtitle generation
|
| 400 |
+
word_boundaries = []
|
| 401 |
+
async for chunk in communicate.stream():
|
| 402 |
+
if chunk["type"] == "audio":
|
| 403 |
+
with open(segment_file, "ab") as audio_file:
|
| 404 |
+
audio_file.write(chunk["data"])
|
| 405 |
+
elif chunk["type"] == "WordBoundary":
|
| 406 |
+
# Adjust offset to account for previous segments
|
| 407 |
+
adjusted_chunk = chunk.copy()
|
| 408 |
+
adjusted_chunk["offset"] += current_offset * 10000 # Convert ms to 100ns units
|
| 409 |
+
word_boundaries.append(adjusted_chunk)
|
| 410 |
+
|
| 411 |
+
all_word_boundaries.extend(word_boundaries)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
# Get duration of the generated audio
|
| 414 |
from pydub import AudioSegment
|
|
|
|
| 435 |
|
| 436 |
# Generate subtitles file if requested
|
| 437 |
if generate_subtitles and subtitle_path:
|
| 438 |
+
# Process all word boundaries into phrases for subtitles
|
| 439 |
+
phrases = []
|
| 440 |
+
current_phrase = []
|
| 441 |
+
current_text = ""
|
| 442 |
+
phrase_start = 0
|
| 443 |
+
|
| 444 |
+
for j, boundary in enumerate(all_word_boundaries):
|
| 445 |
+
word = boundary["text"]
|
| 446 |
+
start_time = boundary["offset"] / 10000
|
| 447 |
+
duration = boundary["duration"] / 10000
|
| 448 |
+
end_time = start_time + duration
|
| 449 |
+
|
| 450 |
+
if not current_phrase:
|
| 451 |
+
phrase_start = start_time
|
| 452 |
+
|
| 453 |
+
current_phrase.append(boundary)
|
| 454 |
+
|
| 455 |
+
if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
|
| 456 |
+
current_text = current_text.rstrip() + word + " "
|
| 457 |
+
else:
|
| 458 |
+
current_text += word + " "
|
| 459 |
+
|
| 460 |
+
# Determine if we should end this phrase
|
| 461 |
+
should_break = False
|
| 462 |
+
|
| 463 |
+
if word.endswith(('.', '!', '?', ':', ';', ',')) or j == len(all_word_boundaries) - 1:
|
| 464 |
+
should_break = True
|
| 465 |
+
elif len(current_phrase) >= 5:
|
| 466 |
+
should_break = True
|
| 467 |
+
elif j < len(all_word_boundaries) - 1:
|
| 468 |
+
next_start = all_word_boundaries[j + 1]["offset"] / 10000
|
| 469 |
+
if next_start - end_time > 300:
|
| 470 |
+
should_break = True
|
| 471 |
+
|
| 472 |
+
if should_break or j == len(all_word_boundaries) - 1:
|
| 473 |
+
if current_phrase:
|
| 474 |
+
last_boundary = current_phrase[-1]
|
| 475 |
+
phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
|
| 476 |
+
phrases.append({
|
| 477 |
+
"text": current_text.strip(),
|
| 478 |
+
"start": phrase_start,
|
| 479 |
+
"end": phrase_end
|
| 480 |
+
})
|
| 481 |
+
current_phrase = []
|
| 482 |
+
current_text = ""
|
| 483 |
+
|
| 484 |
+
# Write phrases to SRT file
|
| 485 |
with open(subtitle_path, "w", encoding="utf-8") as f:
|
| 486 |
+
for i, phrase in enumerate(phrases):
|
| 487 |
f.write(f"{i+1}\n")
|
| 488 |
+
f.write(f"{format_time(phrase['start'])} --> {format_time(phrase['end'])}\n")
|
| 489 |
+
f.write(f"{phrase['text']}\n\n")
|
| 490 |
|
| 491 |
return final_audio_path, subtitle_path, None
|
| 492 |
|