walidadebayo commited on
Commit
e904bea
·
1 Parent(s): cc98bbc

Refactor multi-speaker TTS to improve subtitle generation by consolidating word boundary handling and processing phrases for SRT output

Browse files
Files changed (1) hide show
  1. app.py +64 -71
app.py CHANGED
@@ -3,10 +3,7 @@ import edge_tts
3
  import asyncio
4
  import tempfile
5
  import os
6
- import json
7
- import datetime
8
  import re
9
- import io
10
 
11
 
12
  async def get_voices():
@@ -371,7 +368,7 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
371
  # Process each speaker segment with the corresponding voice
372
  with tempfile.TemporaryDirectory() as temp_dir:
373
  audio_segments = []
374
- subtitle_entries = []
375
  current_offset = 0 # Track the time offset in milliseconds
376
 
377
  for i, segment in enumerate(speaker_segments):
@@ -399,70 +396,19 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
399
  text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
400
  )
401
 
402
- # For subtitle generation, we need word boundaries
403
- if generate_subtitles:
404
- word_boundaries = []
405
- async for chunk in communicate.stream():
406
- if chunk["type"] == "audio":
407
- with open(segment_file, "ab") as audio_file:
408
- audio_file.write(chunk["data"])
409
- elif chunk["type"] == "WordBoundary":
410
- # Adjust offset to account for previous segments
411
- adjusted_chunk = chunk.copy()
412
- adjusted_chunk["offset"] += current_offset * 10000 # Convert ms to 100ns units
413
- word_boundaries.append(adjusted_chunk)
414
-
415
- # Process word boundaries for subtitles
416
- if word_boundaries:
417
- # Group words into phrases for subtitles
418
- phrases = []
419
- current_phrase = []
420
- current_text = ""
421
- phrase_start = 0
422
-
423
- for j, boundary in enumerate(word_boundaries):
424
- word = boundary["text"]
425
- start_time = boundary["offset"] / 10000
426
- duration = boundary["duration"] / 10000
427
- end_time = start_time + duration
428
-
429
- if not current_phrase:
430
- phrase_start = start_time
431
-
432
- current_phrase.append(boundary)
433
-
434
- if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
435
- current_text = current_text.rstrip() + word + " "
436
- else:
437
- current_text += word + " "
438
-
439
- # Determine if we should end this phrase
440
- should_break = False
441
-
442
- if word.endswith(('.', '!', '?', ':', ';', ',')) or j == len(word_boundaries) - 1:
443
- should_break = True
444
- elif len(current_phrase) >= 5:
445
- should_break = True
446
- elif j < len(word_boundaries) - 1:
447
- next_start = word_boundaries[j + 1]["offset"] / 10000
448
- if next_start - end_time > 300:
449
- should_break = True
450
-
451
- if should_break or j == len(word_boundaries) - 1:
452
- if current_phrase:
453
- last_boundary = current_phrase[-1]
454
- phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
455
- phrases.append({
456
- "text": f"[{speaker}] {current_text.strip()}",
457
- "start": phrase_start,
458
- "end": phrase_end
459
- })
460
- subtitle_entries.extend(phrases)
461
- current_phrase = []
462
- current_text = ""
463
- else:
464
- # Simple audio generation without subtitles
465
- await communicate.save(segment_file)
466
 
467
  # Get duration of the generated audio
468
  from pydub import AudioSegment
@@ -489,11 +435,58 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
489
 
490
  # Generate subtitles file if requested
491
  if generate_subtitles and subtitle_path:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  with open(subtitle_path, "w", encoding="utf-8") as f:
493
- for i, entry in enumerate(subtitle_entries):
494
  f.write(f"{i+1}\n")
495
- f.write(f"{format_time(entry['start'])} --> {format_time(entry['end'])}\n")
496
- f.write(f"{entry['text']}\n\n")
497
 
498
  return final_audio_path, subtitle_path, None
499
 
 
3
  import asyncio
4
  import tempfile
5
  import os
 
 
6
  import re
 
7
 
8
 
9
  async def get_voices():
 
368
  # Process each speaker segment with the corresponding voice
369
  with tempfile.TemporaryDirectory() as temp_dir:
370
  audio_segments = []
371
+ all_word_boundaries = [] # Collect all word boundaries for subtitle generation
372
  current_offset = 0 # Track the time offset in milliseconds
373
 
374
  for i, segment in enumerate(speaker_segments):
 
396
  text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
397
  )
398
 
399
+ # Collect word boundaries for subtitle generation
400
+ word_boundaries = []
401
+ async for chunk in communicate.stream():
402
+ if chunk["type"] == "audio":
403
+ with open(segment_file, "ab") as audio_file:
404
+ audio_file.write(chunk["data"])
405
+ elif chunk["type"] == "WordBoundary":
406
+ # Adjust offset to account for previous segments
407
+ adjusted_chunk = chunk.copy()
408
+ adjusted_chunk["offset"] += current_offset * 10000 # Convert ms to 100ns units
409
+ word_boundaries.append(adjusted_chunk)
410
+
411
+ all_word_boundaries.extend(word_boundaries)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
  # Get duration of the generated audio
414
  from pydub import AudioSegment
 
435
 
436
  # Generate subtitles file if requested
437
  if generate_subtitles and subtitle_path:
438
+ # Process all word boundaries into phrases for subtitles
439
+ phrases = []
440
+ current_phrase = []
441
+ current_text = ""
442
+ phrase_start = 0
443
+
444
+ for j, boundary in enumerate(all_word_boundaries):
445
+ word = boundary["text"]
446
+ start_time = boundary["offset"] / 10000
447
+ duration = boundary["duration"] / 10000
448
+ end_time = start_time + duration
449
+
450
+ if not current_phrase:
451
+ phrase_start = start_time
452
+
453
+ current_phrase.append(boundary)
454
+
455
+ if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
456
+ current_text = current_text.rstrip() + word + " "
457
+ else:
458
+ current_text += word + " "
459
+
460
+ # Determine if we should end this phrase
461
+ should_break = False
462
+
463
+ if word.endswith(('.', '!', '?', ':', ';', ',')) or j == len(all_word_boundaries) - 1:
464
+ should_break = True
465
+ elif len(current_phrase) >= 5:
466
+ should_break = True
467
+ elif j < len(all_word_boundaries) - 1:
468
+ next_start = all_word_boundaries[j + 1]["offset"] / 10000
469
+ if next_start - end_time > 300:
470
+ should_break = True
471
+
472
+ if should_break or j == len(all_word_boundaries) - 1:
473
+ if current_phrase:
474
+ last_boundary = current_phrase[-1]
475
+ phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
476
+ phrases.append({
477
+ "text": current_text.strip(),
478
+ "start": phrase_start,
479
+ "end": phrase_end
480
+ })
481
+ current_phrase = []
482
+ current_text = ""
483
+
484
+ # Write phrases to SRT file
485
  with open(subtitle_path, "w", encoding="utf-8") as f:
486
+ for i, phrase in enumerate(phrases):
487
  f.write(f"{i+1}\n")
488
+ f.write(f"{format_time(phrase['start'])} --> {format_time(phrase['end'])}\n")
489
+ f.write(f"{phrase['text']}\n\n")
490
 
491
  return final_audio_path, subtitle_path, None
492