Spaces:

walidadebayo
/

text-to-speech-clone

Running

App Files Files Community

walidadebayo commited on Sep 4

Commit

e904bea

1 Parent(s): cc98bbc

Refactor multi-speaker TTS to improve subtitle generation by consolidating word boundary handling and processing phrases for SRT output

Browse files

Files changed (1) hide show

app.py +64 -71

app.py CHANGED Viewed

@@ -3,10 +3,7 @@ import edge_tts
 import asyncio
 import tempfile
 import os
-import json
-import datetime
 import re
-import io
 async def get_voices():
@@ -371,7 +368,7 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
     # Process each speaker segment with the corresponding voice
     with tempfile.TemporaryDirectory() as temp_dir:
         audio_segments = []
-        subtitle_entries = []
         current_offset = 0  # Track the time offset in milliseconds
         for i, segment in enumerate(speaker_segments):
@@ -399,70 +396,19 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
                 text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
             )
-            # For subtitle generation, we need word boundaries
-            if generate_subtitles:
-                word_boundaries = []
-                async for chunk in communicate.stream():
-                    if chunk["type"] == "audio":
-                        with open(segment_file, "ab") as audio_file:
-                            audio_file.write(chunk["data"])
-                    elif chunk["type"] == "WordBoundary":
-                        # Adjust offset to account for previous segments
-                        adjusted_chunk = chunk.copy()
-                        adjusted_chunk["offset"] += current_offset * 10000  # Convert ms to 100ns units
-                        word_boundaries.append(adjusted_chunk)
-                # Process word boundaries for subtitles
-                if word_boundaries:
-                    # Group words into phrases for subtitles
-                    phrases = []
-                    current_phrase = []
-                    current_text = ""
-                    phrase_start = 0
-                    for j, boundary in enumerate(word_boundaries):
-                        word = boundary["text"]
-                        start_time = boundary["offset"] / 10000
-                        duration = boundary["duration"] / 10000
-                        end_time = start_time + duration
-                        if not current_phrase:
-                            phrase_start = start_time
-                        current_phrase.append(boundary)
-                        if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
-                            current_text = current_text.rstrip() + word + " "
-                        else:
-                            current_text += word + " "
-                        # Determine if we should end this phrase
-                        should_break = False
-                        if word.endswith(('.', '!', '?', ':', ';', ',')) or j == len(word_boundaries) - 1:
-                            should_break = True
-                        elif len(current_phrase) >= 5:
-                            should_break = True
-                        elif j < len(word_boundaries) - 1:
-                            next_start = word_boundaries[j + 1]["offset"] / 10000
-                            if next_start - end_time > 300:
-                                should_break = True
-                        if should_break or j == len(word_boundaries) - 1:
-                            if current_phrase:
-                                last_boundary = current_phrase[-1]
-                                phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
-                                phrases.append({
-                                    "text": f"[{speaker}] {current_text.strip()}",
-                                    "start": phrase_start,
-                                    "end": phrase_end
-                                })
-                                subtitle_entries.extend(phrases)
-                                current_phrase = []
-                                current_text = ""
-            else:
-                # Simple audio generation without subtitles
-                await communicate.save(segment_file)
             # Get duration of the generated audio
             from pydub import AudioSegment
@@ -489,11 +435,58 @@ async def multi_speaker_tts(text, speaker_settings, generate_subtitles=False):
         # Generate subtitles file if requested
         if generate_subtitles and subtitle_path:
             with open(subtitle_path, "w", encoding="utf-8") as f:
-                for i, entry in enumerate(subtitle_entries):
                     f.write(f"{i+1}\n")
-                    f.write(f"{format_time(entry['start'])} --> {format_time(entry['end'])}\n")
-                    f.write(f"{entry['text']}\n\n")
     return final_audio_path, subtitle_path, None

 import asyncio
 import tempfile
 import os
 import re
 async def get_voices():
     # Process each speaker segment with the corresponding voice
     with tempfile.TemporaryDirectory() as temp_dir:
         audio_segments = []
+        all_word_boundaries = []  # Collect all word boundaries for subtitle generation
         current_offset = 0  # Track the time offset in milliseconds
         for i, segment in enumerate(speaker_segments):
                 text, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str
             )
+            # Collect word boundaries for subtitle generation
+            word_boundaries = []
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    with open(segment_file, "ab") as audio_file:
+                        audio_file.write(chunk["data"])
+                elif chunk["type"] == "WordBoundary":
+                    # Adjust offset to account for previous segments
+                    adjusted_chunk = chunk.copy()
+                    adjusted_chunk["offset"] += current_offset * 10000  # Convert ms to 100ns units
+                    word_boundaries.append(adjusted_chunk)
+            all_word_boundaries.extend(word_boundaries)
             # Get duration of the generated audio
             from pydub import AudioSegment
         # Generate subtitles file if requested
         if generate_subtitles and subtitle_path:
+            # Process all word boundaries into phrases for subtitles
+            phrases = []
+            current_phrase = []
+            current_text = ""
+            phrase_start = 0
+            for j, boundary in enumerate(all_word_boundaries):
+                word = boundary["text"]
+                start_time = boundary["offset"] / 10000
+                duration = boundary["duration"] / 10000
+                end_time = start_time + duration
+                if not current_phrase:
+                    phrase_start = start_time
+                current_phrase.append(boundary)
+                if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
+                    current_text = current_text.rstrip() + word + " "
+                else:
+                    current_text += word + " "
+                # Determine if we should end this phrase
+                should_break = False
+                if word.endswith(('.', '!', '?', ':', ';', ',')) or j == len(all_word_boundaries) - 1:
+                    should_break = True
+                elif len(current_phrase) >= 5:
+                    should_break = True
+                elif j < len(all_word_boundaries) - 1:
+                    next_start = all_word_boundaries[j + 1]["offset"] / 10000
+                    if next_start - end_time > 300:
+                        should_break = True
+                if should_break or j == len(all_word_boundaries) - 1:
+                    if current_phrase:
+                        last_boundary = current_phrase[-1]
+                        phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
+                        phrases.append({
+                            "text": current_text.strip(),
+                            "start": phrase_start,
+                            "end": phrase_end
+                        })
+                        current_phrase = []
+                        current_text = ""
+            # Write phrases to SRT file
             with open(subtitle_path, "w", encoding="utf-8") as f:
+                for i, phrase in enumerate(phrases):
                     f.write(f"{i+1}\n")
+                    f.write(f"{format_time(phrase['start'])} --> {format_time(phrase['end'])}\n")
+                    f.write(f"{phrase['text']}\n\n")
     return final_audio_path, subtitle_path, None