Spaces:

ResembleAI
/

Chatterbox

Running on Zero

App Files Files Community

johnm87 commited on 5 days ago

Commit

1afd111

verified ·

1 Parent(s): af25078

Ref wav VAD trimming (#22)

Browse files

- vad trimming for ref wavs (96bdb699f097eb3c5905a7b4b89512f482ccaf77)
- rm pycache files (3646fe5d4c0343b60bd4a64342d81f0079d072df)
- add ref wav vad trimming option (f975abbb26a0f12189284b9c63b3743478864444)

Files changed (48) hide show

.gitignore +1 -0
app.py +12 -8
chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/__pycache__/utils.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc +0 -0
chatterbox/src/chatterbox/tts.py +31 -5

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py CHANGED Viewed

@@ -49,13 +49,14 @@ def generate_tts_audio(
     exaggeration_input: float = 0.5,
     temperature_input: float = 0.8,
     seed_num_input: int = 0,
-    cfgw_input: float = 0.5
 ) -> tuple[int, np.ndarray]:
     """
     Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
-    This tool synthesizes natural-sounding speech from input text. When a reference audio file
-    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
     maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
@@ -78,17 +79,18 @@ def generate_tts_audio(
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
     # Handle optional audio prompt
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
     }
     if audio_prompt_path_input:
         generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
         **generate_kwargs
@@ -126,6 +128,7 @@ with gr.Blocks() as demo:
             with gr.Accordion("More options", open=False):
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
             run_btn = gr.Button("Generate", variant="primary")
@@ -141,8 +144,9 @@ with gr.Blocks() as demo:
             temp,
             seed_num,
             cfg_weight,
         ],
         outputs=[audio_output],
     )
-demo.launch(mcp_server=True)

     exaggeration_input: float = 0.5,
     temperature_input: float = 0.8,
     seed_num_input: int = 0,
+    cfgw_input: float = 0.5,
+    vad_trim_input: bool = False,
 ) -> tuple[int, np.ndarray]:
     """
     Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
+    This tool synthesizes natural-sounding speech from input text. When a reference audio file
+    is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
     maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
         set_seed(int(seed_num_input))
     print(f"Generating audio for text: '{text_input[:50]}...'")
     # Handle optional audio prompt
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
+        "vad_trim": vad_trim_input,
     }
     if audio_prompt_path_input:
         generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
     wav = current_model.generate(
         text_input[:300],  # Truncate text to max chars
         **generate_kwargs
             with gr.Accordion("More options", open=False):
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
+                vad_trim = gr.Checkbox(label="Ref VAD trimming", value=False)
             run_btn = gr.Button("Generate", variant="primary")
             temp,
             seed_num,
             cfg_weight,
+            vad_trim,
         ],
         outputs=[audio_output],
     )
+demo.launch(mcp_server=True)

chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (275 Bytes)

chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc DELETED Viewed

Binary file (13.3 kB)

chatterbox/src/chatterbox/__pycache__/utils.cpython-311.pyc DELETED Viewed

Binary file (858 Bytes)

chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc DELETED Viewed

Binary file (5.44 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (294 Bytes)

chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc DELETED Viewed

Binary file (190 Bytes)

chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc DELETED Viewed

Binary file (16.9 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc DELETED Viewed

Binary file (2.7 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc DELETED Viewed

Binary file (13.7 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc DELETED Viewed

Binary file (13.3 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc DELETED Viewed

Binary file (26.3 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc DELETED Viewed

Binary file (13.7 kB)

chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc DELETED Viewed

Binary file (24 kB)

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc DELETED Viewed

Binary file (21.3 kB)

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc DELETED Viewed

Binary file (6.46 kB)

chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc DELETED Viewed

Binary file (14.7 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (190 Bytes)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc DELETED Viewed

Binary file (3.58 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc DELETED Viewed

Binary file (15.7 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc DELETED Viewed

Binary file (5.54 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc DELETED Viewed

Binary file (17.3 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc DELETED Viewed

Binary file (11.2 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc DELETED Viewed

Binary file (6.24 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc DELETED Viewed

Binary file (18.9 kB)

chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc DELETED Viewed

Binary file (15.6 kB)

chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc DELETED Viewed

Binary file (1.93 kB)

chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc DELETED Viewed

Binary file (6.25 kB)

chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc DELETED Viewed

Binary file (4.05 kB)

chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (1.37 kB)

chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc DELETED Viewed

Binary file (7.94 kB)

chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (218 Bytes)

chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc DELETED Viewed

Binary file (1.34 kB)

chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc DELETED Viewed

Binary file (15.8 kB)

chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc DELETED Viewed

Binary file (7.08 kB)

chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc DELETED Viewed

Binary file (4.65 kB)

chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc DELETED Viewed

Binary file (5.37 kB)

chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc DELETED Viewed

Binary file (2.54 kB)

chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc DELETED Viewed

Binary file (12.6 kB)

chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc DELETED Viewed

Binary file (1.27 kB)

chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (242 Bytes)

chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc DELETED Viewed

Binary file (3.1 kB)

chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (281 Bytes)

chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc DELETED Viewed

Binary file (859 Bytes)

chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc DELETED Viewed

Binary file (3.59 kB)

chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc DELETED Viewed

Binary file (18.7 kB)

chatterbox/src/chatterbox/tts.py CHANGED Viewed

@@ -2,10 +2,12 @@ from dataclasses import dataclass
 from pathlib import Path
 import librosa
 import torch
 import perth
 import torch.nn.functional as F
 from huggingface_hub import hf_hub_download
 from .models.t3 import T3
 from .models.s3tokenizer import S3_SR, drop_invalid_tokens
@@ -121,6 +123,7 @@ class ChatterboxTTS:
         self.device = device
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
     @classmethod
     def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
@@ -162,11 +165,33 @@ class ChatterboxTTS:
         return cls.from_local(Path(local_path).parent, device)
-    def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
-        ## Load reference wav
-        s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
-        ref_16k_wav = librosa.resample(s3gen_ref_wav, orig_sr=S3GEN_SR, target_sr=S3_SR)
         s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
         s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
@@ -195,9 +220,10 @@ class ChatterboxTTS:
         exaggeration=0.5,
         cfg_weight=0.5,
         temperature=0.8,
     ):
         if audio_prompt_path:
-            self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
         else:
             assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"

 from pathlib import Path
 import librosa
+import numpy as np
 import torch
 import perth
 import torch.nn.functional as F
 from huggingface_hub import hf_hub_download
+from silero_vad import load_silero_vad, get_speech_timestamps
 from .models.t3 import T3
 from .models.s3tokenizer import S3_SR, drop_invalid_tokens
         self.device = device
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
+        self.silero_vad = load_silero_vad()
     @classmethod
     def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
         return cls.from_local(Path(local_path).parent, device)
+    def trim_excess_silence(self, wav, sr):
+        "Trim excess silence from speech. Input must be a multiple of 16kHz."
+        assert sr % 16_000 == 0, "Silero requires an integer multiple of 16kHz"
+        # Get VAD as sample-level bool array
+        silero_regions = get_speech_timestamps(wav, self.silero_vad, sampling_rate=sr)
+        vad = np.zeros_like(wav)
+        for region in silero_regions:
+            vad[region["start"]:region["end"]] = 1
+        # Dilate VAD
+        max_silence_ms = 400
+        cfilter = np.ones(int(sr * max_silence_ms / (2 * 1000)))
+        dilated_vad = np.convolve(vad, cfilter, mode="same") > 0
+        # Trim out silence
+        return wav[dilated_vad]
+    def prepare_conditionals(self, wav_fpath, exaggeration=0.5, vad_trim=False):
+        # Load reference wav at high SR and trim silence
+        ref_wav, highres_sr = librosa.load(wav_fpath, sr=48_000)
+        if vad_trim:
+            ref_wav = self.trim_excess_silence(ref_wav, highres_sr)
+        # Resample down
+        s3gen_ref_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3GEN_SR)
+        ref_16k_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3_SR)
         s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
         s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
         exaggeration=0.5,
         cfg_weight=0.5,
         temperature=0.8,
+        vad_trim=False,
     ):
         if audio_prompt_path:
+            self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, vad_trim=vad_trim)
         else:
             assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"