johnm87 commited on
Commit
1afd111
·
verified ·
1 Parent(s): af25078

Ref wav VAD trimming (#22)

Browse files

- vad trimming for ref wavs (96bdb699f097eb3c5905a7b4b89512f482ccaf77)
- rm pycache files (3646fe5d4c0343b60bd4a64342d81f0079d072df)
- add ref wav vad trimming option (f975abbb26a0f12189284b9c63b3743478864444)

Files changed (48) hide show
  1. .gitignore +1 -0
  2. app.py +12 -8
  3. chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc +0 -0
  4. chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc +0 -0
  5. chatterbox/src/chatterbox/__pycache__/utils.cpython-311.pyc +0 -0
  6. chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc +0 -0
  7. chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc +0 -0
  8. chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc +0 -0
  9. chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc +0 -0
  10. chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc +0 -0
  11. chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc +0 -0
  12. chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc +0 -0
  13. chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc +0 -0
  14. chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc +0 -0
  15. chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc +0 -0
  16. chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc +0 -0
  17. chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc +0 -0
  18. chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc +0 -0
  19. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc +0 -0
  20. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc +0 -0
  21. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc +0 -0
  22. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc +0 -0
  23. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc +0 -0
  24. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc +0 -0
  25. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc +0 -0
  26. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc +0 -0
  27. chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc +0 -0
  28. chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc +0 -0
  29. chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc +0 -0
  30. chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc +0 -0
  31. chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc +0 -0
  32. chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc +0 -0
  33. chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc +0 -0
  34. chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc +0 -0
  35. chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc +0 -0
  36. chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc +0 -0
  37. chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc +0 -0
  38. chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc +0 -0
  39. chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc +0 -0
  40. chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc +0 -0
  41. chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc +0 -0
  42. chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc +0 -0
  43. chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc +0 -0
  44. chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc +0 -0
  45. chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc +0 -0
  46. chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc +0 -0
  47. chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc +0 -0
  48. chatterbox/src/chatterbox/tts.py +31 -5
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py CHANGED
@@ -49,13 +49,14 @@ def generate_tts_audio(
49
  exaggeration_input: float = 0.5,
50
  temperature_input: float = 0.8,
51
  seed_num_input: int = 0,
52
- cfgw_input: float = 0.5
 
53
  ) -> tuple[int, np.ndarray]:
54
  """
55
  Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
56
-
57
- This tool synthesizes natural-sounding speech from input text. When a reference audio file
58
- is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
59
  maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
60
 
61
  Args:
@@ -78,17 +79,18 @@ def generate_tts_audio(
78
  set_seed(int(seed_num_input))
79
 
80
  print(f"Generating audio for text: '{text_input[:50]}...'")
81
-
82
  # Handle optional audio prompt
83
  generate_kwargs = {
84
  "exaggeration": exaggeration_input,
85
  "temperature": temperature_input,
86
  "cfg_weight": cfgw_input,
 
87
  }
88
-
89
  if audio_prompt_path_input:
90
  generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
91
-
92
  wav = current_model.generate(
93
  text_input[:300], # Truncate text to max chars
94
  **generate_kwargs
@@ -126,6 +128,7 @@ with gr.Blocks() as demo:
126
  with gr.Accordion("More options", open=False):
127
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
128
  temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
 
129
 
130
  run_btn = gr.Button("Generate", variant="primary")
131
 
@@ -141,8 +144,9 @@ with gr.Blocks() as demo:
141
  temp,
142
  seed_num,
143
  cfg_weight,
 
144
  ],
145
  outputs=[audio_output],
146
  )
147
 
148
- demo.launch(mcp_server=True)
 
49
  exaggeration_input: float = 0.5,
50
  temperature_input: float = 0.8,
51
  seed_num_input: int = 0,
52
+ cfgw_input: float = 0.5,
53
+ vad_trim_input: bool = False,
54
  ) -> tuple[int, np.ndarray]:
55
  """
56
  Generate high-quality speech audio from text using ChatterboxTTS model with optional reference audio styling.
57
+
58
+ This tool synthesizes natural-sounding speech from input text. When a reference audio file
59
+ is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
60
  maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
61
 
62
  Args:
 
79
  set_seed(int(seed_num_input))
80
 
81
  print(f"Generating audio for text: '{text_input[:50]}...'")
82
+
83
  # Handle optional audio prompt
84
  generate_kwargs = {
85
  "exaggeration": exaggeration_input,
86
  "temperature": temperature_input,
87
  "cfg_weight": cfgw_input,
88
+ "vad_trim": vad_trim_input,
89
  }
90
+
91
  if audio_prompt_path_input:
92
  generate_kwargs["audio_prompt_path"] = audio_prompt_path_input
93
+
94
  wav = current_model.generate(
95
  text_input[:300], # Truncate text to max chars
96
  **generate_kwargs
 
128
  with gr.Accordion("More options", open=False):
129
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
130
  temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
131
+ vad_trim = gr.Checkbox(label="Ref VAD trimming", value=False)
132
 
133
  run_btn = gr.Button("Generate", variant="primary")
134
 
 
144
  temp,
145
  seed_num,
146
  cfg_weight,
147
+ vad_trim,
148
  ],
149
  outputs=[audio_output],
150
  )
151
 
152
+ demo.launch(mcp_server=True)
chatterbox/src/chatterbox/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (275 Bytes)
 
chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc DELETED
Binary file (13.3 kB)
 
chatterbox/src/chatterbox/__pycache__/utils.cpython-311.pyc DELETED
Binary file (858 Bytes)
 
chatterbox/src/chatterbox/__pycache__/vc.cpython-311.pyc DELETED
Binary file (5.44 kB)
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (294 Bytes)
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/const.cpython-311.pyc DELETED
Binary file (190 Bytes)
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/decoder.cpython-311.pyc DELETED
Binary file (16.9 kB)
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc DELETED
Binary file (2.7 kB)
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow.cpython-311.pyc DELETED
Binary file (13.7 kB)
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/flow_matching.cpython-311.pyc DELETED
Binary file (13.3 kB)
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/hifigan.cpython-311.pyc DELETED
Binary file (26.3 kB)
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/s3gen.cpython-311.pyc DELETED
Binary file (13.7 kB)
 
chatterbox/src/chatterbox/models/s3gen/__pycache__/xvector.cpython-311.pyc DELETED
Binary file (24 kB)
 
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc DELETED
Binary file (21.3 kB)
 
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc DELETED
Binary file (6.46 kB)
 
chatterbox/src/chatterbox/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc DELETED
Binary file (14.7 kB)
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (190 Bytes)
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc DELETED
Binary file (3.58 kB)
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc DELETED
Binary file (15.7 kB)
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc DELETED
Binary file (5.54 kB)
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc DELETED
Binary file (17.3 kB)
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc DELETED
Binary file (11.2 kB)
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc DELETED
Binary file (6.24 kB)
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc DELETED
Binary file (18.9 kB)
 
chatterbox/src/chatterbox/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc DELETED
Binary file (15.6 kB)
 
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc DELETED
Binary file (1.93 kB)
 
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mask.cpython-311.pyc DELETED
Binary file (6.25 kB)
 
chatterbox/src/chatterbox/models/s3gen/utils/__pycache__/mel.cpython-311.pyc DELETED
Binary file (4.05 kB)
 
chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (1.37 kB)
 
chatterbox/src/chatterbox/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc DELETED
Binary file (7.94 kB)
 
chatterbox/src/chatterbox/models/t3/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (218 Bytes)
 
chatterbox/src/chatterbox/models/t3/__pycache__/llama_configs.cpython-311.pyc DELETED
Binary file (1.34 kB)
 
chatterbox/src/chatterbox/models/t3/__pycache__/t3.cpython-311.pyc DELETED
Binary file (15.8 kB)
 
chatterbox/src/chatterbox/models/t3/inference/__pycache__/alignment_stream_analyzer.cpython-311.pyc DELETED
Binary file (7.08 kB)
 
chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc DELETED
Binary file (4.65 kB)
 
chatterbox/src/chatterbox/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc DELETED
Binary file (5.37 kB)
 
chatterbox/src/chatterbox/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc DELETED
Binary file (2.54 kB)
 
chatterbox/src/chatterbox/models/t3/modules/__pycache__/perceiver.cpython-311.pyc DELETED
Binary file (12.6 kB)
 
chatterbox/src/chatterbox/models/t3/modules/__pycache__/t3_config.cpython-311.pyc DELETED
Binary file (1.27 kB)
 
chatterbox/src/chatterbox/models/tokenizers/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (242 Bytes)
 
chatterbox/src/chatterbox/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc DELETED
Binary file (3.1 kB)
 
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (281 Bytes)
 
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/config.cpython-311.pyc DELETED
Binary file (859 Bytes)
 
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/melspec.cpython-311.pyc DELETED
Binary file (3.59 kB)
 
chatterbox/src/chatterbox/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc DELETED
Binary file (18.7 kB)
 
chatterbox/src/chatterbox/tts.py CHANGED
@@ -2,10 +2,12 @@ from dataclasses import dataclass
2
  from pathlib import Path
3
 
4
  import librosa
 
5
  import torch
6
  import perth
7
  import torch.nn.functional as F
8
  from huggingface_hub import hf_hub_download
 
9
 
10
  from .models.t3 import T3
11
  from .models.s3tokenizer import S3_SR, drop_invalid_tokens
@@ -121,6 +123,7 @@ class ChatterboxTTS:
121
  self.device = device
122
  self.conds = conds
123
  self.watermarker = perth.PerthImplicitWatermarker()
 
124
 
125
  @classmethod
126
  def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
@@ -162,11 +165,33 @@ class ChatterboxTTS:
162
 
163
  return cls.from_local(Path(local_path).parent, device)
164
 
165
- def prepare_conditionals(self, wav_fpath, exaggeration=0.5):
166
- ## Load reference wav
167
- s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
168
 
169
- ref_16k_wav = librosa.resample(s3gen_ref_wav, orig_sr=S3GEN_SR, target_sr=S3_SR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
172
  s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
@@ -195,9 +220,10 @@ class ChatterboxTTS:
195
  exaggeration=0.5,
196
  cfg_weight=0.5,
197
  temperature=0.8,
 
198
  ):
199
  if audio_prompt_path:
200
- self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
201
  else:
202
  assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
203
 
 
2
  from pathlib import Path
3
 
4
  import librosa
5
+ import numpy as np
6
  import torch
7
  import perth
8
  import torch.nn.functional as F
9
  from huggingface_hub import hf_hub_download
10
+ from silero_vad import load_silero_vad, get_speech_timestamps
11
 
12
  from .models.t3 import T3
13
  from .models.s3tokenizer import S3_SR, drop_invalid_tokens
 
123
  self.device = device
124
  self.conds = conds
125
  self.watermarker = perth.PerthImplicitWatermarker()
126
+ self.silero_vad = load_silero_vad()
127
 
128
  @classmethod
129
  def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
 
165
 
166
  return cls.from_local(Path(local_path).parent, device)
167
 
168
+ def trim_excess_silence(self, wav, sr):
169
+ "Trim excess silence from speech. Input must be a multiple of 16kHz."
170
+ assert sr % 16_000 == 0, "Silero requires an integer multiple of 16kHz"
171
 
172
+ # Get VAD as sample-level bool array
173
+ silero_regions = get_speech_timestamps(wav, self.silero_vad, sampling_rate=sr)
174
+ vad = np.zeros_like(wav)
175
+ for region in silero_regions:
176
+ vad[region["start"]:region["end"]] = 1
177
+
178
+ # Dilate VAD
179
+ max_silence_ms = 400
180
+ cfilter = np.ones(int(sr * max_silence_ms / (2 * 1000)))
181
+ dilated_vad = np.convolve(vad, cfilter, mode="same") > 0
182
+
183
+ # Trim out silence
184
+ return wav[dilated_vad]
185
+
186
+ def prepare_conditionals(self, wav_fpath, exaggeration=0.5, vad_trim=False):
187
+ # Load reference wav at high SR and trim silence
188
+ ref_wav, highres_sr = librosa.load(wav_fpath, sr=48_000)
189
+ if vad_trim:
190
+ ref_wav = self.trim_excess_silence(ref_wav, highres_sr)
191
+
192
+ # Resample down
193
+ s3gen_ref_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3GEN_SR)
194
+ ref_16k_wav = librosa.resample(ref_wav, orig_sr=highres_sr, target_sr=S3_SR)
195
 
196
  s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
197
  s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
 
220
  exaggeration=0.5,
221
  cfg_weight=0.5,
222
  temperature=0.8,
223
+ vad_trim=False,
224
  ):
225
  if audio_prompt_path:
226
+ self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, vad_trim=vad_trim)
227
  else:
228
  assert self.conds is not None, "Please `prepare_conditionals` first or specify `audio_prompt_path`"
229