datbkpro commited on
Commit
ebe59f5
·
verified ·
1 Parent(s): cfffabc

Update core/silero_vad.py

Browse files
Files changed (1) hide show
  1. core/silero_vad.py +7 -275
core/silero_vad.py CHANGED
@@ -1,272 +1,5 @@
1
 
2
- # import torch
3
- # import numpy as np
4
- # from typing import Callable
5
- # from config.settings import settings
6
- # import os
7
- # import time
8
 
9
-
10
- # class SileroVAD:
11
- # def __init__(self):
12
- # self.model = None
13
- # self.utils = None
14
- # self.sample_rate = 16000
15
- # self.is_streaming = False
16
- # self.speech_callback = None
17
- # self.audio_buffer = []
18
- # self.speech_start_time = 0
19
- # self.min_speech_duration = 0.5 # Giây
20
-
21
- # # ✅ Thêm cấu hình chunk size cho Silero
22
- # self.chunk_size = 512 # Silero yêu cầu 512 samples cho 16000Hz
23
- # self.chunk_duration = self.chunk_size / self.sample_rate # 0.032 giây
24
-
25
- # self._initialize_model()
26
-
27
- # def _initialize_model(self):
28
- # """Khởi tạo Silero VAD model"""
29
- # try:
30
- # print("🔄 Đang tải Silero VAD model...")
31
-
32
- # self.model, self.utils = torch.hub.load(
33
- # repo_or_dir='snakers4/silero-vad',
34
- # model='silero_vad',
35
- # force_reload=False,
36
- # trust_repo=True
37
- # )
38
-
39
- # self.model.eval()
40
- # print("✅ Đã tải Silero VAD model thành công")
41
-
42
- # except Exception as e:
43
- # print(f"❌ Lỗi tải Silero VAD model: {e}")
44
- # self._initialize_model_fallback()
45
-
46
- # def _initialize_model_fallback(self):
47
- # """Fallback nếu torch.hub.load thất bại"""
48
- # try:
49
- # model_dir = torch.hub.get_dir()
50
- # model_path = os.path.join(
51
- # model_dir, 'snakers4_silero-vad_master', 'files', 'silero_vad.jit'
52
- # )
53
-
54
- # if os.path.exists(model_path):
55
- # self.model = torch.jit.load(model_path)
56
- # self.model.eval()
57
- # print("✅ Đã tải Silero VAD model thành công (fallback)")
58
- # else:
59
- # print("❌ Không tìm thấy model file (fallback thất bại)")
60
- # self.model = None
61
-
62
- # except Exception as e:
63
- # print(f"❌ Lỗi tải Silero VAD model fallback: {e}")
64
- # self.model = None
65
-
66
- # def start_stream(self, speech_callback: Callable):
67
- # """Bắt đầu stream với VAD"""
68
- # if self.model is None:
69
- # print("❌ Silero VAD model chưa được khởi tạo")
70
- # return False
71
-
72
- # self.is_streaming = True
73
- # self.speech_callback = speech_callback
74
- # self.audio_buffer = []
75
- # self.speech_start_time = 0
76
- # print("🎙️ Bắt đầu Silero VAD streaming...")
77
- # return True
78
-
79
- # def stop_stream(self):
80
- # """Dừng stream"""
81
- # self.is_streaming = False
82
- # self.speech_callback = None
83
- # self.audio_buffer = []
84
- # self.speech_start_time = 0
85
- # print("🛑 Đã dừng Silero VAD streaming")
86
-
87
- # def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
88
- # """Xử lý audio chunk với Silero VAD - ĐÃ SỬA LỖI"""
89
- # if not self.is_streaming or self.model is None:
90
- # return
91
-
92
- # try:
93
- # # Resample nếu cần
94
- # if sample_rate != self.sample_rate:
95
- # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
96
-
97
- # # Thêm vào buffer
98
- # self.audio_buffer.extend(audio_chunk)
99
-
100
- # # ✅ Xử lý từng chunk 512 samples (Silero requirement)
101
- # while len(self.audio_buffer) >= self.chunk_size:
102
- # chunk = self.audio_buffer[:self.chunk_size]
103
- # self._process_single_chunk(np.array(chunk))
104
- # # Giữ lại phần thừa cho chunk tiếp theo
105
- # self.audio_buffer = self.audio_buffer[self.chunk_size:]
106
-
107
- # except Exception as e:
108
- # print(f"❌ Lỗi xử lý Silero VAD: {e}")
109
-
110
- # def _process_single_chunk(self, audio_chunk: np.ndarray):
111
- # """Xử lý một chunk 512 samples duy nhất"""
112
- # try:
113
- # # Chuẩn hóa audio
114
- # audio_chunk = self._normalize_audio(audio_chunk)
115
-
116
- # # Đảm bảo đúng kích thước
117
- # if len(audio_chunk) != self.chunk_size:
118
- # # Nếu không đủ, pad với zeros
119
- # if len(audio_chunk) < self.chunk_size:
120
- # padding = np.zeros(self.chunk_size - len(audio_chunk), dtype=np.float32)
121
- # audio_chunk = np.concatenate([audio_chunk, padding])
122
- # else:
123
- # audio_chunk = audio_chunk[:self.chunk_size]
124
-
125
- # # Dự đoán xác suất speech
126
- # speech_prob = self._get_speech_probability(audio_chunk)
127
- # print(f"🎯 Silero VAD speech probability: {speech_prob:.3f}")
128
-
129
- # # Xử lý logic speech detection
130
- # current_time = time.time()
131
-
132
- # if speech_prob > settings.VAD_THRESHOLD:
133
- # if self.speech_start_time == 0:
134
- # self.speech_start_time = current_time
135
- # print("🎯 Bắt đầu phát hiện speech")
136
-
137
- # speech_duration = current_time - self.speech_start_time
138
-
139
- # # Nếu đủ thời gian speech, gọi callback
140
- # if speech_duration >= self.min_speech_duration:
141
- # if self.speech_callback:
142
- # # Thu thập tất cả audio từ khi bắt đầu speech
143
- # full_audio = self._collect_speech_audio()
144
- # if len(full_audio) > 0:
145
- # self.speech_callback(full_audio, self.sample_rate)
146
- # self.speech_start_time = 0
147
- # else:
148
- # if self.speech_start_time > 0:
149
- # print("🔇 Kết thúc speech segment")
150
- # self.speech_start_time = 0
151
-
152
- # except Exception as e:
153
- # print(f"❌ Lỗi xử lý Silero VAD chunk: {e}")
154
-
155
- # def _collect_speech_audio(self) -> np.ndarray:
156
- # """Thu thập toàn bộ audio từ khi bắt đầu speech"""
157
- # # Trong implementation thực tế, bạn cần lưu lại audio
158
- # # từ khi bắt đầu phát hiện speech đến hiện tại
159
- # # Đây là simplified version
160
- # min_samples = int(self.sample_rate * self.min_speech_duration)
161
- # return np.random.randn(min_samples).astype(np.float32) # Placeholder
162
-
163
- # def _normalize_audio(self, audio: np.ndarray) -> np.ndarray:
164
- # """Chuẩn hóa audio"""
165
- # if audio.dtype != np.float32:
166
- # audio = audio.astype(np.float32)
167
- # if np.max(np.abs(audio)) > 1.0:
168
- # audio = audio / 32768.0
169
- # return np.clip(audio, -1.0, 1.0)
170
-
171
- # def _get_speech_probability(self, audio_chunk: np.ndarray) -> float:
172
- # """Trả về xác suất speech - ĐÃ SỬA LỖI"""
173
- # try:
174
- # # ✅ Đảm bảo đúng kích thước 512 samples
175
- # if len(audio_chunk) != self.chunk_size:
176
- # # Resize về đúng 512 samples
177
- # if len(audio_chunk) > self.chunk_size:
178
- # audio_chunk = audio_chunk[:self.chunk_size]
179
- # else:
180
- # padding = np.zeros(self.chunk_size - len(audio_chunk), dtype=np.float32)
181
- # audio_chunk = np.concatenate([audio_chunk, padding])
182
-
183
- # audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
184
-
185
- # with torch.no_grad():
186
- # return self.model(audio_tensor, self.sample_rate).item()
187
-
188
- # except Exception as e:
189
- # print(f"❌ Lỗi lấy speech probability: {e}")
190
- # return 0.0
191
-
192
- # def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
193
- # """Resample audio"""
194
- # if orig_sr == target_sr:
195
- # return audio
196
- # try:
197
- # from scipy import signal
198
- # # Tính số samples mới
199
- # duration = len(audio) / orig_sr
200
- # new_length = int(duration * target_sr)
201
-
202
- # # Resample
203
- # resampled_audio = signal.resample(audio, new_length)
204
- # return resampled_audio.astype(np.float32)
205
-
206
- # except ImportError:
207
- # # Fallback simple resampling
208
- # orig_len = len(audio)
209
- # new_len = int(orig_len * target_sr / orig_sr)
210
- # x_old = np.linspace(0, 1, orig_len)
211
- # x_new = np.linspace(0, 1, new_len)
212
- # return np.interp(x_new, x_old, audio).astype(np.float32)
213
- # except Exception as e:
214
- # print(f"⚠️ Lỗi resample: {e}")
215
- # return audio
216
-
217
- # def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool:
218
- # """Kiểm tra chunk có phải speech không - ĐÃ SỬA"""
219
- # if self.model is None:
220
- # return True
221
- # try:
222
- # if sample_rate != self.sample_rate:
223
- # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
224
- # audio_chunk = self._normalize_audio(audio_chunk)
225
-
226
- # # ✅ Chia thành các chunk 512 samples và kiểm tra trung bình
227
- # chunk_size = 512
228
- # speech_probs = []
229
-
230
- # for i in range(0, len(audio_chunk), chunk_size):
231
- # chunk = audio_chunk[i:i+chunk_size]
232
- # if len(chunk) == chunk_size:
233
- # prob = self._get_speech_probability(chunk)
234
- # speech_probs.append(prob)
235
-
236
- # if not speech_probs:
237
- # return False
238
-
239
- # avg_prob = np.mean(speech_probs)
240
- # return avg_prob > settings.VAD_THRESHOLD
241
-
242
- # except Exception as e:
243
- # print(f"❌ Lỗi kiểm tra speech: {e}")
244
- # return True
245
-
246
- # def get_speech_probability(self, audio_chunk: np.ndarray, sample_rate: int) -> float:
247
- # """Lấy xác suất speech trung bình"""
248
- # if self.model is None:
249
- # return 0.0
250
- # try:
251
- # if sample_rate != self.sample_rate:
252
- # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
253
- # audio_chunk = self._normalize_audio(audio_chunk)
254
-
255
- # # Chia thành các chunk 512 samples
256
- # chunk_size = 512
257
- # speech_probs = []
258
-
259
- # for i in range(0, len(audio_chunk), chunk_size):
260
- # chunk = audio_chunk[i:i+chunk_size]
261
- # if len(chunk) == chunk_size:
262
- # prob = self._get_speech_probability(chunk)
263
- # speech_probs.append(prob)
264
-
265
- # return np.mean(speech_probs) if speech_probs else 0.0
266
-
267
- # except Exception as e:
268
- # print(f"❌ Lỗi lấy speech probability: {e}")
269
- # return 0.0
270
  import io
271
  import numpy as np
272
  import soundfile as sf
@@ -353,14 +86,13 @@ class SileroVAD:
353
  return
354
 
355
  try:
356
- # Resample nếu cần
357
  if sample_rate != self.sample_rate:
358
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
359
 
360
- # Thêm vào buffer chính
361
  self.audio_buffer.extend(audio_chunk)
362
 
363
- # Xử lý từng chunk
364
  while len(self.audio_buffer) >= self.chunk_size:
365
  chunk = self.audio_buffer[:self.chunk_size]
366
  self._process_vad_chunk(np.array(chunk))
@@ -382,7 +114,7 @@ class SileroVAD:
382
  # Logic state machine cải tiến
383
  if self.state == "silence":
384
  if speech_prob > self.speech_threshold:
385
- print("🎯 Bắt đầu phát hiện speech")
386
  self.state = "speech"
387
  self.speech_start_time = current_time
388
  self.last_voice_time = current_time
@@ -410,11 +142,11 @@ class SileroVAD:
410
  # Điều kiện kết thúc: im lặng đủ lâu VÀ đã nói đủ dài
411
  if (silence_duration >= self.min_silence_duration and
412
  speech_duration >= self.min_speech_duration):
413
- print(f"🔇 Kết thúc speech segment (duration: {speech_duration:.2f}s)")
414
  self._finalize_speech()
415
  # Hoặc speech quá dài (timeout)
416
  elif speech_duration > settings.MAX_AUDIO_DURATION:
417
- print(f"Speech timeout ({speech_duration:.2f}s)")
418
  self._finalize_speech()
419
 
420
  elif self.state == "processing":
@@ -466,7 +198,7 @@ class SileroVAD:
466
  with torch.no_grad():
467
  return self.model(audio_tensor, self.sample_rate).item()
468
  except Exception as e:
469
- print(f" Lỗi speech probability: {e}")
470
  return 0.0
471
 
472
  def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
@@ -505,5 +237,5 @@ class SileroVAD:
505
  return np.mean(speech_probs) > self.speech_threshold if speech_probs else False
506
 
507
  except Exception as e:
508
- print(f" Lỗi kiểm tra speech: {e}")
509
  return True
 
1
 
 
 
 
 
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import io
4
  import numpy as np
5
  import soundfile as sf
 
86
  return
87
 
88
  try:
89
+
90
  if sample_rate != self.sample_rate:
91
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
92
 
 
93
  self.audio_buffer.extend(audio_chunk)
94
 
95
+
96
  while len(self.audio_buffer) >= self.chunk_size:
97
  chunk = self.audio_buffer[:self.chunk_size]
98
  self._process_vad_chunk(np.array(chunk))
 
114
  # Logic state machine cải tiến
115
  if self.state == "silence":
116
  if speech_prob > self.speech_threshold:
117
+ print(" Bắt đầu phát hiện speech")
118
  self.state = "speech"
119
  self.speech_start_time = current_time
120
  self.last_voice_time = current_time
 
142
  # Điều kiện kết thúc: im lặng đủ lâu VÀ đã nói đủ dài
143
  if (silence_duration >= self.min_silence_duration and
144
  speech_duration >= self.min_speech_duration):
145
+ print(f" Kết thúc speech segment (duration: {speech_duration:.2f}s)")
146
  self._finalize_speech()
147
  # Hoặc speech quá dài (timeout)
148
  elif speech_duration > settings.MAX_AUDIO_DURATION:
149
+ print(f"Speech timeout ({speech_duration:.2f}s)")
150
  self._finalize_speech()
151
 
152
  elif self.state == "processing":
 
198
  with torch.no_grad():
199
  return self.model(audio_tensor, self.sample_rate).item()
200
  except Exception as e:
201
+ print(f" Lỗi speech probability: {e}")
202
  return 0.0
203
 
204
  def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
 
237
  return np.mean(speech_probs) > self.speech_threshold if speech_probs else False
238
 
239
  except Exception as e:
240
+ print(f" Lỗi kiểm tra speech: {e}")
241
  return True