datbkpro commited on
Commit
01c964f
·
verified ·
1 Parent(s): 38d7d2a

Update core/silero_vad.py

Browse files
Files changed (1) hide show
  1. core/silero_vad.py +58 -38
core/silero_vad.py CHANGED
@@ -17,8 +17,8 @@ class SileroVAD:
17
  self.is_streaming = False
18
  self.speech_callback = None
19
  self.audio_buffer = []
20
- self.speech_buffer = [] # Buffer cho speech đang diễn ra
21
- self.state = "silence" # silence, speech, processing
22
  self.speech_start_time = 0
23
  self.last_voice_time = 0
24
 
@@ -32,7 +32,11 @@ class SileroVAD:
32
 
33
  # Buffer cho pre-speech
34
  self.pre_speech_samples = int(self.pre_speech_buffer * self.sample_rate)
35
- self.pre_speech_buffer = []
 
 
 
 
36
 
37
  self._initialize_model()
38
 
@@ -61,11 +65,13 @@ class SileroVAD:
61
  self.speech_callback = speech_callback
62
  self.audio_buffer = []
63
  self.speech_buffer = []
64
- self.pre_speech_buffer = []
 
 
65
  self.state = "silence"
66
  self.speech_start_time = 0
67
  self.last_voice_time = 0
68
- print("🎙️ Bắt đầu VAD streaming với cấu hình tối ưu...")
69
  return True
70
 
71
  def stop_stream(self):
@@ -74,23 +80,30 @@ class SileroVAD:
74
  self.speech_callback = None
75
  self.audio_buffer = []
76
  self.speech_buffer = []
77
- self.pre_speech_buffer = []
 
 
78
  self.state = "silence"
79
  print("🛑 Đã dừng VAD streaming")
80
 
81
  def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
82
- """Xử lý audio chunk với VAD tối ưu"""
83
  if not self.is_streaming or self.model is None:
84
  return
85
 
86
  try:
87
-
88
  if sample_rate != self.sample_rate:
89
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
90
 
 
91
  self.audio_buffer.extend(audio_chunk)
 
 
 
 
92
 
93
-
94
  while len(self.audio_buffer) >= self.chunk_size:
95
  chunk = self.audio_buffer[:self.chunk_size]
96
  self._process_vad_chunk(np.array(chunk))
@@ -100,7 +113,7 @@ class SileroVAD:
100
  print(f"❌ Lỗi xử lý VAD: {e}")
101
 
102
  def _process_vad_chunk(self, audio_chunk: np.ndarray):
103
- """Xử lý VAD cho một chunk - TỐI ƯU HÓA"""
104
  current_time = time.time()
105
 
106
  # Chuẩn hóa audio
@@ -109,25 +122,28 @@ class SileroVAD:
109
  # Lấy xác suất speech
110
  speech_prob = self._get_speech_probability(audio_chunk)
111
 
112
- # Logic state machine cải tiến
113
  if self.state == "silence":
114
  if speech_prob > self.speech_threshold:
115
  print("🎤 Bắt đầu phát hiện speech")
116
  self.state = "speech"
117
  self.speech_start_time = current_time
118
  self.last_voice_time = current_time
119
- # Khởi tạo speech buffer với pre-speech data
120
- self.speech_buffer = self.pre_speech_buffer.copy()
121
- self.speech_buffer.extend(audio_chunk)
 
 
 
122
  else:
123
- # Lưu pre-speech buffer (giới hạn kích thước)
124
- self.pre_speech_buffer.extend(audio_chunk)
125
- if len(self.pre_speech_buffer) > self.pre_speech_samples:
126
- self.pre_speech_buffer = self.pre_speech_buffer[-self.pre_speech_samples:]
127
 
128
  elif self.state == "speech":
129
- # Luôn thêm vào speech buffer
130
- self.speech_buffer.extend(audio_chunk)
 
131
 
132
  # Cập nhật thời gian voice cuối cùng
133
  if speech_prob > self.speech_threshold:
@@ -137,9 +153,7 @@ class SileroVAD:
137
  silence_duration = current_time - self.last_voice_time
138
  speech_duration = current_time - self.speech_start_time
139
 
140
- # 🎯 LOGIC KẾT THÚC THÔNG MINH - 3 TRƯỜNG HỢP:
141
-
142
- # 1. User nói ngắn (dưới min_speech) nhưng đã im lặng đủ lâu -> XỬ LÝ NGAY
143
  is_short_response = speech_duration < self.min_speech_duration
144
  is_long_silence_after_short = silence_duration >= self.min_silence_duration
145
 
@@ -147,31 +161,30 @@ class SileroVAD:
147
  print(f"🎯 Phát hiện phản hồi ngắn: {speech_duration:.2f}s, im lặng: {silence_duration:.2f}s")
148
  self._finalize_speech()
149
 
150
- # 2. User nói đủ dài VÀ im lặng đủ lâu -> XỬ LÝ BÌNH THƯỜNG
151
  elif (speech_duration >= self.min_speech_duration and
152
  silence_duration >= self.min_silence_duration):
153
  print(f"🎯 Kết thúc speech dài: {speech_duration:.2f}s")
154
  self._finalize_speech()
155
 
156
- # 3. Speech quá dài (timeout) -> XỬ LÝ DÙ ĐANG NÓI
157
  elif speech_duration > settings.MAX_AUDIO_DURATION:
158
  print(f"⏰ Speech timeout ({speech_duration:.2f}s) - xử lý dù đang nói")
159
  self._finalize_speech()
160
 
161
  elif self.state == "processing":
162
- # Đang xử lý, không nhận thêm audio
163
- pass
 
164
  def _finalize_speech(self):
165
- """Hoàn thành xử lý speech segment"""
166
- if not self.speech_buffer or len(self.speech_buffer) == 0:
167
- self.state = "silence"
168
  return
169
 
170
- # Chuyển sang state processing để tránh nhận thêm audio
171
  self.state = "processing"
172
 
173
- # Tạo audio array từ buffer
174
- speech_audio = np.array(self.speech_buffer, dtype=np.float32)
175
 
176
  # Gọi callback trong thread riêng
177
  if self.speech_callback:
@@ -181,11 +194,19 @@ class SileroVAD:
181
  daemon=True
182
  ).start()
183
 
184
- # Reset buffers nhưng giữ pre-speech
185
- self.speech_buffer = []
186
- self.audio_buffer = []
187
 
188
- # Quay lại state silence sau khi xử
 
 
 
 
 
 
 
 
189
  self.state = "silence"
190
 
191
  def _normalize_audio(self, audio: np.ndarray) -> np.ndarray:
@@ -232,7 +253,6 @@ class SileroVAD:
232
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
233
  audio_chunk = self._normalize_audio(audio_chunk)
234
 
235
- # Kiểm tra multiple chunks
236
  chunk_size = 512
237
  speech_probs = []
238
 
 
17
  self.is_streaming = False
18
  self.speech_callback = None
19
  self.audio_buffer = []
20
+ self.speech_buffer = []
21
+ self.state = "silence"
22
  self.speech_start_time = 0
23
  self.last_voice_time = 0
24
 
 
32
 
33
  # Buffer cho pre-speech
34
  self.pre_speech_samples = int(self.pre_speech_buffer * self.sample_rate)
35
+ self.pre_speech_buffer_data = []
36
+
37
+ # Double buffer system để tránh mất dữ liệu
38
+ self.active_speech_buffer = []
39
+ self.backup_speech_buffer = []
40
 
41
  self._initialize_model()
42
 
 
65
  self.speech_callback = speech_callback
66
  self.audio_buffer = []
67
  self.speech_buffer = []
68
+ self.pre_speech_buffer_data = []
69
+ self.active_speech_buffer = []
70
+ self.backup_speech_buffer = []
71
  self.state = "silence"
72
  self.speech_start_time = 0
73
  self.last_voice_time = 0
74
+ print("🎙️ Bắt đầu VAD streaming với double buffer system...")
75
  return True
76
 
77
  def stop_stream(self):
 
80
  self.speech_callback = None
81
  self.audio_buffer = []
82
  self.speech_buffer = []
83
+ self.pre_speech_buffer_data = []
84
+ self.active_speech_buffer = []
85
+ self.backup_speech_buffer = []
86
  self.state = "silence"
87
  print("🛑 Đã dừng VAD streaming")
88
 
89
  def process_stream(self, audio_chunk: np.ndarray, sample_rate: int):
90
+ """Xử lý audio chunk với VAD double buffer"""
91
  if not self.is_streaming or self.model is None:
92
  return
93
 
94
  try:
95
+ # Resample nếu cần
96
  if sample_rate != self.sample_rate:
97
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
98
 
99
+ # Thêm vào audio buffer
100
  self.audio_buffer.extend(audio_chunk)
101
+
102
+ # Đồng thời thêm vào backup buffer để tránh mất dữ liệu
103
+ if self.state == "speech":
104
+ self.backup_speech_buffer.extend(audio_chunk)
105
 
106
+ # Xử lý VAD theo chunks
107
  while len(self.audio_buffer) >= self.chunk_size:
108
  chunk = self.audio_buffer[:self.chunk_size]
109
  self._process_vad_chunk(np.array(chunk))
 
113
  print(f"❌ Lỗi xử lý VAD: {e}")
114
 
115
  def _process_vad_chunk(self, audio_chunk: np.ndarray):
116
+ """Xử lý VAD cho một chunk với double buffer"""
117
  current_time = time.time()
118
 
119
  # Chuẩn hóa audio
 
122
  # Lấy xác suất speech
123
  speech_prob = self._get_speech_probability(audio_chunk)
124
 
 
125
  if self.state == "silence":
126
  if speech_prob > self.speech_threshold:
127
  print("🎤 Bắt đầu phát hiện speech")
128
  self.state = "speech"
129
  self.speech_start_time = current_time
130
  self.last_voice_time = current_time
131
+
132
+ # Khởi tạo cả active và backup buffer
133
+ self.active_speech_buffer = self.pre_speech_buffer_data.copy()
134
+ self.active_speech_buffer.extend(audio_chunk)
135
+ self.backup_speech_buffer = self.active_speech_buffer.copy()
136
+
137
  else:
138
+ # Lưu pre-speech buffer
139
+ self.pre_speech_buffer_data.extend(audio_chunk)
140
+ if len(self.pre_speech_buffer_data) > self.pre_speech_samples:
141
+ self.pre_speech_buffer_data = self.pre_speech_buffer_data[-self.pre_speech_samples:]
142
 
143
  elif self.state == "speech":
144
+ # Thêm vào cả hai buffers
145
+ self.active_speech_buffer.extend(audio_chunk)
146
+ self.backup_speech_buffer.extend(audio_chunk)
147
 
148
  # Cập nhật thời gian voice cuối cùng
149
  if speech_prob > self.speech_threshold:
 
153
  silence_duration = current_time - self.last_voice_time
154
  speech_duration = current_time - self.speech_start_time
155
 
156
+ # Logic kết thúc thông minh
 
 
157
  is_short_response = speech_duration < self.min_speech_duration
158
  is_long_silence_after_short = silence_duration >= self.min_silence_duration
159
 
 
161
  print(f"🎯 Phát hiện phản hồi ngắn: {speech_duration:.2f}s, im lặng: {silence_duration:.2f}s")
162
  self._finalize_speech()
163
 
 
164
  elif (speech_duration >= self.min_speech_duration and
165
  silence_duration >= self.min_silence_duration):
166
  print(f"🎯 Kết thúc speech dài: {speech_duration:.2f}s")
167
  self._finalize_speech()
168
 
 
169
  elif speech_duration > settings.MAX_AUDIO_DURATION:
170
  print(f"⏰ Speech timeout ({speech_duration:.2f}s) - xử lý dù đang nói")
171
  self._finalize_speech()
172
 
173
  elif self.state == "processing":
174
+ # Trong khi đang xử lý, vẫn tiếp tục ghi vào backup buffer
175
+ self.backup_speech_buffer.extend(audio_chunk)
176
+
177
  def _finalize_speech(self):
178
+ """Hoàn thành xử lý speech segment với buffer switching"""
179
+ if not self.active_speech_buffer:
180
+ self._reset_buffers()
181
  return
182
 
183
+ # Chuyển sang state processing
184
  self.state = "processing"
185
 
186
+ # Sử dụng active buffer cho xử lý hiện tại
187
+ speech_audio = np.array(self.active_speech_buffer, dtype=np.float32)
188
 
189
  # Gọi callback trong thread riêng
190
  if self.speech_callback:
 
194
  daemon=True
195
  ).start()
196
 
197
+ # Chuẩn bị cho lần tiếp theo: chuyển backup buffer thành active buffer
198
+ self.active_speech_buffer = self.backup_speech_buffer.copy()
199
+ self.backup_speech_buffer = []
200
 
201
+ # Quay lại state speech để tiếp tục nhận dữ liệu
202
+ self.state = "speech"
203
+ self.last_voice_time = time.time()
204
+
205
+ def _reset_buffers(self):
206
+ """Reset tất cả buffers"""
207
+ self.active_speech_buffer = []
208
+ self.backup_speech_buffer = []
209
+ self.audio_buffer = []
210
  self.state = "silence"
211
 
212
  def _normalize_audio(self, audio: np.ndarray) -> np.ndarray:
 
253
  audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
254
  audio_chunk = self._normalize_audio(audio_chunk)
255
 
 
256
  chunk_size = 512
257
  speech_probs = []
258