| # import torch | |
| # import numpy as np | |
| # from typing import Callable | |
| # from config.settings import settings | |
| # import os | |
| # import time | |
| # class SileroVAD: | |
| # def __init__(self): | |
| # self.model = None | |
| # self.utils = None | |
| # self.sample_rate = 16000 | |
| # self.is_streaming = False | |
| # self.speech_callback = None | |
| # self.audio_buffer = [] | |
| # self.speech_start_time = 0 | |
| # self.min_speech_duration = 0.5 # Giây | |
| # # ✅ Thêm cấu hình chunk size cho Silero | |
| # self.chunk_size = 512 # Silero yêu cầu 512 samples cho 16000Hz | |
| # self.chunk_duration = self.chunk_size / self.sample_rate # 0.032 giây | |
| # self._initialize_model() | |
| # def _initialize_model(self): | |
| # """Khởi tạo Silero VAD model""" | |
| # try: | |
| # print("🔄 Đang tải Silero VAD model...") | |
| # self.model, self.utils = torch.hub.load( | |
| # repo_or_dir='snakers4/silero-vad', | |
| # model='silero_vad', | |
| # force_reload=False, | |
| # trust_repo=True | |
| # ) | |
| # self.model.eval() | |
| # print("✅ Đã tải Silero VAD model thành công") | |
| # except Exception as e: | |
| # print(f"❌ Lỗi tải Silero VAD model: {e}") | |
| # self._initialize_model_fallback() | |
| # def _initialize_model_fallback(self): | |
| # """Fallback nếu torch.hub.load thất bại""" | |
| # try: | |
| # model_dir = torch.hub.get_dir() | |
| # model_path = os.path.join( | |
| # model_dir, 'snakers4_silero-vad_master', 'files', 'silero_vad.jit' | |
| # ) | |
| # if os.path.exists(model_path): | |
| # self.model = torch.jit.load(model_path) | |
| # self.model.eval() | |
| # print("✅ Đã tải Silero VAD model thành công (fallback)") | |
| # else: | |
| # print("❌ Không tìm thấy model file (fallback thất bại)") | |
| # self.model = None | |
| # except Exception as e: | |
| # print(f"❌ Lỗi tải Silero VAD model fallback: {e}") | |
| # self.model = None | |
| # def start_stream(self, speech_callback: Callable): | |
| # """Bắt đầu stream với VAD""" | |
| # if self.model is None: | |
| # print("❌ Silero VAD model chưa được khởi tạo") | |
| # return False | |
| # self.is_streaming = True | |
| # self.speech_callback = speech_callback | |
| # self.audio_buffer = [] | |
| # self.speech_start_time = 0 | |
| # print("🎙️ Bắt đầu Silero VAD streaming...") | |
| # return True | |
| # def stop_stream(self): | |
| # """Dừng stream""" | |
| # self.is_streaming = False | |
| # self.speech_callback = None | |
| # self.audio_buffer = [] | |
| # self.speech_start_time = 0 | |
| # print("🛑 Đã dừng Silero VAD streaming") | |
| # def process_stream(self, audio_chunk: np.ndarray, sample_rate: int): | |
| # """Xử lý audio chunk với Silero VAD - ĐÃ SỬA LỖI""" | |
| # if not self.is_streaming or self.model is None: | |
| # return | |
| # try: | |
| # # Resample nếu cần | |
| # if sample_rate != self.sample_rate: | |
| # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate) | |
| # # Thêm vào buffer | |
| # self.audio_buffer.extend(audio_chunk) | |
| # # ✅ Xử lý từng chunk 512 samples (Silero requirement) | |
| # while len(self.audio_buffer) >= self.chunk_size: | |
| # chunk = self.audio_buffer[:self.chunk_size] | |
| # self._process_single_chunk(np.array(chunk)) | |
| # # Giữ lại phần thừa cho chunk tiếp theo | |
| # self.audio_buffer = self.audio_buffer[self.chunk_size:] | |
| # except Exception as e: | |
| # print(f"❌ Lỗi xử lý Silero VAD: {e}") | |
| # def _process_single_chunk(self, audio_chunk: np.ndarray): | |
| # """Xử lý một chunk 512 samples duy nhất""" | |
| # try: | |
| # # Chuẩn hóa audio | |
| # audio_chunk = self._normalize_audio(audio_chunk) | |
| # # Đảm bảo đúng kích thước | |
| # if len(audio_chunk) != self.chunk_size: | |
| # # Nếu không đủ, pad với zeros | |
| # if len(audio_chunk) < self.chunk_size: | |
| # padding = np.zeros(self.chunk_size - len(audio_chunk), dtype=np.float32) | |
| # audio_chunk = np.concatenate([audio_chunk, padding]) | |
| # else: | |
| # audio_chunk = audio_chunk[:self.chunk_size] | |
| # # Dự đoán xác suất speech | |
| # speech_prob = self._get_speech_probability(audio_chunk) | |
| # print(f"🎯 Silero VAD speech probability: {speech_prob:.3f}") | |
| # # Xử lý logic speech detection | |
| # current_time = time.time() | |
| # if speech_prob > settings.VAD_THRESHOLD: | |
| # if self.speech_start_time == 0: | |
| # self.speech_start_time = current_time | |
| # print("🎯 Bắt đầu phát hiện speech") | |
| # speech_duration = current_time - self.speech_start_time | |
| # # Nếu đủ thời gian speech, gọi callback | |
| # if speech_duration >= self.min_speech_duration: | |
| # if self.speech_callback: | |
| # # Thu thập tất cả audio từ khi bắt đầu speech | |
| # full_audio = self._collect_speech_audio() | |
| # if len(full_audio) > 0: | |
| # self.speech_callback(full_audio, self.sample_rate) | |
| # self.speech_start_time = 0 | |
| # else: | |
| # if self.speech_start_time > 0: | |
| # print("🔇 Kết thúc speech segment") | |
| # self.speech_start_time = 0 | |
| # except Exception as e: | |
| # print(f"❌ Lỗi xử lý Silero VAD chunk: {e}") | |
| # def _collect_speech_audio(self) -> np.ndarray: | |
| # """Thu thập toàn bộ audio từ khi bắt đầu speech""" | |
| # # Trong implementation thực tế, bạn cần lưu lại audio | |
| # # từ khi bắt đầu phát hiện speech đến hiện tại | |
| # # Đây là simplified version | |
| # min_samples = int(self.sample_rate * self.min_speech_duration) | |
| # return np.random.randn(min_samples).astype(np.float32) # Placeholder | |
| # def _normalize_audio(self, audio: np.ndarray) -> np.ndarray: | |
| # """Chuẩn hóa audio""" | |
| # if audio.dtype != np.float32: | |
| # audio = audio.astype(np.float32) | |
| # if np.max(np.abs(audio)) > 1.0: | |
| # audio = audio / 32768.0 | |
| # return np.clip(audio, -1.0, 1.0) | |
| # def _get_speech_probability(self, audio_chunk: np.ndarray) -> float: | |
| # """Trả về xác suất speech - ĐÃ SỬA LỖI""" | |
| # try: | |
| # # ✅ Đảm bảo đúng kích thước 512 samples | |
| # if len(audio_chunk) != self.chunk_size: | |
| # # Resize về đúng 512 samples | |
| # if len(audio_chunk) > self.chunk_size: | |
| # audio_chunk = audio_chunk[:self.chunk_size] | |
| # else: | |
| # padding = np.zeros(self.chunk_size - len(audio_chunk), dtype=np.float32) | |
| # audio_chunk = np.concatenate([audio_chunk, padding]) | |
| # audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0) | |
| # with torch.no_grad(): | |
| # return self.model(audio_tensor, self.sample_rate).item() | |
| # except Exception as e: | |
| # print(f"❌ Lỗi lấy speech probability: {e}") | |
| # return 0.0 | |
| # def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: | |
| # """Resample audio""" | |
| # if orig_sr == target_sr: | |
| # return audio | |
| # try: | |
| # from scipy import signal | |
| # # Tính số samples mới | |
| # duration = len(audio) / orig_sr | |
| # new_length = int(duration * target_sr) | |
| # # Resample | |
| # resampled_audio = signal.resample(audio, new_length) | |
| # return resampled_audio.astype(np.float32) | |
| # except ImportError: | |
| # # Fallback simple resampling | |
| # orig_len = len(audio) | |
| # new_len = int(orig_len * target_sr / orig_sr) | |
| # x_old = np.linspace(0, 1, orig_len) | |
| # x_new = np.linspace(0, 1, new_len) | |
| # return np.interp(x_new, x_old, audio).astype(np.float32) | |
| # except Exception as e: | |
| # print(f"⚠️ Lỗi resample: {e}") | |
| # return audio | |
| # def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool: | |
| # """Kiểm tra chunk có phải speech không - ĐÃ SỬA""" | |
| # if self.model is None: | |
| # return True | |
| # try: | |
| # if sample_rate != self.sample_rate: | |
| # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate) | |
| # audio_chunk = self._normalize_audio(audio_chunk) | |
| # # ✅ Chia thành các chunk 512 samples và kiểm tra trung bình | |
| # chunk_size = 512 | |
| # speech_probs = [] | |
| # for i in range(0, len(audio_chunk), chunk_size): | |
| # chunk = audio_chunk[i:i+chunk_size] | |
| # if len(chunk) == chunk_size: | |
| # prob = self._get_speech_probability(chunk) | |
| # speech_probs.append(prob) | |
| # if not speech_probs: | |
| # return False | |
| # avg_prob = np.mean(speech_probs) | |
| # return avg_prob > settings.VAD_THRESHOLD | |
| # except Exception as e: | |
| # print(f"❌ Lỗi kiểm tra speech: {e}") | |
| # return True | |
| # def get_speech_probability(self, audio_chunk: np.ndarray, sample_rate: int) -> float: | |
| # """Lấy xác suất speech trung bình""" | |
| # if self.model is None: | |
| # return 0.0 | |
| # try: | |
| # if sample_rate != self.sample_rate: | |
| # audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate) | |
| # audio_chunk = self._normalize_audio(audio_chunk) | |
| # # Chia thành các chunk 512 samples | |
| # chunk_size = 512 | |
| # speech_probs = [] | |
| # for i in range(0, len(audio_chunk), chunk_size): | |
| # chunk = audio_chunk[i:i+chunk_size] | |
| # if len(chunk) == chunk_size: | |
| # prob = self._get_speech_probability(chunk) | |
| # speech_probs.append(prob) | |
| # return np.mean(speech_probs) if speech_probs else 0.0 | |
| # except Exception as e: | |
| # print(f"❌ Lỗi lấy speech probability: {e}") | |
| # return 0.0 | |
| import io | |
| import numpy as np | |
| import soundfile as sf | |
| import time | |
| import traceback | |
| import threading | |
| import queue | |
| import torch | |
| from groq import Groq | |
| from typing import Optional, Dict, Any, Callable | |
| from config.settings import settings | |
| class SileroVAD: | |
| def __init__(self): | |
| self.model = None | |
| self.sample_rate = 16000 | |
| self.is_streaming = False | |
| self.speech_callback = None | |
| self.audio_buffer = [] | |
| self.speech_buffer = [] # Buffer cho speech đang diễn ra | |
| self.state = "silence" # silence, speech, processing | |
| self.speech_start_time = 0 | |
| self.last_voice_time = 0 | |
| # Cấu hình tối ưu | |
| self.chunk_size = 512 | |
| self.speech_threshold = settings.VAD_THRESHOLD | |
| self.min_speech_duration = settings.VAD_MIN_SPEECH_DURATION | |
| self.min_silence_duration = settings.VAD_MIN_SILENCE_DURATION | |
| self.speech_pad_duration = settings.VAD_SPEECH_PAD_DURATION | |
| self.pre_speech_buffer = settings.VAD_PRE_SPEECH_BUFFER | |
| # Buffer cho pre-speech | |
| self.pre_speech_samples = int(self.pre_speech_buffer * self.sample_rate) | |
| self.pre_speech_buffer = [] | |
| self._initialize_model() | |
| def _initialize_model(self): | |
| """Khởi tạo Silero VAD model""" | |
| try: | |
| print("🔄 Đang tải Silero VAD model...") | |
| self.model, utils = torch.hub.load( | |
| repo_or_dir='snakers4/silero-vad', | |
| model='silero_vad', | |
| force_reload=False, | |
| trust_repo=True | |
| ) | |
| self.model.eval() | |
| print("✅ Đã tải Silero VAD model thành công") | |
| except Exception as e: | |
| print(f"❌ Lỗi tải Silero VAD model: {e}") | |
| self.model = None | |
| def start_stream(self, speech_callback: Callable): | |
| """Bắt đầu stream với VAD""" | |
| if self.model is None: | |
| return False | |
| self.is_streaming = True | |
| self.speech_callback = speech_callback | |
| self.audio_buffer = [] | |
| self.speech_buffer = [] | |
| self.pre_speech_buffer = [] | |
| self.state = "silence" | |
| self.speech_start_time = 0 | |
| self.last_voice_time = 0 | |
| print("🎙️ Bắt đầu VAD streaming với cấu hình tối ưu...") | |
| return True | |
| def stop_stream(self): | |
| """Dừng stream""" | |
| self.is_streaming = False | |
| self.speech_callback = None | |
| self.audio_buffer = [] | |
| self.speech_buffer = [] | |
| self.pre_speech_buffer = [] | |
| self.state = "silence" | |
| print("🛑 Đã dừng VAD streaming") | |
| def process_stream(self, audio_chunk: np.ndarray, sample_rate: int): | |
| """Xử lý audio chunk với VAD tối ưu""" | |
| if not self.is_streaming or self.model is None: | |
| return | |
| try: | |
| # Resample nếu cần | |
| if sample_rate != self.sample_rate: | |
| audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate) | |
| # Thêm vào buffer chính | |
| self.audio_buffer.extend(audio_chunk) | |
| # Xử lý từng chunk | |
| while len(self.audio_buffer) >= self.chunk_size: | |
| chunk = self.audio_buffer[:self.chunk_size] | |
| self._process_vad_chunk(np.array(chunk)) | |
| self.audio_buffer = self.audio_buffer[self.chunk_size:] | |
| except Exception as e: | |
| print(f"❌ Lỗi xử lý VAD: {e}") | |
| def _process_vad_chunk(self, audio_chunk: np.ndarray): | |
| """Xử lý VAD cho một chunk - TỐI ƯU HÓA""" | |
| current_time = time.time() | |
| # Chuẩn hóa audio | |
| audio_chunk = self._normalize_audio(audio_chunk) | |
| # Lấy xác suất speech | |
| speech_prob = self._get_speech_probability(audio_chunk) | |
| # Logic state machine cải tiến | |
| if self.state == "silence": | |
| if speech_prob > self.speech_threshold: | |
| print("🎯 Bắt đầu phát hiện speech") | |
| self.state = "speech" | |
| self.speech_start_time = current_time | |
| self.last_voice_time = current_time | |
| # Khởi tạo speech buffer với pre-speech data | |
| self.speech_buffer = self.pre_speech_buffer.copy() | |
| self.speech_buffer.extend(audio_chunk) | |
| else: | |
| # Lưu pre-speech buffer (giới hạn kích thước) | |
| self.pre_speech_buffer.extend(audio_chunk) | |
| if len(self.pre_speech_buffer) > self.pre_speech_samples: | |
| self.pre_speech_buffer = self.pre_speech_buffer[-self.pre_speech_samples:] | |
| elif self.state == "speech": | |
| # Luôn thêm vào speech buffer | |
| self.speech_buffer.extend(audio_chunk) | |
| # Cập nhật thời gian voice cuối cùng | |
| if speech_prob > self.speech_threshold: | |
| self.last_voice_time = current_time | |
| # Kiểm tra kết thúc speech | |
| silence_duration = current_time - self.last_voice_time | |
| speech_duration = current_time - self.speech_start_time | |
| # Điều kiện kết thúc: im lặng đủ lâu VÀ đã nói đủ dài | |
| if (silence_duration >= self.min_silence_duration and | |
| speech_duration >= self.min_speech_duration): | |
| print(f"🔇 Kết thúc speech segment (duration: {speech_duration:.2f}s)") | |
| self._finalize_speech() | |
| # Hoặc speech quá dài (timeout) | |
| elif speech_duration > settings.MAX_AUDIO_DURATION: | |
| print(f"⏰ Speech timeout ({speech_duration:.2f}s)") | |
| self._finalize_speech() | |
| elif self.state == "processing": | |
| # Đang xử lý, không nhận thêm audio | |
| pass | |
| def _finalize_speech(self): | |
| """Hoàn thành xử lý speech segment""" | |
| if not self.speech_buffer or len(self.speech_buffer) == 0: | |
| self.state = "silence" | |
| return | |
| # Chuyển sang state processing để tránh nhận thêm audio | |
| self.state = "processing" | |
| # Tạo audio array từ buffer | |
| speech_audio = np.array(self.speech_buffer, dtype=np.float32) | |
| # Gọi callback trong thread riêng | |
| if self.speech_callback: | |
| threading.Thread( | |
| target=self.speech_callback, | |
| args=(speech_audio, self.sample_rate), | |
| daemon=True | |
| ).start() | |
| # Reset buffers nhưng giữ pre-speech | |
| self.speech_buffer = [] | |
| self.audio_buffer = [] | |
| # Quay lại state silence sau khi xử lý | |
| self.state = "silence" | |
| def _normalize_audio(self, audio: np.ndarray) -> np.ndarray: | |
| """Chuẩn hóa audio""" | |
| if audio.dtype != np.float32: | |
| audio = audio.astype(np.float32) | |
| if np.max(np.abs(audio)) > 1.0: | |
| audio = audio / 32768.0 | |
| return np.clip(audio, -1.0, 1.0) | |
| def _get_speech_probability(self, audio_chunk: np.ndarray) -> float: | |
| """Lấy xác suất speech""" | |
| try: | |
| if len(audio_chunk) != self.chunk_size: | |
| return 0.0 | |
| audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0) | |
| with torch.no_grad(): | |
| return self.model(audio_tensor, self.sample_rate).item() | |
| except Exception as e: | |
| print(f"❌ Lỗi speech probability: {e}") | |
| return 0.0 | |
| def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: | |
| """Resample audio""" | |
| if orig_sr == target_sr: | |
| return audio | |
| try: | |
| from scipy import signal | |
| duration = len(audio) / orig_sr | |
| new_length = int(duration * target_sr) | |
| resampled_audio = signal.resample(audio, new_length) | |
| return resampled_audio.astype(np.float32) | |
| except Exception: | |
| return audio | |
| def is_speech(self, audio_chunk: np.ndarray, sample_rate: int) -> bool: | |
| """Kiểm tra speech (cho compatibility)""" | |
| if self.model is None: | |
| return True | |
| try: | |
| if sample_rate != self.sample_rate: | |
| audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate) | |
| audio_chunk = self._normalize_audio(audio_chunk) | |
| # Kiểm tra multiple chunks | |
| chunk_size = 512 | |
| speech_probs = [] | |
| for i in range(0, len(audio_chunk), chunk_size): | |
| chunk = audio_chunk[i:i+chunk_size] | |
| if len(chunk) == chunk_size: | |
| prob = self._get_speech_probability(chunk) | |
| speech_probs.append(prob) | |
| return np.mean(speech_probs) > self.speech_threshold if speech_probs else False | |
| except Exception as e: | |
| print(f"❌ Lỗi kiểm tra speech: {e}") | |
| return True |