File size: 5,810 Bytes
dbf2148 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import torch
import torchaudio
import numpy as np
from speechbrain.inference import VAD
from typing import List, Tuple, Optional
import queue
import threading
import time
from config.settings import settings
class SpeechBrainVAD:
def __init__(self):
self.vad_model = None
self.sample_rate = settings.SAMPLE_RATE
self.threshold = settings.VAD_THRESHOLD
self.min_silence_duration = settings.VAD_MIN_SILENCE_DURATION
self.speech_pad_duration = settings.VAD_SPEECH_PAD_DURATION
self.is_running = False
self.audio_queue = queue.Queue()
self.speech_buffer = []
self.silence_start_time = None
self.callback = None
self._initialize_model()
def _initialize_model(self):
"""Khởi tạo mô hình VAD từ SpeechBrain"""
try:
print("🔄 Đang tải mô hình SpeechBrain VAD...")
self.vad_model = VAD.from_hparams(
source=settings.VAD_MODEL,
savedir=f"pretrained_models/{settings.VAD_MODEL}"
)
print("✅ Đã tải mô hình VAD thành công")
except Exception as e:
print(f"❌ Lỗi tải mô hình VAD: {e}")
self.vad_model = None
def preprocess_audio(self, audio_data: np.ndarray, original_sr: int) -> np.ndarray:
"""Tiền xử lý audio cho VAD"""
if original_sr != self.sample_rate:
# Resample audio to VAD sample rate
audio_tensor = torch.from_numpy(audio_data).float()
if len(audio_tensor.shape) > 1:
audio_tensor = audio_tensor.mean(dim=0) # Convert to mono
resampler = torchaudio.transforms.Resample(
orig_freq=original_sr,
new_freq=self.sample_rate
)
audio_tensor = resampler(audio_tensor)
audio_data = audio_tensor.numpy()
# Normalize audio
if np.max(np.abs(audio_data)) > 0:
audio_data = audio_data / np.max(np.abs(audio_data))
return audio_data
def detect_voice_activity(self, audio_chunk: np.ndarray) -> bool:
"""Phát hiện hoạt động giọng nói trong audio chunk"""
if self.vad_model is None:
# Fallback: simple energy-based VAD
return self._energy_based_vad(audio_chunk)
try:
# Convert to tensor and add batch dimension
audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
# Get VAD probabilities
with torch.no_grad():
prob = self.vad_model.get_speech_prob_chunk(audio_tensor)
return prob.item() > self.threshold
except Exception as e:
print(f"❌ Lỗi VAD detection: {e}")
return self._energy_based_vad(audio_chunk)
def _energy_based_vad(self, audio_chunk: np.ndarray) -> bool:
"""Fallback VAD dựa trên năng lượng âm thanh"""
energy = np.mean(audio_chunk ** 2)
return energy > 0.01 # Simple threshold
def process_stream(self, audio_chunk: np.ndarray, original_sr: int):
"""Xử lý audio stream real-time"""
if not self.is_running:
return
# Preprocess audio
processed_audio = self.preprocess_audio(audio_chunk, original_sr)
# Detect voice activity
is_speech = self.detect_voice_activity(processed_audio)
if is_speech:
self.silence_start_time = None
self.speech_buffer.extend(processed_audio)
print("🎤 Đang nói...")
else:
# Silence detected
if self.silence_start_time is None:
self.silence_start_time = time.time()
elif len(self.speech_buffer) > 0:
silence_duration = time.time() - self.silence_start_time
if silence_duration >= self.min_silence_duration:
# End of speech segment
self._process_speech_segment()
return is_speech
def _process_speech_segment(self):
"""Xử lý segment giọng nói khi kết thúc"""
if len(self.speech_buffer) == 0:
return
# Convert buffer to numpy array
speech_audio = np.array(self.speech_buffer)
# Call callback with speech segment
if self.callback and callable(self.callback):
self.callback(speech_audio, self.sample_rate)
# Clear buffer
self.speech_buffer = []
self.silence_start_time = None
print("✅ Đã xử lý segment giọng nói")
def start_stream(self, callback: callable):
"""Bắt đầu xử lý stream"""
self.is_running = True
self.callback = callback
self.speech_buffer = []
self.silence_start_time = None
print("🎙️ Bắt đầu stream VAD...")
def stop_stream(self):
"""Dừng xử lý stream"""
self.is_running = False
# Process any remaining speech
if len(self.speech_buffer) > 0:
self._process_speech_segment()
print("🛑 Đã dừng stream VAD")
def get_audio_chunk_from_stream(self, stream, chunk_size: int = 1024):
"""Lấy audio chunk từ stream (for microphone input)"""
try:
data = stream.read(chunk_size, exception_on_overflow=False)
audio_data = np.frombuffer(data, dtype=np.int16)
return audio_data.astype(np.float32) / 32768.0 # Normalize to [-1, 1]
except Exception as e:
print(f"❌ Lỗi đọc audio stream: {e}")
return None |