File size: 5,810 Bytes
dbf2148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import torch
import torchaudio
import numpy as np
from speechbrain.inference import VAD
from typing import List, Tuple, Optional
import queue
import threading
import time
from config.settings import settings

class SpeechBrainVAD:
    def __init__(self):
        self.vad_model = None
        self.sample_rate = settings.SAMPLE_RATE
        self.threshold = settings.VAD_THRESHOLD
        self.min_silence_duration = settings.VAD_MIN_SILENCE_DURATION
        self.speech_pad_duration = settings.VAD_SPEECH_PAD_DURATION
        self.is_running = False
        self.audio_queue = queue.Queue()
        self.speech_buffer = []
        self.silence_start_time = None
        self.callback = None
        
        self._initialize_model()
    
    def _initialize_model(self):
        """Khởi tạo mô hình VAD từ SpeechBrain"""
        try:
            print("🔄 Đang tải mô hình SpeechBrain VAD...")
            self.vad_model = VAD.from_hparams(
                source=settings.VAD_MODEL,
                savedir=f"pretrained_models/{settings.VAD_MODEL}"
            )
            print("✅ Đã tải mô hình VAD thành công")
        except Exception as e:
            print(f"❌ Lỗi tải mô hình VAD: {e}")
            self.vad_model = None
    
    def preprocess_audio(self, audio_data: np.ndarray, original_sr: int) -> np.ndarray:
        """Tiền xử lý audio cho VAD"""
        if original_sr != self.sample_rate:
            # Resample audio to VAD sample rate
            audio_tensor = torch.from_numpy(audio_data).float()
            if len(audio_tensor.shape) > 1:
                audio_tensor = audio_tensor.mean(dim=0)  # Convert to mono
            
            resampler = torchaudio.transforms.Resample(
                orig_freq=original_sr, 
                new_freq=self.sample_rate
            )
            audio_tensor = resampler(audio_tensor)
            audio_data = audio_tensor.numpy()
        
        # Normalize audio
        if np.max(np.abs(audio_data)) > 0:
            audio_data = audio_data / np.max(np.abs(audio_data))
        
        return audio_data
    
    def detect_voice_activity(self, audio_chunk: np.ndarray) -> bool:
        """Phát hiện hoạt động giọng nói trong audio chunk"""
        if self.vad_model is None:
            # Fallback: simple energy-based VAD
            return self._energy_based_vad(audio_chunk)
        
        try:
            # Convert to tensor and add batch dimension
            audio_tensor = torch.from_numpy(audio_chunk).float().unsqueeze(0)
            
            # Get VAD probabilities
            with torch.no_grad():
                prob = self.vad_model.get_speech_prob_chunk(audio_tensor)
            
            return prob.item() > self.threshold
            
        except Exception as e:
            print(f"❌ Lỗi VAD detection: {e}")
            return self._energy_based_vad(audio_chunk)
    
    def _energy_based_vad(self, audio_chunk: np.ndarray) -> bool:
        """Fallback VAD dựa trên năng lượng âm thanh"""
        energy = np.mean(audio_chunk ** 2)
        return energy > 0.01  # Simple threshold
    
    def process_stream(self, audio_chunk: np.ndarray, original_sr: int):
        """Xử lý audio stream real-time"""
        if not self.is_running:
            return
        
        # Preprocess audio
        processed_audio = self.preprocess_audio(audio_chunk, original_sr)
        
        # Detect voice activity
        is_speech = self.detect_voice_activity(processed_audio)
        
        if is_speech:
            self.silence_start_time = None
            self.speech_buffer.extend(processed_audio)
            print("🎤 Đang nói...")
        else:
            # Silence detected
            if self.silence_start_time is None:
                self.silence_start_time = time.time()
            elif len(self.speech_buffer) > 0:
                silence_duration = time.time() - self.silence_start_time
                if silence_duration >= self.min_silence_duration:
                    # End of speech segment
                    self._process_speech_segment()
        
        return is_speech
    
    def _process_speech_segment(self):
        """Xử lý segment giọng nói khi kết thúc"""
        if len(self.speech_buffer) == 0:
            return
        
        # Convert buffer to numpy array
        speech_audio = np.array(self.speech_buffer)
        
        # Call callback with speech segment
        if self.callback and callable(self.callback):
            self.callback(speech_audio, self.sample_rate)
        
        # Clear buffer
        self.speech_buffer = []
        self.silence_start_time = None
        
        print("✅ Đã xử lý segment giọng nói")
    
    def start_stream(self, callback: callable):
        """Bắt đầu xử lý stream"""
        self.is_running = True
        self.callback = callback
        self.speech_buffer = []
        self.silence_start_time = None
        print("🎙️ Bắt đầu stream VAD...")
    
    def stop_stream(self):
        """Dừng xử lý stream"""
        self.is_running = False
        # Process any remaining speech
        if len(self.speech_buffer) > 0:
            self._process_speech_segment()
        print("🛑 Đã dừng stream VAD")
    
    def get_audio_chunk_from_stream(self, stream, chunk_size: int = 1024):
        """Lấy audio chunk từ stream (for microphone input)"""
        try:
            data = stream.read(chunk_size, exception_on_overflow=False)
            audio_data = np.frombuffer(data, dtype=np.int16)
            return audio_data.astype(np.float32) / 32768.0  # Normalize to [-1, 1]
        except Exception as e:
            print(f"❌ Lỗi đọc audio stream: {e}")
            return None