Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on 7 days ago

Commit

cfffabc

verified ·

1 Parent(s): 00dbcb7

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +0 -454

services/streaming_voice_service.py CHANGED Viewed

@@ -14,460 +14,6 @@ from core.tts_service import EnhancedTTSService
 from core.speechbrain_vad import SpeechBrainVAD
 from core.silero_vad import SileroVAD
-# class StreamingVoiceService:
-#     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
-#         self.client = groq_client
-#         self.rag_system = rag_system
-#         self.tts_service = tts_service
-#         # Khởi tạo VAD
-#         self.vad_processor = SileroVAD()
-#         self.is_listening = False
-#         self.speech_callback = None
-#         self.is_processing = False  # Tránh xử lý chồng chéo
-#         self.last_speech_time = 0
-#         self.silence_timeout = 2.0  # 2 giây im lặng thì dừng
-#         # Conversation context
-#         self.conversation_history = []
-#         self.current_transcription = ""
-#         # Audio buffer for VAD
-#         self.audio_buffer = []
-#         self.buffer_lock = threading.Lock()
-#     def start_listening(self, speech_callback: Callable) -> bool:
-#         """Bắt đầu lắng nghe với VAD"""
-#         if self.is_listening:
-#             return False
-#         self.speech_callback = speech_callback
-#         self.last_speech_time = time.time()
-#         success = self.vad_processor.start_stream(self._on_speech_detected)
-#         if success:
-#             self.is_listening = True
-#             self.is_processing = False
-#             print("🎙️ Đã bắt đầu lắng nghe với VAD")
-#         return success
-#     def stop_listening(self):
-#         """Dừng lắng nghe"""
-#         self.vad_processor.stop_stream()
-#         self.is_listening = False
-#         self.is_processing = False
-#         self.speech_callback = None
-#         with self.buffer_lock:
-#             self.audio_buffer = []
-#         print("🛑 Đã dừng lắng nghe")
-#     def process_audio_chunk(self, audio_data: tuple) -> Dict[str, Any]:
-#         """Xử lý audio chunk với VAD (dùng cho real-time streaming)"""
-#         if not audio_data or not self.is_listening or self.is_processing:
-#             return {
-#                 'transcription': "Đang lắng nghe...",
-#                 'response': "",
-#                 'tts_audio': None,
-#                 'status': 'listening'
-#             }
-#         try:
-#             sample_rate, audio_array = audio_data
-#             # Thêm vào buffer và xử lý với VAD
-#             with self.buffer_lock:
-#                 self.audio_buffer.extend(audio_array)
-#                 # Giới hạn buffer để tránh tràn bộ nhớ
-#                 max_buffer_samples = sample_rate * 10  # 10 giây
-#                 if len(self.audio_buffer) > max_buffer_samples:
-#                     self.audio_buffer = self.audio_buffer[-max_buffer_samples:]
-#             # Xử lý với VAD
-#             self.vad_processor.process_stream(audio_array, sample_rate)
-#             # Kiểm tra timeout im lặng
-#             current_time = time.time()
-#             if current_time - self.last_speech_time > self.silence_timeout and len(self.audio_buffer) > 0:
-#                 self._process_final_audio()
-#             return {
-#                 'transcription': "Đang lắng nghe...",
-#                 'response': "",
-#                 'tts_audio': None,
-#                 'status': 'listening'
-#             }
-#         except Exception as e:
-#             print(f"❌ Lỗi xử lý audio chunk: {e}")
-#             return {
-#                 'transcription': "",
-#                 'response': "",
-#                 'tts_audio': None,
-#                 'status': 'error'
-#             }
-#     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
-#         """Callback khi VAD phát hiện speech"""
-#         print(f"🎯 VAD phát hiện speech segment: {len(speech_audio)/sample_rate:.2f}s")
-#         self.last_speech_time = time.time()
-#         # Chỉ xử lý nếu không đang xử lý cái khác
-#         if self.is_processing:
-#             print("⚠️ Đang xử lý request trước đó, bỏ qua...")
-#             return
-#         self.is_processing = True
-#         try:
-#             # Chuyển đổi speech thành text
-#             transcription = self._transcribe_audio(speech_audio, sample_rate)
-#             if not transcription or len(transcription.strip()) < 2:
-#                 print("⚠️ Transcription quá ngắn hoặc trống")
-#                 self.is_processing = False
-#                 return
-#             print(f"📝 VAD Transcription: {transcription}")
-#             self.current_transcription = transcription
-#             # Tạo phản hồi AI
-#             response = self._generate_ai_response(transcription)
-#             # Tạo TTS
-#             tts_audio_path = self._text_to_speech(response)
-#             # Gửi kết quả đến callback
-#             if self.speech_callback:
-#                 self.speech_callback({
-#                     'transcription': transcription,
-#                     'response': response,
-#                     'tts_audio': tts_audio_path,
-#                     'status': 'completed'
-#                 })
-#         except Exception as e:
-#             print(f"❌ Lỗi trong _on_speech_detected: {e}")
-#         finally:
-#             # Cho phép xử lý tiếp sau khi TTS kết thúc
-#             threading.Timer(1.0, self._reset_processing).start()
-#     def _reset_processing(self):
-#         """Reset trạng thái xử lý sau khi hoàn thành"""
-#         self.is_processing = False
-#         with self.buffer_lock:
-#             self.audio_buffer = []
-#     def _process_final_audio(self):
-#         """Xử lý audio cuối cùng khi hết thời gian im lặng"""
-#         if self.is_processing or not self.audio_buffer:
-#             return
-#         try:
-#             with self.buffer_lock:
-#                 if not self.audio_buffer:
-#                     return
-#                 final_audio = np.array(self.audio_buffer)
-#                 self.audio_buffer = []
-#             # Chỉ xử lý nếu audio đủ dài
-#             if len(final_audio) > 16000 * 0.5:  # Ít nhất 0.5 giây
-#                 print("🔄 Xử lý audio cuối cùng do im lặng timeout")
-#                 self._on_speech_detected(final_audio, 16000)
-#         except Exception as e:
-#             print(f"❌ Lỗi xử lý final audio: {e}")
-#     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-#         """Xử lý audio streaming (phương thức cũ cho compatibility)"""
-#         if not audio_data:
-#             return {
-#                 'transcription': "❌ Không có dữ liệu âm thanh",
-#                 'response': "Vui lòng nói lại",
-#                 'tts_audio': None,
-#                 'status': 'error'
-#             }
-#         # Nếu đang xử lý VAD, trả về trạng thái listening
-#         if self.is_processing:
-#             return {
-#                 'transcription': "Đang xử lý...",
-#                 'response': "",
-#                 'tts_audio': None,
-#                 'status': 'processing'
-#             }
-#         try:
-#             # Lấy dữ liệu audio từ Gradio
-#             sample_rate, audio_array = audio_data
-#             print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
-#             # Kiểm tra kiểu dữ liệu và chuyển đổi nếu cần
-#             if isinstance(audio_array, np.ndarray):
-#                 if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
-#                     # Chuyển từ float sang int16
-#                     audio_array = (audio_array * 32767).astype(np.int16)
-#             # Kiểm tra audio có dữ liệu không
-#             if len(audio_array) == 0:
-#                 return {
-#                     'transcription': "❌ Âm thanh trống",
-#                     'response': "Vui lòng nói lại",
-#                     'tts_audio': None,
-#                     'status': 'error'
-#                 }
-#             # Tính toán âm lượng
-#             audio_abs = np.abs(audio_array.astype(np.float32))
-#             audio_rms = np.sqrt(np.mean(audio_abs**2)) / 32767.0
-#             print(f"📊 Âm lượng RMS: {audio_rms:.4f}")
-#             if audio_rms < 0.005:
-#                 return {
-#                     'transcription': "❌ Âm thanh quá yếu",
-#                     'response': "Xin vui lòng nói to hơn",
-#                     'tts_audio': None,
-#                     'status': 'error'
-#                 }
-#             # Sử dụng VAD để kiểm tra speech
-#             if not self.vad_processor.is_speech(audio_array, sample_rate):
-#                 return {
-#                     'transcription': "❌ Không phát hiện giọng nói",
-#                     'response': "Vui lòng nói rõ hơn",
-#                     'tts_audio': None,
-#                     'status': 'error'
-#                 }
-#             # Chuyển đổi thành văn bản
-#             transcription = self._transcribe_audio(audio_array, sample_rate)
-#             if not transcription or len(transcription.strip()) == 0:
-#                 return {
-#                     'transcription': "❌ Không nghe rõ",
-#                     'response': "Xin vui lòng nói lại rõ hơn",
-#                     'tts_audio': None,
-#                     'status': 'error'
-#                 }
-#             # Kiểm tra nếu transcription quá ngắn
-#             if len(transcription.strip()) < 2:
-#                 return {
-#                     'transcription': "❌ Câu nói quá ngắn",
-#                     'response': "Xin vui lòng nói câu dài hơn",
-#                     'tts_audio': None,
-#                     'status': 'error'
-#                 }
-#             print(f"📝 Đã chuyển đổi: {transcription}")
-#             # Cập nhật transcription hiện tại
-#             self.current_transcription = transcription
-#             # Tạo phản hồi AI
-#             response = self._generate_ai_response(transcription)
-#             # Tạo TTS
-#             tts_audio_path = self._text_to_speech(response)
-#             return {
-#                 'transcription': transcription,
-#                 'response': response,
-#                 'tts_audio': tts_audio_path,
-#                 'status': 'completed'
-#             }
-#         except Exception as e:
-#             print(f"❌ Lỗi xử lý streaming audio: {e}")
-#             print(f"Chi tiết lỗi: {traceback.format_exc()}")
-#             return {
-#                 'transcription': f"❌ Lỗi: {str(e)}",
-#                 'response': "Xin lỗi, có lỗi xảy ra trong quá trình xử lý",
-#                 'tts_audio': None,
-#                 'status': 'error'
-#             }
-#     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
-#         """Chuyển audio -> text với xử lý sample rate cải tiến"""
-#         try:
-#             # Đảm bảo kiểu dữ liệu là int16
-#             if audio_data.dtype != np.int16:
-#                 if audio_data.dtype in [np.float32, np.float64]:
-#                     audio_data = (audio_data * 32767).astype(np.int16)
-#                 else:
-#                     audio_data = audio_data.astype(np.int16)
-#             # Chuẩn hóa audio data
-#             if audio_data.ndim > 1:
-#                 audio_data = np.mean(audio_data, axis=1).astype(np.int16)  # Chuyển sang mono
-#             # Resample nếu sample rate không phải 16000Hz (Whisper yêu cầu)
-#             target_sample_rate = 16000
-#             if sample_rate != target_sample_rate:
-#                 audio_data = self._resample_audio(audio_data, sample_rate, target_sample_rate)
-#                 sample_rate = target_sample_rate
-#                 print(f"🔄 Đã resample từ {sample_rate}Hz xuống {target_sample_rate}Hz")
-#             # Giới hạn độ dài audio
-#             max_duration = 10  # giây
-#             max_samples = sample_rate * max_duration
-#             if len(audio_data) > max_samples:
-#                 audio_data = audio_data[:max_samples]
-#                 print(f"⚠️ Cắt audio xuống còn {max_duration} giây")
-#             # Đảm bảo audio đủ dài
-#             min_duration = 0.5  # giây
-#             min_samples = int(sample_rate * min_duration)
-#             if len(audio_data) < min_samples:
-#                 # Pad audio nếu quá ngắn
-#                 padding = np.zeros(min_samples - len(audio_data), dtype=np.int16)
-#                 audio_data = np.concatenate([audio_data, padding])
-#                 print(f"⚠️ Đã pad audio lên {min_duration} giây")
-#             print(f"🔊 Gửi audio đến Whisper: {len(audio_data)} samples, {sample_rate}Hz")
-#             # Tạo temporary file trong memory
-#             buffer = io.BytesIO()
-#             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
-#             buffer.seek(0)
-#             # Gọi API Whisper với timeout
-#             import requests
-#             try:
-#                 transcription = self.client.audio.transcriptions.create(
-#                     model=settings.WHISPER_MODEL,
-#                     file=("speech.wav", buffer.read(), "audio/wav"),
-#                     response_format="text",
-#                     language="vi",
-#                     temperature=0.0,
-#                 )
-#             except requests.exceptions.Timeout:
-#                 print("❌ Whisper API timeout")
-#                 return None
-#             except Exception as e:
-#                 print(f"❌ Lỗi Whisper API: {e}")
-#                 return None
-#             # Xử lý response
-#             if hasattr(transcription, 'text'):
-#                 result = transcription.text.strip()
-#             elif isinstance(transcription, str):
-#                 result = transcription.strip()
-#             else:
-#                 result = str(transcription).strip()
-#             print(f"✅ Transcription thành công: '{result}'")
-#             return result
-#         except Exception as e:
-#             print(f"❌ Lỗi transcription: {e}")
-#             print(f"Audio details: dtype={audio_data.dtype}, shape={audio_data.shape}, sr={sample_rate}")
-#             return None
-#     def _resample_audio(self, audio_data: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
-#         """Resample audio sử dụng scipy - cải tiến độ chính xác"""
-#         try:
-#             from scipy import signal
-#             # Tính số samples mới
-#             duration = len(audio_data) / orig_sr
-#             new_length = int(duration * target_sr)
-#             # Resample sử dụng scipy.signal.resample với windowing
-#             resampled_audio = signal.resample(audio_data, new_length)
-#             # Chuyển lại về int16
-#             resampled_audio = np.clip(resampled_audio, -32768, 32767).astype(np.int16)
-#             return resampled_audio
-#         except ImportError:
-#             print("⚠️ Không có scipy, sử dụng simple resampling")
-#             # Simple resampling bằng interpolation
-#             orig_length = len(audio_data)
-#             new_length = int(orig_length * target_sr / orig_sr)
-#             # Linear interpolation
-#             x_old = np.linspace(0, 1, orig_length)
-#             x_new = np.linspace(0, 1, new_length)
-#             resampled_audio = np.interp(x_new, x_old, audio_data).astype(np.int16)
-#             return resampled_audio
-#         except Exception as e:
-#             print(f"❌ Lỗi resample: {e}")
-#             return audio_data
-#     def _generate_ai_response(self, user_input: str) -> str:
-#         """Sinh phản hồi AI với xử lý lỗi"""
-#         try:
-#             # Thêm vào lịch sử
-#             self.conversation_history.append({"role": "user", "content": user_input})
-#             # Tìm kiếm RAG
-#             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
-#             context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
-#             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
-# Hãy trả lời ngắn gọn, tự nhiên và hữu ích (dưới 100 từ).
-# Thông tin tham khảo:
-# {context_text}
-# """
-#             messages = [{"role": "system", "content": system_prompt}]
-#             # Giữ lại 4 tin nhắn gần nhất
-#             messages.extend(self.conversation_history[-4:])
-#             completion = self.client.chat.completions.create(
-#                 model="llama-3.1-8b-instant",
-#                 messages=messages,
-#                 max_tokens=150,
-#                 temperature=0.7
-#             )
-#             response = completion.choices[0].message.content
-#             self.conversation_history.append({"role": "assistant", "content": response})
-#             # Giới hạn lịch sử
-#             if len(self.conversation_history) > 8:
-#                 self.conversation_history = self.conversation_history[-8:]
-#             return response
-#         except Exception as e:
-#             print(f"❌ Lỗi tạo AI response: {e}")
-#             return "Xin lỗi, tôi gặp lỗi khi tạo phản hồi. Vui lòng thử lại."
-#     def _text_to_speech(self, text: str) -> Optional[str]:
-#         """Chuyển văn bản thành giọng nói với xử lý lỗi"""
-#         try:
-#             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
-#                 return None
-#             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
-#             if tts_bytes:
-#                 audio_path = self.tts_service.save_audio_to_file(tts_bytes)
-#                 print(f"✅ Đã tạo TTS: {audio_path}")
-#                 return audio_path
-#         except Exception as e:
-#             print(f"❌ Lỗi TTS: {e}")
-#         return None
-#     def clear_conversation(self):
-#         """Xóa lịch sử hội thoại"""
-#         self.conversation_history = []
-#         self.current_transcription = ""
-#         print("🗑️ Đã xóa lịch sử hội thoại")
-#     def get_conversation_state(self) -> dict:
-#         """Lấy trạng thái hội thoại"""
-#         return {
-#             'is_listening': self.is_listening,
-#             'is_processing': self.is_processing,
-#             'history_length': len(self.conversation_history),
-#             'current_transcription': self.current_transcription,
-#             'last_update': time.strftime("%H:%M:%S")
-#         }
 class StreamingVoiceService:
     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
         self.client = groq_client

 from core.speechbrain_vad import SpeechBrainVAD
 from core.silero_vad import SileroVAD
 class StreamingVoiceService:
     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
         self.client = groq_client