Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 25

Commit

3466e71

1 Parent(s): 7662a6a

Revert portg

Browse files

Files changed (1) hide show

app.py +315 -76

app.py CHANGED Viewed

@@ -283,7 +283,7 @@ class RealtimeSpeakerDiarization:
         self.audio_processor = None
         self.speaker_detector = None
         self.recorder = None
-        self.sentence_queue = queue.Queue()
         self.full_sentences = []
         self.sentence_speakers = []
         self.pending_sentences = []
@@ -294,6 +294,9 @@ class RealtimeSpeakerDiarization:
         self.max_speakers = DEFAULT_MAX_SPEAKERS
         self.current_conversation = ""
         self.audio_buffer = []
     def initialize_models(self):
         """Initialize the speaker encoder model"""
@@ -302,9 +305,25 @@ class RealtimeSpeakerDiarization:
             print(f"Using device: {device_str}")
             self.encoder = SpeechBrainEncoder(device=device_str)
-            success = self.encoder.load_model()
-            if success:
                 self.audio_processor = AudioProcessor(self.encoder)
                 self.speaker_detector = SpeakerChangeDetector(
                     embedding_dim=self.encoder.embedding_dim,
@@ -314,10 +333,52 @@ class RealtimeSpeakerDiarization:
                 print("ECAPA-TDNN model loaded successfully!")
                 return True
             else:
-                print("Failed to load ECAPA-TDNN model")
-                return False
         except Exception as e:
             print(f"Model initialization error: {e}")
             return False
     def live_text_detected(self, text):
@@ -346,8 +407,9 @@ class RealtimeSpeakerDiarization:
         if text:
             try:
                 bytes_data = self.recorder.last_transcription_bytes
-                self.sentence_queue.put((text, bytes_data))
-                self.pending_sentences.append(text)
             except Exception as e:
                 print(f"Error processing final text: {e}")
@@ -363,28 +425,31 @@ class RealtimeSpeakerDiarization:
                 # Extract speaker embedding
                 speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
-                # Store sentence and embedding
-                self.full_sentences.append((text, speaker_embedding))
-                # Fill in missing speaker assignments
-                while len(self.sentence_speakers) < len(self.full_sentences) - 1:
-                    self.sentence_speakers.append(0)
-                # Detect speaker changes
-                speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
-                self.sentence_speakers.append(speaker_id)
-                # Remove from pending
-                if text in self.pending_sentences:
-                    self.pending_sentences.remove(text)
-                # Update conversation display
-                self.current_conversation = self.get_formatted_conversation()
             except queue.Empty:
                 continue
             except Exception as e:
                 print(f"Error processing sentence: {e}")
     def start_recording(self):
         """Start the recording and transcription process"""
@@ -412,10 +477,22 @@ class RealtimeSpeakerDiarization:
                 'beam_size_realtime': REALTIME_BEAM_SIZE,
                 'buffer_size': BUFFER_SIZE,
                 'sample_rate': SAMPLE_RATE,
             }
             self.recorder = AudioToTextRecorder(**recorder_config)
             # Start sentence processing thread
             self.is_running = True
             self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
@@ -428,7 +505,10 @@ class RealtimeSpeakerDiarization:
             return "Recording started successfully! FastRTC audio input ready."
         except Exception as e:
-            return f"Error starting recording: {e}"
     def run_transcription(self):
         """Run the transcription loop"""
@@ -443,8 +523,48 @@ class RealtimeSpeakerDiarization:
         self.is_running = False
         if self.recorder:
             self.recorder.stop()
         return "Recording stopped!"
     def clear_conversation(self):
         """Clear all conversation data"""
         self.full_sentences = []
@@ -553,11 +673,23 @@ class RealtimeSpeakerDiarization:
             else:
                 audio_bytes = audio_data
-            # Feed to recorder
-            self.recorder.feed_audio(audio_bytes)
         except Exception as e:
-            print(f"Error feeding audio data: {e}")
     def process_audio_chunk(self, audio_data, sample_rate=16000):
         """Process audio chunk from FastRTC input"""
@@ -565,34 +697,30 @@ class RealtimeSpeakerDiarization:
             return
         try:
-            # Convert float audio to int16 for the recorder
-            if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
-                if np.max(np.abs(audio_data)) <= 1.0:
-                    # Float audio is normalized to [-1, 1], convert to int16
-                    audio_int16 = (audio_data * 32767).astype(np.int16)
-                else:
-                    # Audio is already in higher range
-                    audio_int16 = audio_data.astype(np.int16)
-            else:
-                audio_int16 = audio_data
-            # Ensure correct shape (1, N) for the recorder
-            if len(audio_int16.shape) == 1:
-                audio_int16 = np.expand_dims(audio_int16, 0)
-            # Resample if needed
-            if sample_rate != SAMPLE_RATE:
-                audio_int16 = self._resample_audio(audio_int16, sample_rate, SAMPLE_RATE)
-            # Convert to bytes for feeding to recorder
-            audio_bytes = audio_int16.tobytes()
-            # Feed to recorder
-            self.feed_audio_data(audio_bytes)
         except Exception as e:
-            print(f"Error processing audio chunk: {e}")
     def _resample_audio(self, audio, orig_sr, target_sr):
         """Resample audio to target sample rate"""
         try:
@@ -613,6 +741,60 @@ class RealtimeSpeakerDiarization:
             print(f"Error resampling audio: {e}")
             return audio
 # FastRTC Audio Handler for Real-time Diarization
@@ -620,9 +802,10 @@ class DiarizationHandler(AsyncStreamHandler):
     def __init__(self, diarization_system):
         super().__init__()
         self.diarization_system = diarization_system
-        self.audio_queue = Queue()
         self.is_processing = False
         self.sample_rate = 16000  # Default sample rate
     def copy(self):
         """Return a fresh handler for each new stream connection"""
@@ -646,39 +829,75 @@ class DiarizationHandler(AsyncStreamHandler):
             else:
                 audio_data = frame
-            # Convert to numpy array if needed
-            if isinstance(audio_data, bytes):
-                # Convert bytes to numpy array (assuming 16-bit PCM)
-                audio_array = np.frombuffer(audio_data, dtype=np.int16)
-                # Normalize to float32 range [-1, 1]
-                audio_array = audio_array.astype(np.float32) / 32768.0
-            elif isinstance(audio_data, (list, tuple)):
-                audio_array = np.array(audio_data, dtype=np.float32)
-            elif isinstance(audio_data, np.ndarray):
-                audio_array = audio_data.astype(np.float32)
-            else:
-                print(f"Unknown audio data type: {type(audio_data)}")
-                return
-            # Ensure mono audio
-            if len(audio_array.shape) > 1 and audio_array.shape[1] > 1:
-                audio_array = np.mean(audio_array, axis=1)
-            # Ensure 1D array
-            if len(audio_array.shape) > 1:
-                audio_array = audio_array.flatten()
             # Get sample rate from frame if available
             sample_rate = getattr(frame, 'sample_rate', self.sample_rate)
-            # Process audio asynchronously to avoid blocking
-            await self.process_audio_async(audio_array, sample_rate)
         except Exception as e:
             print(f"Error in FastRTC audio receive: {e}")
             import traceback
             traceback.print_exc()
     async def process_audio_async(self, audio_data, sample_rate=16000):
         """Process audio data asynchronously"""
         try:
@@ -698,10 +917,30 @@ class DiarizationHandler(AsyncStreamHandler):
         print("FastRTC stream started")
         self.is_processing = True
     async def shutdown(self) -> None:
         """Clean up any resources when the stream ends"""
         print("FastRTC stream shutting down")
         self.is_processing = False
 # Global instances

         self.audio_processor = None
         self.speaker_detector = None
         self.recorder = None
+        self.sentence_queue = queue.Queue(maxsize=100)  # Add maxsize to prevent unlimited growth
         self.full_sentences = []
         self.sentence_speakers = []
         self.pending_sentences = []
         self.max_speakers = DEFAULT_MAX_SPEAKERS
         self.current_conversation = ""
         self.audio_buffer = []
+        # Add locks for thread safety
+        self._state_lock = threading.RLock()  # Reentrant lock for shared state
+        self._audio_lock = threading.Lock()   # Lock for audio processing
     def initialize_models(self):
         """Initialize the speaker encoder model"""
             print(f"Using device: {device_str}")
             self.encoder = SpeechBrainEncoder(device=device_str)
+            # Try to load model with timeout
+            import threading
+            load_success = [False]
+            def load_model_thread():
+                try:
+                    success = self.encoder.load_model()
+                    load_success[0] = success
+                except Exception as e:
+                    print(f"Error in model loading thread: {e}")
+            # Start loading in a thread with timeout
+            load_thread = threading.Thread(target=load_model_thread)
+            load_thread.daemon = True
+            load_thread.start()
+            load_thread.join(timeout=60)  # 60 second timeout for model loading
+            if load_success[0]:
                 self.audio_processor = AudioProcessor(self.encoder)
                 self.speaker_detector = SpeakerChangeDetector(
                     embedding_dim=self.encoder.embedding_dim,
                 print("ECAPA-TDNN model loaded successfully!")
                 return True
             else:
+                print("Failed to load ECAPA-TDNN model or timeout occurred")
+                return self._initialize_fallback()
         except Exception as e:
             print(f"Model initialization error: {e}")
+            import traceback
+            traceback.print_exc()
+            return self._initialize_fallback()
+    def _initialize_fallback(self):
+        """Initialize fallback mode when model loading fails"""
+        try:
+            print("Initializing fallback mode with simple speaker detection...")
+            # Create a simple embedding dimension
+            embedding_dim = 64
+            # Create a dummy encoder that produces random embeddings
+            class DummyEncoder:
+                def __init__(self):
+                    self.embedding_dim = embedding_dim
+                    self.model_loaded = True
+                def embed_utterance(self, audio, sr=16000):
+                    # Simple energy-based pseudo-embedding
+                    if isinstance(audio, np.ndarray):
+                        # Create a simple feature vector (not a real embedding)
+                        energy = np.mean(np.abs(audio))
+                        # Create a pseudo-random but consistent embedding based on audio energy
+                        np.random.seed(int(energy * 1000))
+                        return np.random.rand(embedding_dim)
+                    return np.random.rand(embedding_dim)
+            # Set up system with fallback components
+            self.encoder = DummyEncoder()
+            self.audio_processor = AudioProcessor(self.encoder)
+            self.speaker_detector = SpeakerChangeDetector(
+                embedding_dim=embedding_dim,
+                change_threshold=self.change_threshold,
+                max_speakers=2  # Limit speakers in fallback mode
+            )
+            print("Fallback mode initialized - limited functionality!")
+            return True
+        except Exception as e:
+            print(f"Even fallback initialization failed: {e}")
             return False
     def live_text_detected(self, text):
         if text:
             try:
                 bytes_data = self.recorder.last_transcription_bytes
+                self.sentence_queue.put((text, bytes_data), timeout=1.0)  # Added timeout
+                with self._state_lock:
+                    self.pending_sentences.append(text)
             except Exception as e:
                 print(f"Error processing final text: {e}")
                 # Extract speaker embedding
                 speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
+                with self._state_lock:
+                    # Store sentence and embedding
+                    self.full_sentences.append((text, speaker_embedding))
+                    # Fill in missing speaker assignments
+                    while len(self.sentence_speakers) < len(self.full_sentences) - 1:
+                        self.sentence_speakers.append(0)
+                    # Detect speaker changes
+                    speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
+                    self.sentence_speakers.append(speaker_id)
+                    # Remove from pending
+                    if text in self.pending_sentences:
+                        self.pending_sentences.remove(text)
+                    # Update conversation display
+                    self.current_conversation = self.get_formatted_conversation()
             except queue.Empty:
                 continue
             except Exception as e:
                 print(f"Error processing sentence: {e}")
+                import traceback
+                traceback.print_exc()
     def start_recording(self):
         """Start the recording and transcription process"""
                 'beam_size_realtime': REALTIME_BEAM_SIZE,
                 'buffer_size': BUFFER_SIZE,
                 'sample_rate': SAMPLE_RATE,
+                'external_audio': True,  # Signal that we'll provide audio
             }
+            # Make sure we're not running already
+            if hasattr(self, 'is_running') and self.is_running:
+                self.stop_recording()
+                # Short pause to ensure cleanup completes
+                time.sleep(0.5)
             self.recorder = AudioToTextRecorder(**recorder_config)
+            # Reset state
+            with self._state_lock:
+                self.pending_sentences = []
+                self.last_realtime_text = ""
             # Start sentence processing thread
             self.is_running = True
             self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
             return "Recording started successfully! FastRTC audio input ready."
         except Exception as e:
+            self.is_running = False
+            import traceback
+            traceback.print_exc()
+            return f"Error starting recording: {str(e)}"
     def run_transcription(self):
         """Run the transcription loop"""
         self.is_running = False
         if self.recorder:
             self.recorder.stop()
+        # Wait for threads to finish
+        self._cleanup_resources()
         return "Recording stopped!"
+    def _cleanup_resources(self):
+        """Clean up resources and threads"""
+        try:
+            # Wait for threads to stop gracefully
+            if hasattr(self, 'sentence_thread') and self.sentence_thread is not None:
+                if self.sentence_thread.is_alive():
+                    self.sentence_thread.join(timeout=3.0)
+            if hasattr(self, 'transcription_thread') and self.transcription_thread is not None:
+                if self.transcription_thread.is_alive():
+                    self.transcription_thread.join(timeout=3.0)
+            # Clean up memory
+            with self._state_lock:
+                # Limit history size to prevent memory leaks
+                if len(self.full_sentences) > 1000:
+                    self.full_sentences = self.full_sentences[-1000:]
+                if len(self.sentence_speakers) > 1000:
+                    self.sentence_speakers = self.sentence_speakers[-1000:]
+            # Clear audio buffer
+            with self._audio_lock:
+                self.audio_buffer = []
+            # Clear queue
+            while not self.sentence_queue.empty():
+                try:
+                    self.sentence_queue.get_nowait()
+                except:
+                    pass
+        except Exception as e:
+            print(f"Error during resource cleanup: {e}")
+            import traceback
+            traceback.print_exc()
     def clear_conversation(self):
         """Clear all conversation data"""
         self.full_sentences = []
             else:
                 audio_bytes = audio_data
+            # Use the recorder's internal buffer mechanism
+            if hasattr(self.recorder, 'feed_audio') and callable(self.recorder.feed_audio):
+                self.recorder.feed_audio(audio_bytes)
+            else:
+                # Fallback: Direct access to the underlying buffer if the method doesn't exist
+                self.audio_buffer.append(audio_bytes)
+                # Process buffered audio when enough is accumulated
+                if len(self.audio_buffer) > 5:  # Process in small batches
+                    combined = b''.join(self.audio_buffer)
+                    if hasattr(self.recorder, '_process_audio'):
+                        self.recorder._process_audio(combined)
+                    self.audio_buffer = []
         except Exception as e:
+            print(f"Error feeding audio data: {str(e)}")
+            import traceback
+            traceback.print_exc()
     def process_audio_chunk(self, audio_data, sample_rate=16000):
         """Process audio chunk from FastRTC input"""
             return
         try:
+            with self._audio_lock:
+                # Use the normalized audio function
+                audio_int16 = self._normalize_audio_format(audio_data, target_dtype=np.int16, target_sample_rate=SAMPLE_RATE)
+                # Check if we got valid audio
+                if audio_int16.size == 0:
+                    print("Warning: Empty audio chunk received")
+                    return
+                # Resample if needed
+                if sample_rate != SAMPLE_RATE:
+                    audio_int16 = self._resample_audio(audio_int16, sample_rate, SAMPLE_RATE)
+                # Convert to bytes for feeding to recorder
+                audio_bytes = audio_int16.tobytes()
+                # Feed to recorder
+                self.feed_audio_data(audio_bytes)
         except Exception as e:
+            print(f"Error processing audio chunk: {str(e)}")
+            import traceback
+            traceback.print_exc()
     def _resample_audio(self, audio, orig_sr, target_sr):
         """Resample audio to target sample rate"""
         try:
             print(f"Error resampling audio: {e}")
             return audio
+    def _normalize_audio_format(self, audio_data, target_dtype=np.int16, target_sample_rate=SAMPLE_RATE):
+        """Normalize audio data to consistent format
+        Args:
+            audio_data: Input audio as numpy array or bytes
+            target_dtype: Target data type (np.int16 or np.float32)
+            target_sample_rate: Target sample rate
+        Returns:
+            Normalized audio as numpy array in requested format
+        """
+        try:
+            # Convert bytes to numpy if needed
+            if isinstance(audio_data, bytes):
+                audio_array = np.frombuffer(audio_data, dtype=np.int16)
+            elif isinstance(audio_data, (list, tuple)):
+                audio_array = np.array(audio_data)
+            else:
+                audio_array = audio_data
+            # Convert data type as needed
+            if target_dtype == np.int16 and audio_array.dtype != np.int16:
+                if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
+                    # Check if normalized to [-1, 1] range
+                    if np.max(np.abs(audio_array)) <= 1.0:
+                        audio_array = (audio_array * 32767).astype(np.int16)
+                    else:
+                        audio_array = audio_array.astype(np.int16)
+                else:
+                    audio_array = audio_array.astype(np.int16)
+            elif target_dtype == np.float32 and audio_array.dtype != np.float32:
+                if audio_array.dtype == np.int16:
+                    audio_array = audio_array.astype(np.float32) / 32768.0
+                else:
+                    audio_array = audio_array.astype(np.float32)
+            # Ensure mono audio
+            if len(audio_array.shape) > 1 and audio_array.shape[1] > 1:
+                audio_array = np.mean(audio_array, axis=1)
+            # Reshape if needed
+            if len(audio_array.shape) == 1:
+                if target_dtype == np.int16:
+                    audio_array = np.expand_dims(audio_array, 0)
+            return audio_array
+        except Exception as e:
+            print(f"Error normalizing audio format: {e}")
+            import traceback
+            traceback.print_exc()
+            # Return empty array of correct type as fallback
+            return np.array([], dtype=target_dtype)
 # FastRTC Audio Handler for Real-time Diarization
     def __init__(self, diarization_system):
         super().__init__()
         self.diarization_system = diarization_system
+        self.audio_queue = asyncio.Queue(maxsize=100)  # Use asyncio queue
         self.is_processing = False
         self.sample_rate = 16000  # Default sample rate
+        self.processing_task = None
     def copy(self):
         """Return a fresh handler for each new stream connection"""
             else:
                 audio_data = frame
             # Get sample rate from frame if available
             sample_rate = getattr(frame, 'sample_rate', self.sample_rate)
+            # Add to queue - non-blocking with timeout
+            try:
+                # Use put_nowait with try/except to avoid blocking
+                await asyncio.wait_for(
+                    self.audio_queue.put((audio_data, sample_rate)),
+                    timeout=0.1
+                )
+            except asyncio.TimeoutError:
+                # Queue is full, drop this chunk
+                print("Warning: Audio queue full, dropping frame")
+                return
         except Exception as e:
             print(f"Error in FastRTC audio receive: {e}")
             import traceback
             traceback.print_exc()
+    async def _process_audio_loop(self):
+        """Background task to process audio from queue"""
+        while self.is_processing:
+            try:
+                # Get from queue with timeout to allow checking is_processing flag
+                try:
+                    audio_data, sample_rate = await asyncio.wait_for(
+                        self.audio_queue.get(),
+                        timeout=0.5
+                    )
+                except asyncio.TimeoutError:
+                    # No audio available, check if we should keep running
+                    continue
+                # Convert to numpy array if needed
+                if isinstance(audio_data, bytes):
+                    # Convert bytes to numpy array (assuming 16-bit PCM)
+                    audio_array = np.frombuffer(audio_data, dtype=np.int16)
+                    # Normalize to float32 range [-1, 1]
+                    audio_array = audio_array.astype(np.float32) / 32768.0
+                elif isinstance(audio_data, (list, tuple)):
+                    audio_array = np.array(audio_data, dtype=np.float32)
+                elif isinstance(audio_data, np.ndarray):
+                    audio_array = audio_array.astype(np.float32)
+                else:
+                    print(f"Unknown audio data type: {type(audio_data)}")
+                    continue
+                # Ensure mono audio
+                if len(audio_array.shape) > 1 and audio_array.shape[1] > 1:
+                    audio_array = np.mean(audio_array, axis=1)
+                # Ensure 1D array
+                if len(audio_array.shape) > 1:
+                    audio_array = audio_array.flatten()
+                # Process audio through thread pool to avoid blocking event loop
+                await self.process_audio_async(audio_array, sample_rate)
+                # Mark as done
+                self.audio_queue.task_done()
+            except Exception as e:
+                print(f"Error in audio processing loop: {e}")
+                import traceback
+                traceback.print_exc()
+                # Short sleep to avoid tight loop
+                await asyncio.sleep(0.1)
     async def process_audio_async(self, audio_data, sample_rate=16000):
         """Process audio data asynchronously"""
         try:
         print("FastRTC stream started")
         self.is_processing = True
+        # Start background processing task
+        self.processing_task = asyncio.create_task(self._process_audio_loop())
     async def shutdown(self) -> None:
         """Clean up any resources when the stream ends"""
         print("FastRTC stream shutting down")
         self.is_processing = False
+        # Wait for processing task to finish
+        if self.processing_task:
+            try:
+                # Cancel and wait for task
+                self.processing_task.cancel()
+                await asyncio.wait([self.processing_task], timeout=2.0)
+            except (asyncio.CancelledError, Exception) as e:
+                print(f"Error cancelling audio processing task: {e}")
+        # Clear queue
+        while not self.audio_queue.empty():
+            try:
+                self.audio_queue.get_nowait()
+                self.audio_queue.task_done()
+            except:
+                pass
 # Global instances