Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 24

Commit

78869ff

1 Parent(s): b3cc831

Code fixing

Browse files

Files changed (1) hide show

app.py +204 -176

app.py CHANGED Viewed

@@ -10,16 +10,12 @@ import torchaudio
 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI
 import json
 import io
 import wave
 import asyncio
 import uvicorn
-import logging
-# Configure logging to reduce noise
-logging.getLogger("uvicorn").setLevel(logging.WARNING)
-logging.getLogger("gradio").setLevel(logging.WARNING)
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
@@ -76,15 +72,23 @@ class SpeechBrainEncoder:
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
     def load_model(self):
-        """Load the ECAPA-TDNN model with error handling"""
         try:
-            # Try to import speechbrain
-            try:
-                from speechbrain.pretrained import EncoderClassifier
-            except ImportError:
-                print("SpeechBrain not available. Using fallback embedding model.")
-                return self._load_fallback_model()
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
@@ -93,17 +97,10 @@ class SpeechBrainEncoder:
             )
             self.model_loaded = True
-            print("ECAPA-TDNN model loaded successfully!")
             return True
         except Exception as e:
             print(f"Error loading ECAPA-TDNN model: {e}")
-            return self._load_fallback_model()
-    def _load_fallback_model(self):
-        """Fallback to a simple embedding model if SpeechBrain is not available"""
-        print("Using fallback embedding model (simple spectral features)")
-        self.model_loaded = True
-        return True
     def embed_utterance(self, audio, sr=16000):
         """Extract speaker embedding from audio"""
@@ -111,48 +108,21 @@ class SpeechBrainEncoder:
             raise ValueError("Model not loaded. Call load_model() first.")
         try:
-            if self.model is not None:
-                # Use SpeechBrain model
-                if isinstance(audio, np.ndarray):
-                    waveform = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
-                else:
-                    waveform = audio.unsqueeze(0)
-                if sr != 16000:
-                    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
-                with torch.no_grad():
-                    embedding = self.model.encode_batch(waveform)
-                return embedding.squeeze().cpu().numpy()
             else:
-                # Use fallback method - simple spectral features
-                return self._extract_simple_features(audio)
-        except Exception as e:
-            print(f"Error extracting embedding: {e}")
-            return self._extract_simple_features(audio)
-    def _extract_simple_features(self, audio):
-        """Simple fallback feature extraction"""
-        try:
-            # Ensure audio is numpy array
-            if isinstance(audio, torch.Tensor):
-                audio = audio.numpy()
-            # Basic spectral features as a fallback
-            fft = np.fft.fft(audio)
-            magnitude = np.abs(fft)
-            # Take first 192 features to match expected embedding dimension
-            features = magnitude[:self.embedding_dim] if len(magnitude) >= self.embedding_dim else np.pad(magnitude, (0, self.embedding_dim - len(magnitude)))
-            # Normalize
-            features = features / (np.linalg.norm(features) + 1e-8)
-            return features.astype(np.float32)
         except Exception as e:
-            print(f"Error in fallback feature extraction: {e}")
-            return np.random.randn(self.embedding_dim).astype(np.float32)
 class AudioProcessor:
@@ -321,7 +291,6 @@ class RealtimeSpeakerDiarization:
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
         self.current_conversation = ""
-        self.audio_buffer = []
     def initialize_models(self):
         """Initialize the speaker encoder model"""
@@ -339,10 +308,10 @@ class RealtimeSpeakerDiarization:
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
-                print("Speaker diarization model loaded successfully!")
                 return True
             else:
-                print("Failed to load speaker diarization model")
                 return False
         except Exception as e:
             print(f"Model initialization error: {e}")
@@ -362,31 +331,19 @@ class RealtimeSpeakerDiarization:
             self.last_realtime_text = text
             if prob_sentence_end and FAST_SENTENCE_END:
-                if self.recorder:
-                    self.recorder.stop()
             elif prob_sentence_end:
-                if self.recorder:
-                    self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
             else:
-                if self.recorder:
-                    self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
         text = text.strip()
         if text:
             try:
-                if self.recorder and hasattr(self.recorder, 'last_transcription_bytes'):
-                    bytes_data = self.recorder.last_transcription_bytes
-                    self.sentence_queue.put((text, bytes_data))
-                else:
-                    # Use audio buffer as fallback
-                    if self.audio_buffer:
-                        audio_data = np.concatenate(self.audio_buffer)
-                        bytes_data = audio_data.tobytes()
-                        self.sentence_queue.put((text, bytes_data))
-                        self.audio_buffer = []  # Clear buffer after use
                 self.pending_sentences.append(text)
             except Exception as e:
                 print(f"Error processing final text: {e}")
@@ -432,51 +389,40 @@ class RealtimeSpeakerDiarization:
             return "Please initialize models first!"
         try:
-            # Check if RealtimeSTT is available
-            try:
-                from RealtimeSTT import AudioToTextRecorder
-                recorder_available = True
-            except ImportError:
-                print("RealtimeSTT not available. Using simulated audio processing.")
-                recorder_available = False
-            if recorder_available:
-                # Setup recorder configuration
-                recorder_config = {
-                    'spinner': False,
-                    'use_microphone': True,
-                    'model': FINAL_TRANSCRIPTION_MODEL,
-                    'language': TRANSCRIPTION_LANGUAGE,
-                    'silero_sensitivity': SILERO_SENSITIVITY,
-                    'webrtc_sensitivity': WEBRTC_SENSITIVITY,
-                    'post_speech_silence_duration': SILENCE_THRESHS[1],
-                    'min_length_of_recording': MIN_LENGTH_OF_RECORDING,
-                    'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
-                    'min_gap_between_recordings': 0,
-                    'enable_realtime_transcription': True,
-                    'realtime_processing_pause': 0,
-                    'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
-                    'on_realtime_transcription_update': self.live_text_detected,
-                    'beam_size': FINAL_BEAM_SIZE,
-                    'beam_size_realtime': REALTIME_BEAM_SIZE,
-                    'buffer_size': BUFFER_SIZE,
-                    'sample_rate': SAMPLE_RATE,
-                }
-                self.recorder = AudioToTextRecorder(**recorder_config)
             # Start sentence processing thread
             self.is_running = True
             self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
             self.sentence_thread.start()
-            if recorder_available:
-                # Start transcription thread
-                self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
-                self.transcription_thread.start()
-                return "Recording started successfully! Please speak into your microphone."
-            else:
-                return "Simulation mode active. Speaker diarization ready for audio input."
         except Exception as e:
             return f"Error starting recording: {e}"
@@ -484,7 +430,7 @@ class RealtimeSpeakerDiarization:
     def run_transcription(self):
         """Run the transcription loop"""
         try:
-            while self.is_running and self.recorder:
                 self.recorder.text(self.process_final_text)
         except Exception as e:
             print(f"Transcription error: {e}")
@@ -493,10 +439,7 @@ class RealtimeSpeakerDiarization:
         """Stop the recording process"""
         self.is_running = False
         if self.recorder:
-            try:
-                self.recorder.stop()
-            except:
-                pass
         return "Recording stopped!"
     def clear_conversation(self):
@@ -507,7 +450,6 @@ class RealtimeSpeakerDiarization:
         self.displayed_text = ""
         self.last_realtime_text = ""
         self.current_conversation = "Conversation cleared!"
-        self.audio_buffer = []
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
@@ -589,42 +531,43 @@ class RealtimeSpeakerDiarization:
             return f"Error getting status: {e}"
     def process_audio(self, audio_data):
-        """Process audio data from external sources"""
-        if not self.is_running:
             return
         try:
-            # Handle different audio data formats
-            if isinstance(audio_data, tuple) and len(audio_data) == 2:
-                sample_rate, audio_array = audio_data
-            else:
-                audio_array = audio_data
-                sample_rate = SAMPLE_RATE
             # Convert to int16 format
             if audio_array.dtype != np.int16:
-                if audio_array.dtype == np.float32 or audio_array.dtype == np.float64:
-                    audio_array = (audio_array * 32767).astype(np.int16)
-                else:
-                    audio_array = audio_array.astype(np.int16)
-            # Store in buffer for later processing
-            self.audio_buffer.append(audio_array)
-            # Process if we have enough audio data
-            if len(self.audio_buffer) > 10:  # Process every ~0.5 seconds of audio
-                combined_audio = np.concatenate(self.audio_buffer)
-                # Simulate transcription for demonstration
-                if len(combined_audio) > SAMPLE_RATE:  # At least 1 second of audio
-                    # In a real implementation, this would be transcribed text
-                    demo_text = f"Sample speech segment {len(self.full_sentences) + 1}"
-                    self.process_final_text(demo_text)
-                self.audio_buffer = []  # Clear buffer
         except Exception as e:
-            print(f"Error processing audio: {e}")
 # Global instance
@@ -670,6 +613,61 @@ def get_status():
     return diarization_system.get_status_info()
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as interface:
@@ -678,13 +676,31 @@ def create_interface():
         with gr.Row():
             with gr.Column(scale=2):
-                # Audio input component
-                audio_input = gr.Audio(
-                    label="🎙️ Audio Input",
-                    sources=["microphone"],
-                    type="numpy",
-                    streaming=True
-                )
                 # Main conversation display
                 conversation_output = gr.HTML(
@@ -735,9 +751,11 @@ def create_interface():
                 gr.Markdown("""
                 1. Click **Initialize System** to load models
                 2. Click **Start Recording** to begin processing
-                3. Use the microphone input above to record audio
-                4. Watch real-time transcription with speaker labels
-                5. Adjust settings as needed
                 """)
                 # Speaker color legend
@@ -747,13 +765,20 @@ def create_interface():
                     color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
                 gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
-        # Audio processing function
-        def process_audio_stream(audio_data):
-            if audio_data is not None and diarization_system.is_running:
-                diarization_system.process_audio(audio_data)
-                return diarization_system.get_formatted_conversation()
-            return None
         # Auto-refresh conversation and status
         def refresh_display():
@@ -822,13 +847,6 @@ def create_interface():
             outputs=[status_output]
         )
-        # Connect audio input to processing function
-        audio_input.stream(
-            process_audio_stream,
-            inputs=[audio_input],
-            outputs=[conversation_output]
-        )
         # Auto-refresh every 2 seconds when recording
         refresh_timer = gr.Timer(2.0)
         refresh_timer.tick(
@@ -848,6 +866,16 @@ gradio_interface = create_interface()
 # 3) Mount Gradio onto FastAPI at root
 app = gr.mount_gradio_app(app, gradio_interface, path="/")
-# 4) Local dev via uvicorn; HF Spaces will auto-detect 'app' and ignore this
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from scipy.spatial.distance import cosine
 from RealtimeSTT import AudioToTextRecorder
 from fastapi import FastAPI
+from fastrtc import Stream, AsyncStreamHandler, ReplyOnPause, get_cloudflare_turn_credentials_async, get_cloudflare_turn_credentials
 import json
 import io
 import wave
 import asyncio
 import uvicorn
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
+    def _download_model(self):
+        """Download pre-trained SpeechBrain ECAPA-TDNN model if not present"""
+        model_url = "https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb/resolve/main/embedding_model.ckpt"
+        model_path = os.path.join(self.cache_dir, "embedding_model.ckpt")
+        if not os.path.exists(model_path):
+            print(f"Downloading ECAPA-TDNN model to {model_path}...")
+            urllib.request.urlretrieve(model_url, model_path)
+        return model_path
     def load_model(self):
+        """Load the ECAPA-TDNN model"""
         try:
+            from speechbrain.pretrained import EncoderClassifier
+            model_path = self._download_model()
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
             )
             self.model_loaded = True
             return True
         except Exception as e:
             print(f"Error loading ECAPA-TDNN model: {e}")
+            return False
     def embed_utterance(self, audio, sr=16000):
         """Extract speaker embedding from audio"""
             raise ValueError("Model not loaded. Call load_model() first.")
         try:
+            if isinstance(audio, np.ndarray):
+                waveform = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
             else:
+                waveform = audio.unsqueeze(0)
+            if sr != 16000:
+                waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
+            with torch.no_grad():
+                embedding = self.model.encode_batch(waveform)
+            return embedding.squeeze().cpu().numpy()
         except Exception as e:
+            print(f"Error extracting embedding: {e}")
+            return np.zeros(self.embedding_dim)
 class AudioProcessor:
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
         self.current_conversation = ""
     def initialize_models(self):
         """Initialize the speaker encoder model"""
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
+                print("ECAPA-TDNN model loaded successfully!")
                 return True
             else:
+                print("Failed to load ECAPA-TDNN model")
                 return False
         except Exception as e:
             print(f"Model initialization error: {e}")
             self.last_realtime_text = text
             if prob_sentence_end and FAST_SENTENCE_END:
+                self.recorder.stop()
             elif prob_sentence_end:
+                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
             else:
+                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
     def process_final_text(self, text):
         """Process final transcribed text with speaker embedding"""
         text = text.strip()
         if text:
             try:
+                bytes_data = self.recorder.last_transcription_bytes
+                self.sentence_queue.put((text, bytes_data))
                 self.pending_sentences.append(text)
             except Exception as e:
                 print(f"Error processing final text: {e}")
             return "Please initialize models first!"
         try:
+            # Setup recorder configuration for WebRTC input
+            recorder_config = {
+                'spinner': False,
+                'use_microphone': False,  # We'll feed audio manually
+                'model': FINAL_TRANSCRIPTION_MODEL,
+                'language': TRANSCRIPTION_LANGUAGE,
+                'silero_sensitivity': SILERO_SENSITIVITY,
+                'webrtc_sensitivity': WEBRTC_SENSITIVITY,
+                'post_speech_silence_duration': SILENCE_THRESHS[1],
+                'min_length_of_recording': MIN_LENGTH_OF_RECORDING,
+                'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
+                'min_gap_between_recordings': 0,
+                'enable_realtime_transcription': True,
+                'realtime_processing_pause': 0,
+                'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
+                'on_realtime_transcription_update': self.live_text_detected,
+                'beam_size': FINAL_BEAM_SIZE,
+                'beam_size_realtime': REALTIME_BEAM_SIZE,
+                'buffer_size': BUFFER_SIZE,
+                'sample_rate': SAMPLE_RATE,
+            }
+            self.recorder = AudioToTextRecorder(**recorder_config)
             # Start sentence processing thread
             self.is_running = True
             self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
             self.sentence_thread.start()
+            # Start transcription thread
+            self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
+            self.transcription_thread.start()
+            return "Recording started successfully! FastRTC audio input ready."
         except Exception as e:
             return f"Error starting recording: {e}"
     def run_transcription(self):
         """Run the transcription loop"""
         try:
+            while self.is_running:
                 self.recorder.text(self.process_final_text)
         except Exception as e:
             print(f"Transcription error: {e}")
         """Stop the recording process"""
         self.is_running = False
         if self.recorder:
+            self.recorder.stop()
         return "Recording stopped!"
     def clear_conversation(self):
         self.displayed_text = ""
         self.last_realtime_text = ""
         self.current_conversation = "Conversation cleared!"
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
             return f"Error getting status: {e}"
     def process_audio(self, audio_data):
+        """Process audio data from FastRTC"""
+        if not self.is_running or not self.recorder:
             return
         try:
+            # Extract audio data from FastRTC format (sample_rate, numpy_array)
+            sample_rate, audio_array = audio_data
             # Convert to int16 format
             if audio_array.dtype != np.int16:
+                audio_array = (audio_array * 32767).astype(np.int16)
+            # Convert to bytes and feed to recorder
+            audio_bytes = audio_array.tobytes()
+            self.recorder.feed_audio(audio_bytes)
         except Exception as e:
+            print(f"Error processing FastRTC audio: {e}")
+# FastRTC Audio Handler
+class DiarizationHandler(AsyncStreamHandler):
+    def __init__(self, diarization_system):
+        super().__init__()
+        self.diarization_system = diarization_system
+    def copy(self):
+        # Return a fresh handler for each new stream connection
+        return DiarizationHandler(self.diarization_system)
+    async def emit(self):
+        """Not used in this implementation"""
+        return None
+    async def receive(self, data):
+        """Receive audio data from FastRTC and process it"""
+        if self.diarization_system.is_running:
+            self.diarization_system.process_audio(data)
 # Global instance
     return diarization_system.get_status_info()
+# Get Cloudflare TURN credentials for FastRTC
+async def get_cloudflare_credentials():
+    # Check if HF_TOKEN is set in environment
+    hf_token = os.environ.get("HF_TOKEN")
+    # If not set, use a default Hugging Face token if available
+    if not hf_token:
+        # Log a warning that user should set their own token
+        print("Warning: HF_TOKEN environment variable not set. Please set your own Hugging Face token.")
+        # Try to use the Hugging Face token from the environment
+        from huggingface_hub import HfApi
+        try:
+            api = HfApi()
+            hf_token = api.token
+            if not hf_token:
+                print("Error: No Hugging Face token available. TURN relay may not work properly.")
+        except:
+            print("Error: Failed to get Hugging Face token. TURN relay may not work properly.")
+    # Get Cloudflare TURN credentials using the Hugging Face token
+    if hf_token:
+        try:
+            return await get_cloudflare_turn_credentials_async(hf_token=hf_token)
+        except Exception as e:
+            print(f"Error getting Cloudflare TURN credentials: {e}")
+    # Fallback to a default configuration that may not work
+    return {
+        "iceServers": [
+            {
+                "urls": "stun:stun.l.google.com:19302"
+            }
+        ]
+    }
+# Setup FastRTC stream handler with TURN server configuration
+def setup_fastrtc_handler():
+    """Set up FastRTC audio stream handler with TURN server configuration"""
+    handler = DiarizationHandler(diarization_system)
+    # Get server-side credentials (longer TTL)
+    server_credentials = get_cloudflare_turn_credentials(ttl=360000)
+    stream = Stream(
+        handler=handler,
+        modality="audio",
+        mode="receive",
+        rtc_configuration=get_cloudflare_credentials,  # Async function for client-side credentials
+        server_rtc_configuration=server_credentials    # Server-side credentials with longer TTL
+    )
+    return stream
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Monochrome()) as interface:
         with gr.Row():
             with gr.Column(scale=2):
+                # FastRTC Audio Component
+                fastrtc_html = gr.HTML("""
+                <div class="fastrtc-container" style="margin-bottom: 20px;">
+                    <h3>🎙️ FastRTC Audio Input</h3>
+                    <p>Click the button below to start the audio stream:</p>
+                    <button id="start-fastrtc" style="background: #3498db; color: white; padding: 10px 20px; border: none; border-radius: 5px; cursor: pointer;">
+                        Start FastRTC Audio
+                    </button>
+                    <div id="fastrtc-status" style="margin-top: 10px; font-style: italic;">Not connected</div>
+                    <script>
+                        document.getElementById('start-fastrtc').addEventListener('click', function() {
+                            document.getElementById('fastrtc-status').textContent = 'Connecting...';
+                            // FastRTC will initialize the connection
+                            fetch('/start-rtc', { method: 'POST' })
+                                .then(response => response.text())
+                                .then(data => {
+                                    document.getElementById('fastrtc-status').textContent = 'Connected! Speak now...';
+                                })
+                                .catch(error => {
+                                    document.getElementById('fastrtc-status').textContent = 'Connection error: ' + error;
+                                });
+                        });
+                    </script>
+                </div>
+                """)
                 # Main conversation display
                 conversation_output = gr.HTML(
                 gr.Markdown("""
                 1. Click **Initialize System** to load models
                 2. Click **Start Recording** to begin processing
+                3. Click **Start FastRTC Audio** to connect your microphone
+                4. Allow microphone access when prompted
+                5. Speak into your microphone
+                6. Watch real-time transcription with speaker labels
+                7. Adjust settings as needed
                 """)
                 # Speaker color legend
                     color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
                 gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
+                # FastRTC Integration Notice
+                gr.Markdown("""
+                ## ℹ️ About FastRTC
+                This app uses FastRTC for low-latency audio streaming.
+                For optimal performance, use a modern browser and allow microphone access when prompted.
+                """)
+                # Hugging Face Token Information
+                gr.Markdown("""
+                ## 🔑 Hugging Face Token
+                This app uses Cloudflare TURN server via Hugging Face integration.
+                If audio connection fails, set your HF_TOKEN environment variable in the Space settings.
+                """)
         # Auto-refresh conversation and status
         def refresh_display():
             outputs=[status_output]
         )
         # Auto-refresh every 2 seconds when recording
         refresh_timer = gr.Timer(2.0)
         refresh_timer.tick(
 # 3) Mount Gradio onto FastAPI at root
 app = gr.mount_gradio_app(app, gradio_interface, path="/")
+# 4) Initialize and mount FastRTC stream on the same app
+rtc_stream = setup_fastrtc_handler()
+rtc_stream.mount(app)
+# 5) Expose an endpoint to trigger the client-side RTC handshake
+@app.post("/start-rtc")
+async def start_rtc():
+    await rtc_stream.start_client()
+    return {"status": "success"}
+# 6) Local dev via uvicorn; HF Spaces will auto-detect 'app' and ignore this
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)