Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 23

Commit

88f78ff

1 Parent(s): 7609dee

Updated code

Browse files

Files changed (2) hide show

app.py +130 -286
realtime_diarize.py +0 -581

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
 import numpy as np
-import soundcard as sc
 import queue
 import torch
 import time
@@ -9,8 +8,9 @@ import os
 import urllib.request
 import torchaudio
 from scipy.spatial.distance import cosine
-from RealtimeSTT import AudioToTextRecorder
 import json
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
@@ -33,7 +33,6 @@ ABSOLUTE_MAX_SPEAKERS = 10
 # Global variables
 FAST_SENTENCE_END = True
-USE_MICROPHONE = False
 SAMPLE_RATE = 16000
 BUFFER_SIZE = 512
 CHANNELS = 1
@@ -58,6 +57,9 @@ SPEAKER_COLOR_NAMES = [
 ]
 class SpeechBrainEncoder:
     """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
     def __init__(self, device="cpu"):
@@ -68,24 +70,11 @@ class SpeechBrainEncoder:
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
-    def _download_model(self):
-        """Download pre-trained SpeechBrain ECAPA-TDNN model if not present"""
-        model_url = "https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb/resolve/main/embedding_model.ckpt"
-        model_path = os.path.join(self.cache_dir, "embedding_model.ckpt")
-        if not os.path.exists(model_path):
-            print(f"Downloading ECAPA-TDNN model to {model_path}...")
-            urllib.request.urlretrieve(model_url, model_path)
-        return model_path
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             from speechbrain.pretrained import EncoderClassifier
-            model_path = self._download_model()
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
                 savedir=self.cache_dir,
@@ -93,9 +82,10 @@ class SpeechBrainEncoder:
             )
             self.model_loaded = True
             return True
         except Exception as e:
-            print(f"Error loading ECAPA-TDNN model: {e}")
             return False
     def embed_utterance(self, audio, sr=16000):
@@ -126,16 +116,21 @@ class AudioProcessor:
     def __init__(self, encoder):
         self.encoder = encoder
-    def extract_embedding(self, audio_int16):
         try:
-            float_audio = audio_int16.astype(np.float32) / 32768.0
             if np.abs(float_audio).max() > 1.0:
                 float_audio = float_audio / np.abs(float_audio).max()
-            embedding = self.encoder.embed_utterance(float_audio)
             return embedding
         except Exception as e:
             print(f"Embedding extraction error: {e}")
             return np.zeros(self.encoder.embedding_dim)
@@ -271,20 +266,14 @@ class SpeakerChangeDetector:
         }
-class RealtimeSpeakerDiarization:
     def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
-        self.recorder = None
-        self.recording_thread = None
-        self.sentence_queue = queue.Queue()
         self.full_sentences = []
         self.sentence_speakers = []
-        self.pending_sentences = []
-        self.displayed_text = ""
-        self.last_realtime_text = ""
-        self.is_running = False
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
@@ -294,6 +283,7 @@ class RealtimeSpeakerDiarization:
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
             print(f"Using device: {device_str}")
             self.encoder = SpeechBrainEncoder(device=device_str)
             success = self.encoder.load_model()
@@ -304,170 +294,62 @@ class RealtimeSpeakerDiarization:
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
-                print("ECAPA-TDNN model loaded successfully!")
                 return True
             else:
-                print("Failed to load ECAPA-TDNN model")
                 return False
         except Exception as e:
             print(f"Model initialization error: {e}")
             return False
-    def live_text_detected(self, text):
-        """Callback for real-time transcription updates"""
-        text = text.strip()
-        if text:
-            sentence_delimiters = '.?!。'
-            prob_sentence_end = (
-                len(self.last_realtime_text) > 0
-                and text[-1] in sentence_delimiters
-                and self.last_realtime_text[-1] in sentence_delimiters
-            )
-            self.last_realtime_text = text
-            if prob_sentence_end and FAST_SENTENCE_END:
-                self.recorder.stop()
-            elif prob_sentence_end:
-                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[0]
-            else:
-                self.recorder.post_speech_silence_duration = SILENCE_THRESHS[1]
-    def process_final_text(self, text):
-        """Process final transcribed text with speaker embedding"""
-        text = text.strip()
-        if text:
-            try:
-                bytes_data = self.recorder.last_transcription_bytes
-                self.sentence_queue.put((text, bytes_data))
-                self.pending_sentences.append(text)
-            except Exception as e:
-                print(f"Error processing final text: {e}")
-    def process_sentence_queue(self):
-        """Process sentences in the queue for speaker detection"""
-        while self.is_running:
-            try:
-                text, bytes_data = self.sentence_queue.get(timeout=1)
-                # Convert audio data to int16
-                audio_int16 = np.int16(bytes_data * 32767)
-                # Extract speaker embedding
-                speaker_embedding = self.audio_processor.extract_embedding(audio_int16)
-                # Store sentence and embedding
-                self.full_sentences.append((text, speaker_embedding))
-                # Fill in missing speaker assignments
-                while len(self.sentence_speakers) < len(self.full_sentences) - 1:
-                    self.sentence_speakers.append(0)
-                # Detect speaker changes
-                speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
-                self.sentence_speakers.append(speaker_id)
-                # Remove from pending
-                if text in self.pending_sentences:
-                    self.pending_sentences.remove(text)
-            except queue.Empty:
-                continue
-            except Exception as e:
-                print(f"Error processing sentence: {e}")
-    def start_recording(self):
-        """Start the recording and transcription process"""
-        if self.encoder is None:
-            return "Please initialize models first!"
         try:
-            # Setup recorder configuration
-            recorder_config = {
-                'spinner': False,
-                'use_microphone': USE_MICROPHONE,
-                'model': FINAL_TRANSCRIPTION_MODEL,
-                'language': TRANSCRIPTION_LANGUAGE,
-                'silero_sensitivity': SILERO_SENSITIVITY,
-                'webrtc_sensitivity': WEBRTC_SENSITIVITY,
-                'post_speech_silence_duration': SILENCE_THRESHS[1],
-                'min_length_of_recording': MIN_LENGTH_OF_RECORDING,
-                'pre_recording_buffer_duration': PRE_RECORDING_BUFFER_DURATION,
-                'min_gap_between_recordings': 0,
-                'enable_realtime_transcription': True,
-                'realtime_processing_pause': 0,
-                'realtime_model_type': REALTIME_TRANSCRIPTION_MODEL,
-                'on_realtime_transcription_update': self.live_text_detected,
-                'beam_size': FINAL_BEAM_SIZE,
-                'beam_size_realtime': REALTIME_BEAM_SIZE,
-                'buffer_size': BUFFER_SIZE,
-                'sample_rate': SAMPLE_RATE,
-            }
-            self.recorder = AudioToTextRecorder(**recorder_config)
-            # Start sentence processing thread
-            self.is_running = True
-            self.sentence_thread = threading.Thread(target=self.process_sentence_queue, daemon=True)
-            self.sentence_thread.start()
-            # Start audio capture thread
-            self.audio_thread = threading.Thread(target=self.capture_audio, daemon=True)
-            self.audio_thread.start()
-            # Start transcription thread
-            self.transcription_thread = threading.Thread(target=self.run_transcription, daemon=True)
-            self.transcription_thread.start()
-            return "Recording started successfully!"
-        except Exception as e:
-            return f"Error starting recording: {e}"
-    def capture_audio(self):
-        """Capture audio from default speaker/microphone"""
-        try:
-            device_id = str(sc.default_speaker().name if not USE_MICROPHONE else sc.default_microphone().name)
-            include_loopback = not USE_MICROPHONE
-            with sc.get_microphone(id=device_id, include_loopback=include_loopback).recorder(
-                samplerate=SAMPLE_RATE, blocksize=BUFFER_SIZE
-            ) as mic:
-                while self.is_running:
-                    audio_data = mic.record(numframes=BUFFER_SIZE)
-                    if audio_data.shape[1] > 1 and CHANNELS == 1:
-                        audio_data = audio_data[:, 0]
-                    audio_int16 = (audio_data.flatten() * 32767).astype(np.int16)
-                    audio_bytes = audio_int16.tobytes()
-                    self.recorder.feed_audio(audio_bytes)
-        except Exception as e:
-            print(f"Audio capture error: {e}")
-    def run_transcription(self):
-        """Run the transcription loop"""
-        try:
-            while self.is_running:
-                self.recorder.text(self.process_final_text)
         except Exception as e:
-            print(f"Transcription error: {e}")
-    def stop_recording(self):
-        """Stop the recording process"""
-        self.is_running = False
-        if self.recorder:
-            self.recorder.stop()
-        return "Recording stopped!"
     def clear_conversation(self):
         """Clear all conversation data"""
         self.full_sentences = []
         self.sentence_speakers = []
-        self.pending_sentences = []
-        self.displayed_text = ""
-        self.last_realtime_text = ""
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
@@ -476,7 +358,7 @@ class RealtimeSpeakerDiarization:
                 max_speakers=self.max_speakers
             )
-        return "Conversation cleared!"
     def update_settings(self, threshold, max_speakers):
         """Update speaker detection settings"""
@@ -487,18 +369,22 @@ class RealtimeSpeakerDiarization:
             self.speaker_detector.set_change_threshold(threshold)
             self.speaker_detector.set_max_speakers(max_speakers)
-        return f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
     def get_formatted_conversation(self):
         """Get the formatted conversation with speaker colors"""
         try:
             sentences_with_style = []
-            # Process completed sentences
             for i, sentence in enumerate(self.full_sentences):
                 sentence_text, _ = sentence
                 if i >= len(self.sentence_speakers):
                     color = "#FFFFFF"
                 else:
                     speaker_id = self.sentence_speakers[i]
                     color = self.speaker_detector.get_color_for_speaker(speaker_id)
@@ -507,15 +393,7 @@ class RealtimeSpeakerDiarization:
                 sentences_with_style.append(
                     f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
-            # Add pending sentences
-            for pending_sentence in self.pending_sentences:
-                sentences_with_style.append(
-                    f'<span style="color:#60FFFF;"><b>Processing:</b> {pending_sentence}</span>')
-            if sentences_with_style:
-                return "<br><br>".join(sentences_with_style)
-            else:
-                return "Waiting for speech input..."
         except Exception as e:
             return f"Error formatting conversation: {e}"
@@ -533,7 +411,7 @@ class RealtimeSpeakerDiarization:
                 f"**Active Speakers:** {status['active_speakers']} of {status['max_speakers']}",
                 f"**Last Similarity:** {status['last_similarity']:.3f}",
                 f"**Change Threshold:** {status['threshold']:.2f}",
-                f"**Total Sentences:** {len(self.full_sentences)}",
                 "",
                 "**Speaker Segment Counts:**"
             ]
@@ -549,26 +427,21 @@ class RealtimeSpeakerDiarization:
 # Global instance
-diarization_system = RealtimeSpeakerDiarization()
 def initialize_system():
     """Initialize the diarization system"""
     success = diarization_system.initialize_models()
     if success:
-        return "✅ System initialized successfully! Models loaded."
     else:
-        return "❌ Failed to initialize system. Please check the logs."
-def start_recording():
-    """Start recording and transcription"""
-    return diarization_system.start_recording()
-def stop_recording():
-    """Stop recording and transcription"""
-    return diarization_system.stop_recording()
 def clear_conversation():
@@ -581,44 +454,52 @@ def update_settings(threshold, max_speakers):
     return diarization_system.update_settings(threshold, max_speakers)
-def get_conversation():
-    """Get the current conversation"""
-    return diarization_system.get_formatted_conversation()
-def get_status():
-    """Get system status"""
-    return diarization_system.get_status_info()
 # Create Gradio interface
 def create_interface():
-    with gr.Blocks(title="Real-time Speaker Diarization", theme=gr.themes.Dark()) as app:
-        gr.Markdown("# 🎤 Real-time Speech Recognition with Speaker Diarization")
-        gr.Markdown("This app performs real-time speech recognition with automatic speaker identification and color-coding.")
         with gr.Row():
             with gr.Column(scale=2):
-                # Main conversation display
-                conversation_output = gr.HTML(
-                    value="<i>Click 'Initialize System' to start...</i>",
-                    label="Live Conversation"
-                )
-                # Control buttons
                 with gr.Row():
-                    init_btn = gr.Button("🔧 Initialize System", variant="secondary")
-                    start_btn = gr.Button("🎙️ Start Recording", variant="primary", interactive=False)
-                    stop_btn = gr.Button("⏹️ Stop Recording", variant="stop", interactive=False)
-                    clear_btn = gr.Button("🗑️ Clear Conversation", interactive=False)
-                # Status display
                 status_output = gr.Textbox(
-                    label="System Status",
-                    value="System not initialized",
-                    lines=8,
                     interactive=False
                 )
             with gr.Column(scale=1):
                 # Settings panel
@@ -630,7 +511,7 @@ def create_interface():
                     step=0.05,
                     value=DEFAULT_CHANGE_THRESHOLD,
                     label="Speaker Change Sensitivity",
-                    info="Lower values = more sensitive to speaker changes"
                 )
                 max_speakers_slider = gr.Slider(
@@ -641,88 +522,51 @@ def create_interface():
                     label="Maximum Number of Speakers"
                 )
-                update_settings_btn = gr.Button("Update Settings")
                 # Speaker color legend
                 gr.Markdown("## 🎨 Speaker Colors")
                 color_info = []
-                for i, (color, name) in enumerate(zip(SPEAKER_COLORS, SPEAKER_COLOR_NAMES)):
-                    color_info.append(f'<span style="color:{color};">■</span> Speaker {i+1} ({name})')
-                gr.HTML("<br>".join(color_info[:DEFAULT_MAX_SPEAKERS]))
-        # Auto-refresh conversation and status
-        def refresh_display():
-            return get_conversation(), get_status()
         # Event handlers
-        def on_initialize():
-            result = initialize_system()
-            if "successfully" in result:
-                return (
-                    result,
-                    gr.update(interactive=True),   # start_btn
-                    gr.update(interactive=True),   # clear_btn
-                    get_conversation(),
-                    get_status()
-                )
-            else:
-                return (
-                    result,
-                    gr.update(interactive=False),  # start_btn
-                    gr.update(interactive=False),  # clear_btn
-                    get_conversation(),
-                    get_status()
-                )
-        def on_start():
-            result = start_recording()
-            return (
-                result,
-                gr.update(interactive=False),  # start_btn
-                gr.update(interactive=True),   # stop_btn
-            )
-        def on_stop():
-            result = stop_recording()
-            return (
-                result,
-                gr.update(interactive=True),   # start_btn
-                gr.update(interactive=False),  # stop_btn
-            )
-        # Connect event handlers
         init_btn.click(
-            on_initialize,
-            outputs=[status_output, start_btn, clear_btn, conversation_output, status_output]
         )
-        start_btn.click(
-            on_start,
-            outputs=[status_output, start_btn, stop_btn]
         )
-        stop_btn.click(
-            on_stop,
-            outputs=[status_output, start_btn, stop_btn]
         )
         clear_btn.click(
             clear_conversation,
-            outputs=[status_output]
         )
         update_settings_btn.click(
             update_settings,
             inputs=[threshold_slider, max_speakers_slider],
-            outputs=[status_output]
-        )
-        # Auto-refresh every 2 seconds when recording
-        refresh_timer = gr.Timer(2.0)
-        refresh_timer.tick(
-            refresh_display,
-            outputs=[conversation_output, status_output]
         )
     return app

 import gradio as gr
 import numpy as np
 import queue
 import torch
 import time
 import urllib.request
 import torchaudio
 from scipy.spatial.distance import cosine
 import json
+import io
+import wave
 # Simplified configuration parameters
 SILENCE_THRESHS = [0, 0.4]
 # Global variables
 FAST_SENTENCE_END = True
 SAMPLE_RATE = 16000
 BUFFER_SIZE = 512
 CHANNELS = 1
 ]
 class SpeechBrainEncoder:
     """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
     def __init__(self, device="cpu"):
         self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
         os.makedirs(self.cache_dir, exist_ok=True)
     def load_model(self):
         """Load the ECAPA-TDNN model"""
         try:
             from speechbrain.pretrained import EncoderClassifier
             self.model = EncoderClassifier.from_hparams(
                 source="speechbrain/spkrec-ecapa-voxceleb",
                 savedir=self.cache_dir,
             )
             self.model_loaded = True
+            print("ECAPA-TDNN model loaded successfully!")
             return True
         except Exception as e:
+            print(f"SpeechBrain not available: {e}")
             return False
     def embed_utterance(self, audio, sr=16000):
     def __init__(self, encoder):
         self.encoder = encoder
+    def extract_embedding(self, audio_data, sample_rate=16000):
         try:
+            # Ensure audio is float32 and normalized
+            if audio_data.dtype == np.int16:
+                float_audio = audio_data.astype(np.float32) / 32768.0
+            else:
+                float_audio = audio_data.astype(np.float32)
+            # Normalize if needed
             if np.abs(float_audio).max() > 1.0:
                 float_audio = float_audio / np.abs(float_audio).max()
+            embedding = self.encoder.embed_utterance(float_audio, sample_rate)
             return embedding
         except Exception as e:
             print(f"Embedding extraction error: {e}")
             return np.zeros(self.encoder.embedding_dim)
         }
+class GradioSpeakerDiarization:
     def __init__(self):
         self.encoder = None
         self.audio_processor = None
         self.speaker_detector = None
         self.full_sentences = []
         self.sentence_speakers = []
+        self.is_initialized = False
         self.change_threshold = DEFAULT_CHANGE_THRESHOLD
         self.max_speakers = DEFAULT_MAX_SPEAKERS
             device_str = "cuda" if torch.cuda.is_available() else "cpu"
             print(f"Using device: {device_str}")
+            # Load SpeechBrain encoder
             self.encoder = SpeechBrainEncoder(device=device_str)
             success = self.encoder.load_model()
                     change_threshold=self.change_threshold,
                     max_speakers=self.max_speakers
                 )
+                self.is_initialized = True
                 return True
             else:
                 return False
         except Exception as e:
             print(f"Model initialization error: {e}")
             return False
+    def transcribe_audio(self, audio_input):
+        """Process audio input and perform transcription with speaker diarization"""
+        if not self.is_initialized:
+            return "❌ Please initialize the system first!", self.get_formatted_conversation(), self.get_status_info()
+        if audio_input is None:
+            return "No audio received", self.get_formatted_conversation(), self.get_status_info()
         try:
+            # Handle different audio input formats
+            if isinstance(audio_input, tuple):
+                sample_rate, audio_data = audio_input
+            else:
+                # Assume it's a file path
+                import librosa
+                audio_data, sample_rate = librosa.load(audio_input, sr=16000)
+            # Ensure audio is in the right format
+            if len(audio_data.shape) > 1:
+                audio_data = audio_data.mean(axis=1)  # Convert to mono
+            # Perform simple transcription (placeholder - you'd want to integrate with Whisper or similar)
+            # For now, we'll just do speaker diarization
+            transcription = f"Audio segment {len(self.full_sentences) + 1} (duration: {len(audio_data)/sample_rate:.1f}s)"
+            # Extract speaker embedding
+            speaker_embedding = self.audio_processor.extract_embedding(audio_data, sample_rate)
+            # Store sentence and embedding
+            self.full_sentences.append((transcription, speaker_embedding))
+            # Detect speaker changes
+            speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
+            self.sentence_speakers.append(speaker_id)
+            status_msg = f"✅ Processed audio segment. Detected as Speaker {speaker_id + 1} (similarity: {similarity:.3f})"
+            return status_msg, self.get_formatted_conversation(), self.get_status_info()
         except Exception as e:
+            error_msg = f"❌ Error processing audio: {str(e)}"
+            return error_msg, self.get_formatted_conversation(), self.get_status_info()
     def clear_conversation(self):
         """Clear all conversation data"""
         self.full_sentences = []
         self.sentence_speakers = []
         if self.speaker_detector:
             self.speaker_detector = SpeakerChangeDetector(
                 max_speakers=self.max_speakers
             )
+        return "Conversation cleared!", self.get_formatted_conversation(), self.get_status_info()
     def update_settings(self, threshold, max_speakers):
         """Update speaker detection settings"""
             self.speaker_detector.set_change_threshold(threshold)
             self.speaker_detector.set_max_speakers(max_speakers)
+        status_msg = f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
+        return status_msg, self.get_formatted_conversation(), self.get_status_info()
     def get_formatted_conversation(self):
         """Get the formatted conversation with speaker colors"""
         try:
+            if not self.full_sentences:
+                return "No audio processed yet. Upload an audio file or record using the microphone."
             sentences_with_style = []
             for i, sentence in enumerate(self.full_sentences):
                 sentence_text, _ = sentence
                 if i >= len(self.sentence_speakers):
                     color = "#FFFFFF"
+                    speaker_name = "Unknown"
                 else:
                     speaker_id = self.sentence_speakers[i]
                     color = self.speaker_detector.get_color_for_speaker(speaker_id)
                 sentences_with_style.append(
                     f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
+            return "<br><br>".join(sentences_with_style)
         except Exception as e:
             return f"Error formatting conversation: {e}"
                 f"**Active Speakers:** {status['active_speakers']} of {status['max_speakers']}",
                 f"**Last Similarity:** {status['last_similarity']:.3f}",
                 f"**Change Threshold:** {status['threshold']:.2f}",
+                f"**Total Segments:** {len(self.full_sentences)}",
                 "",
                 "**Speaker Segment Counts:**"
             ]
 # Global instance
+diarization_system = GradioSpeakerDiarization()
 def initialize_system():
     """Initialize the diarization system"""
     success = diarization_system.initialize_models()
     if success:
+        return "✅ System initialized successfully! Models loaded.", "", ""
     else:
+        return "❌ Failed to initialize system. Please check the logs.", "", ""
+def process_audio(audio):
+    """Process uploaded or recorded audio"""
+    return diarization_system.transcribe_audio(audio)
 def clear_conversation():
     return diarization_system.update_settings(threshold, max_speakers)
 # Create Gradio interface
 def create_interface():
+    with gr.Blocks(title="Speaker Diarization", theme=gr.themes.Soft()) as app:
+        gr.Markdown("# 🎤 Audio Speaker Diarization")
+        gr.Markdown("Upload audio files or record directly to identify different speakers using voice characteristics.")
         with gr.Row():
             with gr.Column(scale=2):
+                # Initialize button
                 with gr.Row():
+                    init_btn = gr.Button("🔧 Initialize System", variant="primary", size="lg")
+                # Audio input options
+                gr.Markdown("### 📁 Audio Input")
+                with gr.Tab("Upload Audio File"):
+                    audio_file = gr.Audio(
+                        label="Upload Audio File",
+                        type="filepath",
+                        sources=["upload"]
+                    )
+                    process_file_btn = gr.Button("Process Audio File", variant="secondary")
+                with gr.Tab("Record Audio"):
+                    audio_mic = gr.Audio(
+                        label="Record Audio",
+                        type="numpy",
+                        sources=["microphone"]
+                    )
+                    process_mic_btn = gr.Button("Process Recording", variant="secondary")
+                # Results display
                 status_output = gr.Textbox(
+                    label="Status",
+                    value="Click 'Initialize System' to start...",
+                    lines=2,
                     interactive=False
                 )
+                conversation_output = gr.HTML(
+                    value="<i>System not initialized...</i>",
+                    label="Speaker Analysis Results"
+                )
+                # Control buttons
+                with gr.Row():
+                    clear_btn = gr.Button("🗑️ Clear Results", variant="stop")
             with gr.Column(scale=1):
                 # Settings panel
                     step=0.05,
                     value=DEFAULT_CHANGE_THRESHOLD,
                     label="Speaker Change Sensitivity",
+                    info="Lower = more sensitive to speaker changes"
                 )
                 max_speakers_slider = gr.Slider(
                     label="Maximum Number of Speakers"
                 )
+                update_settings_btn = gr.Button("Update Settings", variant="secondary")
+                # System status
+                system_status = gr.Textbox(
+                    label="System Status",
+                    value="System not initialized",
+                    lines=12,
+                    interactive=False
+                )
                 # Speaker color legend
                 gr.Markdown("## 🎨 Speaker Colors")
                 color_info = []
+                for i, (color, name) in enumerate(zip(SPEAKER_COLORS[:DEFAULT_MAX_SPEAKERS], SPEAKER_COLOR_NAMES[:DEFAULT_MAX_SPEAKERS])):
+                    color_info.append(f'<span style="color:{color};">●</span> Speaker {i+1} ({name})')
+                gr.HTML("<br>".join(color_info))
         # Event handlers
         init_btn.click(
+            initialize_system,
+            outputs=[status_output, conversation_output, system_status]
         )
+        process_file_btn.click(
+            process_audio,
+            inputs=[audio_file],
+            outputs=[status_output, conversation_output, system_status]
         )
+        process_mic_btn.click(
+            process_audio,
+            inputs=[audio_mic],
+            outputs=[status_output, conversation_output, system_status]
         )
         clear_btn.click(
             clear_conversation,
+            outputs=[status_output, conversation_output, system_status]
         )
         update_settings_btn.click(
             update_settings,
             inputs=[threshold_slider, max_speakers_slider],
+            outputs=[status_output, conversation_output, system_status]
         )
     return app

realtime_diarize.py DELETED Viewed

@@ -1,581 +0,0 @@
-import gradio as gr
-import numpy as np
-import queue
-import torch
-import time
-import threading
-import os
-import urllib.request
-import torchaudio
-from scipy.spatial.distance import cosine
-import json
-import io
-import wave
-# Simplified configuration parameters
-SILENCE_THRESHS = [0, 0.4]
-FINAL_TRANSCRIPTION_MODEL = "distil-large-v3"
-FINAL_BEAM_SIZE = 5
-REALTIME_TRANSCRIPTION_MODEL = "distil-small.en"
-REALTIME_BEAM_SIZE = 5
-TRANSCRIPTION_LANGUAGE = "en"
-SILERO_SENSITIVITY = 0.4
-WEBRTC_SENSITIVITY = 3
-MIN_LENGTH_OF_RECORDING = 0.7
-PRE_RECORDING_BUFFER_DURATION = 0.35
-# Speaker change detection parameters
-DEFAULT_CHANGE_THRESHOLD = 0.7
-EMBEDDING_HISTORY_SIZE = 5
-MIN_SEGMENT_DURATION = 1.0
-DEFAULT_MAX_SPEAKERS = 4
-ABSOLUTE_MAX_SPEAKERS = 10
-# Global variables
-FAST_SENTENCE_END = True
-SAMPLE_RATE = 16000
-BUFFER_SIZE = 512
-CHANNELS = 1
-# Speaker colors
-SPEAKER_COLORS = [
-    "#FFFF00",  # Yellow
-    "#FF0000",  # Red
-    "#00FF00",  # Green
-    "#00FFFF",  # Cyan
-    "#FF00FF",  # Magenta
-    "#0000FF",  # Blue
-    "#FF8000",  # Orange
-    "#00FF80",  # Spring Green
-    "#8000FF",  # Purple
-    "#FFFFFF",  # White
-]
-SPEAKER_COLOR_NAMES = [
-    "Yellow", "Red", "Green", "Cyan", "Magenta",
-    "Blue", "Orange", "Spring Green", "Purple", "White"
-]
-class SpeechBrainEncoder:
-    """ECAPA-TDNN encoder from SpeechBrain for speaker embeddings"""
-    def __init__(self, device="cpu"):
-        self.device = device
-        self.model = None
-        self.embedding_dim = 192
-        self.model_loaded = False
-        self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "speechbrain")
-        os.makedirs(self.cache_dir, exist_ok=True)
-    def load_model(self):
-        """Load the ECAPA-TDNN model"""
-        try:
-            from speechbrain.pretrained import EncoderClassifier
-            self.model = EncoderClassifier.from_hparams(
-                source="speechbrain/spkrec-ecapa-voxceleb",
-                savedir=self.cache_dir,
-                run_opts={"device": self.device}
-            )
-            self.model_loaded = True
-            print("ECAPA-TDNN model loaded successfully!")
-            return True
-        except Exception as e:
-            print(f"SpeechBrain not available: {e}")
-            return False
-    def embed_utterance(self, audio, sr=16000):
-        """Extract speaker embedding from audio"""
-        if not self.model_loaded:
-            raise ValueError("Model not loaded. Call load_model() first.")
-        try:
-            if isinstance(audio, np.ndarray):
-                waveform = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
-            else:
-                waveform = audio.unsqueeze(0)
-            if sr != 16000:
-                waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
-            with torch.no_grad():
-                embedding = self.model.encode_batch(waveform)
-            return embedding.squeeze().cpu().numpy()
-        except Exception as e:
-            print(f"Error extracting embedding: {e}")
-            return np.zeros(self.embedding_dim)
-class AudioProcessor:
-    """Processes audio data to extract speaker embeddings"""
-    def __init__(self, encoder):
-        self.encoder = encoder
-    def extract_embedding(self, audio_data, sample_rate=16000):
-        try:
-            # Ensure audio is float32 and normalized
-            if audio_data.dtype == np.int16:
-                float_audio = audio_data.astype(np.float32) / 32768.0
-            else:
-                float_audio = audio_data.astype(np.float32)
-            # Normalize if needed
-            if np.abs(float_audio).max() > 1.0:
-                float_audio = float_audio / np.abs(float_audio).max()
-            embedding = self.encoder.embed_utterance(float_audio, sample_rate)
-            return embedding
-        except Exception as e:
-            print(f"Embedding extraction error: {e}")
-            return np.zeros(self.encoder.embedding_dim)
-class SpeakerChangeDetector:
-    """Speaker change detector that supports a configurable number of speakers"""
-    def __init__(self, embedding_dim=192, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
-        self.embedding_dim = embedding_dim
-        self.change_threshold = change_threshold
-        self.max_speakers = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
-        self.current_speaker = 0
-        self.previous_embeddings = []
-        self.last_change_time = time.time()
-        self.mean_embeddings = [None] * self.max_speakers
-        self.speaker_embeddings = [[] for _ in range(self.max_speakers)]
-        self.last_similarity = 0.0
-        self.active_speakers = set([0])
-    def set_max_speakers(self, max_speakers):
-        """Update the maximum number of speakers"""
-        new_max = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
-        if new_max < self.max_speakers:
-            for speaker_id in list(self.active_speakers):
-                if speaker_id >= new_max:
-                    self.active_speakers.discard(speaker_id)
-            if self.current_speaker >= new_max:
-                self.current_speaker = 0
-        if new_max > self.max_speakers:
-            self.mean_embeddings.extend([None] * (new_max - self.max_speakers))
-            self.speaker_embeddings.extend([[] for _ in range(new_max - self.max_speakers)])
-        else:
-            self.mean_embeddings = self.mean_embeddings[:new_max]
-            self.speaker_embeddings = self.speaker_embeddings[:new_max]
-        self.max_speakers = new_max
-    def set_change_threshold(self, threshold):
-        """Update the threshold for detecting speaker changes"""
-        self.change_threshold = max(0.1, min(threshold, 0.99))
-    def add_embedding(self, embedding, timestamp=None):
-        """Add a new embedding and check if there's a speaker change"""
-        current_time = timestamp or time.time()
-        if not self.previous_embeddings:
-            self.previous_embeddings.append(embedding)
-            self.speaker_embeddings[self.current_speaker].append(embedding)
-            if self.mean_embeddings[self.current_speaker] is None:
-                self.mean_embeddings[self.current_speaker] = embedding.copy()
-            return self.current_speaker, 1.0
-        current_mean = self.mean_embeddings[self.current_speaker]
-        if current_mean is not None:
-            similarity = 1.0 - cosine(embedding, current_mean)
-        else:
-            similarity = 1.0 - cosine(embedding, self.previous_embeddings[-1])
-        self.last_similarity = similarity
-        time_since_last_change = current_time - self.last_change_time
-        is_speaker_change = False
-        if time_since_last_change >= MIN_SEGMENT_DURATION:
-            if similarity < self.change_threshold:
-                best_speaker = self.current_speaker
-                best_similarity = similarity
-                for speaker_id in range(self.max_speakers):
-                    if speaker_id == self.current_speaker:
-                        continue
-                    speaker_mean = self.mean_embeddings[speaker_id]
-                    if speaker_mean is not None:
-                        speaker_similarity = 1.0 - cosine(embedding, speaker_mean)
-                        if speaker_similarity > best_similarity:
-                            best_similarity = speaker_similarity
-                            best_speaker = speaker_id
-                if best_speaker != self.current_speaker:
-                    is_speaker_change = True
-                    self.current_speaker = best_speaker
-                elif len(self.active_speakers) < self.max_speakers:
-                    for new_id in range(self.max_speakers):
-                        if new_id not in self.active_speakers:
-                            is_speaker_change = True
-                            self.current_speaker = new_id
-                            self.active_speakers.add(new_id)
-                            break
-        if is_speaker_change:
-            self.last_change_time = current_time
-        self.previous_embeddings.append(embedding)
-        if len(self.previous_embeddings) > EMBEDDING_HISTORY_SIZE:
-            self.previous_embeddings.pop(0)
-        self.speaker_embeddings[self.current_speaker].append(embedding)
-        self.active_speakers.add(self.current_speaker)
-        if len(self.speaker_embeddings[self.current_speaker]) > 30:
-            self.speaker_embeddings[self.current_speaker] = self.speaker_embeddings[self.current_speaker][-30:]
-        if self.speaker_embeddings[self.current_speaker]:
-            self.mean_embeddings[self.current_speaker] = np.mean(
-                self.speaker_embeddings[self.current_speaker], axis=0
-            )
-        return self.current_speaker, similarity
-    def get_color_for_speaker(self, speaker_id):
-        """Return color for speaker ID"""
-        if 0 <= speaker_id < len(SPEAKER_COLORS):
-            return SPEAKER_COLORS[speaker_id]
-        return "#FFFFFF"
-    def get_status_info(self):
-        """Return status information about the speaker change detector"""
-        speaker_counts = [len(self.speaker_embeddings[i]) for i in range(self.max_speakers)]
-        return {
-            "current_speaker": self.current_speaker,
-            "speaker_counts": speaker_counts,
-            "active_speakers": len(self.active_speakers),
-            "max_speakers": self.max_speakers,
-            "last_similarity": self.last_similarity,
-            "threshold": self.change_threshold
-        }
-class GradioSpeakerDiarization:
-    def __init__(self):
-        self.encoder = None
-        self.audio_processor = None
-        self.speaker_detector = None
-        self.full_sentences = []
-        self.sentence_speakers = []
-        self.is_initialized = False
-        self.change_threshold = DEFAULT_CHANGE_THRESHOLD
-        self.max_speakers = DEFAULT_MAX_SPEAKERS
-    def initialize_models(self):
-        """Initialize the speaker encoder model"""
-        try:
-            device_str = "cuda" if torch.cuda.is_available() else "cpu"
-            print(f"Using device: {device_str}")
-            # Load SpeechBrain encoder
-            self.encoder = SpeechBrainEncoder(device=device_str)
-            success = self.encoder.load_model()
-            if success:
-                self.audio_processor = AudioProcessor(self.encoder)
-                self.speaker_detector = SpeakerChangeDetector(
-                    embedding_dim=self.encoder.embedding_dim,
-                    change_threshold=self.change_threshold,
-                    max_speakers=self.max_speakers
-                )
-                self.is_initialized = True
-                return True
-            else:
-                return False
-        except Exception as e:
-            print(f"Model initialization error: {e}")
-            return False
-    def transcribe_audio(self, audio_input):
-        """Process audio input and perform transcription with speaker diarization"""
-        if not self.is_initialized:
-            return "❌ Please initialize the system first!", self.get_formatted_conversation(), self.get_status_info()
-        if audio_input is None:
-            return "No audio received", self.get_formatted_conversation(), self.get_status_info()
-        try:
-            # Handle different audio input formats
-            if isinstance(audio_input, tuple):
-                sample_rate, audio_data = audio_input
-            else:
-                # Assume it's a file path
-                import librosa
-                audio_data, sample_rate = librosa.load(audio_input, sr=16000)
-            # Ensure audio is in the right format
-            if len(audio_data.shape) > 1:
-                audio_data = audio_data.mean(axis=1)  # Convert to mono
-            # Perform simple transcription (placeholder - you'd want to integrate with Whisper or similar)
-            # For now, we'll just do speaker diarization
-            transcription = f"Audio segment {len(self.full_sentences) + 1} (duration: {len(audio_data)/sample_rate:.1f}s)"
-            # Extract speaker embedding
-            speaker_embedding = self.audio_processor.extract_embedding(audio_data, sample_rate)
-            # Store sentence and embedding
-            self.full_sentences.append((transcription, speaker_embedding))
-            # Detect speaker changes
-            speaker_id, similarity = self.speaker_detector.add_embedding(speaker_embedding)
-            self.sentence_speakers.append(speaker_id)
-            status_msg = f"✅ Processed audio segment. Detected as Speaker {speaker_id + 1} (similarity: {similarity:.3f})"
-            return status_msg, self.get_formatted_conversation(), self.get_status_info()
-        except Exception as e:
-            error_msg = f"❌ Error processing audio: {str(e)}"
-            return error_msg, self.get_formatted_conversation(), self.get_status_info()
-    def clear_conversation(self):
-        """Clear all conversation data"""
-        self.full_sentences = []
-        self.sentence_speakers = []
-        if self.speaker_detector:
-            self.speaker_detector = SpeakerChangeDetector(
-                embedding_dim=self.encoder.embedding_dim,
-                change_threshold=self.change_threshold,
-                max_speakers=self.max_speakers
-            )
-        return "Conversation cleared!", self.get_formatted_conversation(), self.get_status_info()
-    def update_settings(self, threshold, max_speakers):
-        """Update speaker detection settings"""
-        self.change_threshold = threshold
-        self.max_speakers = max_speakers
-        if self.speaker_detector:
-            self.speaker_detector.set_change_threshold(threshold)
-            self.speaker_detector.set_max_speakers(max_speakers)
-        status_msg = f"Settings updated: Threshold={threshold:.2f}, Max Speakers={max_speakers}"
-        return status_msg, self.get_formatted_conversation(), self.get_status_info()
-    def get_formatted_conversation(self):
-        """Get the formatted conversation with speaker colors"""
-        try:
-            if not self.full_sentences:
-                return "No audio processed yet. Upload an audio file or record using the microphone."
-            sentences_with_style = []
-            for i, sentence in enumerate(self.full_sentences):
-                sentence_text, _ = sentence
-                if i >= len(self.sentence_speakers):
-                    color = "#FFFFFF"
-                    speaker_name = "Unknown"
-                else:
-                    speaker_id = self.sentence_speakers[i]
-                    color = self.speaker_detector.get_color_for_speaker(speaker_id)
-                    speaker_name = f"Speaker {speaker_id + 1}"
-                sentences_with_style.append(
-                    f'<span style="color:{color};"><b>{speaker_name}:</b> {sentence_text}</span>')
-            return "<br><br>".join(sentences_with_style)
-        except Exception as e:
-            return f"Error formatting conversation: {e}"
-    def get_status_info(self):
-        """Get current status information"""
-        if not self.speaker_detector:
-            return "Speaker detector not initialized"
-        try:
-            status = self.speaker_detector.get_status_info()
-            status_lines = [
-                f"**Current Speaker:** {status['current_speaker'] + 1}",
-                f"**Active Speakers:** {status['active_speakers']} of {status['max_speakers']}",
-                f"**Last Similarity:** {status['last_similarity']:.3f}",
-                f"**Change Threshold:** {status['threshold']:.2f}",
-                f"**Total Segments:** {len(self.full_sentences)}",
-                "",
-                "**Speaker Segment Counts:**"
-            ]
-            for i in range(status['max_speakers']):
-                color_name = SPEAKER_COLOR_NAMES[i] if i < len(SPEAKER_COLOR_NAMES) else f"Speaker {i+1}"
-                status_lines.append(f"Speaker {i+1} ({color_name}): {status['speaker_counts'][i]}")
-            return "\n".join(status_lines)
-        except Exception as e:
-            return f"Error getting status: {e}"
-# Global instance
-diarization_system = GradioSpeakerDiarization()
-def initialize_system():
-    """Initialize the diarization system"""
-    success = diarization_system.initialize_models()
-    if success:
-        return "✅ System initialized successfully! Models loaded.", "", ""
-    else:
-        return "❌ Failed to initialize system. Please check the logs.", "", ""
-def process_audio(audio):
-    """Process uploaded or recorded audio"""
-    return diarization_system.transcribe_audio(audio)
-def clear_conversation():
-    """Clear the conversation"""
-    return diarization_system.clear_conversation()
-def update_settings(threshold, max_speakers):
-    """Update system settings"""
-    return diarization_system.update_settings(threshold, max_speakers)
-# Create Gradio interface
-def create_interface():
-    with gr.Blocks(title="Speaker Diarization", theme=gr.themes.Soft()) as app:
-        gr.Markdown("# 🎤 Audio Speaker Diarization")
-        gr.Markdown("Upload audio files or record directly to identify different speakers using voice characteristics.")
-        with gr.Row():
-            with gr.Column(scale=2):
-                # Initialize button
-                with gr.Row():
-                    init_btn = gr.Button("🔧 Initialize System", variant="primary", size="lg")
-                # Audio input options
-                gr.Markdown("### 📁 Audio Input")
-                with gr.Tab("Upload Audio File"):
-                    audio_file = gr.Audio(
-                        label="Upload Audio File",
-                        type="filepath",
-                        sources=["upload"]
-                    )
-                    process_file_btn = gr.Button("Process Audio File", variant="secondary")
-                with gr.Tab("Record Audio"):
-                    audio_mic = gr.Audio(
-                        label="Record Audio",
-                        type="numpy",
-                        sources=["microphone"]
-                    )
-                    process_mic_btn = gr.Button("Process Recording", variant="secondary")
-                # Results display
-                status_output = gr.Textbox(
-                    label="Status",
-                    value="Click 'Initialize System' to start...",
-                    lines=2,
-                    interactive=False
-                )
-                conversation_output = gr.HTML(
-                    value="<i>System not initialized...</i>",
-                    label="Speaker Analysis Results"
-                )
-                # Control buttons
-                with gr.Row():
-                    clear_btn = gr.Button("🗑️ Clear Results", variant="stop")
-            with gr.Column(scale=1):
-                # Settings panel
-                gr.Markdown("## ⚙️ Settings")
-                threshold_slider = gr.Slider(
-                    minimum=0.1,
-                    maximum=0.95,
-                    step=0.05,
-                    value=DEFAULT_CHANGE_THRESHOLD,
-                    label="Speaker Change Sensitivity",
-                    info="Lower = more sensitive to speaker changes"
-                )
-                max_speakers_slider = gr.Slider(
-                    minimum=2,
-                    maximum=ABSOLUTE_MAX_SPEAKERS,
-                    step=1,
-                    value=DEFAULT_MAX_SPEAKERS,
-                    label="Maximum Number of Speakers"
-                )
-                update_settings_btn = gr.Button("Update Settings", variant="secondary")
-                # System status
-                system_status = gr.Textbox(
-                    label="System Status",
-                    value="System not initialized",
-                    lines=12,
-                    interactive=False
-                )
-                # Speaker color legend
-                gr.Markdown("## 🎨 Speaker Colors")
-                color_info = []
-                for i, (color, name) in enumerate(zip(SPEAKER_COLORS[:DEFAULT_MAX_SPEAKERS], SPEAKER_COLOR_NAMES[:DEFAULT_MAX_SPEAKERS])):
-                    color_info.append(f'<span style="color:{color};">●</span> Speaker {i+1} ({name})')
-                gr.HTML("<br>".join(color_info))
-        # Event handlers
-        init_btn.click(
-            initialize_system,
-            outputs=[status_output, conversation_output, system_status]
-        )
-        process_file_btn.click(
-            process_audio,
-            inputs=[audio_file],
-            outputs=[status_output, conversation_output, system_status]
-        )
-        process_mic_btn.click(
-            process_audio,
-            inputs=[audio_mic],
-            outputs=[status_output, conversation_output, system_status]
-        )
-        clear_btn.click(
-            clear_conversation,
-            outputs=[status_output, conversation_output, system_status]
-        )
-        update_settings_btn.click(
-            update_settings,
-            inputs=[threshold_slider, max_speakers_slider],
-            outputs=[status_output, conversation_output, system_status]
-        )
-    return app
-if __name__ == "__main__":
-    app = create_interface()
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True
-    )