Real_Time_diarization

Sleeping

App Files Files Community

Saiyaswanth007 commited on May 23

Commit

af81629

1 Parent(s): 10c2754

requirements

Browse files

Files changed (2) hide show

app.py +415 -350
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -2,452 +2,517 @@ import gradio as gr
 import numpy as np
 import torch
 import torchaudio
 from scipy.spatial.distance import cosine
 import tempfile
-import os
-import warnings
-warnings.filterwarnings("ignore", category=UserWarning)
-try:
-    from transformers import pipeline
-except ImportError:
-    print("transformers not found. Install with: pip install transformers")
-# Configuration
-class Config:
-    # Audio settings
-    SAMPLE_RATE = 16000
-    # Speaker detection
-    CHANGE_THRESHOLD = 0.65
-    MAX_SPEAKERS = 4
-    MIN_SEGMENT_DURATION = 1.0
-    EMBEDDING_HISTORY_SIZE = 3
-    SPEAKER_MEMORY_SIZE = 20
-# Console colors for speakers (HTML version)
 SPEAKER_COLORS = [
     "#FFD700",  # Gold
     "#FF6B6B",  # Red
     "#4ECDC4",  # Teal
     "#45B7D1",  # Blue
-    "#96CEB4",  # Mint
-    "#FFEAA7",  # Light Yellow
-    "#DDA0DD",  # Plum
-    "#98D8C8",  # Mint Green
 ]
-class SpeakerEncoder:
-    """Simplified speaker encoder using torchaudio transforms"""
     def __init__(self, device="cpu"):
         self.device = device
         self.embedding_dim = 128
-        self.model_loaded = False
-        self._setup_model()
-    def _setup_model(self):
-        """Setup a simple MFCC-based feature extractor"""
-        try:
-            self.mfcc_transform = torchaudio.transforms.MFCC(
-                sample_rate=Config.SAMPLE_RATE,
-                n_mfcc=13,
-                melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23}
-            ).to(self.device)
-            self.model_loaded = True
-            print("Simple MFCC-based encoder initialized")
-        except Exception as e:
-            print(f"Error setting up encoder: {e}")
-            self.model_loaded = False
-    def extract_embedding(self, audio):
-        """Extract speaker embedding from audio"""
-        if not self.model_loaded:
-            return np.zeros(self.embedding_dim)
         try:
-            # Ensure audio is float32 and normalized
             if isinstance(audio, np.ndarray):
-                audio = torch.from_numpy(audio).float()
-            # Normalize audio
-            if audio.abs().max() > 0:
-                audio = audio / audio.abs().max()
-            # Add batch dimension if needed
-            if audio.dim() == 1:
-                audio = audio.unsqueeze(0)
-            # Extract MFCC features
-            with torch.no_grad():
-                mfcc = self.mfcc_transform(audio)
-                # Simple statistics-based embedding
-                embedding = torch.cat([
-                    mfcc.mean(dim=2).flatten(),
-                    mfcc.std(dim=2).flatten(),
-                    mfcc.max(dim=2)[0].flatten(),
-                    mfcc.min(dim=2)[0].flatten()
-                ])
-                # Pad or truncate to fixed size
-                if embedding.size(0) > self.embedding_dim:
-                    embedding = embedding[:self.embedding_dim]
-                elif embedding.size(0) < self.embedding_dim:
-                    padding = torch.zeros(self.embedding_dim - embedding.size(0))
-                    embedding = torch.cat([embedding, padding])
-            return embedding.cpu().numpy()
         except Exception as e:
             print(f"Error extracting embedding: {e}")
-            return np.zeros(self.embedding_dim)
-class SpeakerDetector:
-    """Speaker change detection using embeddings"""
-    def __init__(self, threshold=Config.CHANGE_THRESHOLD, max_speakers=Config.MAX_SPEAKERS):
-        self.threshold = threshold
-        self.max_speakers = max_speakers
-        self.current_speaker = 0
-        self.speaker_embeddings = [[] for _ in range(max_speakers)]
-        self.speaker_centroids = [None] * max_speakers
-        self.active_speakers = {0}
-    def reset(self):
-        """Reset speaker detection state"""
         self.current_speaker = 0
         self.speaker_embeddings = [[] for _ in range(self.max_speakers)]
-        self.speaker_centroids = [None] * self.max_speakers
-        self.active_speakers = {0}
-    def detect_speaker(self, embedding):
-        """Detect current speaker from embedding"""
-        # Initialize first speaker
-        if not self.speaker_embeddings[0]:
-            self.speaker_embeddings[0].append(embedding)
-            self.speaker_centroids[0] = embedding.copy()
-            return 0, 1.0
-        # Calculate similarity with current speaker
-        current_centroid = self.speaker_centroids[self.current_speaker]
-        if current_centroid is not None:
-            similarity = 1.0 - cosine(embedding, current_centroid)
         else:
-            similarity = 0.0
-        # Check for speaker change
-        if similarity < self.threshold:
-            # Find best matching existing speaker
-            best_speaker = self.current_speaker
-            best_similarity = similarity
-            for speaker_id in self.active_speakers:
-                if speaker_id == self.current_speaker:
-                    continue
-                centroid = self.speaker_centroids[speaker_id]
-                if centroid is not None:
-                    sim = 1.0 - cosine(embedding, centroid)
-                    if sim > best_similarity and sim > self.threshold:
-                        best_similarity = sim
-                        best_speaker = speaker_id
-            # Create new speaker if no good match and slots available
-            if (best_speaker == self.current_speaker and
-                len(self.active_speakers) < self.max_speakers):
-                for new_id in range(self.max_speakers):
-                    if new_id not in self.active_speakers:
-                        best_speaker = new_id
-                        best_similarity = 0.0
-                        self.active_speakers.add(new_id)
-                        break
-            # Update current speaker if changed
-            if best_speaker != self.current_speaker:
-                self.current_speaker = best_speaker
-                similarity = best_similarity
-        # Update speaker model
-        self._update_speaker_model(self.current_speaker, embedding)
         return self.current_speaker, similarity
-    def _update_speaker_model(self, speaker_id, embedding):
-        """Update speaker model with new embedding"""
-        self.speaker_embeddings[speaker_id].append(embedding)
-        # Keep only recent embeddings
-        if len(self.speaker_embeddings[speaker_id]) > Config.SPEAKER_MEMORY_SIZE:
-            self.speaker_embeddings[speaker_id] = \
-                self.speaker_embeddings[speaker_id][-Config.SPEAKER_MEMORY_SIZE:]
-        # Update centroid
-        if self.speaker_embeddings[speaker_id]:
-            self.speaker_centroids[speaker_id] = np.mean(
-                self.speaker_embeddings[speaker_id], axis=0
-            )
-class AudioProcessor:
-    """Handles audio processing and transcription"""
     def __init__(self):
-        self.encoder = SpeakerEncoder()
-        self.detector = SpeakerDetector()
-        # Initialize Whisper model for transcription
         try:
-            self.transcriber = pipeline(
-                "automatic-speech-recognition",
-                model="openai/whisper-base",
-                chunk_length_s=30,
-                device=0 if torch.cuda.is_available() else -1
-            )
-            print("Whisper model loaded successfully")
-        except Exception as e:
-            print(f"Error loading Whisper model: {e}")
-            self.transcriber = None
-    def process_audio_file(self, audio_file):
-        """Process uploaded audio file"""
-        if audio_file is None:
-            return "Please upload an audio file.", ""
         try:
-            # Reset speaker detection for new file
-            self.detector.reset()
-            # Load audio file
-            waveform, sample_rate = torchaudio.load(audio_file)
-            # Convert to mono if stereo
-            if waveform.shape[0] > 1:
-                waveform = waveform.mean(dim=0, keepdim=True)
-            # Resample to 16kHz if needed
-            if sample_rate != Config.SAMPLE_RATE:
-                resampler = torchaudio.transforms.Resample(sample_rate, Config.SAMPLE_RATE)
-                waveform = resampler(waveform)
-            # Convert to numpy
-            audio_data = waveform.squeeze().numpy()
-            # Transcribe entire audio
-            if self.transcriber:
-                transcription_result = self.transcriber(audio_file)
-                full_transcription = transcription_result['text']
-            else:
-                full_transcription = "Transcription service unavailable"
-            # Process audio in chunks for speaker detection
-            chunk_duration = 3.0  # 3 second chunks
-            chunk_samples = int(chunk_duration * Config.SAMPLE_RATE)
-            results = []
-            for i in range(0, len(audio_data), chunk_samples // 2):  # 50% overlap
-                chunk = audio_data[i:i + chunk_samples]
-                if len(chunk) < Config.SAMPLE_RATE:  # Skip chunks less than 1 second
-                    continue
-                # Extract speaker embedding
-                embedding = self.encoder.extract_embedding(chunk)
-                speaker_id, similarity = self.detector.detect_speaker(embedding)
-                # Get timestamp
-                start_time = i / Config.SAMPLE_RATE
-                end_time = (i + len(chunk)) / Config.SAMPLE_RATE
-                # Transcribe chunk
-                if self.transcriber and len(chunk) > Config.SAMPLE_RATE:
-                    # Save chunk temporarily for transcription
-                    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
-                        torchaudio.save(tmp_file.name, torch.tensor(chunk).unsqueeze(0), Config.SAMPLE_RATE)
-                        chunk_result = self.transcriber(tmp_file.name)
-                        chunk_text = chunk_result['text'].strip()
-                        os.unlink(tmp_file.name)  # Clean up temp file
-                else:
-                    chunk_text = ""
-                if chunk_text:  # Only add if there's actual text
-                    results.append({
-                        'speaker_id': speaker_id,
-                        'start_time': start_time,
-                        'end_time': end_time,
-                        'text': chunk_text,
-                        'similarity': similarity
-                    })
-            # Format results
-            formatted_output = self._format_results(results)
-            return formatted_output, full_transcription
         except Exception as e:
-            return f"Error processing audio: {str(e)}", ""
-    def _format_results(self, results):
-        """Format results with speaker colors"""
-        if not results:
-            return "No speech detected in the audio file."
-        formatted_lines = []
-        formatted_lines.append("🎤 **Speaker Diarization Results**\n")
-        for result in results:
-            speaker_id = result['speaker_id']
-            start_time = result['start_time']
-            end_time = result['end_time']
-            text = result['text']
-            similarity = result['similarity']
-            color = SPEAKER_COLORS[speaker_id % len(SPEAKER_COLORS)]
-            # Format timestamp
-            start_min, start_sec = divmod(int(start_time), 60)
-            end_min, end_sec = divmod(int(end_time), 60)
-            timestamp = f"[{start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}]"
-            # Create colored HTML output
-            formatted_lines.append(
-                f'<div style="margin-bottom: 10px; padding: 8px; border-left: 4px solid {color}; background-color: {color}20;">'
-                f'<strong style="color: {color};">Speaker {speaker_id + 1}</strong> '
-                f'<span style="color: #666; font-size: 0.9em;">{timestamp}</span><br>'
-                f'<span style="color: #333;">{text}</span>'
-                f'</div>'
             )
-        return "".join(formatted_lines)
-# Global processor instance
-processor = AudioProcessor()
-def process_audio(audio_file, sensitivity):
-    """Process audio file with speaker detection"""
-    if audio_file is None:
-        return "Please upload an audio file.", ""
-    # Update sensitivity
-    processor.detector.threshold = sensitivity
-    # Process the audio
-    diarized_output, full_transcription = processor.process_audio_file(audio_file)
-    return diarized_output, full_transcription
-# Create Gradio interface
 def create_interface():
     """Create Gradio interface"""
     with gr.Blocks(
         theme=gr.themes.Soft(),
-        title="Speaker Diarization & Transcription",
         css="""
-        .gradio-container {
-            max-width: 1200px !important;
         }
-        .speaker-output {
-            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
         }
         """
     ) as demo:
         gr.Markdown(
             """
-            # 🎙️ Speaker Diarization & Transcription
-            Upload an audio file to automatically detect different speakers and transcribe their speech.
-            The system will identify speaker changes and display each speaker's text in different colors.
             """
         )
         with gr.Row():
-            with gr.Column(scale=1):
                 audio_input = gr.Audio(
-                    label="Upload Audio File",
-                    type="filepath",
-                    sources=["upload", "microphone"]
                 )
-                sensitivity_slider = gr.Slider(
                     minimum=0.1,
-                    maximum=1.0,
-                    value=0.65,
                     step=0.05,
-                    label="Speaker Change Sensitivity",
-                    info="Lower values = more sensitive to speaker changes"
                 )
-                process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg")
-                gr.Markdown(
-                    """
-                    ### Instructions:
-                    1. Upload an audio file (WAV, MP3, etc.)
-                    2. Adjust sensitivity if needed
-                    3. Click "Process Audio"
-                    4. View results with speaker colors
-                    ### Tips:
-                    - Works best with clear speech
-                    - Supports multiple file formats
-                    - Different speakers shown in different colors
-                    - Processing may take a moment for longer files
-                    """
                 )
-            with gr.Column(scale=2):
-                with gr.Tabs():
-                    with gr.TabItem("🎨 Speaker Diarization"):
-                        diarized_output = gr.HTML(
-                            label="Speaker Diarization Results",
-                            elem_classes=["speaker-output"]
-                        )
-                    with gr.TabItem("📝 Full Transcription"):
-                        full_transcription = gr.Textbox(
-                            label="Complete Transcription",
-                            lines=15,
-                            max_lines=20,
-                            show_copy_button=True
-                        )
         # Event handlers
-        process_btn.click(
-            fn=process_audio,
-            inputs=[audio_input, sensitivity_slider],
-            outputs=[diarized_output, full_transcription],
-            show_progress=True
         )
-        # Auto-process when audio is uploaded
-        audio_input.change(
-            fn=process_audio,
-            inputs=[audio_input, sensitivity_slider],
-            outputs=[diarized_output, full_transcription],
-            show_progress=True
         )
-        gr.Markdown(
-            """
-            ---
-            ### About
-            This application uses:
-            - **MFCC features** for speaker embedding extraction
-            - **Cosine similarity** for speaker change detection
-            - **OpenAI Whisper** for speech-to-text transcription
-            - **Gradio** for the web interface
-            **Note**: This is a simplified speaker diarization system. For production use,
-            consider more advanced speaker embedding models like speechbrain or pyannote.audio.
-            """
         )
     return demo
-# Create and launch the interface
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,
-        show_error=True
     )

 import numpy as np
 import torch
 import torchaudio
+import threading
+import queue
+import time
+import os
+import urllib.request
 from scipy.spatial.distance import cosine
+from collections import deque
 import tempfile
+import librosa
+# Configuration parameters
+FINAL_TRANSCRIPTION_MODEL = "openai/whisper-small"
+TRANSCRIPTION_LANGUAGE = "en"
+DEFAULT_CHANGE_THRESHOLD = 0.7
+EMBEDDING_HISTORY_SIZE = 5
+MIN_SEGMENT_DURATION = 1.0
+DEFAULT_MAX_SPEAKERS = 4
+ABSOLUTE_MAX_SPEAKERS = 6
+SAMPLE_RATE = 16000
+# Speaker colors for up to 6 speakers
 SPEAKER_COLORS = [
     "#FFD700",  # Gold
     "#FF6B6B",  # Red
     "#4ECDC4",  # Teal
     "#45B7D1",  # Blue
+    "#96CEB4",  # Green
+    "#FFEAA7",  # Yellow
 ]
+SPEAKER_COLOR_NAMES = [
+    "Gold", "Red", "Teal", "Blue", "Green", "Yellow"
+]
+class SpeechBrainEncoder:
+    """Simplified encoder for speaker embeddings using torch audio features"""
     def __init__(self, device="cpu"):
         self.device = device
         self.embedding_dim = 128
+        self.model_loaded = True
+    def load_model(self):
+        """Model loading simulation"""
+        return True
+    def embed_utterance(self, audio, sr=16000):
+        """Extract simple spectral features as speaker embedding"""
         try:
             if isinstance(audio, np.ndarray):
+                waveform = torch.tensor(audio, dtype=torch.float32)
+            else:
+                waveform = audio
+            if len(waveform.shape) == 1:
+                waveform = waveform.unsqueeze(0)
+            # Resample if needed
+            if sr != 16000:
+                waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
+            # Extract MFCC features as a simple embedding
+            mfcc_transform = torchaudio.transforms.MFCC(
+                sample_rate=16000,
+                n_mfcc=13,
+                melkwargs={'n_mels': 40}
+            )
+            mfcc = mfcc_transform(waveform)
+            # Take mean across time dimension and flatten
+            embedding = mfcc.mean(dim=2).flatten()
+            # Pad or truncate to fixed size
+            if len(embedding) > self.embedding_dim:
+                embedding = embedding[:self.embedding_dim]
+            elif len(embedding) < self.embedding_dim:
+                padding = torch.zeros(self.embedding_dim - len(embedding))
+                embedding = torch.cat([embedding, padding])
+            return embedding.numpy()
         except Exception as e:
             print(f"Error extracting embedding: {e}")
+            return np.random.randn(self.embedding_dim)
+class SpeakerChangeDetector:
+    """Speaker change detector for real-time diarization"""
+    def __init__(self, embedding_dim=128, change_threshold=DEFAULT_CHANGE_THRESHOLD, max_speakers=DEFAULT_MAX_SPEAKERS):
+        self.embedding_dim = embedding_dim
+        self.change_threshold = change_threshold
+        self.max_speakers = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
         self.current_speaker = 0
+        self.previous_embeddings = []
+        self.last_change_time = time.time()
+        self.mean_embeddings = [None] * self.max_speakers
         self.speaker_embeddings = [[] for _ in range(self.max_speakers)]
+        self.last_similarity = 0.0
+        self.active_speakers = set([0])
+    def set_max_speakers(self, max_speakers):
+        """Update the maximum number of speakers"""
+        new_max = min(max_speakers, ABSOLUTE_MAX_SPEAKERS)
+        if new_max < self.max_speakers:
+            for speaker_id in list(self.active_speakers):
+                if speaker_id >= new_max:
+                    self.active_speakers.discard(speaker_id)
+            if self.current_speaker >= new_max:
+                self.current_speaker = 0
+        if new_max > self.max_speakers:
+            self.mean_embeddings.extend([None] * (new_max - self.max_speakers))
+            self.speaker_embeddings.extend([[] for _ in range(new_max - self.max_speakers)])
         else:
+            self.mean_embeddings = self.mean_embeddings[:new_max]
+            self.speaker_embeddings = self.speaker_embeddings[:new_max]
+        self.max_speakers = new_max
+    def set_change_threshold(self, threshold):
+        """Update the threshold for detecting speaker changes"""
+        self.change_threshold = max(0.1, min(threshold, 0.99))
+    def add_embedding(self, embedding, timestamp=None):
+        """Add a new embedding and check if there's a speaker change"""
+        current_time = timestamp or time.time()
+        if not self.previous_embeddings:
+            self.previous_embeddings.append(embedding)
+            self.speaker_embeddings[self.current_speaker].append(embedding)
+            if self.mean_embeddings[self.current_speaker] is None:
+                self.mean_embeddings[self.current_speaker] = embedding.copy()
+            return self.current_speaker, 1.0
+        current_mean = self.mean_embeddings[self.current_speaker]
+        if current_mean is not None:
+            similarity = 1.0 - cosine(embedding, current_mean)
+        else:
+            similarity = 1.0 - cosine(embedding, self.previous_embeddings[-1])
+        self.last_similarity = similarity
+        time_since_last_change = current_time - self.last_change_time
+        is_speaker_change = False
+        if time_since_last_change >= MIN_SEGMENT_DURATION:
+            if similarity < self.change_threshold:
+                best_speaker = self.current_speaker
+                best_similarity = similarity
+                for speaker_id in range(self.max_speakers):
+                    if speaker_id == self.current_speaker:
+                        continue
+                    speaker_mean = self.mean_embeddings[speaker_id]
+                    if speaker_mean is not None:
+                        speaker_similarity = 1.0 - cosine(embedding, speaker_mean)
+                        if speaker_similarity > best_similarity:
+                            best_similarity = speaker_similarity
+                            best_speaker = speaker_id
+                if best_speaker != self.current_speaker:
+                    is_speaker_change = True
+                    self.current_speaker = best_speaker
+                elif len(self.active_speakers) < self.max_speakers:
+                    for new_id in range(self.max_speakers):
+                        if new_id not in self.active_speakers:
+                            is_speaker_change = True
+                            self.current_speaker = new_id
+                            self.active_speakers.add(new_id)
+                            break
+        if is_speaker_change:
+            self.last_change_time = current_time
+        self.previous_embeddings.append(embedding)
+        if len(self.previous_embeddings) > EMBEDDING_HISTORY_SIZE:
+            self.previous_embeddings.pop(0)
+        self.speaker_embeddings[self.current_speaker].append(embedding)
+        self.active_speakers.add(self.current_speaker)
+        if len(self.speaker_embeddings[self.current_speaker]) > 30:
+            self.speaker_embeddings[self.current_speaker] = self.speaker_embeddings[self.current_speaker][-30:]
+        if self.speaker_embeddings[self.current_speaker]:
+            self.mean_embeddings[self.current_speaker] = np.mean(
+                self.speaker_embeddings[self.current_speaker], axis=0
+            )
         return self.current_speaker, similarity
+    def get_color_for_speaker(self, speaker_id):
+        """Return color for speaker ID"""
+        if 0 <= speaker_id < len(SPEAKER_COLORS):
+            return SPEAKER_COLORS[speaker_id]
+        return "#FFFFFF"
+class RealTimeASRDiarization:
+    """Main class for real-time ASR with speaker diarization"""
     def __init__(self):
+        self.encoder = SpeechBrainEncoder()
+        self.encoder.load_model()
+        self.speaker_detector = SpeakerChangeDetector()
+        self.transcription_queue = queue.Queue()
+        self.conversation_history = []
+        self.is_processing = False
+        # Load Whisper model
         try:
+            import whisper
+            self.whisper_model = whisper.load_model("base")
+        except ImportError:
+            print("Whisper not available, using mock transcription")
+            self.whisper_model = None
+    def transcribe_audio(self, audio_data, sr=16000):
+        """Transcribe audio using Whisper"""
         try:
+            if self.whisper_model is None:
+                return "Mock transcription: Hello, this is a test."
+            # Ensure audio is the right format
+            if isinstance(audio_data, tuple):
+                sr, audio_data = audio_data
+            if len(audio_data.shape) > 1:
+                audio_data = audio_data.mean(axis=1)
+            # Normalize audio
+            audio_data = audio_data.astype(np.float32)
+            if np.abs(audio_data).max() > 1.0:
+                audio_data = audio_data / np.abs(audio_data).max()
+            # Resample to 16kHz if needed
+            if sr != 16000:
+                audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
+            # Transcribe
+            result = self.whisper_model.transcribe(audio_data, language="en")
+            return result["text"].strip()
         except Exception as e:
+            print(f"Transcription error: {e}")
+            return ""
+    def extract_speaker_embedding(self, audio_data, sr=16000):
+        """Extract speaker embedding from audio"""
+        return self.encoder.embed_utterance(audio_data, sr)
+    def process_audio_segment(self, audio_data, sr=16000):
+        """Process an audio segment for transcription and speaker identification"""
+        if len(audio_data) < sr * 0.5:  # Skip very short segments
+            return None, None, None
+        # Transcribe the audio
+        transcription = self.transcribe_audio(audio_data, sr)
+        if not transcription:
+            return None, None, None
+        # Extract speaker embedding
+        embedding = self.extract_speaker_embedding(audio_data, sr)
+        # Detect speaker
+        speaker_id, similarity = self.speaker_detector.add_embedding(embedding)
+        return transcription, speaker_id, similarity
+    def update_conversation(self, transcription, speaker_id):
+        """Update conversation history with new transcription"""
+        speaker_name = f"Speaker {speaker_id + 1}"
+        color = self.speaker_detector.get_color_for_speaker(speaker_id)
+        entry = {
+            "speaker": speaker_name,
+            "text": transcription,
+            "color": color,
+            "timestamp": time.time()
+        }
+        self.conversation_history.append(entry)
+        return entry
+    def format_conversation_html(self):
+        """Format conversation history as HTML"""
+        if not self.conversation_history:
+            return "<p><i>No conversation yet. Start speaking to see real-time transcription with speaker diarization.</i></p>"
+        html_parts = []
+        for entry in self.conversation_history:
+            html_parts.append(
+                f'<p><span style="color: {entry["color"]}; font-weight: bold;">'
+                f'{entry["speaker"]}:</span> {entry["text"]}</p>'
             )
+        return "".join(html_parts)
+    def get_status_info(self):
+        """Get current status information"""
+        status = {
+            "active_speakers": len(self.speaker_detector.active_speakers),
+            "max_speakers": self.speaker_detector.max_speakers,
+            "current_speaker": self.speaker_detector.current_speaker + 1,
+            "total_segments": len(self.conversation_history),
+            "threshold": self.speaker_detector.change_threshold
+        }
+        return status
+    def clear_conversation(self):
+        """Clear conversation history and reset speaker detector"""
+        self.conversation_history = []
+        self.speaker_detector = SpeakerChangeDetector(
+            change_threshold=self.speaker_detector.change_threshold,
+            max_speakers=self.speaker_detector.max_speakers
+        )
+    def set_parameters(self, threshold, max_speakers):
+        """Update parameters"""
+        self.speaker_detector.set_change_threshold(threshold)
+        self.speaker_detector.set_max_speakers(max_speakers)
+# Global instance
+asr_system = RealTimeASRDiarization()
+def process_audio_realtime(audio_data, threshold, max_speakers):
+    """Process audio in real-time"""
+    global asr_system
+    if audio_data is None:
+        return asr_system.format_conversation_html(), get_status_display()
+    # Update parameters
+    asr_system.set_parameters(threshold, max_speakers)
+    try:
+        # Process the audio segment
+        sr, audio_array = audio_data
+        # Convert to float32 and normalize
+        if audio_array.dtype != np.float32:
+            audio_array = audio_array.astype(np.float32)
+            if audio_array.dtype == np.int16:
+                audio_array = audio_array / 32768.0
+            elif audio_array.dtype == np.int32:
+                audio_array = audio_array / 2147483648.0
+        # Process the audio segment
+        transcription, speaker_id, similarity = asr_system.process_audio_segment(audio_array, sr)
+        if transcription and speaker_id is not None:
+            # Update conversation
+            asr_system.update_conversation(transcription, speaker_id)
+    except Exception as e:
+        print(f"Error processing audio: {e}")
+    return asr_system.format_conversation_html(), get_status_display()
+def get_status_display():
+    """Get formatted status display"""
+    status = asr_system.get_status_info()
+    status_html = f"""
+    <div style="font-family: monospace; font-size: 12px;">
+    <strong>Status:</strong><br>
+    Current Speaker: {status['current_speaker']}<br>
+    Active Speakers: {status['active_speakers']} / {status['max_speakers']}<br>
+    Total Segments: {status['total_segments']}<br>
+    Threshold: {status['threshold']:.2f}<br>
+    </div>
+    """
+    return status_html
+def clear_conversation():
+    """Clear the conversation"""
+    global asr_system
+    asr_system.clear_conversation()
+    return asr_system.format_conversation_html(), get_status_display()
 def create_interface():
     """Create Gradio interface"""
     with gr.Blocks(
+        title="Real-time ASR with Speaker Diarization",
         theme=gr.themes.Soft(),
         css="""
+        .conversation-box {
+            height: 400px;
+            overflow-y: auto;
+            border: 1px solid #ddd;
+            padding: 10px;
+            background-color: #f9f9f9;
         }
+        .status-box {
+            border: 1px solid #ccc;
+            padding: 10px;
+            background-color: #f0f0f0;
         }
         """
     ) as demo:
         gr.Markdown(
             """
+            # 🎤 Real-time ASR with Live Speaker Diarization
+            This application provides real-time speech recognition with speaker diarization.
+            It can distinguish between different speakers and display their conversations in different colors.
+            **Instructions:**
+            1. Adjust the speaker change threshold and maximum speakers
+            2. Click the microphone button to start recording
+            3. Speak naturally - the system will detect speaker changes and transcribe speech
+            4. Each speaker will be assigned a different color
             """
         )
         with gr.Row():
+            with gr.Column(scale=3):
+                # Main conversation display
+                conversation_display = gr.HTML(
+                    value="<p><i>Click the microphone to start recording...</i></p>",
+                    elem_classes=["conversation-box"]
+                )
+                # Audio input
                 audio_input = gr.Audio(
+                    source="microphone",
+                    type="numpy",
+                    streaming=True,
+                    label="🎤 Microphone Input"
                 )
+            with gr.Column(scale=1):
+                # Controls
+                gr.Markdown("### Controls")
+                threshold_slider = gr.Slider(
                     minimum=0.1,
+                    maximum=0.9,
+                    value=DEFAULT_CHANGE_THRESHOLD,
                     step=0.05,
+                    label="Speaker Change Threshold",
+                    info="Higher values = less sensitive to speaker changes"
                 )
+                max_speakers_slider = gr.Slider(
+                    minimum=2,
+                    maximum=ABSOLUTE_MAX_SPEAKERS,
+                    value=DEFAULT_MAX_SPEAKERS,
+                    step=1,
+                    label="Maximum Speakers",
+                    info="Maximum number of different speakers to detect"
+                )
+                clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")
+                # Status display
+                gr.Markdown("### Status")
+                status_display = gr.HTML(
+                    value=get_status_display(),
+                    elem_classes=["status-box"]
                 )
+                # Speaker color legend
+                gr.Markdown("### Speaker Colors")
+                legend_html = ""
+                for i in range(ABSOLUTE_MAX_SPEAKERS):
+                    color = SPEAKER_COLORS[i]
+                    name = SPEAKER_COLOR_NAMES[i]
+                    legend_html += f'<p><span style="color: {color}; font-weight: bold;">● Speaker {i+1} ({name})</span></p>'
+                gr.HTML(legend_html)
         # Event handlers
+        audio_input.change(
+            fn=process_audio_realtime,
+            inputs=[audio_input, threshold_slider, max_speakers_slider],
+            outputs=[conversation_display, status_display],
+            show_progress=False
         )
+        clear_btn.click(
+            fn=clear_conversation,
+            outputs=[conversation_display, status_display]
         )
+        # Update status periodically
+        demo.load(
+            fn=lambda: (asr_system.format_conversation_html(), get_status_display()),
+            outputs=[conversation_display, status_display],
+            every=2
         )
     return demo
 if __name__ == "__main__":
+    # Create and launch the interface
     demo = create_interface()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=True
     )

requirements.txt CHANGED Viewed

@@ -128,7 +128,7 @@ pytz==2024.1
 PyYAML==6.0.1
 RealTimeSTT==0.1.13
 regex==2023.12.25
-requests==2.31.0
 safetensors==0.4.2
 scikit-learn==1.4.1.post1
 scipy==1.15.2
@@ -163,7 +163,7 @@ absl-py==2.1.0
 # … any other non-PyTorch dependencies …
 torch==2.2.2+cpu
 torchaudio==2.2.2+cpu
-tqdm==4.66.2
 trainer==0.0.36
 traitlets==5.14.2
 transformers==4.39.2

 PyYAML==6.0.1
 RealTimeSTT==0.1.13
 regex==2023.12.25
+requests==2.32.3
 safetensors==0.4.2
 scikit-learn==1.4.1.post1
 scipy==1.15.2
 # … any other non-PyTorch dependencies …
 torch==2.2.2+cpu
 torchaudio==2.2.2+cpu
+tqdm==4.67.1
 trainer==0.0.36
 traitlets==5.14.2
 transformers==4.39.2