Spaces:

mostafaashahin
/

asr_trials

Runtime error

App Files Files Community

mostafaashahin commited on Jul 29

Commit

9ae1d37

verified ·

1 Parent(s): da8dafb

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -212

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import numpy as np
 import os
 import gc
 import re
-from difflib import SequenceMatcher
 class MultiModelASRInterface:
     def __init__(self):
@@ -120,7 +119,7 @@ class MultiModelASRInterface:
     def preprocess_audio(self, audio):
         """
-        Preprocess audio for Wav2Vec2 model.
         Args:
             audio: Audio data (numpy array or file path)
@@ -146,7 +145,7 @@ class MultiModelASRInterface:
                 audio_data = np.mean(audio_data, axis=1)
                 print(f"Converted to mono: shape={audio_data.shape}")
-            # Resample to 16kHz if needed (Wav2Vec2 expects 16kHz)
             if sample_rate != 16000:
                 print(f"Resampling from {sample_rate}Hz to 16000Hz")
                 audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
@@ -361,229 +360,215 @@ class MultiModelASRInterface:
             info = self.available_models[model_name]
             return f"**{info['name']}**\n{info['description']}\nMemory: {info['size']}"
         return "Model information not available"
-    def create_interface(self):
-        """Create the Gradio interface with model selection."""
-        with gr.Blocks(title="Multi-Model ASR", theme=gr.themes.Soft()) as interface:
-            gr.Markdown("# 🎤 Multi-Model Speech Recognition")
-            gr.Markdown("Select a model, then record or upload audio for transcription.")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### 🤖 Model Selection")
-                    # Model dropdown
-                    model_dropdown = gr.Dropdown(
-                        choices=list(self.available_models.keys()),
-                        value="facebook/wav2vec2-base-960h",
-                        label="Select ASR Model",
-                        info="Choose the model based on your needs"
-                    )
-                    # Model info display
-                    model_info = gr.Markdown(self.get_model_info("facebook/wav2vec2-base-960h"))
-                    # Load model button
-                    load_btn = gr.Button("📥 Load Model", variant="primary")
-                    # Current model status
-                    model_status = gr.Markdown("No model loaded. Please select and load a model.")
-                    gr.Markdown("### 📹 Audio Input")
-                    audio_input = gr.Audio(
-                        sources=["microphone", "upload"],
-                        type="filepath",
-                        label="Record or upload audio",
-                        show_label=True
-                    )
-                    with gr.Row():
-                        transcribe_btn = gr.Button("🔄 Transcribe", variant="primary", size="lg")
-                        clear_btn = gr.Button("🗑️ Clear", variant="secondary")
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📝 Transcription")
-                    text_output = gr.Textbox(
-                        label="Transcribed Text",
-                        placeholder="Your transcribed text will appear here...",
-                        lines=6,
-                        max_lines=10
-                    )
-                    copy_btn = gr.Button("📋 Copy Text", variant="secondary")
-                    gr.Markdown("### 📊 WER Analysis")
-                    reference_input = gr.Textbox(
-                        label="Reference Text (Optional)",
-                        placeholder="Enter the correct/expected text to calculate WER...",
-                        lines=3,
-                        max_lines=5
-                    )
-                    wer_output = gr.Markdown("Enter reference text to see WER analysis")
-            # Status indicator
-            status = gr.Markdown("Ready! Select a model and load it to get started.")
-            # Event handlers
-            def update_model_info(model_name):
-                return self.get_model_info(model_name)
-            def load_selected_model(model_name):
-                result = self.load_model(model_name)
-                return result, f"Current model: {self.available_models[model_name]['name']}"
-            def transcribe(audio):
-                if audio is None:
-                    return "Please provide audio first.", "No audio to transcribe."
-                if self.model is None:
-                    return "Please load a model first.", "No model loaded."
-                print(f"Transcribe called with model: {self.current_model_name}")
-                status_msg = f"🔄 Transcribing with {self.available_models[self.current_model_name]['name']}..."
-                transcription = self.transcribe_audio(audio)
-                if transcription and "Error" not in transcription and "No audio provided" not in transcription:
-                    status_msg = "✅ Transcription completed!"
-                else:
-                    status_msg = "❌ Transcription failed. Please try again."
-                return status_msg, transcription
-            def calculate_wer(transcription, reference):
-                """Calculate WER when reference text is provided."""
-                if not transcription or transcription.strip() == "":
-                    return "No transcription available for WER calculation."
-                if not reference or reference.strip() == "":
-                    return "Enter reference text to calculate WER."
-                try:
-                    wer_details = self.calculate_wer_details(reference, transcription)
-                    # Format WER results
-                    wer_percent = wer_details['wer'] * 100
-                    result = f"""
-                    ## 📊 WER Analysis Results
-                    **Word Error Rate:** {wer_percent:.2f}%
-                    ### Word Statistics:
-                    - **Correct Words:** {wer_details['correct_words']}
-                    - **Total Words:** {wer_details['total_words']}
-                    - **Accuracy:** {(wer_details['correct_words'] / wer_details['total_words'] * 100):.2f}%
-                    ### Error Breakdown:
-                    - **Insertions:** {wer_details['insertions']}
-                    - **Deletions:** {wer_details['deletions']}
-                    - **Substitutions:** {wer_details['substitutions']}
-                    - **Total Errors:** {wer_details['total_errors']}
-                    ### Normalized Texts:
-                    **Reference:** `{wer_details['ref_normalized']}`
-                    **Hypothesis:** `{wer_details['hyp_normalized']}`
-                    """
-                    return result
-                except Exception as e:
-                    return f"Error calculating WER: {str(e)}"
-            def clear():
-                return None, "", "", "Ready! Record audio or upload a file to get started."
-            def copy_text(text):
-                if text and text.strip():
-                    return gr.update(value="Text copied to clipboard!")
-                return gr.update(value="No text to copy.")
-            # Connect event handlers
-            model_dropdown.change(
-                fn=update_model_info,
-                inputs=model_dropdown,
-                outputs=model_info
-            )
-            load_btn.click(
-                fn=load_selected_model,
-                inputs=model_dropdown,
-                outputs=[model_status, status]
             )
-            transcribe_btn.click(
-                fn=transcribe,
-                inputs=audio_input,
-                outputs=[status, text_output]
-            )
-            clear_btn.click(
-                fn=clear,
-                outputs=[audio_input, text_output, wer_output, status]
-            )
-            copy_btn.click(
-                fn=copy_text,
-                inputs=text_output,
-                outputs=status
-            )
-            # Auto-transcribe when audio changes
-            audio_input.change(
-                fn=transcribe,
-                inputs=audio_input,
-                outputs=[status, text_output]
             )
-            # Calculate WER when reference text changes
-            reference_input.change(
-                fn=calculate_wer,
-                inputs=[text_output, reference_input],
-                outputs=wer_output
             )
-            # Calculate WER when transcription changes
-            text_output.change(
-                fn=calculate_wer,
-                inputs=[text_output, reference_input],
-                outputs=wer_output
             )
-            # Instructions
-            with gr.Accordion("ℹ️ Instructions", open=False):
-                gr.Markdown("""
-                ### How to use:
-                1. **Select Model**: Choose from available Wav2Vec2 and Whisper models
-                2. **Load Model**: Click 'Load Model' to load the selected model
-                3. **Record/Upload**: Record audio or upload an audio file
-                4. **Transcribe**: Click 'Transcribe' or wait for auto-transcription
-                5. **WER Analysis**: Enter reference text to calculate Word Error Rate
-                6. **Copy Text**: Use 'Copy Text' to copy the result
-                ### Model Comparison:
-                - **Wav2Vec2 Base (100h)**: Fastest, smallest memory (~300MB), good for basic tasks
-                - **Wav2Vec2 Base (960h)**: Balanced speed/accuracy (~1GB), recommended for most uses
-                - **Wav2Vec2 Large (960h)**: High accuracy (~3GB), best for difficult audio
-                - **Whisper Large V3 Turbo**: State-of-the-art accuracy (~5GB), multilingual support
-                ### Tips:
-                - Larger models are more accurate but slower
-                - Only one model is loaded at a time to save memory
-                - Switch models anytime by selecting and loading a new one
-                - WER calculation normalizes text (lowercase, no punctuation)
-                - Lower WER percentage indicates better transcription accuracy
-                """)
-        return interface
-# Initialize the ASR interface
-asr_interface = MultiModelASRInterface()
-# Create the Gradio interface
-interface = asr_interface.create_interface()
 # Launch the interface
 if __name__ == "__main__":
-    interface.launch()

 import os
 import gc
 import re
 class MultiModelASRInterface:
     def __init__(self):
     def preprocess_audio(self, audio):
         """
+        Preprocess audio for ASR models.
         Args:
             audio: Audio data (numpy array or file path)
                 audio_data = np.mean(audio_data, axis=1)
                 print(f"Converted to mono: shape={audio_data.shape}")
+            # Resample to 16kHz if needed (models expect 16kHz)
             if sample_rate != 16000:
                 print(f"Resampling from {sample_rate}Hz to 16000Hz")
                 audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
             info = self.available_models[model_name]
             return f"**{info['name']}**\n{info['description']}\nMemory: {info['size']}"
         return "Model information not available"
+# Initialize the ASR interface
+asr_interface = MultiModelASRInterface()
+def load_selected_model(model_name):
+    """Load the selected model."""
+    return asr_interface.load_model(model_name)
+def transcribe(audio):
+    """Transcribe audio."""
+    if audio is None:
+        return "Please provide audio first."
+    if asr_interface.model is None:
+        return "Please load a model first."
+    print(f"Transcribe called with model: {asr_interface.current_model_name}")
+    transcription = asr_interface.transcribe_audio(audio)
+    if transcription and "Error" not in transcription and "No audio provided" not in transcription:
+        return transcription
+    else:
+        return transcription
+def calculate_wer(transcription, reference):
+    """Calculate WER when reference text is provided."""
+    if not transcription or transcription.strip() == "":
+        return "No transcription available for WER calculation."
+    if not reference or reference.strip() == "":
+        return "Enter reference text to calculate WER."
+    try:
+        wer_details = asr_interface.calculate_wer_details(reference, transcription)
+        # Format WER results
+        wer_percent = wer_details['wer'] * 100
+        result = f"""
+        ## 📊 WER Analysis Results
+        **Word Error Rate:** {wer_percent:.2f}%
+        ### Word Statistics:
+        - **Correct Words:** {wer_details['correct_words']}
+        - **Total Words:** {wer_details['total_words']}
+        - **Accuracy:** {(wer_details['correct_words'] / wer_details['total_words'] * 100):.2f}%
+        ### Error Breakdown:
+        - **Insertions:** {wer_details['insertions']}
+        - **Deletions:** {wer_details['deletions']}
+        - **Substitutions:** {wer_details['substitutions']}
+        - **Total Errors:** {wer_details['total_errors']}
+        ### Normalized Texts:
+        **Reference:** `{wer_details['ref_normalized']}`
+        **Hypothesis:** `{wer_details['hyp_normalized']}`
+        """
+        return result
+    except Exception as e:
+        return f"Error calculating WER: {str(e)}"
+def clear():
+    """Clear all inputs."""
+    return None, "", ""
+# Create the Gradio interface
+with gr.Blocks(title="Multi-Model ASR", theme=gr.themes.Soft()) as interface:
+    gr.Markdown("# 🎤 Multi-Model Speech Recognition")
+    gr.Markdown("Select a model, then record or upload audio for transcription.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 🤖 Model Selection")
+            # Model dropdown
+            model_dropdown = gr.Dropdown(
+                choices=list(asr_interface.available_models.keys()),
+                value="facebook/wav2vec2-base-960h",
+                label="Select ASR Model",
+                info="Choose the model based on your needs"
             )
+            # Model info display
+            model_info = gr.Markdown(asr_interface.get_model_info("facebook/wav2vec2-base-960h"))
+            # Load model button
+            load_btn = gr.Button("📥 Load Model", variant="primary")
+            # Current model status
+            model_status = gr.Markdown("No model loaded. Please select and load a model.")
+            gr.Markdown("### 📹 Audio Input")
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="Record or upload audio",
+                show_label=True
             )
+            with gr.Row():
+                transcribe_btn = gr.Button("🔄 Transcribe", variant="primary", size="lg")
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+        with gr.Column(scale=1):
+            gr.Markdown("### 📝 Transcription")
+            text_output = gr.Textbox(
+                label="Transcribed Text",
+                placeholder="Your transcribed text will appear here...",
+                lines=6,
+                max_lines=10
             )
+            gr.Markdown("### 📊 WER Analysis")
+            reference_input = gr.Textbox(
+                label="Reference Text (Optional)",
+                placeholder="Enter the correct/expected text to calculate WER...",
+                lines=3,
+                max_lines=5
             )
+            wer_output = gr.Markdown("Enter reference text to see WER analysis")
+    # Status indicator
+    status = gr.Markdown("Ready! Select a model and load it to get started.")
+    # Event handlers
+    def update_model_info(model_name):
+        return asr_interface.get_model_info(model_name)
+    # Connect event handlers
+    model_dropdown.change(
+        fn=update_model_info,
+        inputs=model_dropdown,
+        outputs=model_info
+    )
+    load_btn.click(
+        fn=load_selected_model,
+        inputs=model_dropdown,
+        outputs=model_status
+    )
+    transcribe_btn.click(
+        fn=transcribe,
+        inputs=audio_input,
+        outputs=text_output
+    )
+    clear_btn.click(
+        fn=clear,
+        outputs=[audio_input, text_output, wer_output]
+    )
+    # Auto-transcribe when audio changes
+    audio_input.change(
+        fn=transcribe,
+        inputs=audio_input,
+        outputs=text_output
+    )
+    # Calculate WER when reference text changes
+    reference_input.change(
+        fn=calculate_wer,
+        inputs=[text_output, reference_input],
+        outputs=wer_output
+    )
+    # Calculate WER when transcription changes
+    text_output.change(
+        fn=calculate_wer,
+        inputs=[text_output, reference_input],
+        outputs=wer_output
+    )
+    # Instructions
+    with gr.Accordion("ℹ️ Instructions", open=False):
+        gr.Markdown("""
+        ### How to use:
+        1. **Select Model**: Choose from available Wav2Vec2 and Whisper models
+        2. **Load Model**: Click 'Load Model' to load the selected model
+        3. **Record/Upload**: Record audio or upload an audio file
+        4. **Transcribe**: Click 'Transcribe' or wait for auto-transcription
+        5. **WER Analysis**: Enter reference text to calculate Word Error Rate
+        6. **Copy Text**: Use 'Copy Text' to copy the result
+        ### Model Comparison:
+        - **Wav2Vec2 Base (100h)**: Fastest, smallest memory (~300MB), good for basic tasks
+        - **Wav2Vec2 Base (960h)**: Balanced speed/accuracy (~1GB), recommended for most uses
+        - **Wav2Vec2 Large (960h)**: High accuracy (~3GB), best for difficult audio
+        - **Whisper Large V3 Turbo**: State-of-the-art accuracy (~5GB), multilingual support
+        ### Tips:
+        - Larger models are more accurate but slower
+        - Only one model is loaded at a time to save memory
+        - Switch models anytime by selecting and loading a new one
+        - WER calculation normalizes text (lowercase, no punctuation)
+        - Lower WER percentage indicates better transcription accuracy
+        """)
 # Launch the interface
 if __name__ == "__main__":
+    interface.launch(
+        server_name="0.0.0.0",  # Allow external connections
+        server_port=7860,       # Default HF Spaces port
+        share=False,            # Don't create shareable link (HF handles this)
+        show_error=True,        # Show errors for debugging
+        quiet=False,            # Show startup messages
+        inbrowser=False         # Don't open browser (HF handles this)
+    )