Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -6,7 +6,6 @@ import numpy as np | |
| 6 | 
             
            import os
         | 
| 7 | 
             
            import gc
         | 
| 8 | 
             
            import re
         | 
| 9 | 
            -
            from difflib import SequenceMatcher
         | 
| 10 |  | 
| 11 | 
             
            class MultiModelASRInterface:
         | 
| 12 | 
             
                def __init__(self):
         | 
| @@ -120,7 +119,7 @@ class MultiModelASRInterface: | |
| 120 |  | 
| 121 | 
             
                def preprocess_audio(self, audio):
         | 
| 122 | 
             
                    """
         | 
| 123 | 
            -
                    Preprocess audio for  | 
| 124 |  | 
| 125 | 
             
                    Args:
         | 
| 126 | 
             
                        audio: Audio data (numpy array or file path)
         | 
| @@ -146,7 +145,7 @@ class MultiModelASRInterface: | |
| 146 | 
             
                            audio_data = np.mean(audio_data, axis=1)
         | 
| 147 | 
             
                            print(f"Converted to mono: shape={audio_data.shape}")
         | 
| 148 |  | 
| 149 | 
            -
                        # Resample to 16kHz if needed ( | 
| 150 | 
             
                        if sample_rate != 16000:
         | 
| 151 | 
             
                            print(f"Resampling from {sample_rate}Hz to 16000Hz")
         | 
| 152 | 
             
                            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         | 
| @@ -361,229 +360,215 @@ class MultiModelASRInterface: | |
| 361 | 
             
                        info = self.available_models[model_name]
         | 
| 362 | 
             
                        return f"**{info['name']}**\n{info['description']}\nMemory: {info['size']}"
         | 
| 363 | 
             
                    return "Model information not available"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 364 |  | 
| 365 | 
            -
                 | 
| 366 | 
            -
                     | 
| 367 |  | 
| 368 | 
            -
                     | 
| 369 | 
            -
             | 
| 370 | 
            -
             | 
| 371 | 
            -
             | 
| 372 | 
            -
             | 
| 373 | 
            -
             | 
| 374 | 
            -
             | 
| 375 | 
            -
             | 
| 376 | 
            -
             | 
| 377 | 
            -
             | 
| 378 | 
            -
             | 
| 379 | 
            -
             | 
| 380 | 
            -
             | 
| 381 | 
            -
             | 
| 382 | 
            -
             | 
| 383 | 
            -
             | 
| 384 | 
            -
             | 
| 385 | 
            -
             | 
| 386 | 
            -
             | 
| 387 | 
            -
             | 
| 388 | 
            -
             | 
| 389 | 
            -
             | 
| 390 | 
            -
             | 
| 391 | 
            -
             | 
| 392 | 
            -
             | 
| 393 | 
            -
             | 
| 394 | 
            -
             | 
| 395 | 
            -
             | 
| 396 | 
            -
             | 
| 397 | 
            -
             | 
| 398 | 
            -
             | 
| 399 | 
            -
             | 
| 400 | 
            -
             | 
| 401 | 
            -
             | 
| 402 | 
            -
             | 
| 403 | 
            -
             | 
| 404 | 
            -
             | 
| 405 | 
            -
             | 
| 406 | 
            -
             | 
| 407 | 
            -
             | 
| 408 | 
            -
             | 
| 409 | 
            -
             | 
| 410 | 
            -
             | 
| 411 | 
            -
             | 
| 412 | 
            -
             | 
| 413 | 
            -
             | 
| 414 | 
            -
             | 
| 415 | 
            -
             | 
| 416 | 
            -
                                gr.Markdown("### π WER Analysis")
         | 
| 417 | 
            -
                                reference_input = gr.Textbox(
         | 
| 418 | 
            -
                                    label="Reference Text (Optional)",
         | 
| 419 | 
            -
                                    placeholder="Enter the correct/expected text to calculate WER...",
         | 
| 420 | 
            -
                                    lines=3,
         | 
| 421 | 
            -
                                    max_lines=5
         | 
| 422 | 
            -
                                )
         | 
| 423 | 
            -
                                
         | 
| 424 | 
            -
                                wer_output = gr.Markdown("Enter reference text to see WER analysis")
         | 
| 425 | 
            -
                        
         | 
| 426 | 
            -
                        # Status indicator
         | 
| 427 | 
            -
                        status = gr.Markdown("Ready! Select a model and load it to get started.")
         | 
| 428 | 
            -
                        
         | 
| 429 | 
            -
                        # Event handlers
         | 
| 430 | 
            -
                        def update_model_info(model_name):
         | 
| 431 | 
            -
                            return self.get_model_info(model_name)
         | 
| 432 | 
            -
                        
         | 
| 433 | 
            -
                        def load_selected_model(model_name):
         | 
| 434 | 
            -
                            result = self.load_model(model_name)
         | 
| 435 | 
            -
                            return result, f"Current model: {self.available_models[model_name]['name']}"
         | 
| 436 | 
            -
                        
         | 
| 437 | 
            -
                        def transcribe(audio):
         | 
| 438 | 
            -
                            if audio is None:
         | 
| 439 | 
            -
                                return "Please provide audio first.", "No audio to transcribe."
         | 
| 440 | 
            -
                            
         | 
| 441 | 
            -
                            if self.model is None:
         | 
| 442 | 
            -
                                return "Please load a model first.", "No model loaded."
         | 
| 443 | 
            -
                            
         | 
| 444 | 
            -
                            print(f"Transcribe called with model: {self.current_model_name}")
         | 
| 445 | 
            -
                            status_msg = f"π Transcribing with {self.available_models[self.current_model_name]['name']}..."
         | 
| 446 | 
            -
                            transcription = self.transcribe_audio(audio)
         | 
| 447 | 
            -
                            
         | 
| 448 | 
            -
                            if transcription and "Error" not in transcription and "No audio provided" not in transcription:
         | 
| 449 | 
            -
                                status_msg = "β
 Transcription completed!"
         | 
| 450 | 
            -
                            else:
         | 
| 451 | 
            -
                                status_msg = "β Transcription failed. Please try again."
         | 
| 452 | 
            -
                            
         | 
| 453 | 
            -
                            return status_msg, transcription
         | 
| 454 | 
            -
                        
         | 
| 455 | 
            -
                        def calculate_wer(transcription, reference):
         | 
| 456 | 
            -
                            """Calculate WER when reference text is provided."""
         | 
| 457 | 
            -
                            if not transcription or transcription.strip() == "":
         | 
| 458 | 
            -
                                return "No transcription available for WER calculation."
         | 
| 459 | 
            -
                            
         | 
| 460 | 
            -
                            if not reference or reference.strip() == "":
         | 
| 461 | 
            -
                                return "Enter reference text to calculate WER."
         | 
| 462 | 
            -
                            
         | 
| 463 | 
            -
                            try:
         | 
| 464 | 
            -
                                wer_details = self.calculate_wer_details(reference, transcription)
         | 
| 465 | 
            -
                                
         | 
| 466 | 
            -
                                # Format WER results
         | 
| 467 | 
            -
                                wer_percent = wer_details['wer'] * 100
         | 
| 468 | 
            -
                                
         | 
| 469 | 
            -
                                result = f"""
         | 
| 470 | 
            -
                                ## π WER Analysis Results
         | 
| 471 | 
            -
                                
         | 
| 472 | 
            -
                                **Word Error Rate:** {wer_percent:.2f}%
         | 
| 473 | 
            -
                                
         | 
| 474 | 
            -
                                ### Word Statistics:
         | 
| 475 | 
            -
                                - **Correct Words:** {wer_details['correct_words']}
         | 
| 476 | 
            -
                                - **Total Words:** {wer_details['total_words']}
         | 
| 477 | 
            -
                                - **Accuracy:** {(wer_details['correct_words'] / wer_details['total_words'] * 100):.2f}%
         | 
| 478 | 
            -
                                
         | 
| 479 | 
            -
                                ### Error Breakdown:
         | 
| 480 | 
            -
                                - **Insertions:** {wer_details['insertions']}
         | 
| 481 | 
            -
                                - **Deletions:** {wer_details['deletions']}
         | 
| 482 | 
            -
                                - **Substitutions:** {wer_details['substitutions']}
         | 
| 483 | 
            -
                                - **Total Errors:** {wer_details['total_errors']}
         | 
| 484 | 
            -
                                
         | 
| 485 | 
            -
                                ### Normalized Texts:
         | 
| 486 | 
            -
                                **Reference:** `{wer_details['ref_normalized']}`
         | 
| 487 | 
            -
                                **Hypothesis:** `{wer_details['hyp_normalized']}`
         | 
| 488 | 
            -
                                """
         | 
| 489 | 
            -
                                
         | 
| 490 | 
            -
                                return result
         | 
| 491 | 
            -
                                
         | 
| 492 | 
            -
                            except Exception as e:
         | 
| 493 | 
            -
                                return f"Error calculating WER: {str(e)}"
         | 
| 494 | 
            -
                        
         | 
| 495 | 
            -
                        def clear():
         | 
| 496 | 
            -
                            return None, "", "", "Ready! Record audio or upload a file to get started."
         | 
| 497 | 
            -
                        
         | 
| 498 | 
            -
                        def copy_text(text):
         | 
| 499 | 
            -
                            if text and text.strip():
         | 
| 500 | 
            -
                                return gr.update(value="Text copied to clipboard!")
         | 
| 501 | 
            -
                            return gr.update(value="No text to copy.")
         | 
| 502 | 
            -
                        
         | 
| 503 | 
            -
                        # Connect event handlers
         | 
| 504 | 
            -
                        model_dropdown.change(
         | 
| 505 | 
            -
                            fn=update_model_info,
         | 
| 506 | 
            -
                            inputs=model_dropdown,
         | 
| 507 | 
            -
                            outputs=model_info
         | 
| 508 | 
            -
                        )
         | 
| 509 | 
            -
                        
         | 
| 510 | 
            -
                        load_btn.click(
         | 
| 511 | 
            -
                            fn=load_selected_model,
         | 
| 512 | 
            -
                            inputs=model_dropdown,
         | 
| 513 | 
            -
                            outputs=[model_status, status]
         | 
| 514 | 
             
                        )
         | 
| 515 |  | 
| 516 | 
            -
                         | 
| 517 | 
            -
             | 
| 518 | 
            -
                            inputs=audio_input,
         | 
| 519 | 
            -
                            outputs=[status, text_output]
         | 
| 520 | 
            -
                        )
         | 
| 521 |  | 
| 522 | 
            -
                         | 
| 523 | 
            -
             | 
| 524 | 
            -
                            outputs=[audio_input, text_output, wer_output, status]
         | 
| 525 | 
            -
                        )
         | 
| 526 |  | 
| 527 | 
            -
                         | 
| 528 | 
            -
             | 
| 529 | 
            -
                            inputs=text_output,
         | 
| 530 | 
            -
                            outputs=status
         | 
| 531 | 
            -
                        )
         | 
| 532 |  | 
| 533 | 
            -
                         | 
| 534 | 
            -
                        audio_input. | 
| 535 | 
            -
                             | 
| 536 | 
            -
                             | 
| 537 | 
            -
                             | 
|  | |
| 538 | 
             
                        )
         | 
| 539 |  | 
| 540 | 
            -
                         | 
| 541 | 
            -
             | 
| 542 | 
            -
                             | 
| 543 | 
            -
             | 
| 544 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 545 | 
             
                        )
         | 
| 546 |  | 
| 547 | 
            -
                         | 
| 548 | 
            -
                         | 
| 549 | 
            -
                             | 
| 550 | 
            -
                             | 
| 551 | 
            -
                             | 
|  | |
| 552 | 
             
                        )
         | 
| 553 |  | 
| 554 | 
            -
                         | 
| 555 | 
            -
             | 
| 556 | 
            -
             | 
| 557 | 
            -
             | 
| 558 | 
            -
             | 
| 559 | 
            -
             | 
| 560 | 
            -
             | 
| 561 | 
            -
             | 
| 562 | 
            -
             | 
| 563 | 
            -
             | 
| 564 | 
            -
             | 
| 565 | 
            -
             | 
| 566 | 
            -
             | 
| 567 | 
            -
             | 
| 568 | 
            -
             | 
| 569 | 
            -
             | 
| 570 | 
            -
             | 
| 571 | 
            -
             | 
| 572 | 
            -
             | 
| 573 | 
            -
             | 
| 574 | 
            -
             | 
| 575 | 
            -
             | 
| 576 | 
            -
             | 
| 577 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 578 |  | 
| 579 | 
            -
                     | 
| 580 | 
            -
             | 
| 581 | 
            -
             | 
| 582 | 
            -
             | 
| 583 | 
            -
             | 
| 584 | 
            -
             | 
| 585 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 586 |  | 
| 587 | 
             
            # Launch the interface
         | 
| 588 | 
             
            if __name__ == "__main__":
         | 
| 589 | 
            -
                interface.launch( | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 6 | 
             
            import os
         | 
| 7 | 
             
            import gc
         | 
| 8 | 
             
            import re
         | 
|  | |
| 9 |  | 
| 10 | 
             
            class MultiModelASRInterface:
         | 
| 11 | 
             
                def __init__(self):
         | 
|  | |
| 119 |  | 
| 120 | 
             
                def preprocess_audio(self, audio):
         | 
| 121 | 
             
                    """
         | 
| 122 | 
            +
                    Preprocess audio for ASR models.
         | 
| 123 |  | 
| 124 | 
             
                    Args:
         | 
| 125 | 
             
                        audio: Audio data (numpy array or file path)
         | 
|  | |
| 145 | 
             
                            audio_data = np.mean(audio_data, axis=1)
         | 
| 146 | 
             
                            print(f"Converted to mono: shape={audio_data.shape}")
         | 
| 147 |  | 
| 148 | 
            +
                        # Resample to 16kHz if needed (models expect 16kHz)
         | 
| 149 | 
             
                        if sample_rate != 16000:
         | 
| 150 | 
             
                            print(f"Resampling from {sample_rate}Hz to 16000Hz")
         | 
| 151 | 
             
                            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         | 
|  | |
| 360 | 
             
                        info = self.available_models[model_name]
         | 
| 361 | 
             
                        return f"**{info['name']}**\n{info['description']}\nMemory: {info['size']}"
         | 
| 362 | 
             
                    return "Model information not available"
         | 
| 363 | 
            +
             | 
| 364 | 
            +
            # Initialize the ASR interface
         | 
| 365 | 
            +
            asr_interface = MultiModelASRInterface()
         | 
| 366 | 
            +
             | 
| 367 | 
            +
            def load_selected_model(model_name):
         | 
| 368 | 
            +
                """Load the selected model."""
         | 
| 369 | 
            +
                return asr_interface.load_model(model_name)
         | 
| 370 | 
            +
             | 
| 371 | 
            +
            def transcribe(audio):
         | 
| 372 | 
            +
                """Transcribe audio."""
         | 
| 373 | 
            +
                if audio is None:
         | 
| 374 | 
            +
                    return "Please provide audio first."
         | 
| 375 | 
            +
                
         | 
| 376 | 
            +
                if asr_interface.model is None:
         | 
| 377 | 
            +
                    return "Please load a model first."
         | 
| 378 | 
            +
                
         | 
| 379 | 
            +
                print(f"Transcribe called with model: {asr_interface.current_model_name}")
         | 
| 380 | 
            +
                transcription = asr_interface.transcribe_audio(audio)
         | 
| 381 | 
            +
                
         | 
| 382 | 
            +
                if transcription and "Error" not in transcription and "No audio provided" not in transcription:
         | 
| 383 | 
            +
                    return transcription
         | 
| 384 | 
            +
                else:
         | 
| 385 | 
            +
                    return transcription
         | 
| 386 | 
            +
             | 
| 387 | 
            +
            def calculate_wer(transcription, reference):
         | 
| 388 | 
            +
                """Calculate WER when reference text is provided."""
         | 
| 389 | 
            +
                if not transcription or transcription.strip() == "":
         | 
| 390 | 
            +
                    return "No transcription available for WER calculation."
         | 
| 391 | 
            +
                
         | 
| 392 | 
            +
                if not reference or reference.strip() == "":
         | 
| 393 | 
            +
                    return "Enter reference text to calculate WER."
         | 
| 394 |  | 
| 395 | 
            +
                try:
         | 
| 396 | 
            +
                    wer_details = asr_interface.calculate_wer_details(reference, transcription)
         | 
| 397 |  | 
| 398 | 
            +
                    # Format WER results
         | 
| 399 | 
            +
                    wer_percent = wer_details['wer'] * 100
         | 
| 400 | 
            +
                    
         | 
| 401 | 
            +
                    result = f"""
         | 
| 402 | 
            +
                    ## π WER Analysis Results
         | 
| 403 | 
            +
                    
         | 
| 404 | 
            +
                    **Word Error Rate:** {wer_percent:.2f}%
         | 
| 405 | 
            +
                    
         | 
| 406 | 
            +
                    ### Word Statistics:
         | 
| 407 | 
            +
                    - **Correct Words:** {wer_details['correct_words']}
         | 
| 408 | 
            +
                    - **Total Words:** {wer_details['total_words']}
         | 
| 409 | 
            +
                    - **Accuracy:** {(wer_details['correct_words'] / wer_details['total_words'] * 100):.2f}%
         | 
| 410 | 
            +
                    
         | 
| 411 | 
            +
                    ### Error Breakdown:
         | 
| 412 | 
            +
                    - **Insertions:** {wer_details['insertions']}
         | 
| 413 | 
            +
                    - **Deletions:** {wer_details['deletions']}
         | 
| 414 | 
            +
                    - **Substitutions:** {wer_details['substitutions']}
         | 
| 415 | 
            +
                    - **Total Errors:** {wer_details['total_errors']}
         | 
| 416 | 
            +
                    
         | 
| 417 | 
            +
                    ### Normalized Texts:
         | 
| 418 | 
            +
                    **Reference:** `{wer_details['ref_normalized']}`
         | 
| 419 | 
            +
                    **Hypothesis:** `{wer_details['hyp_normalized']}`
         | 
| 420 | 
            +
                    """
         | 
| 421 | 
            +
                    
         | 
| 422 | 
            +
                    return result
         | 
| 423 | 
            +
                    
         | 
| 424 | 
            +
                except Exception as e:
         | 
| 425 | 
            +
                    return f"Error calculating WER: {str(e)}"
         | 
| 426 | 
            +
             | 
| 427 | 
            +
            def clear():
         | 
| 428 | 
            +
                """Clear all inputs."""
         | 
| 429 | 
            +
                return None, "", ""
         | 
| 430 | 
            +
             | 
| 431 | 
            +
            # Create the Gradio interface
         | 
| 432 | 
            +
            with gr.Blocks(title="Multi-Model ASR", theme=gr.themes.Soft()) as interface:
         | 
| 433 | 
            +
                gr.Markdown("# π€ Multi-Model Speech Recognition")
         | 
| 434 | 
            +
                gr.Markdown("Select a model, then record or upload audio for transcription.")
         | 
| 435 | 
            +
                
         | 
| 436 | 
            +
                with gr.Row():
         | 
| 437 | 
            +
                    with gr.Column(scale=1):
         | 
| 438 | 
            +
                        gr.Markdown("### π€ Model Selection")
         | 
| 439 | 
            +
                        
         | 
| 440 | 
            +
                        # Model dropdown
         | 
| 441 | 
            +
                        model_dropdown = gr.Dropdown(
         | 
| 442 | 
            +
                            choices=list(asr_interface.available_models.keys()),
         | 
| 443 | 
            +
                            value="facebook/wav2vec2-base-960h",
         | 
| 444 | 
            +
                            label="Select ASR Model",
         | 
| 445 | 
            +
                            info="Choose the model based on your needs"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 446 | 
             
                        )
         | 
| 447 |  | 
| 448 | 
            +
                        # Model info display
         | 
| 449 | 
            +
                        model_info = gr.Markdown(asr_interface.get_model_info("facebook/wav2vec2-base-960h"))
         | 
|  | |
|  | |
|  | |
| 450 |  | 
| 451 | 
            +
                        # Load model button
         | 
| 452 | 
            +
                        load_btn = gr.Button("π₯ Load Model", variant="primary")
         | 
|  | |
|  | |
| 453 |  | 
| 454 | 
            +
                        # Current model status
         | 
| 455 | 
            +
                        model_status = gr.Markdown("No model loaded. Please select and load a model.")
         | 
|  | |
|  | |
|  | |
| 456 |  | 
| 457 | 
            +
                        gr.Markdown("### πΉ Audio Input")
         | 
| 458 | 
            +
                        audio_input = gr.Audio(
         | 
| 459 | 
            +
                            sources=["microphone", "upload"],
         | 
| 460 | 
            +
                            type="filepath",
         | 
| 461 | 
            +
                            label="Record or upload audio",
         | 
| 462 | 
            +
                            show_label=True
         | 
| 463 | 
             
                        )
         | 
| 464 |  | 
| 465 | 
            +
                        with gr.Row():
         | 
| 466 | 
            +
                            transcribe_btn = gr.Button("π Transcribe", variant="primary", size="lg")
         | 
| 467 | 
            +
                            clear_btn = gr.Button("ποΈ Clear", variant="secondary")
         | 
| 468 | 
            +
                    
         | 
| 469 | 
            +
                    with gr.Column(scale=1):
         | 
| 470 | 
            +
                        gr.Markdown("### π Transcription")
         | 
| 471 | 
            +
                        text_output = gr.Textbox(
         | 
| 472 | 
            +
                            label="Transcribed Text",
         | 
| 473 | 
            +
                            placeholder="Your transcribed text will appear here...",
         | 
| 474 | 
            +
                            lines=6,
         | 
| 475 | 
            +
                            max_lines=10
         | 
| 476 | 
             
                        )
         | 
| 477 |  | 
| 478 | 
            +
                        gr.Markdown("### π WER Analysis")
         | 
| 479 | 
            +
                        reference_input = gr.Textbox(
         | 
| 480 | 
            +
                            label="Reference Text (Optional)",
         | 
| 481 | 
            +
                            placeholder="Enter the correct/expected text to calculate WER...",
         | 
| 482 | 
            +
                            lines=3,
         | 
| 483 | 
            +
                            max_lines=5
         | 
| 484 | 
             
                        )
         | 
| 485 |  | 
| 486 | 
            +
                        wer_output = gr.Markdown("Enter reference text to see WER analysis")
         | 
| 487 | 
            +
                
         | 
| 488 | 
            +
                # Status indicator
         | 
| 489 | 
            +
                status = gr.Markdown("Ready! Select a model and load it to get started.")
         | 
| 490 | 
            +
                
         | 
| 491 | 
            +
                # Event handlers
         | 
| 492 | 
            +
                def update_model_info(model_name):
         | 
| 493 | 
            +
                    return asr_interface.get_model_info(model_name)
         | 
| 494 | 
            +
                
         | 
| 495 | 
            +
                # Connect event handlers
         | 
| 496 | 
            +
                model_dropdown.change(
         | 
| 497 | 
            +
                    fn=update_model_info,
         | 
| 498 | 
            +
                    inputs=model_dropdown,
         | 
| 499 | 
            +
                    outputs=model_info
         | 
| 500 | 
            +
                )
         | 
| 501 | 
            +
                
         | 
| 502 | 
            +
                load_btn.click(
         | 
| 503 | 
            +
                    fn=load_selected_model,
         | 
| 504 | 
            +
                    inputs=model_dropdown,
         | 
| 505 | 
            +
                    outputs=model_status
         | 
| 506 | 
            +
                )
         | 
| 507 | 
            +
                
         | 
| 508 | 
            +
                transcribe_btn.click(
         | 
| 509 | 
            +
                    fn=transcribe,
         | 
| 510 | 
            +
                    inputs=audio_input,
         | 
| 511 | 
            +
                    outputs=text_output
         | 
| 512 | 
            +
                )
         | 
| 513 | 
            +
                
         | 
| 514 | 
            +
                clear_btn.click(
         | 
| 515 | 
            +
                    fn=clear,
         | 
| 516 | 
            +
                    outputs=[audio_input, text_output, wer_output]
         | 
| 517 | 
            +
                )
         | 
| 518 | 
            +
                
         | 
| 519 | 
            +
                # Auto-transcribe when audio changes
         | 
| 520 | 
            +
                audio_input.change(
         | 
| 521 | 
            +
                    fn=transcribe,
         | 
| 522 | 
            +
                    inputs=audio_input,
         | 
| 523 | 
            +
                    outputs=text_output
         | 
| 524 | 
            +
                )
         | 
| 525 | 
            +
                
         | 
| 526 | 
            +
                # Calculate WER when reference text changes
         | 
| 527 | 
            +
                reference_input.change(
         | 
| 528 | 
            +
                    fn=calculate_wer,
         | 
| 529 | 
            +
                    inputs=[text_output, reference_input],
         | 
| 530 | 
            +
                    outputs=wer_output
         | 
| 531 | 
            +
                )
         | 
| 532 | 
            +
                
         | 
| 533 | 
            +
                # Calculate WER when transcription changes
         | 
| 534 | 
            +
                text_output.change(
         | 
| 535 | 
            +
                    fn=calculate_wer,
         | 
| 536 | 
            +
                    inputs=[text_output, reference_input],
         | 
| 537 | 
            +
                    outputs=wer_output
         | 
| 538 | 
            +
                )
         | 
| 539 | 
            +
                
         | 
| 540 | 
            +
                # Instructions
         | 
| 541 | 
            +
                with gr.Accordion("βΉοΈ Instructions", open=False):
         | 
| 542 | 
            +
                    gr.Markdown("""
         | 
| 543 | 
            +
                    ### How to use:
         | 
| 544 | 
            +
                    1. **Select Model**: Choose from available Wav2Vec2 and Whisper models
         | 
| 545 | 
            +
                    2. **Load Model**: Click 'Load Model' to load the selected model
         | 
| 546 | 
            +
                    3. **Record/Upload**: Record audio or upload an audio file
         | 
| 547 | 
            +
                    4. **Transcribe**: Click 'Transcribe' or wait for auto-transcription
         | 
| 548 | 
            +
                    5. **WER Analysis**: Enter reference text to calculate Word Error Rate
         | 
| 549 | 
            +
                    6. **Copy Text**: Use 'Copy Text' to copy the result
         | 
| 550 |  | 
| 551 | 
            +
                    ### Model Comparison:
         | 
| 552 | 
            +
                    - **Wav2Vec2 Base (100h)**: Fastest, smallest memory (~300MB), good for basic tasks
         | 
| 553 | 
            +
                    - **Wav2Vec2 Base (960h)**: Balanced speed/accuracy (~1GB), recommended for most uses
         | 
| 554 | 
            +
                    - **Wav2Vec2 Large (960h)**: High accuracy (~3GB), best for difficult audio
         | 
| 555 | 
            +
                    - **Whisper Large V3 Turbo**: State-of-the-art accuracy (~5GB), multilingual support
         | 
| 556 | 
            +
                    
         | 
| 557 | 
            +
                    ### Tips:
         | 
| 558 | 
            +
                    - Larger models are more accurate but slower
         | 
| 559 | 
            +
                    - Only one model is loaded at a time to save memory
         | 
| 560 | 
            +
                    - Switch models anytime by selecting and loading a new one
         | 
| 561 | 
            +
                    - WER calculation normalizes text (lowercase, no punctuation)
         | 
| 562 | 
            +
                    - Lower WER percentage indicates better transcription accuracy
         | 
| 563 | 
            +
                    """)
         | 
| 564 |  | 
| 565 | 
             
            # Launch the interface
         | 
| 566 | 
             
            if __name__ == "__main__":
         | 
| 567 | 
            +
                interface.launch(
         | 
| 568 | 
            +
                    server_name="0.0.0.0",  # Allow external connections
         | 
| 569 | 
            +
                    server_port=7860,       # Default HF Spaces port
         | 
| 570 | 
            +
                    share=False,            # Don't create shareable link (HF handles this)
         | 
| 571 | 
            +
                    show_error=True,        # Show errors for debugging
         | 
| 572 | 
            +
                    quiet=False,            # Show startup messages
         | 
| 573 | 
            +
                    inbrowser=False         # Don't open browser (HF handles this)
         | 
| 574 | 
            +
                ) 
         | 
