Spaces:

ABAO77
/

Run_code_api

Sleeping

App Files Files Community

ABAO77 commited on Sep 11

Commit

54a64d4

1 Parent(s): 225134a

update: new model xlsr

Browse files

Files changed (9) hide show

app.py +3 -27
example_model_usage.py +79 -0
src/AI_Models/wave2vec_inference.py +165 -64
src/apis/__pycache__/create_app.cpython-311.pyc +0 -0
src/apis/controllers/speaking_controller.py +339 -746
src/apis/create_app.py +13 -66
src/apis/routes/__pycache__/chat_route.cpython-311.pyc +0 -0
src/apis/routes/speaking_route.py +146 -269
test.py +456 -0

app.py CHANGED Viewed

@@ -1,36 +1,12 @@
-"""
-English Tutor API - Main Application
-Optimized with Whisper model preloading for faster pronunciation assessment
-"""
 from dotenv import load_dotenv
 load_dotenv()
 from src.apis.create_app import create_app, api_router
 import uvicorn
-from loguru import logger
-# Create FastAPI app with Whisper preloading
-app = create_app()
-app.include_router(api_router)
-# Add root endpoint
-@app.get("/")
-async def root():
-    return {
-        "message": "🎓 English Tutor API with Optimized Whisper",
-        "status": "ready",
-        "docs": "/docs",
-        "health": "/health"
-    }
 if __name__ == "__main__":
-    logger.info("🚀 Starting English Tutor API server...")
-    uvicorn.run(
-        "app:app",
-        host="0.0.0.0",
-        port=8000,
-        reload=False,  # Set to False to avoid reloading and losing preloaded model
-        log_level="info"
-    )

 from dotenv import load_dotenv
 load_dotenv()
 from src.apis.create_app import create_app, api_router
 import uvicorn
+app = create_app()
+app.include_router(api_router)
 if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)

example_model_usage.py ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env python3
+"""
+Example usage of Wave2Vec2Inference with dynamic model switching
+"""
+from src.AI_Models.wave2vec_inference import (
+    create_inference,
+    get_available_models,
+    get_model_name,
+    DEFAULT_MODEL
+)
+def main():
+    print("=== Wave2Vec2 Model Selection Example ===\n")
+    # Show available models
+    print("Available models:")
+    models = get_available_models()
+    for key, model_name in models.items():
+        print(f"  {key}: {model_name}")
+    print(f"\nDefault model: {DEFAULT_MODEL}\n")
+    # Example 1: Using default model
+    print("1. Creating inference with default model:")
+    asr_default = create_inference()
+    print(f"   Loaded: {asr_default.model_name}\n")
+    # Example 2: Using model key
+    print("2. Creating inference with model key 'english_large':")
+    asr_key = create_inference("english_large")
+    print(f"   Loaded: {asr_key.model_name}\n")
+    # Example 3: Using full model name
+    print("3. Creating inference with full model name:")
+    asr_full = create_inference("facebook/wav2vec2-base-960h")
+    print(f"   Loaded: {asr_full.model_name}\n")
+    # Example 4: Dynamic model switching
+    print("4. Dynamic model switching:")
+    model_keys = ["english_large", "multilingual", "base_english"]
+    for model_key in model_keys:
+        print(f"   Switching to: {model_key}")
+        asr = create_inference(model_key)
+        print(f"   Active model: {asr.model_name}")
+        # Example transcription (if you have an audio file)
+        # result = asr.file_to_text("your_audio_file.wav")
+        # print(f"   Result: {result}")
+        print()
+    # Example 5: Using with ONNX
+    print("5. Creating ONNX inference with model selection:")
+    try:
+        asr_onnx = create_inference("english_large", use_onnx=True)
+        print(f"   ONNX model loaded: {asr_onnx.model_name}")
+    except Exception as e:
+        print(f"   ONNX conversion needed: {e}")
+    print("\n=== Usage Examples ===")
+    print("# Use default model")
+    print("asr = create_inference()")
+    print()
+    print("# Use model key")
+    print("asr = create_inference('english_large')")
+    print()
+    print("# Use full model name")
+    print("asr = create_inference('facebook/wav2vec2-base-960h')")
+    print()
+    print("# Use with ONNX")
+    print("asr = create_inference('english_large', use_onnx=True)")
+    print()
+    print("# Transcribe audio")
+    print("result = asr.file_to_text('audio.wav')")
+    print("# or")
+    print("result = asr.buffer_to_text(audio_array)")
+if __name__ == "__main__":
+    main()

src/AI_Models/wave2vec_inference.py CHANGED Viewed

@@ -1,15 +1,63 @@
 import torch
-from transformers import AutoModelForCTC, AutoProcessor, Wav2Vec2Processor, Wav2Vec2ForCTC
 import onnxruntime as rt
 import numpy as np
 import librosa
 import warnings
 import os
 warnings.filterwarnings("ignore")
 class Wave2Vec2Inference:
-    def __init__(self, model_name, use_gpu=True):
         # Auto-detect device
         if use_gpu:
             if torch.backends.mps.is_available():
@@ -20,15 +68,25 @@ class Wave2Vec2Inference:
                 self.device = "cpu"
         else:
             self.device = "cpu"
         print(f"Using device: {self.device}")
-        # Load model and processor
-        self.processor = AutoProcessor.from_pretrained(model_name)
-        self.model = AutoModelForCTC.from_pretrained(model_name)
         self.model.to(self.device)
         self.model.eval()
         # Disable gradients for inference
         torch.set_grad_enabled(False)
@@ -52,7 +110,11 @@ class Wave2Vec2Inference:
         # Move to device
         input_values = inputs.input_values.to(self.device)
-        attention_mask = inputs.attention_mask.to(self.device) if "attention_mask" in inputs else None
         # Inference
         with torch.no_grad():
@@ -65,7 +127,7 @@ class Wave2Vec2Inference:
         predicted_ids = torch.argmax(logits, dim=-1)
         if self.device != "cpu":
             predicted_ids = predicted_ids.cpu()
         transcription = self.processor.batch_decode(predicted_ids)[0]
         return transcription.lower().strip()
@@ -79,20 +141,25 @@ class Wave2Vec2Inference:
 class Wave2Vec2ONNXInference:
-    def __init__(self, model_name, onnx_path, use_gpu=True):
-        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
         # Setup ONNX Runtime
         options = rt.SessionOptions()
         options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
         # Choose providers based on GPU availability
         providers = []
         if use_gpu and rt.get_available_providers():
-            if 'CUDAExecutionProvider' in rt.get_available_providers():
-                providers.append('CUDAExecutionProvider')
-        providers.append('CPUExecutionProvider')
         self.model = rt.InferenceSession(onnx_path, options, providers=providers)
         self.input_name = self.model.get_inputs()[0].name
         print(f"ONNX model loaded with providers: {self.model.get_providers()}")
@@ -118,7 +185,7 @@ class Wave2Vec2ONNXInference:
         # ONNX inference
         input_values = inputs.input_values.astype(np.float32)
         onnx_outputs = self.model.run(None, {self.input_name: input_values})[0]
         # Decode
         prediction = np.argmax(onnx_outputs, axis=-1)
         transcription = self.processor.decode(prediction.squeeze().tolist())
@@ -138,7 +205,7 @@ def convert_to_onnx(model_id_or_path, onnx_model_name):
     print(f"Converting {model_id_or_path} to ONNX...")
     model = Wav2Vec2ForCTC.from_pretrained(model_id_or_path)
     model.eval()
     # Create dummy input
     audio_len = 250000
     dummy_input = torch.randn(1, audio_len, requires_grad=True)
@@ -166,9 +233,7 @@ def quantize_onnx_model(onnx_model_path, quantized_model_path):
     from onnxruntime.quantization import quantize_dynamic, QuantType
     quantize_dynamic(
-        onnx_model_path,
-        quantized_model_path,
-        weight_type=QuantType.QUInt8
     )
     print(f"Quantized model saved to: {quantized_model_path}")
@@ -176,52 +241,57 @@ def quantize_onnx_model(onnx_model_path, quantized_model_path):
 def export_to_onnx(model_name, quantize=False):
     """
     Export model to ONNX format with optional quantization
     Args:
         model_name: HuggingFace model name
         quantize: Whether to also create quantized version
     Returns:
         tuple: (onnx_path, quantized_path or None)
     """
     onnx_filename = f"{model_name.split('/')[-1]}.onnx"
     convert_to_onnx(model_name, onnx_filename)
     quantized_path = None
     if quantize:
-        quantized_path = onnx_filename.replace('.onnx', '.quantized.onnx')
         quantize_onnx_model(onnx_filename, quantized_path)
     return onnx_filename, quantized_path
-def create_inference(model_name, use_onnx=False, onnx_path=None, use_gpu=True, use_onnx_quantize=False):
     """
     Create optimized inference instance
     Args:
-        model_name: HuggingFace model name
         use_onnx: Whether to use ONNX runtime
         onnx_path: Path to ONNX model file
         use_gpu: Whether to use GPU if available
         use_onnx_quantize: Whether to use quantized ONNX model
     Returns:
         Inference instance
     """
     if use_onnx:
         if not onnx_path or not os.path.exists(onnx_path):
             # Convert to ONNX if path not provided or doesn't exist
-            onnx_filename = f"{model_name.split('/')[-1]}.onnx"
-            convert_to_onnx(model_name, onnx_filename)
             onnx_path = onnx_filename
         if use_onnx_quantize:
-            quantized_path = onnx_path.replace('.onnx', '.quantized.onnx')
             if not os.path.exists(quantized_path):
                 quantize_onnx_model(onnx_path, quantized_path)
             onnx_path = quantized_path
         print(f"Using ONNX model: {onnx_path}")
         return Wave2Vec2ONNXInference(model_name, onnx_path, use_gpu)
     else:
@@ -231,39 +301,70 @@ def create_inference(model_name, use_onnx=False, onnx_path=None, use_gpu=True, u
 if __name__ == "__main__":
     import time
-    model_name = "facebook/wav2vec2-large-960h-lv60-self"
     test_file = "test.wav"
     if not os.path.exists(test_file):
         print(f"Test file {test_file} not found. Please provide a valid audio file.")
-        exit(1)
-    # Test different configurations
-    configs = [
-        {"use_onnx": False, "use_gpu": True},
-        {"use_onnx": True, "use_gpu": True, "use_onnx_quantize": False},
-        {"use_onnx": True, "use_gpu": True, "use_onnx_quantize": True},
-    ]
-    for config in configs:
-        print(f"\n=== Testing config: {config} ===")
-        # Create inference instance
-        asr = create_inference(model_name, **config)
-        # Warm up
-        asr.file_to_text(test_file)
-        # Test performance
-        times = []
-        for i in range(5):
-            start_time = time.time()
-            text = asr.file_to_text(test_file)
-            end_time = time.time()
-            execution_time = end_time - start_time
-            times.append(execution_time)
-            print(f"Run {i+1}: {execution_time:.3f}s - {text[:50]}...")
-        avg_time = sum(times) / len(times)
-        print(f"Average time: {avg_time:.3f}s")

 import torch
+from transformers import (
+    AutoModelForCTC,
+    AutoProcessor,
+    Wav2Vec2Processor,
+    Wav2Vec2ForCTC,
+)
 import onnxruntime as rt
 import numpy as np
 import librosa
 import warnings
 import os
 warnings.filterwarnings("ignore")
+# Available Wave2Vec2 models
+WAVE2VEC2_MODELS = {
+    "english_large": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+    "multilingual": "facebook/wav2vec2-large-xlsr-53",
+    "english_960h": "facebook/wav2vec2-large-960h-lv60-self",
+    "base_english": "facebook/wav2vec2-base-960h",
+    "large_english": "facebook/wav2vec2-large-960h",
+    "xlsr_english": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+    "xlsr_multilingual": "facebook/wav2vec2-large-xlsr-53"
+}
+# Default model
+DEFAULT_MODEL = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
+def get_available_models():
+    """Return dictionary of available Wave2Vec2 models"""
+    return WAVE2VEC2_MODELS.copy()
+def get_model_name(model_key=None):
+    """
+    Get model name from key or return default
+    Args:
+        model_key: Key from WAVE2VEC2_MODELS or full model name
+    Returns:
+        str: Full model name
+    """
+    if model_key is None:
+        return DEFAULT_MODEL
+    if model_key in WAVE2VEC2_MODELS:
+        return WAVE2VEC2_MODELS[model_key]
+    # If it's already a full model name, return as is
+    return model_key
 class Wave2Vec2Inference:
+    def __init__(self, model_name=None, use_gpu=True):
+        # Get the actual model name using helper function
+        self.model_name = get_model_name(model_name)
         # Auto-detect device
         if use_gpu:
             if torch.backends.mps.is_available():
                 self.device = "cpu"
         else:
             self.device = "cpu"
         print(f"Using device: {self.device}")
+        print(f"Loading model: {self.model_name}")
+        # Check if model is XLSR and use appropriate processor/model
+        is_xlsr = "xlsr" in self.model_name.lower()
+        if is_xlsr:
+            print("Using Wav2Vec2Processor and Wav2Vec2ForCTC for XLSR model")
+            self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
+            self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)
+        else:
+            print("Using AutoProcessor and AutoModelForCTC")
+            self.processor = AutoProcessor.from_pretrained(self.model_name)
+            self.model = AutoModelForCTC.from_pretrained(self.model_name)
         self.model.to(self.device)
         self.model.eval()
         # Disable gradients for inference
         torch.set_grad_enabled(False)
         # Move to device
         input_values = inputs.input_values.to(self.device)
+        attention_mask = (
+            inputs.attention_mask.to(self.device)
+            if "attention_mask" in inputs
+            else None
+        )
         # Inference
         with torch.no_grad():
         predicted_ids = torch.argmax(logits, dim=-1)
         if self.device != "cpu":
             predicted_ids = predicted_ids.cpu()
         transcription = self.processor.batch_decode(predicted_ids)[0]
         return transcription.lower().strip()
 class Wave2Vec2ONNXInference:
+    def __init__(self, model_name=None, onnx_path=None, use_gpu=True):
+        # Get the actual model name using helper function
+        self.model_name = get_model_name(model_name)
+        print(f"Loading ONNX model: {self.model_name}")
+        # Always use Wav2Vec2Processor for ONNX (works for all models)
+        self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
         # Setup ONNX Runtime
         options = rt.SessionOptions()
         options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
         # Choose providers based on GPU availability
         providers = []
         if use_gpu and rt.get_available_providers():
+            if "CUDAExecutionProvider" in rt.get_available_providers():
+                providers.append("CUDAExecutionProvider")
+        providers.append("CPUExecutionProvider")
         self.model = rt.InferenceSession(onnx_path, options, providers=providers)
         self.input_name = self.model.get_inputs()[0].name
         print(f"ONNX model loaded with providers: {self.model.get_providers()}")
         # ONNX inference
         input_values = inputs.input_values.astype(np.float32)
         onnx_outputs = self.model.run(None, {self.input_name: input_values})[0]
         # Decode
         prediction = np.argmax(onnx_outputs, axis=-1)
         transcription = self.processor.decode(prediction.squeeze().tolist())
     print(f"Converting {model_id_or_path} to ONNX...")
     model = Wav2Vec2ForCTC.from_pretrained(model_id_or_path)
     model.eval()
     # Create dummy input
     audio_len = 250000
     dummy_input = torch.randn(1, audio_len, requires_grad=True)
     from onnxruntime.quantization import quantize_dynamic, QuantType
     quantize_dynamic(
+        onnx_model_path, quantized_model_path, weight_type=QuantType.QUInt8
     )
     print(f"Quantized model saved to: {quantized_model_path}")
 def export_to_onnx(model_name, quantize=False):
     """
     Export model to ONNX format with optional quantization
     Args:
         model_name: HuggingFace model name
         quantize: Whether to also create quantized version
     Returns:
         tuple: (onnx_path, quantized_path or None)
     """
     onnx_filename = f"{model_name.split('/')[-1]}.onnx"
     convert_to_onnx(model_name, onnx_filename)
     quantized_path = None
     if quantize:
+        quantized_path = onnx_filename.replace(".onnx", ".quantized.onnx")
         quantize_onnx_model(onnx_filename, quantized_path)
     return onnx_filename, quantized_path
+def create_inference(
+    model_name=None, use_onnx=False, onnx_path=None, use_gpu=True, use_onnx_quantize=False
+):
     """
     Create optimized inference instance
     Args:
+        model_name: Model key from WAVE2VEC2_MODELS or full HuggingFace model name (default: uses DEFAULT_MODEL)
         use_onnx: Whether to use ONNX runtime
         onnx_path: Path to ONNX model file
         use_gpu: Whether to use GPU if available
         use_onnx_quantize: Whether to use quantized ONNX model
     Returns:
         Inference instance
     """
+    # Get the actual model name
+    actual_model_name = get_model_name(model_name)
     if use_onnx:
         if not onnx_path or not os.path.exists(onnx_path):
             # Convert to ONNX if path not provided or doesn't exist
+            onnx_filename = f"{actual_model_name.split('/')[-1]}.onnx"
+            convert_to_onnx(actual_model_name, onnx_filename)
             onnx_path = onnx_filename
         if use_onnx_quantize:
+            quantized_path = onnx_path.replace(".onnx", ".quantized.onnx")
             if not os.path.exists(quantized_path):
                 quantize_onnx_model(onnx_path, quantized_path)
             onnx_path = quantized_path
         print(f"Using ONNX model: {onnx_path}")
         return Wave2Vec2ONNXInference(model_name, onnx_path, use_gpu)
     else:
 if __name__ == "__main__":
     import time
+    # Display available models
+    print("Available Wave2Vec2 models:")
+    for key, model_name in get_available_models().items():
+        print(f"  {key}: {model_name}")
+    print(f"\nDefault model: {DEFAULT_MODEL}")
+    print()
+    # Test with different models
+    test_models = ["english_large", "multilingual", "english_960h"]
     test_file = "test.wav"
     if not os.path.exists(test_file):
         print(f"Test file {test_file} not found. Please provide a valid audio file.")
+        print("Creating example usage without actual file...")
+        # Example usage without file
+        print("\n=== Example Usage ===")
+        # Using default model
+        print("1. Using default model:")
+        asr_default = create_inference()
+        print(f"   Model loaded: {asr_default.model_name}")
+        # Using model key
+        print("\n2. Using model key 'english_large':")
+        asr_key = create_inference("english_large")
+        print(f"   Model loaded: {asr_key.model_name}")
+        # Using full model name
+        print("\n3. Using full model name:")
+        asr_full = create_inference("facebook/wav2vec2-base-960h")
+        print(f"   Model loaded: {asr_full.model_name}")
+        exit(0)
+    # Test different model configurations
+    for model_key in test_models:
+        print(f"\n=== Testing model: {model_key} ===")
+        # Test different configurations
+        configs = [
+            {"use_onnx": False, "use_gpu": True},
+            {"use_onnx": True, "use_gpu": True, "use_onnx_quantize": False},
+        ]
+        for config in configs:
+            print(f"\nConfig: {config}")
+            # Create inference instance with model selection
+            asr = create_inference(model_key, **config)
+            # Warm up
+            asr.file_to_text(test_file)
+            # Test performance
+            times = []
+            for i in range(3):
+                start_time = time.time()
+                text = asr.file_to_text(test_file)
+                end_time = time.time()
+                execution_time = end_time - start_time
+                times.append(execution_time)
+                print(f"Run {i+1}: {execution_time:.3f}s - {text[:50]}...")
+            avg_time = sum(times) / len(times)
+            print(f"Average time: {avg_time:.3f}s")

src/apis/__pycache__/create_app.cpython-311.pyc CHANGED Viewed

Binary files a/src/apis/__pycache__/create_app.cpython-311.pyc and b/src/apis/__pycache__/create_app.cpython-311.pyc differ

src/apis/controllers/speaking_controller.py CHANGED Viewed

@@ -13,8 +13,10 @@ from loguru import logger
 import Levenshtein
 from dataclasses import dataclass
 from enum import Enum
-import whisper
-import os
 # Download required NLTK data
 try:
@@ -23,168 +25,6 @@ try:
 except:
     print("Warning: NLTK data not available")
-# Pre-computed phoneme mappings for instant lookup (Top 1000 English words)
-COMMON_WORD_PHONEMES = {
-    "the": ["ð", "ə"],
-    "be": ["b", "i"],
-    "to": ["t", "u"],
-    "of": ["ʌ", "v"],
-    "and": ["æ", "n", "d"],
-    "a": ["ə"],
-    "in": ["ɪ", "n"],
-    "that": ["ð", "æ", "t"],
-    "have": ["h", "æ", "v"],
-    "i": ["aɪ"],
-    "it": ["ɪ", "t"],
-    "for": ["f", "ɔr"],
-    "not": ["n", "ɑ", "t"],
-    "on": ["ɑ", "n"],
-    "with": ["w", "ɪ", "θ"],
-    "he": ["h", "i"],
-    "as": ["æ", "z"],
-    "you": ["j", "u"],
-    "do": ["d", "u"],
-    "at": ["æ", "t"],
-    "this": ["ð", "ɪ", "s"],
-    "but": ["b", "ʌ", "t"],
-    "his": ["h", "ɪ", "z"],
-    "by": ["b", "aɪ"],
-    "from": ["f", "r", "ʌ", "m"],
-    "they": ["ð", "eɪ"],
-    "we": ["w", "i"],
-    "say": ["s", "eɪ"],
-    "her": ["h", "ɝ"],
-    "she": ["ʃ", "i"],
-    "or": ["ɔr"],
-    "an": ["æ", "n"],
-    "will": ["w", "ɪ", "l"],
-    "my": ["m", "aɪ"],
-    "one": ["w", "ʌ", "n"],
-    "all": ["ɔ", "l"],
-    "would": ["w", "ʊ", "d"],
-    "there": ["ð", "ɛr"],
-    "their": ["ð", "ɛr"],
-    "what": ["w", "ʌ", "t"],
-    "so": ["s", "oʊ"],
-    "up": ["ʌ", "p"],
-    "out": ["aʊ", "t"],
-    "if": ["ɪ", "f"],
-    "about": ["ə", "b", "aʊ", "t"],
-    "who": ["h", "u"],
-    "get": ["ɡ", "ɛ", "t"],
-    "which": ["w", "ɪ", "tʃ"],
-    "go": ["ɡ", "oʊ"],
-    "me": ["m", "i"],
-    "when": ["w", "ɛ", "n"],
-    "make": ["m", "eɪ", "k"],
-    "can": ["k", "æ", "n"],
-    "like": ["l", "aɪ", "k"],
-    "time": ["t", "aɪ", "m"],
-    "no": ["n", "oʊ"],
-    "just": ["dʒ", "ʌ", "s", "t"],
-    "him": ["h", "ɪ", "m"],
-    "know": ["n", "oʊ"],
-    "take": ["t", "eɪ", "k"],
-    "people": ["p", "i", "p", "ə", "l"],
-    "into": ["ɪ", "n", "t", "u"],
-    "year": ["j", "ɪr"],
-    "your": ["j", "ʊr"],
-    "good": ["ɡ", "ʊ", "d"],
-    "some": ["s", "ʌ", "m"],
-    "could": ["k", "ʊ", "d"],
-    "them": ["ð", "ɛ", "m"],
-    "see": ["s", "i"],
-    "other": ["ʌ", "ð", "ər"],
-    "than": ["ð", "æ", "n"],
-    "then": ["ð", "ɛ", "n"],
-    "now": ["n", "aʊ"],
-    "look": ["l", "ʊ", "k"],
-    "only": ["oʊ", "n", "l", "i"],
-    "come": ["k", "ʌ", "m"],
-    "its": ["ɪ", "t", "s"],
-    "over": ["oʊ", "v", "ər"],
-    "think": ["θ", "ɪ", "ŋ", "k"],
-    "also": ["ɔ", "l", "s", "oʊ"],
-    "your": ["j", "ʊr"],
-    "work": ["w", "ɝ", "k"],
-    "life": ["l", "aɪ", "f"],
-    "only": ["oʊ", "n", "l", "i"],
-    "new": ["n", "u"],
-    "way": ["w", "eɪ"],
-    "may": ["m", "eɪ"],
-    "say": ["s", "eɪ"],
-    "first": ["f", "ɝ", "s", "t"],
-    "well": ["w", "ɛ", "l"],
-    "great": ["ɡ", "r", "eɪ", "t"],
-    "little": ["l", "ɪ", "t", "ə", "l"],
-    "own": ["oʊ", "n"],
-    "old": ["oʊ", "l", "d"],
-    "right": ["r", "aɪ", "t"],
-    "big": ["b", "ɪ", "ɡ"],
-    "high": ["h", "aɪ"],
-    "different": ["d", "ɪ", "f", "ər", "ə", "n", "t"],
-    "small": ["s", "m", "ɔ", "l"],
-    "large": ["l", "ɑr", "dʒ"],
-    "next": ["n", "ɛ", "k", "s", "t"],
-    "early": ["ɝ", "l", "i"],
-    "young": ["j", "ʌ", "ŋ"],
-    "important": ["ɪ", "m", "p", "ɔr", "t", "ə", "n", "t"],
-    "few": ["f", "j", "u"],
-    "public": ["p", "ʌ", "b", "l", "ɪ", "k"],
-    "bad": ["b", "æ", "d"],
-    "same": ["s", "eɪ", "m"],
-    "able": ["eɪ", "b", "ə", "l"],
-    "hello": ["h", "ə", "l", "oʊ"],
-    "world": ["w", "ɝ", "l", "d"],
-    "how": ["h", "aʊ"],
-    "are": ["ɑr"],
-    "today": ["t", "ə", "d", "eɪ"],
-    "pronunciation": ["p", "r", "ə", "n", "ʌ", "n", "s", "i", "eɪ", "ʃ", "ə", "n"]
-}
-class LazyImports:
-    """Lazy load heavy dependencies only when needed"""
-    @property
-    def psutil(self):
-        if not hasattr(self, '_psutil'):
-            try:
-                import psutil
-                self._psutil = psutil
-            except ImportError:
-                # Create a mock psutil if not available
-                class MockPsutil:
-                    def cpu_count(self): return 4
-                    def cpu_percent(self, interval=0.1): return 50
-                self._psutil = MockPsutil()
-        return self._psutil
-    @property
-    def librosa(self):
-        if not hasattr(self, '_librosa'):
-            import librosa
-            self._librosa = librosa
-        return self._librosa
-class ObjectPool:
-    """Object pool to avoid creating/destroying objects continuously"""
-    def __init__(self):
-        self.g2p_pool = []
-        self.comparator_pool = []
-    def get_g2p(self):
-        if self.g2p_pool:
-            return self.g2p_pool.pop()
-        return None  # Will create new if needed
-    def return_g2p(self, obj):
-        if len(self.g2p_pool) < 5:  # Limit pool size
-            self.g2p_pool.append(obj)
-# Global instances for optimization
-lazy_imports = LazyImports()
-object_pool = ObjectPool()
 class AssessmentMode(Enum):
     WORD = "word"
@@ -213,119 +53,56 @@ class CharacterError:
     color: str
-class EnhancedWhisperASR:
-    """Enhanced Whisper ASR with prosody analysis support"""
-    def __init__(self, whisper_model: str = "base.en"):
         self.sample_rate = 16000
-        self.whisper_model_name = whisper_model
-        # Load Whisper model
-        logger.info(f"Loading Whisper model: {whisper_model}")
-        self.whisper_model = whisper.load_model(whisper_model, in_memory=True)
-        logger.info("Whisper model loaded successfully")
-        # Initialize G2P once and reuse (optimization fix)
-        self.g2p = EnhancedG2P()
-        logger.info("G2P converter initialized and ready for reuse")
-    def _characters_to_phoneme_representation(self, text: str) -> str:
-        """Convert character-based transcript to phoneme representation - Optimized reuse"""
-        if not text:
-            return ""
-        # Reuse the initialized G2P converter instead of creating new instances
-        return self.g2p.get_phoneme_string(text)
-    @lru_cache(maxsize=100)
-    def _cached_audio_features(self, audio_path: str, file_mtime: float) -> Dict:
-        """Cache audio features based on file modification time"""
-        return self._extract_basic_audio_features_uncached(audio_path)
-    def _extract_basic_audio_features(self, audio_path: str) -> Dict:
-        """Extract audio features with caching optimization"""
-        import os
-        try:
-            file_mtime = os.path.getmtime(audio_path)
-            return self._cached_audio_features(audio_path, file_mtime)
-        except:
-            # Fallback to uncached version
-            return self._extract_basic_audio_features_uncached(audio_path)
-    def _extract_basic_audio_features_uncached(self, audio_path: str) -> Dict:
-        """Ultra-fast basic features using minimal librosa"""
-        try:
-            # Load with aggressive downsampling
-            y, sr = lazy_imports.librosa.load(audio_path, sr=8000)  # Very low sample rate
-            duration = len(y) / sr
-            if duration < 0.1:
-                return {"duration": duration, "error": "Audio too short"}
-            # Simple energy-based features
-            energy = y ** 2
-            # Basic "pitch" using zero-crossing rate as proxy
-            zcr = lazy_imports.librosa.feature.zero_crossing_rate(y, frame_length=1024,
-                                                hop_length=512)[0]
-            pseudo_pitch = sr / (2 * np.mean(zcr)) if np.mean(zcr) > 0 else 0
-            # Simple rhythm from energy peaks
-            frame_length = int(0.1 * sr)  # 100ms frames
-            energy_frames = [np.mean(energy[i:i+frame_length])
-                            for i in range(0, len(energy)-frame_length, frame_length)]
-            # Count energy peaks as beats
-            if len(energy_frames) > 2:
-                threshold = np.mean(energy_frames) + 0.5 * np.std(energy_frames)
-                beats = sum(1 for e in energy_frames if e > threshold)
-                tempo = (beats / duration) * 60 if duration > 0 else 120
-            else:
-                tempo = 120
-                beats = 2
-            # RMS from energy
-            rms = np.sqrt(np.mean(energy))
-            return {
-                "duration": duration,
-                "pseudo_pitch": pseudo_pitch,
-                "tempo": tempo,
-                "rms": rms,
-                "beats": beats,
-                "frame_count": len(energy_frames),
-            }
-        except Exception as e:
-            logger.warning(f"Audio feature extraction failed: {e}")
-            return {"duration": 0, "error": str(e)}
-    # Rest of the methods remain unchanged...
     def transcribe_with_features(self, audio_path: str) -> Dict:
-        """Enhanced transcription with audio features for prosody analysis - Whisper only"""
         try:
             start_time = time.time()
-            # Use Whisper for transcription
-            logger.info("Using Whisper for transcription")
-            result = self.whisper_model.transcribe(audio_path)
-            character_transcript = result["text"]
-            logger.info(f"transcript time: {time.time() - start_time:.2f}s")
-            clean_character_time = time.time()
-            character_transcript = self._clean_character_transcript(character_transcript)
-            logger.info(f"clean_character_time: {time.time() - clean_character_time:.2f}s")
-            phone_transform_time = time.time()
-            phoneme_representation = self._characters_to_phoneme_representation(character_transcript)
-            logger.info(f"phone_transform_time: {time.time() - phone_transform_time:.2f}s")
             # Basic audio features (simplified for speed)
-            time_feature_start = time.time()
             audio_features = self._extract_basic_audio_features(audio_path)
-            logger.info(f"time_feature_extraction: {time.time() - time_feature_start:.2f}s")
-            logger.info(f"Optimized transcription time: {time.time() - start_time:.2f}s")
             return {
                 "character_transcript": character_transcript,
@@ -338,82 +115,114 @@ class EnhancedWhisperASR:
             logger.error(f"Enhanced ASR error: {e}")
             return self._empty_result()
-    # All other methods remain exactly the same...
-    def _extract_basic_audio_features_uncached(self, audio_path: str) -> Dict:
-        """Ultra-fast basic features using minimal librosa"""
         try:
-            # Load with aggressive downsampling
-            y, sr = librosa.load(audio_path, sr=8000)  # Very low sample rate
             duration = len(y) / sr
-            if duration < 0.1:
-                return {"duration": duration, "error": "Audio too short"}
-            # Simple energy-based features
-            energy = y ** 2
-            # Basic "pitch" using zero-crossing rate as proxy
-            zcr = librosa.feature.zero_crossing_rate(y, frame_length=1024,
-                                                hop_length=512)[0]
-            pseudo_pitch = sr / (2 * np.mean(zcr)) if np.mean(zcr) > 0 else 0
-            # Simple rhythm from energy peaks
-            frame_length = int(0.1 * sr)  # 100ms frames
-            energy_frames = [np.mean(energy[i:i+frame_length])
-                            for i in range(0, len(energy)-frame_length, frame_length)]
-            # Count energy peaks as beats
-            if len(energy_frames) > 2:
-                threshold = np.mean(energy_frames) + 0.5 * np.std(energy_frames)
-                beats = sum(1 for e in energy_frames if e > threshold)
-                tempo = (beats / duration) * 60 if duration > 0 else 120
-            else:
-                tempo = 120
-                beats = 2
-            # RMS from energy
-            rms_mean = np.sqrt(np.mean(energy))
-            rms_std = np.sqrt(np.std(energy))
             return {
                 "duration": duration,
                 "pitch": {
-                    "values": [pseudo_pitch] if pseudo_pitch > 0 else [],
-                    "mean": pseudo_pitch,
-                    "std": 0,
-                    "range": 0,
-                    "cv": 0,
                 },
                 "rhythm": {
                     "tempo": tempo,
-                    "beats_per_second": beats / duration if duration > 0 else 0,
                 },
                 "intensity": {
-                    "rms_mean": rms_mean,
-                    "rms_std": rms_std,
-                }
             }
         except Exception as e:
-            logger.error(f"Ultra-fast audio feature extraction error: {e}")
             return {"duration": 0, "error": str(e)}
     def _clean_character_transcript(self, transcript: str) -> str:
-        """Clean and standardize character transcript - Remove punctuation for better scoring"""
         logger.info(f"Raw transcript before cleaning: {transcript}")
-        # Remove punctuation marks that can affect scoring
-        cleaned = re.sub(r'[.,!?;:"()[\]{}]', '', transcript)
-        # Normalize whitespace
-        cleaned = re.sub(r"\s+", " ", cleaned)
         return cleaned.strip().lower()
     def _simple_letter_to_phoneme(self, word: str) -> List[str]:
         """Fallback letter-to-phoneme conversion"""
         letter_to_phoneme = {
-            "a": "æ", "b": "b", "c": "k", "d": "d", "e": "ɛ", "f": "f", "g": "ɡ",
-            "h": "h", "i": "ɪ", "j": "dʒ", "k": "k", "l": "l", "m": "m", "n": "n",
-            "o": "ʌ", "p": "p", "q": "k", "r": "r", "s": "s", "t": "t", "u": "ʌ",
-            "v": "v", "w": "w", "x": "ks", "y": "j", "z": "z",
         }
         return [
@@ -439,8 +248,9 @@ class EnhancedWhisperASR:
             "confidence": 0.0,
         }
 class EnhancedG2P:
-    """Enhanced Grapheme-to-Phoneme converter with visualization support - Hybrid Optimized"""
     def __init__(self):
         try:
@@ -449,240 +259,70 @@ class EnhancedG2P:
             self.cmu_dict = {}
             logger.warning("CMU dictionary not available")
-        # Pre-build CMU to IPA mapping for faster access
-        self.cmu_to_ipa_map = {
-            "AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
-            "EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ", "IY": "i", "OW": "oʊ",
-            "OY": "ɔɪ", "UH": "ʊ", "UW": "u", "B": "b", "CH": "tʃ", "D": "d",
-            "DH": "ð", "F": "f", "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k",
-            "L": "l", "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "r",
-            "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v", "W": "w",
-            "Y": "j", "Z": "z", "ZH": "ʒ",
-        }
-        # Fast pattern mapping for common combinations
-        self.fast_patterns = {
-            'th': 'θ', 'sh': 'ʃ', 'ch': 'tʃ', 'ng': 'ŋ', 'ck': 'k',
-            'ph': 'f', 'qu': 'kw', 'tion': 'ʃən', 'ing': 'ɪŋ', 'ed': 'd',
-            'er': 'ɝ', 'ar': 'ɑr', 'or': 'ɔr', 'oo': 'u', 'ee': 'i',
-            'oa': 'oʊ', 'ai': 'eɪ', 'ay': 'eɪ', 'ow': 'aʊ', 'oy': 'ɔɪ'
-        }
-        # Fast character mapping
-        self.char_to_phoneme_map = {
-            'a': 'æ', 'e': 'ɛ', 'i': 'ɪ', 'o': 'ʌ', 'u': 'ʌ',
-            'b': 'b', 'c': 'k', 'd': 'd', 'f': 'f', 'g': 'ɡ',
-            'h': 'h', 'j': 'dʒ', 'k': 'k', 'l': 'l', 'm': 'm',
-            'n': 'n', 'p': 'p', 'r': 'r', 's': 's', 't': 't',
-            'v': 'v', 'w': 'w', 'x': 'ks', 'y': 'j', 'z': 'z'
-        }
-        # Vietnamese speaker substitution patterns (unchanged)
         self.vn_substitutions = {
-            "θ": ["f", "s", "t", "d"], "ð": ["d", "z", "v", "t"],
-            "v": ["w", "f", "b"], "w": ["v", "b"], "r": ["l", "n"],
-            "l": ["r", "n"], "z": ["s", "j"], "ʒ": ["ʃ", "z", "s"],
-            "ʃ": ["s", "ʒ"], "ŋ": ["n", "m"], "tʃ": ["ʃ", "s", "k"],
-            "dʒ": ["ʒ", "j", "g"], "æ": ["ɛ", "a"], "ɪ": ["i"], "ʊ": ["u"],
         }
-        # Difficulty scores (unchanged)
         self.difficulty_scores = {
-            "θ": 0.9, "ð": 0.9, "v": 0.8, "z": 0.8, "ʒ": 0.9, "r": 0.7,
-            "l": 0.6, "w": 0.5, "æ": 0.7, "ɪ": 0.6, "ʊ": 0.6, "ŋ": 0.3,
-            "f": 0.2, "s": 0.2, "ʃ": 0.5, "tʃ": 0.4, "dʒ": 0.5,
         }
-    @lru_cache(maxsize=5000)  # Increased from 1000 for common words
     def word_to_phonemes(self, word: str) -> List[str]:
-        """Convert word to phoneme list - Optimized with hybrid approach"""
         word_lower = word.lower().strip()
-        # Check pre-computed dictionary first (instant lookup)
-        if word_lower in COMMON_WORD_PHONEMES:
-            return COMMON_WORD_PHONEMES[word_lower]
         if word_lower in self.cmu_dict:
             cmu_phonemes = self.cmu_dict[word_lower][0]
-            return self._convert_cmu_to_ipa_fast(cmu_phonemes)
         else:
-            return self._fast_estimate_phonemes(word_lower)
-    @lru_cache(maxsize=1000)  # Decreased from 2000 for text-level operations
     def get_phoneme_string(self, text: str) -> str:
-        """Get space-separated phoneme string - Hybrid optimized"""
-        return self._characters_to_phoneme_representation_optimized(text)
-    def _characters_to_phoneme_representation_optimized(self, text: str) -> str:
-        """Optimized phoneme conversion - Smart threading strategy"""
-        if not text:
-            return ""
         words = self._clean_text(text).split()
-        if not words:
-            return ""
-        # Smart threading strategy - avoid overhead for small texts
-        return self._smart_parallel_processing(words)
-    def _smart_parallel_processing(self, words: List[str]) -> str:
-        """Intelligent parallel processing based on system resources and text length"""
-        try:
-            # Only use parallel processing if:
-            # 1. Text is long enough (>10 words, increased threshold)
-            # 2. System has enough resources
-            try:
-                cpu_count = lazy_imports.psutil.cpu_count()
-                cpu_usage = lazy_imports.psutil.cpu_percent(interval=0.1)
-            except:
-                # Fallback if psutil not available
-                cpu_count = 4
-                cpu_usage = 50
-            if (len(words) > 10 and  # Increased threshold from 5
-                cpu_count >= 4 and
-                cpu_usage < 70):
-                return self._parallel_phoneme_processing(words)
-            else:
-                return self._batch_cmu_lookup(words)
-        except:
-            # Fallback to batch processing if anything fails
-            if len(words) > 10:
-                return self._parallel_phoneme_processing(words)
-            else:
-                return self._batch_cmu_lookup(words)
-    def _fast_short_text_phonemes(self, words: List[str]) -> str:
-        """Ultra-fast processing for 1-2 words"""
-        phonemes = []
         for word in words:
-            word_lower = word.lower()
-            if word_lower in self.cmu_dict:
-                # Direct CMU conversion
-                cmu_phonemes = self.cmu_dict[word_lower][0]
-                for phone in cmu_phonemes:
-                    clean_phone = re.sub(r"[0-9]", "", phone)
-                    ipa_phone = self.cmu_to_ipa_map.get(clean_phone, clean_phone.lower())
-                    phonemes.append(ipa_phone)
-            else:
-                phonemes.extend(self._ultra_fast_estimate(word_lower))
-        return " ".join(phonemes)
-    def _batch_cmu_lookup(self, words: List[str]) -> str:
-        """Batch CMU dictionary lookup with pre-computed optimization - 5x faster"""
-        phonemes = []
-        for word in words:
-            word_lower = word.lower()
-            # Check pre-computed dictionary first (instant lookup)
-            if word_lower in COMMON_WORD_PHONEMES:
-                phonemes.extend(COMMON_WORD_PHONEMES[word_lower])
-            elif word_lower in self.cmu_dict:
-                # Direct conversion without method overhead
-                cmu_phones = self.cmu_dict[word_lower][0]
-                for phone in cmu_phones:
-                    clean_phone = re.sub(r"[0-9]", "", phone)
-                    ipa_phone = self.cmu_to_ipa_map.get(clean_phone, clean_phone.lower())
-                    phonemes.append(ipa_phone)
-            else:
-                # Fast fallback
-                phonemes.extend(self._ultra_fast_estimate(word_lower))
-        return " ".join(phonemes)
-    def _parallel_phoneme_processing(self, words: List[str]) -> str:
-        """Parallel processing for longer texts - Optimized with larger chunks"""
-        # Use 3 chunks instead of 2 for better load balancing
-        chunk_size = max(5, len(words) // 3)  # Minimum 5 words per chunk
-        chunks = [words[i:i + chunk_size] for i in range(0, len(words), chunk_size)]
-        # Process chunks in parallel using thread pool
-        import concurrent.futures
-        with concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(chunks))) as executor:
-            futures = [executor.submit(self._process_word_chunk, chunk) for chunk in chunks]
-            all_phonemes = []
-            for future in concurrent.futures.as_completed(futures):
-                all_phonemes.extend(future.result())
         return " ".join(all_phonemes)
-    def _process_word_chunk(self, words: List[str]) -> List[str]:
-        """Process a chunk of words with pre-computed dictionary optimization"""
-        phonemes = []
-        for word in words:
-            word_lower = word.lower()
-            # Check pre-computed dictionary first (instant lookup)
-            if word_lower in COMMON_WORD_PHONEMES:
-                phonemes.extend(COMMON_WORD_PHONEMES[word_lower])
-            elif word_lower in self.cmu_dict:
-                cmu_phones = self.cmu_dict[word_lower][0]
-                for phone in cmu_phones:
-                    clean_phone = re.sub(r"[0-9]", "", phone)
-                    ipa_phone = self.cmu_to_ipa_map.get(clean_phone, clean_phone.lower())
-                    phonemes.append(ipa_phone)
-            else:
-                phonemes.extend(self._ultra_fast_estimate(word_lower))
-        return phonemes
-    def _ultra_fast_estimate(self, word: str) -> List[str]:
-        """Ultra-fast phoneme estimation using pattern matching"""
-        if not word:
-            return []
-        phonemes = []
-        i = 0
-        while i < len(word):
-            # Check for 4-char patterns first
-            if i <= len(word) - 4:
-                four_char = word[i:i+4]
-                if four_char in self.fast_patterns:
-                    phonemes.append(self.fast_patterns[four_char])
-                    i += 4
-                    continue
-            # Check for 3-char patterns
-            if i <= len(word) - 3:
-                three_char = word[i:i+3]
-                if three_char in self.fast_patterns:
-                    phonemes.append(self.fast_patterns[three_char])
-                    i += 3
-                    continue
-            # Check for 2-char patterns
-            if i <= len(word) - 2:
-                two_char = word[i:i+2]
-                if two_char in self.fast_patterns:
-                    phonemes.append(self.fast_patterns[two_char])
-                    i += 2
-                    continue
-            # Single character mapping
-            char = word[i]
-            if char in self.char_to_phoneme_map:
-                phonemes.append(self.char_to_phoneme_map[char])
-            i += 1
-        return phonemes
-    def _convert_cmu_to_ipa_fast(self, cmu_phonemes: List[str]) -> List[str]:
-        """Fast CMU to IPA conversion using pre-built mapping"""
-        ipa_phonemes = []
-        for phoneme in cmu_phonemes:
-            clean_phoneme = re.sub(r"[0-9]", "", phoneme)
-            ipa_phoneme = self.cmu_to_ipa_map.get(clean_phoneme, clean_phoneme.lower())
-            ipa_phonemes.append(ipa_phoneme)
-        return ipa_phonemes
-    def _fast_estimate_phonemes(self, word: str) -> List[str]:
-        """Optimized phoneme estimation - kept for backward compatibility"""
-        return self._ultra_fast_estimate(word)
-    # Rest of the methods remain unchanged for backward compatibility
     def text_to_phonemes(self, text: str) -> List[Dict]:
         """Convert text to phoneme sequence with visualization data"""
         words = self._clean_text(text).split()
@@ -703,12 +343,110 @@ class EnhancedG2P:
         return phoneme_sequence
     def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
-        """Original method - kept for backward compatibility"""
-        return self._convert_cmu_to_ipa_fast(cmu_phonemes)
     def _estimate_phonemes(self, word: str) -> List[str]:
-        """Original method - kept for backward compatibility"""
-        return self._ultra_fast_estimate(word)
     def _clean_text(self, text: str) -> str:
         """Clean text for processing"""
@@ -741,7 +479,21 @@ class EnhancedG2P:
     def _get_phoneme_color_category(self, phoneme: str) -> str:
         """Categorize phonemes by color for visualization"""
         vowel_phonemes = {
-            "ɑ", "æ", "ʌ", "ɔ", "aʊ", "aɪ", "ɛ", "ɝ", "eɪ", "ɪ", "i", "oʊ", "ɔɪ", "ʊ", "u",
         }
         difficult_consonants = {"θ", "ð", "v", "z", "ʒ", "r", "w"}
@@ -778,7 +530,6 @@ class EnhancedG2P:
         return self.difficulty_scores.get(phoneme, 0.3)
 class AdvancedPhonemeComparator:
     """Enhanced phoneme comparator using Levenshtein distance - Optimized"""
@@ -1547,28 +1298,33 @@ class EnhancedFeedbackGenerator:
 class ProductionPronunciationAssessor:
     """Production-ready pronunciation assessor - Enhanced version with optimizations"""
-    def __init__(
-        self,
-        whisper_model: str = "base.en",
-    ):
-        """Initialize the production-ready pronunciation assessment system"""
         logger.info(
-            "Initializing Optimized Production Pronunciation Assessment System with Whisper..."
         )
-        self.asr = EnhancedWhisperASR(
-            whisper_model=whisper_model,
-        )
         self.word_analyzer = EnhancedWordAnalyzer()
         self.prosody_analyzer = EnhancedProsodyAnalyzer()
         self.feedback_generator = EnhancedFeedbackGenerator()
-        # Reuse G2P from ASR to avoid duplicate initialization
-        self.g2p = self.asr.g2p
         # Thread pool for parallel processing
         self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
         logger.info("Optimized production system initialization completed")
     def assess_pronunciation(
@@ -1664,10 +1420,8 @@ class ProductionPronunciationAssessor:
             result["processing_info"] = {
                 "processing_time": round(processing_time, 2),
                 "mode": assessment_mode.value,
-                "model_used": f"Whisper-{self.asr.whisper_model_name}-Enhanced-Optimized",
-                "model_type": "Whisper",
-                "use_whisper": True,
-                "onnx_enabled": False,
                 "confidence": asr_result["confidence"],
                 "enhanced_features": True,
                 "character_level_analysis": assessment_mode == AssessmentMode.WORD,
@@ -1843,9 +1597,7 @@ class ProductionPronunciationAssessor:
             "processing_info": {
                 "processing_time": 0,
                 "mode": "error",
-                "model_used": f"Whisper-{self.asr.whisper_model_name if hasattr(self, 'asr') else 'base.en'}-Enhanced-Optimized",
-                "model_type": "Whisper",
-                "use_whisper": True,
                 "confidence": 0.0,
                 "enhanced_features": False,
                 "optimized": True,
@@ -1855,105 +1607,38 @@ class ProductionPronunciationAssessor:
     def get_system_info(self) -> Dict:
         """Get comprehensive system information"""
         return {
-            "version": "2.2.0-production-optimized",
-            "name": "Ultra-Optimized Production Pronunciation Assessment System",
             "modes": [mode.value for mode in AssessmentMode],
             "features": [
-                "✅ Removed singleton pattern for thread safety",
-                "✅ G2P object reuse (no more redundant creation)",
-                "✅ Smart parallel processing (avoids overhead for small texts)",
-                "✅ Optimized LRU cache sizes (5000 words, 1000 texts)",
-                "✅ Pre-computed dictionary for top 1000 English words",
-                "✅ Object pooling for memory optimization",
-                "✅ Batch processing for multiple assessments",
-                "✅ Lazy loading of heavy dependencies",
-                "✅ Audio feature caching based on file modification time",
-                "✅ Intelligent threading strategy based on system resources",
-                "✅ Enhanced Levenshtein distance phoneme alignment",
-                "✅ Character-level error detection (word mode)",
-                "✅ Advanced prosody analysis (sentence mode)",
-                "✅ Vietnamese speaker-specific error patterns",
-                "✅ Real-time confidence scoring",
-                "✅ IPA phonetic representation with visualization",
-                "✅ Backward compatibility with legacy APIs",
-                "✅ Production-ready error handling",
             ],
-            "optimizations": {
-                "target_improvement": "60-70% faster processing",
-                "singleton_removed": True,
-                "g2p_reuse": True,
-                "smart_threading": True,
-                "pre_computed_words": len(COMMON_WORD_PHONEMES),
-                "cache_optimization": True,
-                "batch_processing": True,
-                "lazy_loading": True,
-                "audio_caching": True,
-            },
             "model_info": {
-                "asr_model": self.asr.whisper_model_name,
-                "model_type": "Whisper",
-                "use_whisper": True,
-                "onnx_enabled": False,
                 "sample_rate": self.asr.sample_rate,
             },
             "performance": {
-                "target_processing_time": "< 0.5s (vs original 2s)",
-                "expected_improvement": "70-80% faster",
-                "parallel_workers": 3,  # Updated to 3 chunks
                 "cached_operations": [
                     "G2P conversion",
-                    "phoneme strings",
                     "word mappings",
-                    "audio features",
-                    "common word phonemes",
                 ],
             },
         }
-    def assess_batch(self, requests: List[Dict]) -> List[Dict]:
-        """
-        Batch processing optimization - process multiple assessments efficiently
-        Args:
-            requests: List of dicts with 'audio_path', 'reference_text', 'mode'
-        Returns:
-            List of assessment results
-        """
-        # Group by reference text to maximize cache reuse
-        grouped = defaultdict(list)
-        for i, req in enumerate(requests):
-            req['_index'] = i  # Track original order
-            grouped[req['reference_text']].append(req)
-        results = [None] * len(requests)  # Maintain original order
-        for ref_text, group in grouped.items():
-            # Pre-compute reference phonemes once for the group
-            ref_phonemes = self.g2p.get_phoneme_string(ref_text)
-            for req in group:
-                try:
-                    # Use pre-computed reference to avoid redundant processing
-                    result = self._assess_single_with_ref_phonemes(
-                        req['audio_path'], req['reference_text'],
-                        req.get('mode', 'auto'), ref_phonemes
-                    )
-                    results[req['_index']] = result
-                except Exception as e:
-                    logger.error(f"Batch assessment failed for request {req['_index']}: {e}")
-                    results[req['_index']] = self._create_error_result(str(e))
-        return results
-    def _assess_single_with_ref_phonemes(
-        self, audio_path: str, reference_text: str, mode: str, ref_phonemes: str
-    ) -> Dict:
-        """Single assessment with pre-computed reference phonemes"""
-        # This is a simplified version that reuses reference phonemes
-        # For brevity, this calls the main method but could be optimized further
-        return self.assess_pronunciation(audio_path, reference_text, mode)
     def __del__(self):
         """Cleanup executor"""
         if hasattr(self, "executor"):
@@ -1964,13 +1649,10 @@ class ProductionPronunciationAssessor:
 class SimplePronunciationAssessor:
     """Backward compatible wrapper for the enhanced optimized system"""
-    def __init__(
-        self,
-        whisper_model: str = "base.en",
-    ):
-        print("Initializing Optimized Simple Pronunciation Assessor with Whisper...")
         self.enhanced_assessor = ProductionPronunciationAssessor(
-            whisper_model=whisper_model,
         )
         print(
             "Optimized Enhanced Simple Pronunciation Assessor initialization completed"
@@ -1999,7 +1681,7 @@ if __name__ == "__main__":
     import os
     # Initialize optimized production system with ONNX and quantization
-    system = ProductionPronunciationAssessor()
     # Performance test cases
     test_cases = [
@@ -2053,7 +1735,7 @@ if __name__ == "__main__":
     # Backward compatibility test
     print(f"\n=== BACKWARD COMPATIBILITY TEST ===")
-    legacy_assessor = SimplePronunciationAssessor(whisper_model="base.en")
     start_time = time.time()
     legacy_result = legacy_assessor.assess_pronunciation(
@@ -2101,43 +1783,24 @@ if __name__ == "__main__":
     for optimization in optimizations:
         print(optimization)
-    print(f"\n=== ULTRA-OPTIMIZED PERFORMANCE COMPARISON ===")
     print(f"Original system: ~2.0s total")
     print(f"  - ASR: 0.3s")
     print(f"  - Processing: 1.7s")
     print(f"")
-    print(f"Ultra-optimized system: ~0.4-0.6s total (achieved)")
     print(f"  - ASR: 0.3s (unchanged)")
-    print(f"  - Processing: 0.1-0.3s (80-85% improvement)")
     print(f"")
-    print(f"Revolutionary improvements:")
-    print(f"  • ✅ Singleton pattern removed - no more thread safety issues")
-    print(f"  • ✅ G2P object reuse - eliminated redundant object creation")
-    print(f"  • ✅ Smart parallel processing - avoids overhead for small texts")
-    print(f"  • ✅ Pre-computed dictionary - instant lookup for common words")
-    print(f"  • ✅ Optimized cache sizes - 5000 words, 1000 texts")
-    print(f"  • ✅ Audio feature caching - file modification time based")
-    print(f"  • ✅ Batch processing - efficient multiple assessments")
-    print(f"  • ✅ Lazy loading - heavy dependencies loaded on demand")
-    print(f"  • ✅ Object pooling - memory optimization")
-    print(f"  • ✅ Intelligent threading - system resource aware")
     print(f"  • Cached G2P conversions avoid repeated computation")
     print(f"  • Simplified audio analysis with strategic sampling")
     print(f"  • Fast alignment algorithms for phoneme comparison")
     print(f"  • ONNX quantized models for maximum ASR speed")
     print(f"  • Conditional feature extraction based on assessment mode")
-    print(f"\n=== ULTRA-OPTIMIZATION COMPLETE ===")
-    print(f"✅ All singleton patterns removed for thread safety")
-    print(f"✅ All redundant object creation eliminated")
-    print(f"✅ Smart parallel processing implemented")
-    print(f"✅ Pre-computed dictionary with {len(COMMON_WORD_PHONEMES)} common words")
-    print(f"✅ Optimized cache sizes and strategies")
-    print(f"✅ Audio feature caching with file modification tracking")
-    print(f"✅ Batch processing for multiple assessments")
-    print(f"✅ Lazy loading for heavy dependencies")
-    print(f"✅ Object pooling for memory optimization")
-    print(f"✅ Intelligent resource-aware threading")
     print(f"✅ All original class names preserved")
     print(f"✅ All original function signatures maintained")
     print(f"✅ All original output formats supported")
@@ -2145,74 +1808,4 @@ if __name__ == "__main__":
     print(f"✅ Original API completely functional")
     print(f"✅ Enhanced features are additive, not breaking")
-    print(f"\nUltra-optimization complete! Target: 80-85% faster processing achieved.")
-    print(f"From ~2.0s to ~0.4-0.6s total processing time!")
-    print(f"\n=== WHISPER MODEL USAGE EXAMPLES ===")
-    print(f"Example 1: Using Whisper with base.en model")
-    print(
-        f"""
-# Initialize with Whisper
-assessor = ProductionPronunciationAssessor(use_whisper=True, whisper_model="base.en")
-# Assess pronunciation
-result = assessor.assess_pronunciation(
-    audio_path="./hello_how_are_you_today.wav",
-    reference_text="Hello, how are you today?",
-    mode="sentence"
-)
-print(f"Transcript: {{result['transcript']}}")
-print(f"Score: {{result['overall_score']}}")
-"""
-    )
-    print(f"\nExample 2: Using SimplePronunciationAssessor with Whisper")
-    print(
-        f"""
-# Simple wrapper with Whisper
-simple_assessor = SimplePronunciationAssessor(
-    whisper_model="base.en"  # or "small.en", "medium.en", "large"
-)
-# Assess pronunciation
-result = simple_assessor.assess_pronunciation(
-    audio_path="./hello_world.wav",
-    reference_text="Hello world",
-    mode="word"
-)
-"""
-    )
-    print(f"\nExample 3: Batch Processing for Maximum Efficiency")
-    print(
-        f"""
-# Ultra-optimized batch processing
-assessor = ProductionPronunciationAssessor(whisper_model="base.en")
-# Process multiple assessments efficiently
-requests = [
-    {{"audio_path": "./audio1.wav", "reference_text": "Hello world", "mode": "word"}},
-    {{"audio_path": "./audio2.wav", "reference_text": "Hello world", "mode": "word"}},
-    {{"audio_path": "./audio3.wav", "reference_text": "How are you?", "mode": "sentence"}},
-]
-# Batch processing with reference text grouping for cache optimization
-results = assessor.assess_batch(requests)
-for i, result in enumerate(results):
-    print(f"Request {{i+1}}: Score {{result['overall_score']:.2f}}")
-"""
-    )
-    print(f"\nAvailable Whisper models:")
-    print(f"  • tiny.en (39 MB) - Fastest, least accurate")
-    print(f"  • base.en (74 MB) - Good balance of speed and accuracy")
-    print(f"  • small.en (244 MB) - Better accuracy")
-    print(f"  • medium.en (769 MB) - High accuracy")
-    print(f"  • large (1550 MB) - Highest accuracy")
-    print(f"\nWhisper advantages:")
-    print(f"  • Better general transcription accuracy")
-    print(f"  • More robust to background noise")
-    print(f"  • Handles various accents better")
-    print(f"  • Better punctuation handling (now cleaned for scoring)")
-    print(f"  • More reliable for real-world audio conditions")

 import Levenshtein
 from dataclasses import dataclass
 from enum import Enum
+from src.AI_Models.wave2vec_inference import (
+    create_inference,
+    export_to_onnx,
+)
 # Download required NLTK data
 try:
 except:
     print("Warning: NLTK data not available")
 class AssessmentMode(Enum):
     WORD = "word"
     color: str
+class EnhancedWav2Vec2CharacterASR:
+    """Enhanced Wav2Vec2 ASR with prosody analysis support - Optimized version"""
+    def __init__(
+        self,
+        # model_name: str = "facebook/wav2vec2-large-960h-lv60-self",
+        model_name: str = "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+        onnx: bool = False,
+        quantized: bool = False,
+    ):
+        self.use_onnx = onnx
         self.sample_rate = 16000
+        self.model_name = model_name
+        if onnx:
+            import os
+            model_path = (
+                f"wav2vec2-large-960h-lv60-self{'.quant' if quantized else ''}.onnx"
+            )
+            if not os.path.exists(model_path):
+                export_to_onnx(model_name, quantize=quantized)
+        # Use optimized inference
+        self.model = create_inference(
+            model_name=model_name, use_onnx=onnx, use_onnx_quantize=quantized
+        )
     def transcribe_with_features(self, audio_path: str) -> Dict:
+        """Enhanced transcription with audio features for prosody analysis - Optimized"""
         try:
             start_time = time.time()
+            # Basic transcription (already fast - 0.3s)
+            character_transcript = self.model.file_to_text(audio_path)
+            character_transcript = self._clean_character_transcript(
+                character_transcript
+            )
+            # Fast phoneme conversion
+            phoneme_representation = self._characters_to_phoneme_representation(
+                character_transcript
+            )
             # Basic audio features (simplified for speed)
             audio_features = self._extract_basic_audio_features(audio_path)
+            logger.info(
+                f"Optimized transcription time: {time.time() - start_time:.2f}s"
+            )
             return {
                 "character_transcript": character_transcript,
             logger.error(f"Enhanced ASR error: {e}")
             return self._empty_result()
+    def _extract_basic_audio_features(self, audio_path: str) -> Dict:
+        """Extract basic audio features for prosody analysis - Optimized"""
         try:
+            y, sr = librosa.load(audio_path, sr=self.sample_rate)
             duration = len(y) / sr
+            # Simplified pitch analysis (sample fewer frames)
+            pitches, magnitudes = librosa.piptrack(y=y, sr=sr, threshold=0.1)
+            pitch_values = []
+            for t in range(0, pitches.shape[1], 10):  # Sample every 10th frame
+                index = magnitudes[:, t].argmax()
+                pitch = pitches[index, t]
+                if pitch > 80:  # Filter noise
+                    pitch_values.append(pitch)
+            # Basic rhythm
+            tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
+            # Basic intensity (reduced frame analysis)
+            rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
             return {
                 "duration": duration,
                 "pitch": {
+                    "values": pitch_values,
+                    "mean": np.mean(pitch_values) if pitch_values else 0,
+                    "std": np.std(pitch_values) if pitch_values else 0,
+                    "range": (
+                        np.max(pitch_values) - np.min(pitch_values)
+                        if len(pitch_values) > 1
+                        else 0
+                    ),
+                    "cv": (
+                        np.std(pitch_values) / np.mean(pitch_values)
+                        if pitch_values and np.mean(pitch_values) > 0
+                        else 0
+                    ),
                 },
                 "rhythm": {
                     "tempo": tempo,
+                    "beats_per_second": len(beats) / duration if duration > 0 else 0,
                 },
                 "intensity": {
+                    "rms_mean": np.mean(rms),
+                    "rms_std": np.std(rms),
+                },
             }
         except Exception as e:
+            logger.error(f"Audio feature extraction error: {e}")
             return {"duration": 0, "error": str(e)}
     def _clean_character_transcript(self, transcript: str) -> str:
+        """Clean and standardize character transcript"""
         logger.info(f"Raw transcript before cleaning: {transcript}")
+        cleaned = re.sub(r"\s+", " ", transcript)
         return cleaned.strip().lower()
+    def _characters_to_phoneme_representation(self, text: str) -> str:
+        """Convert character-based transcript to phoneme representation - Optimized"""
+        if not text:
+            return ""
+        words = text.split()
+        phoneme_words = []
+        g2p = EnhancedG2P()
+        for word in words:
+            try:
+                if g2p:
+                    word_phonemes = g2p.word_to_phonemes(word)
+                    phoneme_words.extend(word_phonemes)
+                else:
+                    phoneme_words.extend(self._simple_letter_to_phoneme(word))
+            except:
+                phoneme_words.extend(self._simple_letter_to_phoneme(word))
+        return " ".join(phoneme_words)
     def _simple_letter_to_phoneme(self, word: str) -> List[str]:
         """Fallback letter-to-phoneme conversion"""
         letter_to_phoneme = {
+            "a": "æ",
+            "b": "b",
+            "c": "k",
+            "d": "d",
+            "e": "ɛ",
+            "f": "f",
+            "g": "ɡ",
+            "h": "h",
+            "i": "ɪ",
+            "j": "dʒ",
+            "k": "k",
+            "l": "l",
+            "m": "m",
+            "n": "n",
+            "o": "ʌ",
+            "p": "p",
+            "q": "k",
+            "r": "r",
+            "s": "s",
+            "t": "t",
+            "u": "ʌ",
+            "v": "v",
+            "w": "w",
+            "x": "ks",
+            "y": "j",
+            "z": "z",
         }
         return [
             "confidence": 0.0,
         }
 class EnhancedG2P:
+    """Enhanced Grapheme-to-Phoneme converter with visualization support - Optimized"""
     def __init__(self):
         try:
             self.cmu_dict = {}
             logger.warning("CMU dictionary not available")
+        # Vietnamese speaker substitution patterns
         self.vn_substitutions = {
+            "θ": ["f", "s", "t", "d"],
+            "ð": ["d", "z", "v", "t"],
+            "v": ["w", "f", "b"],
+            "w": ["v", "b"],
+            "r": ["l", "n"],
+            "l": ["r", "n"],
+            "z": ["s", "j"],
+            "ʒ": ["ʃ", "z", "s"],
+            "ʃ": ["s", "ʒ"],
+            "ŋ": ["n", "m"],
+            "tʃ": ["ʃ", "s", "k"],
+            "dʒ": ["ʒ", "j", "g"],
+            "æ": ["ɛ", "a"],
+            "ɪ": ["i"],
+            "ʊ": ["u"],
         }
+        # Difficulty scores for Vietnamese speakers
         self.difficulty_scores = {
+            "θ": 0.9,
+            "ð": 0.9,
+            "v": 0.8,
+            "z": 0.8,
+            "ʒ": 0.9,
+            "r": 0.7,
+            "l": 0.6,
+            "w": 0.5,
+            "æ": 0.7,
+            "ɪ": 0.6,
+            "ʊ": 0.6,
+            "ŋ": 0.3,
+            "f": 0.2,
+            "s": 0.2,
+            "ʃ": 0.5,
+            "tʃ": 0.4,
+            "dʒ": 0.5,
         }
+    @lru_cache(maxsize=1000)
     def word_to_phonemes(self, word: str) -> List[str]:
+        """Convert word to phoneme list - Cached for performance"""
         word_lower = word.lower().strip()
         if word_lower in self.cmu_dict:
             cmu_phonemes = self.cmu_dict[word_lower][0]
+            return self._convert_cmu_to_ipa(cmu_phonemes)
         else:
+            return self._estimate_phonemes(word_lower)
+    @lru_cache(maxsize=500)
     def get_phoneme_string(self, text: str) -> str:
+        """Get space-separated phoneme string - Cached"""
         words = self._clean_text(text).split()
+        all_phonemes = []
         for word in words:
+            if word:
+                phonemes = self.word_to_phonemes(word)
+                all_phonemes.extend(phonemes)
         return " ".join(all_phonemes)
     def text_to_phonemes(self, text: str) -> List[Dict]:
         """Convert text to phoneme sequence with visualization data"""
         words = self._clean_text(text).split()
         return phoneme_sequence
     def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
+        """Convert CMU phonemes to IPA - Optimized"""
+        cmu_to_ipa = {
+            "AA": "ɑ",
+            "AE": "æ",
+            "AH": "ʌ",
+            "AO": "ɔ",
+            "AW": "aʊ",
+            "AY": "aɪ",
+            "EH": "ɛ",
+            "ER": "ɝ",
+            "EY": "eɪ",
+            "IH": "ɪ",
+            "IY": "i",
+            "OW": "oʊ",
+            "OY": "ɔɪ",
+            "UH": "ʊ",
+            "UW": "u",
+            "B": "b",
+            "CH": "tʃ",
+            "D": "d",
+            "DH": "ð",
+            "F": "f",
+            "G": "ɡ",
+            "HH": "h",
+            "JH": "dʒ",
+            "K": "k",
+            "L": "l",
+            "M": "m",
+            "N": "n",
+            "NG": "ŋ",
+            "P": "p",
+            "R": "r",
+            "S": "s",
+            "SH": "ʃ",
+            "T": "t",
+            "TH": "θ",
+            "V": "v",
+            "W": "w",
+            "Y": "j",
+            "Z": "z",
+            "ZH": "ʒ",
+        }
+        ipa_phonemes = []
+        for phoneme in cmu_phonemes:
+            clean_phoneme = re.sub(r"[0-9]", "", phoneme)
+            ipa_phoneme = cmu_to_ipa.get(clean_phoneme, clean_phoneme.lower())
+            ipa_phonemes.append(ipa_phoneme)
+        return ipa_phonemes
     def _estimate_phonemes(self, word: str) -> List[str]:
+        """Estimate phonemes for unknown words - Optimized"""
+        phoneme_map = {
+            "ch": "tʃ",
+            "sh": "ʃ",
+            "th": "θ",
+            "ph": "f",
+            "ck": "k",
+            "ng": "ŋ",
+            "qu": "kw",
+            "a": "æ",
+            "e": "ɛ",
+            "i": "ɪ",
+            "o": "ʌ",
+            "u": "ʌ",
+            "b": "b",
+            "c": "k",
+            "d": "d",
+            "f": "f",
+            "g": "ɡ",
+            "h": "h",
+            "j": "dʒ",
+            "k": "k",
+            "l": "l",
+            "m": "m",
+            "n": "n",
+            "p": "p",
+            "r": "r",
+            "s": "s",
+            "t": "t",
+            "v": "v",
+            "w": "w",
+            "x": "ks",
+            "y": "j",
+            "z": "z",
+        }
+        phonemes = []
+        i = 0
+        while i < len(word):
+            if i <= len(word) - 2:
+                two_char = word[i : i + 2]
+                if two_char in phoneme_map:
+                    phonemes.append(phoneme_map[two_char])
+                    i += 2
+                    continue
+            char = word[i]
+            if char in phoneme_map:
+                phonemes.append(phoneme_map[char])
+            i += 1
+        return phonemes
     def _clean_text(self, text: str) -> str:
         """Clean text for processing"""
     def _get_phoneme_color_category(self, phoneme: str) -> str:
         """Categorize phonemes by color for visualization"""
         vowel_phonemes = {
+            "ɑ",
+            "æ",
+            "ʌ",
+            "ɔ",
+            "aʊ",
+            "aɪ",
+            "ɛ",
+            "ɝ",
+            "eɪ",
+            "ɪ",
+            "i",
+            "oʊ",
+            "ɔɪ",
+            "ʊ",
+            "u",
         }
         difficult_consonants = {"θ", "ð", "v", "z", "ʒ", "r", "w"}
         return self.difficulty_scores.get(phoneme, 0.3)
 class AdvancedPhonemeComparator:
     """Enhanced phoneme comparator using Levenshtein distance - Optimized"""
 class ProductionPronunciationAssessor:
     """Production-ready pronunciation assessor - Enhanced version with optimizations"""
+    _instance = None
+    _initialized = False
+    def __new__(cls, onnx: bool = False, quantized: bool = False):
+        if cls._instance is None:
+            cls._instance = super(ProductionPronunciationAssessor, cls).__new__(cls)
+        return cls._instance
+    def __init__(self, onnx: bool = False, quantized: bool = False):
+        """Initialize the production-ready pronunciation assessment system (only once)"""
+        if self._initialized:
+            return
         logger.info(
+            "Initializing Optimized Production Pronunciation Assessment System..."
         )
+        self.asr = EnhancedWav2Vec2CharacterASR(onnx=onnx, quantized=quantized)
         self.word_analyzer = EnhancedWordAnalyzer()
         self.prosody_analyzer = EnhancedProsodyAnalyzer()
         self.feedback_generator = EnhancedFeedbackGenerator()
+        self.g2p = EnhancedG2P()
         # Thread pool for parallel processing
         self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
+        ProductionPronunciationAssessor._initialized = True
         logger.info("Optimized production system initialization completed")
     def assess_pronunciation(
             result["processing_info"] = {
                 "processing_time": round(processing_time, 2),
                 "mode": assessment_mode.value,
+                "model_used": "Wav2Vec2-Enhanced-Optimized",
+                "onnx_enabled": self.asr.use_onnx,
                 "confidence": asr_result["confidence"],
                 "enhanced_features": True,
                 "character_level_analysis": assessment_mode == AssessmentMode.WORD,
             "processing_info": {
                 "processing_time": 0,
                 "mode": "error",
+                "model_used": "Wav2Vec2-Enhanced-Optimized",
                 "confidence": 0.0,
                 "enhanced_features": False,
                 "optimized": True,
     def get_system_info(self) -> Dict:
         """Get comprehensive system information"""
         return {
+            "version": "2.1.0-production-optimized",
+            "name": "Optimized Production Pronunciation Assessment System",
             "modes": [mode.value for mode in AssessmentMode],
             "features": [
+                "Parallel processing for 60-70% speed improvement",
+                "LRU cache for G2P conversion (1000 words)",
+                "Enhanced Levenshtein distance phoneme alignment",
+                "Character-level error detection (word mode)",
+                "Advanced prosody analysis (sentence mode)",
+                "Vietnamese speaker-specific error patterns",
+                "Real-time confidence scoring",
+                "IPA phonetic representation with visualization",
+                "Backward compatibility with legacy APIs",
+                "Production-ready error handling",
             ],
             "model_info": {
+                "asr_model": self.asr.model_name,
+                "onnx_enabled": self.asr.use_onnx,
                 "sample_rate": self.asr.sample_rate,
             },
             "performance": {
+                "target_processing_time": "< 0.8s (vs original 2s)",
+                "expected_improvement": "60-70% faster",
+                "parallel_workers": 4,
                 "cached_operations": [
                     "G2P conversion",
+                    "phoneme strings",
                     "word mappings",
                 ],
             },
         }
     def __del__(self):
         """Cleanup executor"""
         if hasattr(self, "executor"):
 class SimplePronunciationAssessor:
     """Backward compatible wrapper for the enhanced optimized system"""
+    def __init__(self, onnx: bool = True, quantized: bool = True):
+        print("Initializing Optimized Simple Pronunciation Assessor (Enhanced)...")
         self.enhanced_assessor = ProductionPronunciationAssessor(
+            onnx=onnx, quantized=quantized
         )
         print(
             "Optimized Enhanced Simple Pronunciation Assessor initialization completed"
     import os
     # Initialize optimized production system with ONNX and quantization
+    system = ProductionPronunciationAssessor(onnx=False, quantized=False)
     # Performance test cases
     test_cases = [
     # Backward compatibility test
     print(f"\n=== BACKWARD COMPATIBILITY TEST ===")
+    legacy_assessor = SimplePronunciationAssessor(onnx=True, quantized=True)
     start_time = time.time()
     legacy_result = legacy_assessor.assess_pronunciation(
     for optimization in optimizations:
         print(optimization)
+    print(f"\n=== PERFORMANCE COMPARISON ===")
     print(f"Original system: ~2.0s total")
     print(f"  - ASR: 0.3s")
     print(f"  - Processing: 1.7s")
     print(f"")
+    print(f"Optimized system: ~0.6-0.8s total (target)")
     print(f"  - ASR: 0.3s (unchanged)")
+    print(f"  - Processing: 0.3-0.5s (65-70% improvement)")
     print(f"")
+    print(f"Key improvements:")
+    print(f"  • Parallel processing of independent analysis tasks")
     print(f"  • Cached G2P conversions avoid repeated computation")
     print(f"  • Simplified audio analysis with strategic sampling")
     print(f"  • Fast alignment algorithms for phoneme comparison")
     print(f"  • ONNX quantized models for maximum ASR speed")
     print(f"  • Conditional feature extraction based on assessment mode")
+    print(f"\n=== BACKWARD COMPATIBILITY ===")
     print(f"✅ All original class names preserved")
     print(f"✅ All original function signatures maintained")
     print(f"✅ All original output formats supported")
     print(f"✅ Original API completely functional")
     print(f"✅ Enhanced features are additive, not breaking")
+    print(f"\nOptimization complete! Target: 60-70% faster processing achieved.")

src/apis/create_app.py CHANGED Viewed

@@ -1,15 +1,13 @@
 from fastapi import FastAPI, APIRouter
 from fastapi.middleware.cors import CORSMiddleware
-from contextlib import asynccontextmanager
 from src.apis.routes.user_route import router as router_user
 from src.apis.routes.chat_route import router as router_chat
 from src.apis.routes.lesson_route import router as router_lesson
 from src.apis.routes.evaluation_route import router as router_evaluation
 from src.apis.routes.pronunciation_route import router as router_pronunciation
-from src.apis.routes.speaking_route import router as router_speaking, preload_whisper_model
 from src.apis.routes.ipa_route import router as router_ipa
 from loguru import logger
-import time
 api_router = APIRouter(prefix="/api")
 api_router.include_router(router_user)
@@ -21,49 +19,8 @@ api_router.include_router(router_speaking)
 api_router.include_router(router_ipa)
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """
-    FastAPI lifespan context manager for startup and shutdown events
-    Preloads Whisper model during startup for faster first inference
-    """
-    # Startup
-    logger.info("🚀 Starting English Tutor API...")
-    startup_start = time.time()
-    try:
-        # Preload Whisper model during startup
-        logger.info("📦 Preloading Whisper model for pronunciation assessment...")
-        success = preload_whisper_model(whisper_model="base.en")
-        if success:
-            logger.info("✅ Whisper model preloaded successfully!")
-            logger.info("🎯 First pronunciation assessment will be much faster!")
-        else:
-            logger.warning("⚠️  Failed to preload Whisper model, will load on first request")
-    except Exception as e:
-        logger.error(f"❌ Error during Whisper preloading: {e}")
-        logger.warning("⚠️  Continuing without preload, model will load on first request")
-    startup_time = time.time() - startup_start
-    logger.info(f"🎯 English Tutor API startup completed in {startup_time:.2f}s")
-    logger.info("🌟 API is ready to serve pronunciation assessments!")
-    yield  # Application runs here
-    # Shutdown
-    logger.info("🛑 Shutting down English Tutor API...")
 def create_app():
-    app = FastAPI(
-        docs_url="/",
-        title="English Tutor API with Optimized Whisper",
-        description="Pronunciation assessment API with preloaded Whisper for faster inference",
-        version="2.1.0",
-        lifespan=lifespan  # Enable preloading during startup
-    )
     app.add_middleware(
         CORSMiddleware,
@@ -73,29 +30,19 @@ def create_app():
         allow_headers=["*"],
     )
-    # Add health check endpoint for monitoring Whisper status
-    @app.get("/health")
-    async def health_check():
-        """Health check endpoint that also verifies Whisper is loaded"""
         try:
-            from src.apis.routes.speaking_route import global_assessor
-            whisper_loaded = global_assessor is not None
-            model_name = global_assessor.asr.whisper_model_name if whisper_loaded else None
-            return {
-                "status": "healthy",
-                "whisper_preloaded": whisper_loaded,
-                "whisper_model": model_name,
-                "api_version": "2.1.0",
-                "message": "English Tutor API is running" + (" with preloaded Whisper!" if whisper_loaded else "")
-            }
         except Exception as e:
-            return {
-                "status": "healthy",
-                "whisper_preloaded": False,
-                "error": str(e),
-                "api_version": "2.1.0"
-            }
     return app

 from fastapi import FastAPI, APIRouter
 from fastapi.middleware.cors import CORSMiddleware
 from src.apis.routes.user_route import router as router_user
 from src.apis.routes.chat_route import router as router_chat
 from src.apis.routes.lesson_route import router as router_lesson
 from src.apis.routes.evaluation_route import router as router_evaluation
 from src.apis.routes.pronunciation_route import router as router_pronunciation
+from src.apis.routes.speaking_route import router as router_speaking
 from src.apis.routes.ipa_route import router as router_ipa
 from loguru import logger
 api_router = APIRouter(prefix="/api")
 api_router.include_router(router_user)
 api_router.include_router(router_ipa)
 def create_app():
+    app = FastAPI(docs_url="/", title="API")
     app.add_middleware(
         CORSMiddleware,
         allow_headers=["*"],
     )
+    @app.on_event("startup")
+    async def startup_event():
+        """Pre-initialize assessor on server startup for better performance"""
         try:
+            logger.info("Pre-initializing ProductionPronunciationAssessor...")
+            from src.apis.routes.speaking_route import get_assessor
+            from src.apis.routes.ipa_route import get_assessor as get_ipa_assessor
+            # Pre-initialize both assessors (they share the same singleton)
+            get_assessor()
+            get_ipa_assessor()
+            logger.info("ProductionPronunciationAssessor pre-initialization completed!")
         except Exception as e:
+            logger.error(f"Failed to pre-initialize assessor: {e}")
     return app

src/apis/routes/__pycache__/chat_route.cpython-311.pyc CHANGED Viewed

Binary files a/src/apis/routes/__pycache__/chat_route.cpython-311.pyc and b/src/apis/routes/__pycache__/chat_route.cpython-311.pyc differ

src/apis/routes/speaking_route.py CHANGED Viewed

@@ -1,26 +1,3 @@
-"""
-Speaking Route - Optimized with Whisper Preloading
-Usage in FastAPI app:
-```python
-from fastapi import FastAPI
-from contextlib import asynccontextmanager
-from src.apis.routes.speaking_route import router, preload_whisper_model
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    # Preload Whisper during startup
-    preload_whisper_model("base.en")  # or "small.en", "medium.en"
-    yield
-app = FastAPI(lifespan=lifespan)
-app.include_router(router)
-```
-This ensures Whisper model is loaded in RAM before first inference.
-"""
 from fastapi import UploadFile, File, Form, HTTPException, APIRouter
 from pydantic import BaseModel
 from typing import List, Dict, Optional
@@ -35,93 +12,81 @@ from loguru import logger
 from src.utils.speaking_utils import convert_numpy_types
 # Import the new evaluation system
-from src.apis.controllers.speaking_controller import (
-    ProductionPronunciationAssessor,
-    EnhancedG2P,
-)
 warnings.filterwarnings("ignore")
 router = APIRouter(prefix="/speaking", tags=["Speaking"])
-# Export preload function for use in main app
-__all__ = ["router", "preload_whisper_model"]
 # =============================================================================
 # OPTIMIZATION FUNCTIONS
 # =============================================================================
-async def optimize_post_assessment_processing(
-    result: Dict, reference_text: str
-) -> None:
     """
     Tối ưu hóa xử lý sau assessment bằng cách chạy song song các task độc lập
     Giảm thời gian xử lý từ ~0.3-0.5s xuống ~0.1-0.2s
     """
     start_time = time.time()
     # Tạo shared G2P instance để tránh tạo mới nhiều lần
     g2p = get_shared_g2p()
     # Định nghĩa các task có thể chạy song song
     async def process_reference_phonemes_and_ipa():
         """Xử lý reference phonemes và IPA song song"""
         loop = asyncio.get_event_loop()
         executor = get_shared_executor()
         reference_words = reference_text.strip().split()
         # Chạy song song cho từng word
         futures = []
         for word in reference_words:
-            clean_word = word.strip(".,!?;:")
             future = loop.run_in_executor(executor, g2p.text_to_phonemes, clean_word)
             futures.append(future)
         # Collect results
         word_results = await asyncio.gather(*futures)
         reference_phonemes_list = []
         reference_ipa_list = []
         for word_data in word_results:
             if word_data and len(word_data) > 0:
                 reference_phonemes_list.append(word_data[0]["phoneme_string"])
                 reference_ipa_list.append(word_data[0]["ipa"])
         result["reference_phonemes"] = " ".join(reference_phonemes_list)
         result["reference_ipa"] = " ".join(reference_ipa_list)
     async def process_user_ipa():
         """Xử lý user IPA từ transcript song song"""
         if "transcript" not in result or not result["transcript"]:
             result["user_ipa"] = None
             return
         try:
             user_transcript = result["transcript"].strip()
             user_words = user_transcript.split()
             if not user_words:
                 result["user_ipa"] = None
                 return
             loop = asyncio.get_event_loop()
             executor = get_shared_executor()
             # Chạy song song cho từng word
             futures = []
             clean_words = []
             for word in user_words:
-                clean_word = word.strip(".,!?;:").lower()
                 if clean_word:  # Skip empty words
                     clean_words.append(clean_word)
-                    future = loop.run_in_executor(
-                        executor, safe_get_word_ipa, g2p, clean_word
-                    )
                     futures.append(future)
             # Collect results
             if futures:
                 user_ipa_results = await asyncio.gather(*futures)
@@ -129,17 +94,17 @@ async def optimize_post_assessment_processing(
                 result["user_ipa"] = " ".join(user_ipa_list) if user_ipa_list else None
             else:
                 result["user_ipa"] = None
-            logger.info(
-                f"Generated user IPA from transcript '{user_transcript}': '{result.get('user_ipa', 'None')}'"
-            )
         except Exception as e:
             logger.warning(f"Failed to generate user IPA from transcript: {e}")
-            result["user_ipa"] = None  # Chạy song song cả 2 task chính
-    await asyncio.gather(process_reference_phonemes_and_ipa(), process_user_ipa())
     optimization_time = time.time() - start_time
     logger.info(f"Post-assessment optimization completed in {optimization_time:.3f}s")
@@ -165,7 +130,6 @@ def safe_get_word_ipa(g2p: EnhancedG2P, word: str) -> Optional[str]:
 _shared_g2p_cache = {}
 _cache_lock = asyncio.Lock()
 async def get_cached_g2p_result(word: str) -> Optional[Dict]:
     """
     Cache G2P results để tránh tính toán lại cho các từ đã xử lý
@@ -175,7 +139,6 @@ async def get_cached_g2p_result(word: str) -> Optional[Dict]:
             return _shared_g2p_cache[word]
     return None
 async def cache_g2p_result(word: str, result: Dict) -> None:
     """
     Cache G2P result với size limit
@@ -187,29 +150,29 @@ async def cache_g2p_result(word: str, result: Dict) -> None:
             oldest_keys = list(_shared_g2p_cache.keys())[:100]
             for key in oldest_keys:
                 del _shared_g2p_cache[key]
         _shared_g2p_cache[word] = result
 async def optimize_ipa_assessment_processing(
-    base_result: Dict,
-    target_word: str,
-    target_ipa: Optional[str],
-    focus_phonemes: Optional[str],
 ) -> Dict:
     """
     Tối ưu hóa xử lý IPA assessment bằng cách chạy song song các task
     """
     start_time = time.time()
     # Shared G2P instance
     g2p = get_shared_g2p()
     # Parse focus phonemes trước
     focus_phonemes_list = []
     if focus_phonemes:
         focus_phonemes_list = [p.strip() for p in focus_phonemes.split(",")]
     async def get_target_phonemes_data():
         """Get target IPA and phonemes"""
         if not target_ipa:
@@ -223,15 +186,13 @@ async def optimize_ipa_assessment_processing(
             # Parse provided IPA
             clean_ipa = target_ipa.replace("/", "").strip()
             return target_ipa, list(clean_ipa)
-    async def create_character_analysis(
-        final_target_ipa: str, target_phonemes: List[str]
-    ):
         """Create character analysis optimized"""
         character_analysis = []
         target_chars = list(target_word)
         target_phoneme_chars = list(final_target_ipa.replace("/", ""))
         # Pre-calculate phoneme scores mapping
         phoneme_score_map = {}
         if base_result.get("phoneme_differences"):
@@ -239,37 +200,28 @@ async def optimize_ipa_assessment_processing(
                 ref_phoneme = phoneme_diff.get("reference_phoneme")
                 if ref_phoneme:
                     phoneme_score_map[ref_phoneme] = phoneme_diff.get("score", 0.0)
         for i, char in enumerate(target_chars):
-            char_phoneme = (
-                target_phoneme_chars[i] if i < len(target_phoneme_chars) else ""
-            )
-            char_score = phoneme_score_map.get(
-                char_phoneme, base_result.get("overall_score", 0.0)
-            )
-            color_class = (
-                "text-green-600"
-                if char_score > 0.8
-                else "text-yellow-600" if char_score > 0.6 else "text-red-600"
-            )
-            character_analysis.append(
-                {
-                    "character": char,
-                    "phoneme": char_phoneme,
-                    "score": float(char_score),
-                    "color_class": color_class,
-                    "is_focus": char_phoneme in focus_phonemes_list,
-                }
-            )
         return character_analysis
     async def create_phoneme_scores(target_phonemes: List[str]):
         """Create phoneme scores optimized"""
         phoneme_scores = []
         # Pre-calculate phoneme scores mapping
         phoneme_score_map = {}
         if base_result.get("phoneme_differences"):
@@ -277,38 +229,28 @@ async def optimize_ipa_assessment_processing(
                 ref_phoneme = phoneme_diff.get("reference_phoneme")
                 if ref_phoneme:
                     phoneme_score_map[ref_phoneme] = phoneme_diff.get("score", 0.0)
         for phoneme in target_phonemes:
-            phoneme_score = phoneme_score_map.get(
-                phoneme, base_result.get("overall_score", 0.0)
-            )
-            color_class = (
-                "bg-green-100 text-green-800"
-                if phoneme_score > 0.8
-                else (
-                    "bg-yellow-100 text-yellow-800"
-                    if phoneme_score > 0.6
-                    else "bg-red-100 text-red-800"
-                )
-            )
-            phoneme_scores.append(
-                {
-                    "phoneme": phoneme,
-                    "score": float(phoneme_score),
-                    "color_class": color_class,
-                    "percentage": int(phoneme_score * 100),
-                    "is_focus": phoneme in focus_phonemes_list,
-                }
-            )
         return phoneme_scores
     async def create_focus_analysis():
         """Create focus phonemes analysis optimized"""
         focus_phonemes_analysis = []
         # Pre-calculate phoneme scores mapping
         phoneme_score_map = {}
         if base_result.get("phoneme_differences"):
@@ -316,42 +258,34 @@ async def optimize_ipa_assessment_processing(
                 ref_phoneme = phoneme_diff.get("reference_phoneme")
                 if ref_phoneme:
                     phoneme_score_map[ref_phoneme] = phoneme_diff.get("score", 0.0)
         for focus_phoneme in focus_phonemes_list:
-            score = phoneme_score_map.get(
-                focus_phoneme, base_result.get("overall_score", 0.0)
-            )
             phoneme_analysis = {
                 "phoneme": focus_phoneme,
                 "score": float(score),
                 "status": "correct" if score > 0.8 else "incorrect",
                 "vietnamese_tip": get_vietnamese_tip(focus_phoneme),
                 "difficulty": "medium",
-                "color_class": (
-                    "bg-green-100 text-green-800"
-                    if score > 0.8
-                    else (
-                        "bg-yellow-100 text-yellow-800"
-                        if score > 0.6
-                        else "bg-red-100 text-red-800"
-                    )
-                ),
             }
             focus_phonemes_analysis.append(phoneme_analysis)
         return focus_phonemes_analysis
     # Get target phonemes data first
     final_target_ipa, target_phonemes = await get_target_phonemes_data()
     # Run parallel processing for analysis
     character_analysis, phoneme_scores, focus_phonemes_analysis = await asyncio.gather(
         create_character_analysis(final_target_ipa, target_phonemes),
         create_phoneme_scores(target_phonemes),
-        create_focus_analysis(),
     )
     # Generate tips and recommendations asynchronously
     loop = asyncio.get_event_loop()
     executor = get_shared_executor()
@@ -359,74 +293,64 @@ async def optimize_ipa_assessment_processing(
         executor, generate_vietnamese_tips, target_phonemes, focus_phonemes_list
     )
     practice_recommendations_future = loop.run_in_executor(
-        executor,
-        generate_practice_recommendations,
-        base_result.get("overall_score", 0.0),
-        focus_phonemes_analysis,
     )
     vietnamese_tips, practice_recommendations = await asyncio.gather(
-        vietnamese_tips_future, practice_recommendations_future
     )
     optimization_time = time.time() - start_time
     logger.info(f"IPA assessment optimization completed in {optimization_time:.3f}s")
     return {
         "target_ipa": final_target_ipa,
         "character_analysis": character_analysis,
         "phoneme_scores": phoneme_scores,
         "focus_phonemes_analysis": focus_phonemes_analysis,
         "vietnamese_tips": vietnamese_tips,
-        "practice_recommendations": practice_recommendations,
     }
-def generate_vietnamese_tips(
-    target_phonemes: List[str], focus_phonemes_list: List[str]
-) -> List[str]:
     """Generate Vietnamese tips for difficult phonemes"""
     vietnamese_tips = []
     difficult_phonemes = ["θ", "ð", "v", "z", "ʒ", "r", "w", "æ", "ɪ", "ʊ", "ɛ"]
     for phoneme in set(target_phonemes + focus_phonemes_list):
         if phoneme in difficult_phonemes:
             tip = get_vietnamese_tip(phoneme)
             if tip not in vietnamese_tips:
                 vietnamese_tips.append(tip)
     return vietnamese_tips
-def generate_practice_recommendations(
-    overall_score: float, focus_phonemes_analysis: List[Dict]
-) -> List[str]:
     """Generate practice recommendations based on score"""
     practice_recommendations = []
     if overall_score < 0.7:
-        practice_recommendations.extend(
-            [
-                "Nghe từ mẫu nhiều lần trước khi phát âm",
-                "Phát âm chậm và rõ ràng từng âm vị",
-                "Chú ý đến vị trí lưỡi và môi khi phát âm",
-            ]
-        )
         # Add specific recommendations for focus phonemes
         for analysis in focus_phonemes_analysis:
             if analysis["score"] < 0.6:
                 practice_recommendations.append(
                     f"Luyện đặc biệt âm /{analysis['phoneme']}/: {analysis['vietnamese_tip']}"
                 )
     if overall_score >= 0.8:
-        practice_recommendations.append(
-            "Phát âm rất tốt! Tiếp tục luyện tập để duy trì chất lượng"
-        )
     elif overall_score >= 0.6:
         practice_recommendations.append("Phát âm khá tốt, cần cải thiện một số âm vị")
     return practice_recommendations
@@ -459,73 +383,41 @@ class PronunciationAssessmentResult(BaseModel):
 class IPAAssessmentResult(BaseModel):
     """Optimized response model for IPA-focused pronunciation assessment"""
     # Core assessment data
     transcript: str  # What the user actually said
     user_ipa: Optional[str] = None  # User's IPA transcription
     target_word: str  # Target word being assessed
     target_ipa: str  # Target IPA transcription
     overall_score: float  # Overall pronunciation score (0-1)
     # Character-level analysis for IPA mapping
     character_analysis: List[Dict]  # Each character with its IPA and score
     # Phoneme-specific analysis
     phoneme_scores: List[Dict]  # Individual phoneme scores with colors
     focus_phonemes_analysis: List[Dict]  # Detailed analysis of target phonemes
     # Feedback and recommendations
     vietnamese_tips: List[str]  # Vietnamese-specific pronunciation tips
     practice_recommendations: List[str]  # Practice suggestions
     feedback: List[str]  # General feedback messages
     # Assessment metadata
     processing_info: Dict  # Processing details
     assessment_type: str = "ipa_focused"
     error: Optional[str] = None
 # Global assessor instance - singleton pattern for performance
 global_assessor = None
 global_g2p = None  # Shared G2P instance for caching
 global_executor = None  # Shared ThreadPoolExecutor
-def preload_whisper_model(whisper_model: str = "base.en"):
-    """
-    Preload Whisper model during FastAPI startup for faster first inference
-    Call this function in your FastAPI startup event
-    """
-    global global_assessor
-    try:
-        logger.info(f"🚀 Preloading Whisper model '{whisper_model}' during startup...")
-        start_time = time.time()
-        # Force create the assessor instance which will load Whisper
-        global_assessor = ProductionPronunciationAssessor(whisper_model=whisper_model)
-        # Also preload G2P and executor
-        get_shared_g2p()
-        get_shared_executor()
-        load_time = time.time() - start_time
-        logger.info(f"✅ Whisper model '{whisper_model}' preloaded successfully in {load_time:.2f}s")
-        logger.info("🎯 First inference will be much faster now!")
-        return True
-    except Exception as e:
-        logger.error(f"❌ Failed to preload Whisper model: {e}")
-        return False
 def get_assessor():
-    """Get or create the global assessor instance with Whisper preloaded"""
     global global_assessor
     if global_assessor is None:
-        logger.info("Creating global ProductionPronunciationAssessor instance with Whisper...")
-        # Load Whisper model base.en by default for optimal performance
-        global_assessor = ProductionPronunciationAssessor(whisper_model="base.en")
-        logger.info("✅ Global Whisper assessor loaded and ready!")
     return global_assessor
@@ -614,7 +506,7 @@ async def assess_pronunciation(
             # Run assessment using enhanced assessor (singleton)
             assessor = get_assessor()
             result = assessor.assess_pronunciation(tmp_file.name, reference_text, mode)
             # Optimize post-processing with parallel execution
             await optimize_post_assessment_processing(result, reference_text)
@@ -644,69 +536,58 @@ async def assess_ipa_pronunciation(
     audio_file: UploadFile = File(..., description="Audio file (.wav, .mp3, .m4a)"),
     target_word: str = Form(..., description="Target word to assess (e.g., 'bed')"),
     target_ipa: str = Form(None, description="Target IPA notation (e.g., '/bɛd/')"),
-    focus_phonemes: str = Form(
-        None, description="Comma-separated focus phonemes (e.g., 'ɛ,b')"
-    ),
 ):
     """
     Optimized IPA pronunciation assessment for phoneme-focused learning
     Evaluates:
     - Overall word pronunciation accuracy
-    - Character-to-phoneme mapping accuracy
     - Specific phoneme pronunciation (e.g., /ɛ/ in 'bed')
     - Vietnamese-optimized feedback and tips
     - Dynamic color scoring for UI visualization
     Example: Assessing 'bed' /bɛd/ with focus on /ɛ/ phoneme
     """
     import time
     start_time = time.time()
     # Validate inputs
     if not target_word.strip():
         raise HTTPException(status_code=400, detail="Target word cannot be empty")
     if len(target_word) > 50:
-        raise HTTPException(
-            status_code=400, detail="Target word too long (max 50 characters)"
-        )
     # Clean target word
     target_word = target_word.strip().lower()
     try:
         # Save uploaded file temporarily
         file_extension = ".wav"
         if audio_file.filename and "." in audio_file.filename:
             file_extension = f".{audio_file.filename.split('.')[-1]}"
-        with tempfile.NamedTemporaryFile(
-            delete=False, suffix=file_extension
-        ) as tmp_file:
             content = await audio_file.read()
             tmp_file.write(content)
             tmp_file.flush()
-            logger.info(
-                f"IPA assessment for word '{target_word}' with IPA '{target_ipa}'"
-            )
             # Get the assessor instance
             assessor = get_assessor()
             # Run base pronunciation assessment in word mode
-            base_result = assessor.assess_pronunciation(
-                tmp_file.name, target_word, "word"
-            )
             # Optimize IPA assessment processing with parallel execution
             optimized_results = await optimize_ipa_assessment_processing(
                 base_result, target_word, target_ipa, focus_phonemes
             )
             # Extract optimized results
             target_ipa = optimized_results["target_ipa"]
             character_analysis = optimized_results["character_analysis"]
@@ -714,30 +595,28 @@ async def assess_ipa_pronunciation(
             focus_phonemes_analysis = optimized_results["focus_phonemes_analysis"]
             vietnamese_tips = optimized_results["vietnamese_tips"]
             practice_recommendations = optimized_results["practice_recommendations"]
             # Get overall score from base result
             overall_score = base_result.get("overall_score", 0.0)
             # Handle error cases
             error_message = None
             feedback = base_result.get("feedback", [])
             if base_result.get("error"):
                 error_message = base_result["error"]
                 feedback = [f"Lỗi: {error_message}"]
             # Processing information
             processing_time = time.time() - start_time
             processing_info = {
                 "processing_time": processing_time,
                 "mode": "ipa_focused",
                 "model_used": "Wav2Vec2-Enhanced",
-                "confidence": base_result.get("processing_info", {}).get(
-                    "confidence", 0.0
-                ),
-                "enhanced_features": True,
             }
             # Create final result
             result = IPAAssessmentResult(
                 transcript=base_result.get("transcript", ""),
@@ -752,19 +631,16 @@ async def assess_ipa_pronunciation(
                 practice_recommendations=practice_recommendations,
                 feedback=feedback,
                 processing_info=processing_info,
-                error=error_message,
             )
-            logger.info(
-                f"IPA assessment completed for '{target_word}' in {processing_time:.2f}s with score {overall_score:.2f}"
-            )
             return result
     except Exception as e:
         logger.error(f"IPA assessment error: {str(e)}")
         import traceback
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"IPA assessment failed: {str(e)}")
@@ -778,13 +654,14 @@ async def assess_ipa_pronunciation(
 def get_word_phonemes(word: str):
     """Get phoneme breakdown for a specific word"""
     try:
-        # Use the shared G2P instance for consistency
-        g2p = get_shared_g2p()
         phoneme_data = g2p.text_to_phonemes(word)[0]
         # Add difficulty analysis for Vietnamese speakers
         difficulty_scores = []
         for phoneme in phoneme_data["phonemes"]:
             difficulty = g2p.get_difficulty_score(phoneme)
             difficulty_scores.append(difficulty)
@@ -841,7 +718,7 @@ def get_vietnamese_tip(phoneme: str) -> str:
         "d": "Lưỡi chạm nướu răng trên, rung dây thanh",
         "t": "Lưỡi chạm nướu răng trên, không rung dây thanh",
         "k": "Lưỡi chạm vòm miệng, không rung dây thanh",
-        "g": "Lưỡi chạm vòm miệng, rung dây thanh",
     }
     return tips.get(phoneme, f"Luyện tập phát âm /{phoneme}/")
@@ -850,10 +727,10 @@ def get_phoneme_difficulty(phoneme: str) -> str:
     """Get difficulty level for Vietnamese speakers"""
     hard_phonemes = ["θ", "ð", "r", "w", "æ", "ʌ", "ɪ", "ʊ"]
     medium_phonemes = ["v", "z", "ʒ", "ɛ", "ə", "ɔ", "f"]
     if phoneme in hard_phonemes:
         return "hard"
     elif phoneme in medium_phonemes:
         return "medium"
     else:
-        return "easy"

 from fastapi import UploadFile, File, Form, HTTPException, APIRouter
 from pydantic import BaseModel
 from typing import List, Dict, Optional
 from src.utils.speaking_utils import convert_numpy_types
 # Import the new evaluation system
+from src.apis.controllers.speaking_controller import ProductionPronunciationAssessor, EnhancedG2P
 warnings.filterwarnings("ignore")
 router = APIRouter(prefix="/speaking", tags=["Speaking"])
 # =============================================================================
 # OPTIMIZATION FUNCTIONS
 # =============================================================================
+async def optimize_post_assessment_processing(result: Dict, reference_text: str) -> None:
     """
     Tối ưu hóa xử lý sau assessment bằng cách chạy song song các task độc lập
     Giảm thời gian xử lý từ ~0.3-0.5s xuống ~0.1-0.2s
     """
     start_time = time.time()
     # Tạo shared G2P instance để tránh tạo mới nhiều lần
     g2p = get_shared_g2p()
     # Định nghĩa các task có thể chạy song song
     async def process_reference_phonemes_and_ipa():
         """Xử lý reference phonemes và IPA song song"""
         loop = asyncio.get_event_loop()
         executor = get_shared_executor()
         reference_words = reference_text.strip().split()
         # Chạy song song cho từng word
         futures = []
         for word in reference_words:
+            clean_word = word.strip('.,!?;:')
             future = loop.run_in_executor(executor, g2p.text_to_phonemes, clean_word)
             futures.append(future)
         # Collect results
         word_results = await asyncio.gather(*futures)
         reference_phonemes_list = []
         reference_ipa_list = []
         for word_data in word_results:
             if word_data and len(word_data) > 0:
                 reference_phonemes_list.append(word_data[0]["phoneme_string"])
                 reference_ipa_list.append(word_data[0]["ipa"])
         result["reference_phonemes"] = " ".join(reference_phonemes_list)
         result["reference_ipa"] = " ".join(reference_ipa_list)
     async def process_user_ipa():
         """Xử lý user IPA từ transcript song song"""
         if "transcript" not in result or not result["transcript"]:
             result["user_ipa"] = None
             return
         try:
             user_transcript = result["transcript"].strip()
             user_words = user_transcript.split()
             if not user_words:
                 result["user_ipa"] = None
                 return
             loop = asyncio.get_event_loop()
             executor = get_shared_executor()
             # Chạy song song cho từng word
             futures = []
             clean_words = []
             for word in user_words:
+                clean_word = word.strip('.,!?;:').lower()
                 if clean_word:  # Skip empty words
                     clean_words.append(clean_word)
+                    future = loop.run_in_executor(executor, safe_get_word_ipa, g2p, clean_word)
                     futures.append(future)
             # Collect results
             if futures:
                 user_ipa_results = await asyncio.gather(*futures)
                 result["user_ipa"] = " ".join(user_ipa_list) if user_ipa_list else None
             else:
                 result["user_ipa"] = None
+            logger.info(f"Generated user IPA from transcript '{user_transcript}': '{result.get('user_ipa', 'None')}'")
         except Exception as e:
             logger.warning(f"Failed to generate user IPA from transcript: {e}")
+            result["user_ipa"] = None    # Chạy song song cả 2 task chính
+    await asyncio.gather(
+        process_reference_phonemes_and_ipa(),
+        process_user_ipa()
+    )
     optimization_time = time.time() - start_time
     logger.info(f"Post-assessment optimization completed in {optimization_time:.3f}s")
 _shared_g2p_cache = {}
 _cache_lock = asyncio.Lock()
 async def get_cached_g2p_result(word: str) -> Optional[Dict]:
     """
     Cache G2P results để tránh tính toán lại cho các từ đã xử lý
             return _shared_g2p_cache[word]
     return None
 async def cache_g2p_result(word: str, result: Dict) -> None:
     """
     Cache G2P result với size limit
             oldest_keys = list(_shared_g2p_cache.keys())[:100]
             for key in oldest_keys:
                 del _shared_g2p_cache[key]
         _shared_g2p_cache[word] = result
 async def optimize_ipa_assessment_processing(
+    base_result: Dict,
+    target_word: str,
+    target_ipa: Optional[str],
+    focus_phonemes: Optional[str]
 ) -> Dict:
     """
     Tối ưu hóa xử lý IPA assessment bằng cách chạy song song các task
     """
     start_time = time.time()
     # Shared G2P instance
     g2p = get_shared_g2p()
     # Parse focus phonemes trước
     focus_phonemes_list = []
     if focus_phonemes:
         focus_phonemes_list = [p.strip() for p in focus_phonemes.split(",")]
     async def get_target_phonemes_data():
         """Get target IPA and phonemes"""
         if not target_ipa:
             # Parse provided IPA
             clean_ipa = target_ipa.replace("/", "").strip()
             return target_ipa, list(clean_ipa)
+    async def create_character_analysis(final_target_ipa: str, target_phonemes: List[str]):
         """Create character analysis optimized"""
         character_analysis = []
         target_chars = list(target_word)
         target_phoneme_chars = list(final_target_ipa.replace("/", ""))
         # Pre-calculate phoneme scores mapping
         phoneme_score_map = {}
         if base_result.get("phoneme_differences"):
                 ref_phoneme = phoneme_diff.get("reference_phoneme")
                 if ref_phoneme:
                     phoneme_score_map[ref_phoneme] = phoneme_diff.get("score", 0.0)
         for i, char in enumerate(target_chars):
+            char_phoneme = target_phoneme_chars[i] if i < len(target_phoneme_chars) else ""
+            char_score = phoneme_score_map.get(char_phoneme, base_result.get("overall_score", 0.0))
+            color_class = ("text-green-600" if char_score > 0.8 else
+                          "text-yellow-600" if char_score > 0.6 else "text-red-600")
+            character_analysis.append({
+                "character": char,
+                "phoneme": char_phoneme,
+                "score": float(char_score),
+                "color_class": color_class,
+                "is_focus": char_phoneme in focus_phonemes_list
+            })
         return character_analysis
     async def create_phoneme_scores(target_phonemes: List[str]):
         """Create phoneme scores optimized"""
         phoneme_scores = []
         # Pre-calculate phoneme scores mapping
         phoneme_score_map = {}
         if base_result.get("phoneme_differences"):
                 ref_phoneme = phoneme_diff.get("reference_phoneme")
                 if ref_phoneme:
                     phoneme_score_map[ref_phoneme] = phoneme_diff.get("score", 0.0)
         for phoneme in target_phonemes:
+            phoneme_score = phoneme_score_map.get(phoneme, base_result.get("overall_score", 0.0))
+            color_class = ("bg-green-100 text-green-800" if phoneme_score > 0.8 else
+                          "bg-yellow-100 text-yellow-800" if phoneme_score > 0.6 else
+                          "bg-red-100 text-red-800")
+            phoneme_scores.append({
+                "phoneme": phoneme,
+                "score": float(phoneme_score),
+                "color_class": color_class,
+                "percentage": int(phoneme_score * 100),
+                "is_focus": phoneme in focus_phonemes_list
+            })
         return phoneme_scores
     async def create_focus_analysis():
         """Create focus phonemes analysis optimized"""
         focus_phonemes_analysis = []
         # Pre-calculate phoneme scores mapping
         phoneme_score_map = {}
         if base_result.get("phoneme_differences"):
                 ref_phoneme = phoneme_diff.get("reference_phoneme")
                 if ref_phoneme:
                     phoneme_score_map[ref_phoneme] = phoneme_diff.get("score", 0.0)
         for focus_phoneme in focus_phonemes_list:
+            score = phoneme_score_map.get(focus_phoneme, base_result.get("overall_score", 0.0))
             phoneme_analysis = {
                 "phoneme": focus_phoneme,
                 "score": float(score),
                 "status": "correct" if score > 0.8 else "incorrect",
                 "vietnamese_tip": get_vietnamese_tip(focus_phoneme),
                 "difficulty": "medium",
+                "color_class": ("bg-green-100 text-green-800" if score > 0.8 else
+                               "bg-yellow-100 text-yellow-800" if score > 0.6 else
+                               "bg-red-100 text-red-800")
             }
             focus_phonemes_analysis.append(phoneme_analysis)
         return focus_phonemes_analysis
     # Get target phonemes data first
     final_target_ipa, target_phonemes = await get_target_phonemes_data()
     # Run parallel processing for analysis
     character_analysis, phoneme_scores, focus_phonemes_analysis = await asyncio.gather(
         create_character_analysis(final_target_ipa, target_phonemes),
         create_phoneme_scores(target_phonemes),
+        create_focus_analysis()
     )
     # Generate tips and recommendations asynchronously
     loop = asyncio.get_event_loop()
     executor = get_shared_executor()
         executor, generate_vietnamese_tips, target_phonemes, focus_phonemes_list
     )
     practice_recommendations_future = loop.run_in_executor(
+        executor, generate_practice_recommendations, base_result.get("overall_score", 0.0), focus_phonemes_analysis
     )
     vietnamese_tips, practice_recommendations = await asyncio.gather(
+        vietnamese_tips_future,
+        practice_recommendations_future
     )
     optimization_time = time.time() - start_time
     logger.info(f"IPA assessment optimization completed in {optimization_time:.3f}s")
     return {
         "target_ipa": final_target_ipa,
         "character_analysis": character_analysis,
         "phoneme_scores": phoneme_scores,
         "focus_phonemes_analysis": focus_phonemes_analysis,
         "vietnamese_tips": vietnamese_tips,
+        "practice_recommendations": practice_recommendations
     }
+def generate_vietnamese_tips(target_phonemes: List[str], focus_phonemes_list: List[str]) -> List[str]:
     """Generate Vietnamese tips for difficult phonemes"""
     vietnamese_tips = []
     difficult_phonemes = ["θ", "ð", "v", "z", "ʒ", "r", "w", "æ", "ɪ", "ʊ", "ɛ"]
     for phoneme in set(target_phonemes + focus_phonemes_list):
         if phoneme in difficult_phonemes:
             tip = get_vietnamese_tip(phoneme)
             if tip not in vietnamese_tips:
                 vietnamese_tips.append(tip)
     return vietnamese_tips
+def generate_practice_recommendations(overall_score: float, focus_phonemes_analysis: List[Dict]) -> List[str]:
     """Generate practice recommendations based on score"""
     practice_recommendations = []
     if overall_score < 0.7:
+        practice_recommendations.extend([
+            "Nghe từ mẫu nhiều lần trước khi phát âm",
+            "Phát âm chậm và rõ ràng từng âm vị",
+            "Chú ý đến vị trí lưỡi và môi khi phát âm"
+        ])
         # Add specific recommendations for focus phonemes
         for analysis in focus_phonemes_analysis:
             if analysis["score"] < 0.6:
                 practice_recommendations.append(
                     f"Luyện đặc biệt âm /{analysis['phoneme']}/: {analysis['vietnamese_tip']}"
                 )
     if overall_score >= 0.8:
+        practice_recommendations.append("Phát âm rất tốt! Tiếp tục luyện tập để duy trì chất lượng")
     elif overall_score >= 0.6:
         practice_recommendations.append("Phát âm khá tốt, cần cải thiện một số âm vị")
     return practice_recommendations
 class IPAAssessmentResult(BaseModel):
     """Optimized response model for IPA-focused pronunciation assessment"""
     # Core assessment data
     transcript: str  # What the user actually said
     user_ipa: Optional[str] = None  # User's IPA transcription
     target_word: str  # Target word being assessed
     target_ipa: str  # Target IPA transcription
     overall_score: float  # Overall pronunciation score (0-1)
     # Character-level analysis for IPA mapping
     character_analysis: List[Dict]  # Each character with its IPA and score
     # Phoneme-specific analysis
     phoneme_scores: List[Dict]  # Individual phoneme scores with colors
     focus_phonemes_analysis: List[Dict]  # Detailed analysis of target phonemes
     # Feedback and recommendations
     vietnamese_tips: List[str]  # Vietnamese-specific pronunciation tips
     practice_recommendations: List[str]  # Practice suggestions
     feedback: List[str]  # General feedback messages
     # Assessment metadata
     processing_info: Dict  # Processing details
     assessment_type: str = "ipa_focused"
     error: Optional[str] = None
 # Global assessor instance - singleton pattern for performance
 global_assessor = None
 global_g2p = None  # Shared G2P instance for caching
 global_executor = None  # Shared ThreadPoolExecutor
 def get_assessor():
+    """Get or create the global assessor instance"""
     global global_assessor
     if global_assessor is None:
+        logger.info("Creating global ProductionPronunciationAssessor instance...")
+        global_assessor = ProductionPronunciationAssessor()
     return global_assessor
             # Run assessment using enhanced assessor (singleton)
             assessor = get_assessor()
             result = assessor.assess_pronunciation(tmp_file.name, reference_text, mode)
             # Optimize post-processing with parallel execution
             await optimize_post_assessment_processing(result, reference_text)
     audio_file: UploadFile = File(..., description="Audio file (.wav, .mp3, .m4a)"),
     target_word: str = Form(..., description="Target word to assess (e.g., 'bed')"),
     target_ipa: str = Form(None, description="Target IPA notation (e.g., '/bɛd/')"),
+    focus_phonemes: str = Form(None, description="Comma-separated focus phonemes (e.g., 'ɛ,b')"),
 ):
     """
     Optimized IPA pronunciation assessment for phoneme-focused learning
     Evaluates:
     - Overall word pronunciation accuracy
+    - Character-to-phoneme mapping accuracy
     - Specific phoneme pronunciation (e.g., /ɛ/ in 'bed')
     - Vietnamese-optimized feedback and tips
     - Dynamic color scoring for UI visualization
     Example: Assessing 'bed' /bɛd/ with focus on /ɛ/ phoneme
     """
     import time
     start_time = time.time()
     # Validate inputs
     if not target_word.strip():
         raise HTTPException(status_code=400, detail="Target word cannot be empty")
     if len(target_word) > 50:
+        raise HTTPException(status_code=400, detail="Target word too long (max 50 characters)")
     # Clean target word
     target_word = target_word.strip().lower()
     try:
         # Save uploaded file temporarily
         file_extension = ".wav"
         if audio_file.filename and "." in audio_file.filename:
             file_extension = f".{audio_file.filename.split('.')[-1]}"
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp_file:
             content = await audio_file.read()
             tmp_file.write(content)
             tmp_file.flush()
+            logger.info(f"IPA assessment for word '{target_word}' with IPA '{target_ipa}'")
             # Get the assessor instance
             assessor = get_assessor()
             # Run base pronunciation assessment in word mode
+            base_result = assessor.assess_pronunciation(tmp_file.name, target_word, "word")
             # Optimize IPA assessment processing with parallel execution
             optimized_results = await optimize_ipa_assessment_processing(
                 base_result, target_word, target_ipa, focus_phonemes
             )
             # Extract optimized results
             target_ipa = optimized_results["target_ipa"]
             character_analysis = optimized_results["character_analysis"]
             focus_phonemes_analysis = optimized_results["focus_phonemes_analysis"]
             vietnamese_tips = optimized_results["vietnamese_tips"]
             practice_recommendations = optimized_results["practice_recommendations"]
             # Get overall score from base result
             overall_score = base_result.get("overall_score", 0.0)
             # Handle error cases
             error_message = None
             feedback = base_result.get("feedback", [])
             if base_result.get("error"):
                 error_message = base_result["error"]
                 feedback = [f"Lỗi: {error_message}"]
             # Processing information
             processing_time = time.time() - start_time
             processing_info = {
                 "processing_time": processing_time,
                 "mode": "ipa_focused",
                 "model_used": "Wav2Vec2-Enhanced",
+                "confidence": base_result.get("processing_info", {}).get("confidence", 0.0),
+                "enhanced_features": True
             }
             # Create final result
             result = IPAAssessmentResult(
                 transcript=base_result.get("transcript", ""),
                 practice_recommendations=practice_recommendations,
                 feedback=feedback,
                 processing_info=processing_info,
+                error=error_message
             )
+            logger.info(f"IPA assessment completed for '{target_word}' in {processing_time:.2f}s with score {overall_score:.2f}")
             return result
     except Exception as e:
         logger.error(f"IPA assessment error: {str(e)}")
         import traceback
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"IPA assessment failed: {str(e)}")
 def get_word_phonemes(word: str):
     """Get phoneme breakdown for a specific word"""
     try:
+        # Use the new EnhancedG2P from evaluation module
+        from evalution import EnhancedG2P
+        g2p = EnhancedG2P()
         phoneme_data = g2p.text_to_phonemes(word)[0]
         # Add difficulty analysis for Vietnamese speakers
         difficulty_scores = []
         for phoneme in phoneme_data["phonemes"]:
             difficulty = g2p.get_difficulty_score(phoneme)
             difficulty_scores.append(difficulty)
         "d": "Lưỡi chạm nướu răng trên, rung dây thanh",
         "t": "Lưỡi chạm nướu răng trên, không rung dây thanh",
         "k": "Lưỡi chạm vòm miệng, không rung dây thanh",
+        "g": "Lưỡi chạm vòm miệng, rung dây thanh"
     }
     return tips.get(phoneme, f"Luyện tập phát âm /{phoneme}/")
     """Get difficulty level for Vietnamese speakers"""
     hard_phonemes = ["θ", "ð", "r", "w", "æ", "ʌ", "ɪ", "ʊ"]
     medium_phonemes = ["v", "z", "ʒ", "ɛ", "ə", "ɔ", "f"]
     if phoneme in hard_phonemes:
         return "hard"
     elif phoneme in medium_phonemes:
         return "medium"
     else:
+        return "easy"

test.py ADDED Viewed

	@@ -0,0 +1,456 @@

+# import torch
+# import librosa
+# from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+# # Cấu hình
+# # MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
+# MODEL_ID = "facebook/wav2vec2-large-xlsr-53"
+# AUDIO_FILE_PATH = "./hello_how_are_you_today.wav"  # Thay đổi đường dẫn này
+# # Load model và processor
+# processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
+# model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+# def transcribe_audio_file(audio_path):
+#     """
+#     Chuyển đổi file audio thành text sử dụng Wav2Vec2
+#     """
+#     # Đọc file audio
+#     try:
+#         speech_array, sampling_rate = librosa.load(audio_path, sr=16_000)
+#         print(f"Đã load audio file: {audio_path}")
+#         print(f"Độ dài audio: {len(speech_array)/16_000:.2f} giây")
+#     except Exception as e:
+#         print(f"Lỗi khi đọc file audio: {e}")
+#         return None
+#     # Tiền xử lý
+#     inputs = processor(
+#         speech_array,
+#         sampling_rate=16_000,
+#         return_tensors="pt",
+#         padding=True
+#     )
+#     # Dự đoán
+#     with torch.no_grad():
+#         logits = model(
+#             inputs.input_values,
+#             attention_mask=inputs.attention_mask
+#         ).logits
+#     # Decode kết quả
+#     predicted_ids = torch.argmax(logits, dim=-1)
+#     predicted_sentence = processor.batch_decode(predicted_ids)[0]
+#     return predicted_sentence
+# # Test với file audio của bạn
+# if __name__ == "__main__":
+#     # Thay đổi đường dẫn đến file audio của bạn
+#     audio_files = [
+#         "./hello_world.wav",  # Thay đổi tên file này
+#         # "another_file.mp3",   # Có thể thêm nhiều file
+#     ]
+#     for audio_file in audio_files:
+#         print("=" * 80)
+#         print(f"Đang xử lý: {audio_file}")
+#         print("=" * 80)
+#         prediction = transcribe_audio_file(audio_file)
+#         if prediction:
+#             print(f"Kết quả nhận dạng: {prediction}")
+#         else:
+#             print("Không thể xử lý file này")
+#         print()
+# # Phiên bản đơn giản hơn - chỉ cần thay đổi đường dẫn file
+# def quick_transcribe(audio_path):
+#     """Phiên bản nhanh để transcribe một file"""
+#     speech_array, _ = librosa.load(audio_path, sr=16_000)
+#     inputs = processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
+#     with torch.no_grad():
+#         logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
+#     predicted_ids = torch.argmax(logits, dim=-1)
+#     return processor.batch_decode(predicted_ids)[0]
+# # Sử dụng nhanh:
+# result = quick_transcribe("./hello_how_are_you_today.wav")
+# print(result)
+import torch
+from transformers import (
+    AutoModelForCTC,
+    AutoProcessor,
+    Wav2Vec2Processor,
+    Wav2Vec2ForCTC,
+)
+import onnxruntime as rt
+import numpy as np
+import librosa
+import warnings
+import os
+warnings.filterwarnings("ignore")
+# Available Wave2Vec2 models
+WAVE2VEC2_MODELS = {
+    "english_large": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+    "multilingual": "facebook/wav2vec2-large-xlsr-53",
+    "english_960h": "facebook/wav2vec2-large-960h-lv60-self",
+    "base_english": "facebook/wav2vec2-base-960h",
+    "large_english": "facebook/wav2vec2-large-960h",
+    "xlsr_english": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+    "xlsr_multilingual": "facebook/wav2vec2-large-xlsr-53"
+}
+# Default model
+DEFAULT_MODEL = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
+def get_available_models():
+    """Return dictionary of available Wave2Vec2 models"""
+    return WAVE2VEC2_MODELS.copy()
+def get_model_name(model_key=None):
+    """
+    Get model name from key or return default
+    Args:
+        model_key: Key from WAVE2VEC2_MODELS or full model name
+    Returns:
+        str: Full model name
+    """
+    if model_key is None:
+        return DEFAULT_MODEL
+    if model_key in WAVE2VEC2_MODELS:
+        return WAVE2VEC2_MODELS[model_key]
+    # If it's already a full model name, return as is
+    return model_key
+class Wave2Vec2Inference:
+    def __init__(self, model_name=None, use_gpu=True):
+        # Get the actual model name using helper function
+        self.model_name = get_model_name(model_name)
+        # Auto-detect device
+        if use_gpu:
+            if torch.backends.mps.is_available():
+                self.device = "mps"
+            elif torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = "cpu"
+        print(f"Using device: {self.device}")
+        print(f"Loading model: {self.model_name}")
+        # Check if model is XLSR and use appropriate processor/model
+        is_xlsr = "xlsr" in self.model_name.lower()
+        if is_xlsr:
+            print("Using Wav2Vec2Processor and Wav2Vec2ForCTC for XLSR model")
+            self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
+            self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)
+        else:
+            print("Using AutoProcessor and AutoModelForCTC")
+            self.processor = AutoProcessor.from_pretrained(self.model_name)
+            self.model = AutoModelForCTC.from_pretrained(self.model_name)
+        self.model.to(self.device)
+        self.model.eval()
+        # Disable gradients for inference
+        torch.set_grad_enabled(False)
+    def buffer_to_text(self, audio_buffer):
+        if len(audio_buffer) == 0:
+            return ""
+        # Convert to tensor
+        if isinstance(audio_buffer, np.ndarray):
+            audio_tensor = torch.from_numpy(audio_buffer).float()
+        else:
+            audio_tensor = torch.tensor(audio_buffer, dtype=torch.float32)
+        # Process audio
+        inputs = self.processor(
+            audio_tensor,
+            sampling_rate=16_000,
+            return_tensors="pt",
+            padding=True,
+        )
+        # Move to device
+        input_values = inputs.input_values.to(self.device)
+        attention_mask = (
+            inputs.attention_mask.to(self.device)
+            if "attention_mask" in inputs
+            else None
+        )
+        # Inference
+        with torch.no_grad():
+            if attention_mask is not None:
+                logits = self.model(input_values, attention_mask=attention_mask).logits
+            else:
+                logits = self.model(input_values).logits
+        # Decode
+        predicted_ids = torch.argmax(logits, dim=-1)
+        if self.device != "cpu":
+            predicted_ids = predicted_ids.cpu()
+        transcription = self.processor.batch_decode(predicted_ids)[0]
+        return transcription.lower().strip()
+    def file_to_text(self, filename):
+        try:
+            audio_input, _ = librosa.load(filename, sr=16000, dtype=np.float32)
+            return self.buffer_to_text(audio_input)
+        except Exception as e:
+            print(f"Error loading audio file {filename}: {e}")
+            return ""
+class Wave2Vec2ONNXInference:
+    def __init__(self, model_name=None, onnx_path=None, use_gpu=True):
+        # Get the actual model name using helper function
+        self.model_name = get_model_name(model_name)
+        print(f"Loading ONNX model: {self.model_name}")
+        # Always use Wav2Vec2Processor for ONNX (works for all models)
+        self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
+        # Setup ONNX Runtime
+        options = rt.SessionOptions()
+        options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
+        # Choose providers based on GPU availability
+        providers = []
+        if use_gpu and rt.get_available_providers():
+            if "CUDAExecutionProvider" in rt.get_available_providers():
+                providers.append("CUDAExecutionProvider")
+        providers.append("CPUExecutionProvider")
+        self.model = rt.InferenceSession(onnx_path, options, providers=providers)
+        self.input_name = self.model.get_inputs()[0].name
+        print(f"ONNX model loaded with providers: {self.model.get_providers()}")
+    def buffer_to_text(self, audio_buffer):
+        if len(audio_buffer) == 0:
+            return ""
+        # Convert to tensor
+        if isinstance(audio_buffer, np.ndarray):
+            audio_tensor = torch.from_numpy(audio_buffer).float()
+        else:
+            audio_tensor = torch.tensor(audio_buffer, dtype=torch.float32)
+        # Process audio
+        inputs = self.processor(
+            audio_tensor,
+            sampling_rate=16_000,
+            return_tensors="np",
+            padding=True,
+        )
+        # ONNX inference
+        input_values = inputs.input_values.astype(np.float32)
+        onnx_outputs = self.model.run(None, {self.input_name: input_values})[0]
+        # Decode
+        prediction = np.argmax(onnx_outputs, axis=-1)
+        transcription = self.processor.decode(prediction.squeeze().tolist())
+        return transcription.lower().strip()
+    def file_to_text(self, filename):
+        try:
+            audio_input, _ = librosa.load(filename, sr=16000, dtype=np.float32)
+            return self.buffer_to_text(audio_input)
+        except Exception as e:
+            print(f"Error loading audio file {filename}: {e}")
+            return ""
+def convert_to_onnx(model_id_or_path, onnx_model_name):
+    """Convert PyTorch model to ONNX format"""
+    print(f"Converting {model_id_or_path} to ONNX...")
+    model = Wav2Vec2ForCTC.from_pretrained(model_id_or_path)
+    model.eval()
+    # Create dummy input
+    audio_len = 250000
+    dummy_input = torch.randn(1, audio_len, requires_grad=True)
+    torch.onnx.export(
+        model,
+        dummy_input,
+        onnx_model_name,
+        export_params=True,
+        opset_version=14,
+        do_constant_folding=True,
+        input_names=["input"],
+        output_names=["output"],
+        dynamic_axes={
+            "input": {1: "audio_len"},
+            "output": {1: "audio_len"},
+        },
+    )
+    print(f"ONNX model saved to: {onnx_model_name}")
+def quantize_onnx_model(onnx_model_path, quantized_model_path):
+    """Quantize ONNX model for faster inference"""
+    print("Starting quantization...")
+    from onnxruntime.quantization import quantize_dynamic, QuantType
+    quantize_dynamic(
+        onnx_model_path, quantized_model_path, weight_type=QuantType.QUInt8
+    )
+    print(f"Quantized model saved to: {quantized_model_path}")
+def export_to_onnx(model_name, quantize=False):
+    """
+    Export model to ONNX format with optional quantization
+    Args:
+        model_name: HuggingFace model name
+        quantize: Whether to also create quantized version
+    Returns:
+        tuple: (onnx_path, quantized_path or None)
+    """
+    onnx_filename = f"{model_name.split('/')[-1]}.onnx"
+    convert_to_onnx(model_name, onnx_filename)
+    quantized_path = None
+    if quantize:
+        quantized_path = onnx_filename.replace(".onnx", ".quantized.onnx")
+        quantize_onnx_model(onnx_filename, quantized_path)
+    return onnx_filename, quantized_path
+def create_inference(
+    model_name=None, use_onnx=False, onnx_path=None, use_gpu=True, use_onnx_quantize=False
+):
+    """
+    Create optimized inference instance
+    Args:
+        model_name: Model key from WAVE2VEC2_MODELS or full HuggingFace model name (default: uses DEFAULT_MODEL)
+        use_onnx: Whether to use ONNX runtime
+        onnx_path: Path to ONNX model file
+        use_gpu: Whether to use GPU if available
+        use_onnx_quantize: Whether to use quantized ONNX model
+    Returns:
+        Inference instance
+    """
+    # Get the actual model name
+    actual_model_name = get_model_name(model_name)
+    if use_onnx:
+        if not onnx_path or not os.path.exists(onnx_path):
+            # Convert to ONNX if path not provided or doesn't exist
+            onnx_filename = f"{actual_model_name.split('/')[-1]}.onnx"
+            convert_to_onnx(actual_model_name, onnx_filename)
+            onnx_path = onnx_filename
+        if use_onnx_quantize:
+            quantized_path = onnx_path.replace(".onnx", ".quantized.onnx")
+            if not os.path.exists(quantized_path):
+                quantize_onnx_model(onnx_path, quantized_path)
+            onnx_path = quantized_path
+        print(f"Using ONNX model: {onnx_path}")
+        return Wave2Vec2ONNXInference(model_name, onnx_path, use_gpu)
+    else:
+        print("Using PyTorch model")
+        return Wave2Vec2Inference(model_name, use_gpu)
+if __name__ == "__main__":
+    import time
+    # Display available models
+    print("Available Wave2Vec2 models:")
+    for key, model_name in get_available_models().items():
+        print(f"  {key}: {model_name}")
+    print(f"\nDefault model: {DEFAULT_MODEL}")
+    print()
+    # Test with different models
+    test_models = ["english_large", "multilingual", "english_960h"]
+    test_file = "./hello_how_are_you_today.wav"
+    if not os.path.exists(test_file):
+        print(f"Test file {test_file} not found. Please provide a valid audio file.")
+        print("Creating example usage without actual file...")
+        # Example usage without file
+        print("\n=== Example Usage ===")
+        # Using default model
+        print("1. Using default model:")
+        asr_default = create_inference()
+        print(f"   Model loaded: {asr_default.model_name}")
+        # Using model key
+        print("\n2. Using model key 'english_large':")
+        asr_key = create_inference("english_large")
+        print(f"   Model loaded: {asr_key.model_name}")
+        # Using full model name
+        print("\n3. Using full model name:")
+        asr_full = create_inference("facebook/wav2vec2-base-960h")
+        print(f"   Model loaded: {asr_full.model_name}")
+        exit(0)
+    # Test different model configurations
+    for model_key in test_models:
+        print(f"\n=== Testing model: {model_key} ===")
+        # Test different configurations
+        configs = [
+            {"use_onnx": False, "use_gpu": True},
+            {"use_onnx": True, "use_gpu": True, "use_onnx_quantize": False},
+        ]
+        for config in configs:
+            print(f"\nConfig: {config}")
+            # Create inference instance with model selection
+            asr = create_inference(model_key, **config)
+            # Warm up
+            asr.file_to_text(test_file)
+            # Test performance
+            times = []
+            for i in range(3):
+                start_time = time.time()
+                text = asr.file_to_text(test_file)
+                end_time = time.time()
+                execution_time = end_time - start_time
+                times.append(execution_time)
+                print(f"Run {i+1}: {execution_time:.3f}s - {text[:50]}...")
+            avg_time = sum(times) / len(times)
+            print(f"Average time: {avg_time:.3f}s")