Spaces:

ABAO77
/

Run_code_api

Sleeping

App Files Files Community

ABAO77 commited on Sep 21

Commit

9537fdb

1 Parent(s): 043359f

remove evaluation, no need function

Browse files

Files changed (3) hide show

evalution.py +0 -1675
src/apis/controllers/speaking_controller.py +20 -30
src/utils/speaking_utils.py +0 -559

evalution.py DELETED Viewed

@@ -1,1675 +0,0 @@
-import asyncio
-import concurrent.futures
-from functools import lru_cache
-import time
-from typing import List, Dict, Optional, Tuple
-import numpy as np
-import librosa
-import nltk
-import eng_to_ipa as ipa
-import re
-from collections import defaultdict
-from loguru import logger
-import Levenshtein
-from dataclasses import dataclass
-from enum import Enum
-from src.AI_Models.wave2vec_inference import (
-    create_inference,
-    export_to_onnx,
-)
-# Download required NLTK data
-try:
-    nltk.download("cmudict", quiet=True)
-    from nltk.corpus import cmudict
-except:
-    print("Warning: NLTK data not available")
-class AssessmentMode(Enum):
-    WORD = "word"
-    SENTENCE = "sentence"
-    AUTO = "auto"
-class ErrorType(Enum):
-    CORRECT = "correct"
-    SUBSTITUTION = "substitution"
-    DELETION = "deletion"
-    INSERTION = "insertion"
-    ACCEPTABLE = "acceptable"
-@dataclass
-class CharacterError:
-    """Character-level error information for UI mapping"""
-    character: str
-    position: int
-    error_type: str
-    expected_sound: str
-    actual_sound: str
-    severity: float
-    color: str
-class EnhancedWav2Vec2CharacterASR:
-    """Enhanced Wav2Vec2 ASR with prosody analysis support - Optimized version"""
-    def __init__(
-        self,
-        model_name: str = "facebook/wav2vec2-large-960h-lv60-self",
-        onnx: bool = False,
-        quantized: bool = False,
-    ):
-        self.use_onnx = onnx
-        self.sample_rate = 16000
-        self.model_name = model_name
-        if onnx:
-            import os
-            model_path = (
-                f"wav2vec2-large-960h-lv60-self{'.quant' if quantized else ''}.onnx"
-            )
-            if not os.path.exists(model_path):
-                export_to_onnx(model_name, quantize=quantized)
-        # Use optimized inference
-        self.model = create_inference(
-            model_name=model_name, use_onnx=onnx, use_onnx_quantize=quantized, use_gpu=True
-        )
-    def transcribe_with_features(self, audio_path: str) -> Dict:
-        """Enhanced transcription with audio features for prosody analysis - Optimized"""
-        try:
-            start_time = time.time()
-            # Basic transcription (already fast - 0.3s)
-            character_transcript = self.model.file_to_text(audio_path)
-            character_transcript = self._clean_character_transcript(
-                character_transcript
-            )
-            # Fast phoneme conversion
-            phoneme_representation = self._characters_to_phoneme_representation(
-                character_transcript
-            )
-            # Basic audio features (simplified for speed)
-            audio_features = self._extract_basic_audio_features(audio_path)
-            logger.info(f"Optimized transcription time: {time.time() - start_time:.2f}s")
-            return {
-                "character_transcript": character_transcript,
-                "phoneme_representation": phoneme_representation,
-                "audio_features": audio_features,
-                "confidence": self._estimate_confidence(character_transcript),
-            }
-        except Exception as e:
-            logger.error(f"Enhanced ASR error: {e}")
-            return self._empty_result()
-    def _extract_basic_audio_features(self, audio_path: str) -> Dict:
-        """Extract basic audio features for prosody analysis - Optimized"""
-        try:
-            y, sr = librosa.load(audio_path, sr=self.sample_rate)
-            duration = len(y) / sr
-            # Simplified pitch analysis (sample fewer frames)
-            pitches, magnitudes = librosa.piptrack(y=y, sr=sr, threshold=0.1)
-            pitch_values = []
-            for t in range(0, pitches.shape[1], 10):  # Sample every 10th frame
-                index = magnitudes[:, t].argmax()
-                pitch = pitches[index, t]
-                if pitch > 80:  # Filter noise
-                    pitch_values.append(pitch)
-            # Basic rhythm
-            tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
-            # Basic intensity (reduced frame analysis)
-            rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
-            return {
-                "duration": duration,
-                "pitch": {
-                    "values": pitch_values,
-                    "mean": np.mean(pitch_values) if pitch_values else 0,
-                    "std": np.std(pitch_values) if pitch_values else 0,
-                    "range": (
-                        np.max(pitch_values) - np.min(pitch_values)
-                        if len(pitch_values) > 1 else 0
-                    ),
-                    "cv": (
-                        np.std(pitch_values) / np.mean(pitch_values)
-                        if pitch_values and np.mean(pitch_values) > 0
-                        else 0
-                    ),
-                },
-                "rhythm": {
-                    "tempo": tempo,
-                    "beats_per_second": len(beats) / duration if duration > 0 else 0,
-                },
-                "intensity": {
-                    "rms_mean": np.mean(rms),
-                    "rms_std": np.std(rms),
-                },
-            }
-        except Exception as e:
-            logger.error(f"Audio feature extraction error: {e}")
-            return {"duration": 0, "error": str(e)}
-    def _clean_character_transcript(self, transcript: str) -> str:
-        """Clean and standardize character transcript"""
-        logger.info(f"Raw transcript before cleaning: {transcript}")
-        cleaned = re.sub(r"\s+", " ", transcript)
-        return cleaned.strip().lower()
-    def _characters_to_phoneme_representation(self, text: str) -> str:
-        """Convert character-based transcript to phoneme representation - Optimized"""
-        if not text:
-            return ""
-        words = text.split()
-        phoneme_words = []
-        g2p = EnhancedG2P()
-        for word in words:
-            try:
-                if g2p:
-                    word_phonemes = g2p.word_to_phonemes(word)
-                    phoneme_words.extend(word_phonemes)
-                else:
-                    phoneme_words.extend(self._simple_letter_to_phoneme(word))
-            except:
-                phoneme_words.extend(self._simple_letter_to_phoneme(word))
-        return " ".join(phoneme_words)
-    def _simple_letter_to_phoneme(self, word: str) -> List[str]:
-        """Fallback letter-to-phoneme conversion"""
-        letter_to_phoneme = {
-            "a": "æ", "b": "b", "c": "k", "d": "d", "e": "ɛ", "f": "f",
-            "g": "ɡ", "h": "h", "i": "ɪ", "j": "dʒ", "k": "k", "l": "l",
-            "m": "m", "n": "n", "o": "ʌ", "p": "p", "q": "k", "r": "r",
-            "s": "s", "t": "t", "u": "ʌ", "v": "v", "w": "w", "x": "ks",
-            "y": "j", "z": "z",
-        }
-        return [
-            letter_to_phoneme.get(letter, letter)
-            for letter in word.lower()
-            if letter in letter_to_phoneme
-        ]
-    def _estimate_confidence(self, transcript: str) -> float:
-        """Estimate transcription confidence"""
-        if not transcript or len(transcript.strip()) < 2:
-            return 0.0
-        repeated_chars = len(re.findall(r"(.)\1{2,}", transcript))
-        return max(0.0, 1.0 - (repeated_chars * 0.2))
-    def _empty_result(self) -> Dict:
-        """Empty result for error cases"""
-        return {
-            "character_transcript": "",
-            "phoneme_representation": "",
-            "audio_features": {"duration": 0},
-            "confidence": 0.0,
-        }
-class EnhancedG2P:
-    """Enhanced Grapheme-to-Phoneme converter with visualization support - Optimized"""
-    def __init__(self):
-        try:
-            self.cmu_dict = cmudict.dict()
-        except:
-            self.cmu_dict = {}
-            logger.warning("CMU dictionary not available")
-        # Vietnamese speaker substitution patterns
-        self.vn_substitutions = {
-            "θ": ["f", "s", "t", "d"],
-            "ð": ["d", "z", "v", "t"],
-            "v": ["w", "f", "b"],
-            "w": ["v", "b"],
-            "r": ["l", "n"],
-            "l": ["r", "n"],
-            "z": ["s", "j"],
-            "ʒ": ["ʃ", "z", "s"],
-            "ʃ": ["s", "ʒ"],
-            "ŋ": ["n", "m"],
-            "tʃ": ["ʃ", "s", "k"],
-            "dʒ": ["ʒ", "j", "g"],
-            "æ": ["ɛ", "a"],
-            "ɪ": ["i"],
-            "ʊ": ["u"],
-        }
-        # Difficulty scores for Vietnamese speakers
-        self.difficulty_scores = {
-            "θ": 0.9, "ð": 0.9, "v": 0.8, "z": 0.8, "ʒ": 0.9,
-            "r": 0.7, "l": 0.6, "w": 0.5, "æ": 0.7, "ɪ": 0.6, "ʊ": 0.6,
-            "ŋ": 0.3, "f": 0.2, "s": 0.2, "ʃ": 0.5, "tʃ": 0.4, "dʒ": 0.5,
-        }
-    @lru_cache(maxsize=1000)
-    def word_to_phonemes(self, word: str) -> List[str]:
-        """Convert word to phoneme list - Cached for performance"""
-        word_lower = word.lower().strip()
-        if word_lower in self.cmu_dict:
-            cmu_phonemes = self.cmu_dict[word_lower][0]
-            return self._convert_cmu_to_ipa(cmu_phonemes)
-        else:
-            return self._estimate_phonemes(word_lower)
-    @lru_cache(maxsize=500)
-    def get_phoneme_string(self, text: str) -> str:
-        """Get space-separated phoneme string - Cached"""
-        words = self._clean_text(text).split()
-        all_phonemes = []
-        for word in words:
-            if word:
-                phonemes = self.word_to_phonemes(word)
-                all_phonemes.extend(phonemes)
-        return " ".join(all_phonemes)
-    def text_to_phonemes(self, text: str) -> List[Dict]:
-        """Convert text to phoneme sequence with visualization data"""
-        words = self._clean_text(text).split()
-        phoneme_sequence = []
-        for word in words:
-            word_phonemes = self.word_to_phonemes(word)
-            phoneme_sequence.append(
-                {
-                    "word": word,
-                    "phonemes": word_phonemes,
-                    "ipa": self._get_ipa(word),
-                    "phoneme_string": " ".join(word_phonemes),
-                    "visualization": self._create_phoneme_visualization(word_phonemes),
-                }
-            )
-        return phoneme_sequence
-    def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
-        """Convert CMU phonemes to IPA - Optimized"""
-        cmu_to_ipa = {
-            "AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
-            "EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ", "IY": "i", "OW": "oʊ",
-            "OY": "ɔɪ", "UH": "ʊ", "UW": "u", "B": "b", "CH": "tʃ", "D": "d",
-            "DH": "ð", "F": "f", "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k",
-            "L": "l", "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "r",
-            "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v", "W": "w",
-            "Y": "j", "Z": "z", "ZH": "ʒ",
-        }
-        ipa_phonemes = []
-        for phoneme in cmu_phonemes:
-            clean_phoneme = re.sub(r"[0-9]", "", phoneme)
-            ipa_phoneme = cmu_to_ipa.get(clean_phoneme, clean_phoneme.lower())
-            ipa_phonemes.append(ipa_phoneme)
-        return ipa_phonemes
-    def _estimate_phonemes(self, word: str) -> List[str]:
-        """Estimate phonemes for unknown words - Optimized"""
-        phoneme_map = {
-            "ch": "tʃ", "sh": "ʃ", "th": "θ", "ph": "f", "ck": "k", "ng": "ŋ", "qu": "kw",
-            "a": "æ", "e": "ɛ", "i": "ɪ", "o": "ʌ", "u": "ʌ", "b": "b", "c": "k",
-            "d": "d", "f": "f", "g": "ɡ", "h": "h", "j": "dʒ", "k": "k", "l": "l",
-            "m": "m", "n": "n", "p": "p", "r": "r", "s": "s", "t": "t", "v": "v",
-            "w": "w", "x": "ks", "y": "j", "z": "z",
-        }
-        phonemes = []
-        i = 0
-        while i < len(word):
-            if i <= len(word) - 2:
-                two_char = word[i : i + 2]
-                if two_char in phoneme_map:
-                    phonemes.append(phoneme_map[two_char])
-                    i += 2
-                    continue
-            char = word[i]
-            if char in phoneme_map:
-                phonemes.append(phoneme_map[char])
-            i += 1
-        return phonemes
-    def _clean_text(self, text: str) -> str:
-        """Clean text for processing"""
-        text = re.sub(r"[^\w\s']", " ", text)
-        text = re.sub(r"\s+", " ", text)
-        return text.lower().strip()
-    def _get_ipa(self, word: str) -> str:
-        """Get IPA transcription"""
-        try:
-            return ipa.convert(word)
-        except:
-            return f"/{word}/"
-    def _create_phoneme_visualization(self, phonemes: List[str]) -> List[Dict]:
-        """Create visualization data for phonemes"""
-        visualization = []
-        for phoneme in phonemes:
-            color_category = self._get_phoneme_color_category(phoneme)
-            visualization.append(
-                {
-                    "phoneme": phoneme,
-                    "color_category": color_category,
-                    "description": self._get_phoneme_description(phoneme),
-                    "difficulty": self.difficulty_scores.get(phoneme, 0.3),
-                }
-            )
-        return visualization
-    def _get_phoneme_color_category(self, phoneme: str) -> str:
-        """Categorize phonemes by color for visualization"""
-        vowel_phonemes = {
-            "ɑ", "æ", "ʌ", "ɔ", "aʊ", "aɪ", "ɛ", "ɝ", "eɪ", "ɪ", "i", "oʊ", "ɔɪ", "ʊ", "u",
-        }
-        difficult_consonants = {"θ", "ð", "v", "z", "ʒ", "r", "w"}
-        if phoneme in vowel_phonemes:
-            return "vowel"
-        elif phoneme in difficult_consonants:
-            return "difficult"
-        else:
-            return "consonant"
-    def _get_phoneme_description(self, phoneme: str) -> str:
-        """Get description for a phoneme"""
-        descriptions = {
-            "θ": "Voiceless dental fricative (like 'th' in 'think')",
-            "ð": "Voiced dental fricative (like 'th' in 'this')",
-            "v": "Voiced labiodental fricative (like 'v' in 'van')",
-            "z": "Voiced alveolar fricative (like 'z' in 'zip')",
-            "ʒ": "Voiced postalveolar fricative (like 's' in 'measure')",
-            "r": "Alveolar approximant (like 'r' in 'red')",
-            "w": "Labial-velar approximant (like 'w' in 'wet')",
-            "æ": "Near-open front unrounded vowel (like 'a' in 'cat')",
-            "ɪ": "Near-close near-front unrounded vowel (like 'i' in 'sit')",
-            "ʊ": "Near-close near-back rounded vowel (like 'u' in 'put')",
-        }
-        return descriptions.get(phoneme, f"Phoneme: {phoneme}")
-    def is_acceptable_substitution(self, reference: str, predicted: str) -> bool:
-        """Check if substitution is acceptable for Vietnamese speakers"""
-        acceptable = self.vn_substitutions.get(reference, [])
-        return predicted in acceptable
-    def get_difficulty_score(self, phoneme: str) -> float:
-        """Get difficulty score for phoneme"""
-        return self.difficulty_scores.get(phoneme, 0.3)
-class AdvancedPhonemeComparator:
-    """Enhanced phoneme comparator using Levenshtein distance - Optimized"""
-    def __init__(self):
-        self.g2p = EnhancedG2P()
-    def compare_with_levenshtein(self, reference: str, predicted: str) -> List[Dict]:
-        """Compare phonemes using Levenshtein distance for accurate alignment - Optimized"""
-        ref_phones = reference.split() if reference else []
-        pred_phones = predicted.split() if predicted else []
-        if not ref_phones:
-            return []
-        # Use Levenshtein editops for precise alignment
-        ops = Levenshtein.editops(ref_phones, pred_phones)
-        comparisons = []
-        ref_idx = 0
-        pred_idx = 0
-        # Process equal parts first
-        for op_type, ref_pos, pred_pos in ops:
-            # Add equal characters before this operation
-            while ref_idx < ref_pos and pred_idx < pred_pos:
-                comparison = self._create_comparison(
-                    ref_phones[ref_idx],
-                    pred_phones[pred_idx],
-                    ErrorType.CORRECT,
-                    1.0,
-                    len(comparisons),
-                )
-                comparisons.append(comparison)
-                ref_idx += 1
-                pred_idx += 1
-            # Process the operation
-            if op_type == "replace":
-                ref_phoneme = ref_phones[ref_pos]
-                pred_phoneme = pred_phones[pred_pos]
-                if self.g2p.is_acceptable_substitution(ref_phoneme, pred_phoneme):
-                    error_type = ErrorType.ACCEPTABLE
-                    score = 0.7
-                else:
-                    error_type = ErrorType.SUBSTITUTION
-                    score = 0.2
-                comparison = self._create_comparison(
-                    ref_phoneme, pred_phoneme, error_type, score, len(comparisons)
-                )
-                comparisons.append(comparison)
-                ref_idx = ref_pos + 1
-                pred_idx = pred_pos + 1
-            elif op_type == "delete":
-                comparison = self._create_comparison(
-                    ref_phones[ref_pos], "", ErrorType.DELETION, 0.0, len(comparisons)
-                )
-                comparisons.append(comparison)
-                ref_idx = ref_pos + 1
-            elif op_type == "insert":
-                comparison = self._create_comparison(
-                    "",
-                    pred_phones[pred_pos],
-                    ErrorType.INSERTION,
-                    0.0,
-                    len(comparisons),
-                )
-                comparisons.append(comparison)
-                pred_idx = pred_pos + 1
-        # Add remaining equal characters
-        while ref_idx < len(ref_phones) and pred_idx < len(pred_phones):
-            comparison = self._create_comparison(
-                ref_phones[ref_idx],
-                pred_phones[pred_idx],
-                ErrorType.CORRECT,
-                1.0,
-                len(comparisons),
-            )
-            comparisons.append(comparison)
-            ref_idx += 1
-            pred_idx += 1
-        return comparisons
-    def _create_comparison(
-        self,
-        ref_phoneme: str,
-        pred_phoneme: str,
-        error_type: ErrorType,
-        score: float,
-        position: int,
-    ) -> Dict:
-        """Create comparison dictionary"""
-        return {
-            "position": position,
-            "reference_phoneme": ref_phoneme,
-            "learner_phoneme": pred_phoneme,
-            "status": error_type.value,
-            "score": score,
-            "difficulty": self.g2p.get_difficulty_score(ref_phoneme),
-            "error_type": error_type.value,
-        }
-class EnhancedWordAnalyzer:
-    """Enhanced word analyzer with character-level error mapping - Optimized"""
-    def __init__(self):
-        self.g2p = EnhancedG2P()
-        self.comparator = AdvancedPhonemeComparator()
-        # Thread pool for parallel processing
-        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
-    def analyze_words_enhanced(
-        self, reference_text: str, learner_phonemes: str, mode: AssessmentMode
-    ) -> Dict:
-        """Enhanced word analysis with character-level mapping - Parallelized"""
-        # Start parallel tasks
-        future_ref_phonemes = self.executor.submit(
-            self.g2p.text_to_phonemes, reference_text
-        )
-        future_ref_phoneme_string = self.executor.submit(
-            self.g2p.get_phoneme_string, reference_text
-        )
-        # Get results
-        reference_words = future_ref_phonemes.result()
-        reference_phoneme_string = future_ref_phoneme_string.result()
-        # Phoneme comparison
-        phoneme_comparisons = self.comparator.compare_with_levenshtein(
-            reference_phoneme_string, learner_phonemes
-        )
-        # Parallel final processing
-        future_highlights = self.executor.submit(
-            self._create_enhanced_word_highlights,
-            reference_words, phoneme_comparisons, mode
-        )
-        future_pairs = self.executor.submit(
-            self._create_phoneme_pairs, reference_phoneme_string, learner_phonemes
-        )
-        word_highlights = future_highlights.result()
-        phoneme_pairs = future_pairs.result()
-        # Quick wrong words identification
-        wrong_words = self._identify_wrong_words_enhanced(
-            word_highlights, phoneme_comparisons
-        )
-        return {
-            "word_highlights": word_highlights,
-            "phoneme_differences": phoneme_comparisons,
-            "wrong_words": wrong_words,
-            "reference_phonemes": reference_phoneme_string,
-            "phoneme_pairs": phoneme_pairs,
-        }
-    def _create_enhanced_word_highlights(
-        self,
-        reference_words: List[Dict],
-        phoneme_comparisons: List[Dict],
-        mode: AssessmentMode,
-    ) -> List[Dict]:
-        """Create enhanced word highlights with character-level error mapping - Optimized"""
-        word_highlights = []
-        phoneme_index = 0
-        for word_data in reference_words:
-            word = word_data["word"]
-            word_phonemes = word_data["phonemes"]
-            num_phonemes = len(word_phonemes)
-            # Get phoneme scores for this word
-            word_phoneme_scores = []
-            word_comparisons = []
-            for j in range(num_phonemes):
-                if phoneme_index + j < len(phoneme_comparisons):
-                    comparison = phoneme_comparisons[phoneme_index + j]
-                    word_phoneme_scores.append(comparison["score"])
-                    word_comparisons.append(comparison)
-            # Calculate word score
-            word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
-            # Map phoneme errors to character positions (enhanced for word mode)
-            character_errors = []
-            if mode == AssessmentMode.WORD:
-                character_errors = self._map_phonemes_to_characters(
-                    word, word_comparisons
-                )
-            # Create enhanced word highlight
-            highlight = {
-                "word": word,
-                "score": float(word_score),
-                "status": self._get_word_status(word_score),
-                "color": self._get_word_color(word_score),
-                "phonemes": word_phonemes,
-                "ipa": word_data["ipa"],
-                "phoneme_scores": word_phoneme_scores,
-                "phoneme_start_index": phoneme_index,
-                "phoneme_end_index": phoneme_index + num_phonemes - 1,
-                "phoneme_visualization": word_data["visualization"],
-                "character_errors": character_errors,
-                "detailed_analysis": mode == AssessmentMode.WORD,
-            }
-            word_highlights.append(highlight)
-            phoneme_index += num_phonemes
-        return word_highlights
-    def _map_phonemes_to_characters(
-        self, word: str, phoneme_comparisons: List[Dict]
-    ) -> List[CharacterError]:
-        """Map phoneme errors to character positions in word"""
-        character_errors = []
-        if not phoneme_comparisons or not word:
-            return character_errors
-        chars_per_phoneme = len(word) / len(phoneme_comparisons)
-        for i, comparison in enumerate(phoneme_comparisons):
-            if comparison["status"] in ["substitution", "deletion", "wrong"]:
-                char_pos = min(int(i * chars_per_phoneme), len(word) - 1)
-                severity = 1.0 - comparison["score"]
-                color = self._get_error_color(severity)
-                error = CharacterError(
-                    character=word[char_pos],
-                    position=char_pos,
-                    error_type=comparison["status"],
-                    expected_sound=comparison["reference_phoneme"],
-                    actual_sound=comparison["learner_phoneme"],
-                    severity=severity,
-                    color=color,
-                )
-                character_errors.append(error)
-        return character_errors
-    def _get_error_color(self, severity: float) -> str:
-        """Get color code for character errors"""
-        if severity >= 0.8:
-            return "#ef4444"  # Red - severe error
-        elif severity >= 0.6:
-            return "#f97316"  # Orange - moderate error
-        elif severity >= 0.4:
-            return "#eab308"  # Yellow - mild error
-        else:
-            return "#84cc16"  # Light green - minor error
-    def _identify_wrong_words_enhanced(
-        self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
-    ) -> List[Dict]:
-        """Enhanced wrong word identification with detailed error analysis"""
-        wrong_words = []
-        for word_highlight in word_highlights:
-            if word_highlight["score"] < 0.6:
-                start_idx = word_highlight["phoneme_start_index"]
-                end_idx = word_highlight["phoneme_end_index"]
-                wrong_phonemes = []
-                missing_phonemes = []
-                for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
-                    comparison = phoneme_comparisons[i]
-                    if comparison["status"] in ["wrong", "substitution"]:
-                        wrong_phonemes.append(
-                            {
-                                "expected": comparison["reference_phoneme"],
-                                "actual": comparison["learner_phoneme"],
-                                "difficulty": comparison["difficulty"],
-                                "description": self.g2p._get_phoneme_description(
-                                    comparison["reference_phoneme"]
-                                ),
-                            }
-                        )
-                    elif comparison["status"] in ["missing", "deletion"]:
-                        missing_phonemes.append(
-                            {
-                                "phoneme": comparison["reference_phoneme"],
-                                "difficulty": comparison["difficulty"],
-                                "description": self.g2p._get_phoneme_description(
-                                    comparison["reference_phoneme"]
-                                ),
-                            }
-                        )
-                wrong_word = {
-                    "word": word_highlight["word"],
-                    "score": word_highlight["score"],
-                    "expected_phonemes": word_highlight["phonemes"],
-                    "ipa": word_highlight["ipa"],
-                    "wrong_phonemes": wrong_phonemes,
-                    "missing_phonemes": missing_phonemes,
-                    "tips": self._get_enhanced_vietnamese_tips(
-                        wrong_phonemes, missing_phonemes
-                    ),
-                    "phoneme_visualization": word_highlight["phoneme_visualization"],
-                    "character_errors": word_highlight.get("character_errors", []),
-                }
-                wrong_words.append(wrong_word)
-        return wrong_words
-    def _create_phoneme_pairs(self, reference: str, learner: str) -> List[Dict]:
-        """Create phoneme pairs for visualization - Optimized"""
-        ref_phones = reference.split() if reference else []
-        learner_phones = learner.split() if learner else []
-        pairs = []
-        min_len = min(len(ref_phones), len(learner_phones))
-        # Quick alignment for most cases
-        for i in range(min_len):
-            pairs.append(
-                {
-                    "reference": ref_phones[i],
-                    "learner": learner_phones[i],
-                    "match": ref_phones[i] == learner_phones[i],
-                    "type": "correct" if ref_phones[i] == learner_phones[i] else "substitution",
-                }
-            )
-        # Handle extra phonemes
-        for i in range(min_len, len(ref_phones)):
-            pairs.append(
-                {
-                    "reference": ref_phones[i],
-                    "learner": "",
-                    "match": False,
-                    "type": "deletion",
-                }
-            )
-        for i in range(min_len, len(learner_phones)):
-            pairs.append(
-                {
-                    "reference": "",
-                    "learner": learner_phones[i],
-                    "match": False,
-                    "type": "insertion",
-                }
-            )
-        return pairs
-    def _get_word_status(self, score: float) -> str:
-        """Get word status from score"""
-        if score >= 0.8:
-            return "excellent"
-        elif score >= 0.6:
-            return "good"
-        elif score >= 0.4:
-            return "needs_practice"
-        else:
-            return "poor"
-    def _get_word_color(self, score: float) -> str:
-        """Get color for word highlighting"""
-        if score >= 0.8:
-            return "#22c55e"  # Green
-        elif score >= 0.6:
-            return "#84cc16"  # Light green
-        elif score >= 0.4:
-            return "#eab308"  # Yellow
-        else:
-            return "#ef4444"  # Red
-    def _get_enhanced_vietnamese_tips(
-        self, wrong_phonemes: List[Dict], missing_phonemes: List[Dict]
-    ) -> List[str]:
-        """Enhanced Vietnamese-specific pronunciation tips"""
-        tips = []
-        vietnamese_tips = {
-            "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
-            "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
-            "v": "Chạm môi dưới vào răng trên, không dùng cả hai môi như tiếng Việt",
-            "r": "Cuộn lưỡi nhưng không chạm vào vòm miệng, không lăn lưỡi",
-            "l": "Đầu lư��i chạm vào vòm miệng sau răng",
-            "z": "Giống âm 's' nhưng có rung dây thanh âm",
-            "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
-            "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
-            "æ": "Mở miệng rộng hơn khi phát âm 'a'",
-            "ɪ": "Âm 'i' ngắn, không kéo dài như tiếng Việt",
-        }
-        for wrong in wrong_phonemes:
-            expected = wrong["expected"]
-            if expected in vietnamese_tips:
-                tips.append(f"Âm /{expected}/: {vietnamese_tips[expected]}")
-        for missing in missing_phonemes:
-            phoneme = missing["phoneme"]
-            if phoneme in vietnamese_tips:
-                tips.append(f"Thiếu âm /{phoneme}/: {vietnamese_tips[phoneme]}")
-        return tips
-    def __del__(self):
-        """Cleanup executor"""
-        if hasattr(self, 'executor'):
-            self.executor.shutdown(wait=False)
-class EnhancedProsodyAnalyzer:
-    """Enhanced prosody analyzer for sentence-level assessment - Optimized"""
-    def __init__(self):
-        # Expected values for English prosody
-        self.expected_speech_rate = 4.0  # syllables per second
-        self.expected_pitch_range = 100  # Hz
-        self.expected_pitch_cv = 0.3  # coefficient of variation
-    def analyze_prosody_enhanced(
-        self, audio_features: Dict, reference_text: str
-    ) -> Dict:
-        """Enhanced prosody analysis with detailed scoring - Optimized"""
-        if "error" in audio_features:
-            return self._empty_prosody_result()
-        duration = audio_features.get("duration", 1)
-        pitch_data = audio_features.get("pitch", {})
-        rhythm_data = audio_features.get("rhythm", {})
-        intensity_data = audio_features.get("intensity", {})
-        # Calculate syllables (simplified)
-        num_syllables = self._estimate_syllables(reference_text)
-        actual_speech_rate = num_syllables / duration if duration > 0 else 0
-        # Calculate individual prosody scores
-        pace_score = self._calculate_pace_score(actual_speech_rate)
-        intonation_score = self._calculate_intonation_score(pitch_data)
-        rhythm_score = self._calculate_rhythm_score(rhythm_data, intensity_data)
-        stress_score = self._calculate_stress_score(pitch_data, intensity_data)
-        # Overall prosody score
-        overall_prosody = (
-            pace_score + intonation_score + rhythm_score + stress_score
-        ) / 4
-        # Generate prosody feedback
-        feedback = self._generate_prosody_feedback(
-            pace_score,
-            intonation_score,
-            rhythm_score,
-            stress_score,
-            actual_speech_rate,
-            pitch_data,
-        )
-        return {
-            "pace_score": pace_score,
-            "intonation_score": intonation_score,
-            "rhythm_score": rhythm_score,
-            "stress_score": stress_score,
-            "overall_prosody": overall_prosody,
-            "details": {
-                "speech_rate": actual_speech_rate,
-                "expected_speech_rate": self.expected_speech_rate,
-                "syllable_count": num_syllables,
-                "duration": duration,
-                "pitch_analysis": pitch_data,
-                "rhythm_analysis": rhythm_data,
-                "intensity_analysis": intensity_data,
-            },
-            "feedback": feedback,
-        }
-    def _calculate_pace_score(self, actual_rate: float) -> float:
-        """Calculate pace score based on speech rate"""
-        if self.expected_speech_rate == 0:
-            return 0.5
-        ratio = actual_rate / self.expected_speech_rate
-        if 0.8 <= ratio <= 1.2:
-            return 1.0
-        elif 0.6 <= ratio < 0.8 or 1.2 < ratio <= 1.5:
-            return 0.7
-        elif 0.4 <= ratio < 0.6 or 1.5 < ratio <= 2.0:
-            return 0.4
-        else:
-            return 0.1
-    def _calculate_intonation_score(self, pitch_data: Dict) -> float:
-        """Calculate intonation score based on pitch variation"""
-        pitch_range = pitch_data.get("range", 0)
-        if self.expected_pitch_range == 0:
-            return 0.5
-        ratio = pitch_range / self.expected_pitch_range
-        if 0.7 <= ratio <= 1.3:
-            return 1.0
-        elif 0.5 <= ratio < 0.7 or 1.3 < ratio <= 1.8:
-            return 0.7
-        elif 0.3 <= ratio < 0.5 or 1.8 < ratio <= 2.5:
-            return 0.4
-        else:
-            return 0.2
-    def _calculate_rhythm_score(self, rhythm_data: Dict, intensity_data: Dict) -> float:
-        """Calculate rhythm score based on tempo and intensity patterns"""
-        tempo = rhythm_data.get("tempo", 120)
-        intensity_std = intensity_data.get("rms_std", 0)
-        intensity_mean = intensity_data.get("rms_mean", 0)
-        # Tempo score (60-180 BPM is good for speech)
-        if 60 <= tempo <= 180:
-            tempo_score = 1.0
-        elif 40 <= tempo < 60 or 180 < tempo <= 220:
-            tempo_score = 0.6
-        else:
-            tempo_score = 0.3
-        # Intensity consistency score
-        if intensity_mean > 0:
-            intensity_consistency = max(0, 1.0 - (intensity_std / intensity_mean))
-        else:
-            intensity_consistency = 0.5
-        return (tempo_score + intensity_consistency) / 2
-    def _calculate_stress_score(self, pitch_data: Dict, intensity_data: Dict) -> float:
-        """Calculate stress score based on pitch and intensity variation"""
-        pitch_cv = pitch_data.get("cv", 0)
-        intensity_std = intensity_data.get("rms_std", 0)
-        intensity_mean = intensity_data.get("rms_mean", 0)
-        # Pitch coefficient of variation score
-        if 0.2 <= pitch_cv <= 0.4:
-            pitch_score = 1.0
-        elif 0.1 <= pitch_cv < 0.2 or 0.4 < pitch_cv <= 0.6:
-            pitch_score = 0.7
-        else:
-            pitch_score = 0.4
-        # Intensity variation score
-        if intensity_mean > 0:
-            intensity_cv = intensity_std / intensity_mean
-            if 0.1 <= intensity_cv <= 0.3:
-                intensity_score = 1.0
-            elif 0.05 <= intensity_cv < 0.1 or 0.3 < intensity_cv <= 0.5:
-                intensity_score = 0.7
-            else:
-                intensity_score = 0.4
-        else:
-            intensity_score = 0.5
-        return (pitch_score + intensity_score) / 2
-    def _generate_prosody_feedback(
-        self,
-        pace_score: float,
-        intonation_score: float,
-        rhythm_score: float,
-        stress_score: float,
-        speech_rate: float,
-        pitch_data: Dict,
-    ) -> List[str]:
-        """Generate detailed prosody feedback"""
-        feedback = []
-        if pace_score < 0.5:
-            if speech_rate < self.expected_speech_rate * 0.8:
-                feedback.append("Tốc độ nói hơi chậm, thử nói nhanh hơn một chút")
-            else:
-                feedback.append("Tốc độ nói hơi nhanh, thử nói chậm lại để rõ ràng hơn")
-        elif pace_score >= 0.8:
-            feedback.append("Tốc độ nói rất tự nhiên")
-        if intonation_score < 0.5:
-            feedback.append("Cần cải thiện ngữ điệu - thay đổi cao độ giọng nhiều hơn")
-        elif intonation_score >= 0.8:
-            feedback.append("Ngữ điệu rất tự nhiên và sinh động")
-        if rhythm_score < 0.5:
-            feedback.append("Nhịp điệu cần đều hơn - chú ý đến trọng âm của từ")
-        elif rhythm_score >= 0.8:
-            feedback.append("Nhịp điệu rất tốt")
-        if stress_score < 0.5:
-            feedback.append("Cần nhấn mạnh trọng âm rõ ràng hơn")
-        elif stress_score >= 0.8:
-            feedback.append("Trọng âm được nhấn rất tốt")
-        return feedback
-    def _estimate_syllables(self, text: str) -> int:
-        """Estimate number of syllables in text - Optimized"""
-        vowels = "aeiouy"
-        text = text.lower()
-        syllable_count = 0
-        prev_was_vowel = False
-        for char in text:
-            if char in vowels:
-                if not prev_was_vowel:
-                    syllable_count += 1
-                prev_was_vowel = True
-            else:
-                prev_was_vowel = False
-        if text.endswith("e"):
-            syllable_count -= 1
-        return max(1, syllable_count)
-    def _empty_prosody_result(self) -> Dict:
-        """Return empty prosody result for error cases"""
-        return {
-            "pace_score": 0.5,
-            "intonation_score": 0.5,
-            "rhythm_score": 0.5,
-            "stress_score": 0.5,
-            "overall_prosody": 0.5,
-            "details": {},
-            "feedback": ["Không thể phân tích ngữ điệu"],
-        }
-class EnhancedFeedbackGenerator:
-    """Enhanced feedback generator with detailed analysis - Optimized"""
-    def generate_enhanced_feedback(
-        self,
-        overall_score: float,
-        wrong_words: List[Dict],
-        phoneme_comparisons: List[Dict],
-        mode: AssessmentMode,
-        prosody_analysis: Dict = None,
-    ) -> List[str]:
-        """Generate comprehensive feedback based on assessment mode"""
-        feedback = []
-        # Overall score feedback
-        if overall_score >= 0.9:
-            feedback.append("Phát âm xuất sắc! Bạn đã làm rất tốt.")
-        elif overall_score >= 0.8:
-            feedback.append("Phát âm rất tốt! Chỉ còn một vài điểm nhỏ cần cải thiện.")
-        elif overall_score >= 0.6:
-            feedback.append("Phát âm khá tốt, còn một số điểm cần luyện tập thêm.")
-        elif overall_score >= 0.4:
-            feedback.append("Cần luyện tập thêm. Tập trung vào những từ được đánh dấu.")
-        else:
-            feedback.append("Hãy luyện tập chậm rãi và rõ ràng hơn.")
-        # Mode-specific feedback
-        if mode == AssessmentMode.WORD:
-            feedback.extend(
-                self._generate_word_mode_feedback(wrong_words, phoneme_comparisons)
-            )
-        elif mode == AssessmentMode.SENTENCE:
-            feedback.extend(
-                self._generate_sentence_mode_feedback(wrong_words, prosody_analysis)
-            )
-        # Common error patterns
-        error_patterns = self._analyze_error_patterns(phoneme_comparisons)
-        if error_patterns:
-            feedback.extend(error_patterns)
-        return feedback
-    def _generate_word_mode_feedback(
-        self, wrong_words: List[Dict], phoneme_comparisons: List[Dict]
-    ) -> List[str]:
-        """Generate feedback specific to word mode"""
-        feedback = []
-        if wrong_words:
-            if len(wrong_words) == 1:
-                word = wrong_words[0]["word"]
-                feedback.append(f"Từ '{word}' cần luyện tập thêm")
-                # Character-level feedback
-                char_errors = wrong_words[0].get("character_errors", [])
-                if char_errors:
-                    error_chars = [err.character for err in char_errors[:3]]
-                    feedback.append(f"Chú ý các âm: {', '.join(error_chars)}")
-            else:
-                word_list = [w["word"] for w in wrong_words[:3]]
-                feedback.append(f"Các từ cần luyện: {', '.join(word_list)}")
-        return feedback
-    def _generate_sentence_mode_feedback(
-        self, wrong_words: List[Dict], prosody_analysis: Dict
-    ) -> List[str]:
-        """Generate feedback specific to sentence mode"""
-        feedback = []
-        # Word-level feedback
-        if wrong_words:
-            if len(wrong_words) <= 2:
-                word_list = [w["word"] for w in wrong_words]
-                feedback.append(f"Cần cải thiện: {', '.join(word_list)}")
-            else:
-                feedback.append(f"Có {len(wrong_words)} từ cần luyện tập")
-        # Prosody feedback
-        if prosody_analysis and "feedback" in prosody_analysis:
-            feedback.extend(prosody_analysis["feedback"][:2])  # Limit prosody feedback
-        return feedback
-    def _analyze_error_patterns(self, phoneme_comparisons: List[Dict]) -> List[str]:
-        """Analyze common error patterns across phonemes"""
-        feedback = []
-        # Count error types
-        error_counts = defaultdict(int)
-        difficult_phonemes = defaultdict(int)
-        for comparison in phoneme_comparisons:
-            if comparison["status"] in ["wrong", "substitution"]:
-                phoneme = comparison["reference_phoneme"]
-                difficult_phonemes[phoneme] += 1
-                error_counts[comparison["status"]] += 1
-        # Most problematic phoneme
-        if difficult_phonemes:
-            most_difficult = max(difficult_phonemes.items(), key=lambda x: x[1])
-            if most_difficult[1] >= 2:
-                phoneme = most_difficult[0]
-                phoneme_tips = {
-                    "θ": "Lưỡi giữa răng, thổi nhẹ",
-                    "ð": "Lưỡi giữa răng, rung dây thanh",
-                    "v": "Môi dưới chạm răng trên",
-                    "r": "Cuộn lưỡi nhẹ",
-                    "z": "Như 's' nhưng rung dây thanh",
-                }
-                if phoneme in phoneme_tips:
-                    feedback.append(f"Âm khó nhất /{phoneme}/: {phoneme_tips[phoneme]}")
-        return feedback
-class ProductionPronunciationAssessor:
-    """Production-ready pronunciation assessor - Enhanced version with optimizations"""
-    _instance = None
-    _initialized = False
-    def __new__(cls, onnx: bool = False, quantized: bool = False):
-        if cls._instance is None:
-            cls._instance = super(ProductionPronunciationAssessor, cls).__new__(cls)
-        return cls._instance
-    def __init__(self, onnx: bool = False, quantized: bool = False):
-        """Initialize the production-ready pronunciation assessment system (only once)"""
-        if self._initialized:
-            return
-        logger.info("Initializing Optimized Production Pronunciation Assessment System...")
-        self.asr = EnhancedWav2Vec2CharacterASR(onnx=onnx, quantized=quantized)
-        self.word_analyzer = EnhancedWordAnalyzer()
-        self.prosody_analyzer = EnhancedProsodyAnalyzer()
-        self.feedback_generator = EnhancedFeedbackGenerator()
-        self.g2p = EnhancedG2P()
-        # Thread pool for parallel processing
-        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
-        ProductionPronunciationAssessor._initialized = True
-        logger.info("Optimized production system initialization completed")
-    def assess_pronunciation(
-        self, audio_path: str, reference_text: str, mode: str = "auto"
-    ) -> Dict:
-        """
-        Main assessment function with enhanced features and optimizations
-        Args:
-            audio_path: Path to audio file
-            reference_text: Reference text to compare against
-            mode: Assessment mode ("word", "sentence", "auto", or legacy modes)
-        Returns:
-            Enhanced assessment results with backward compatibility
-        """
-        logger.info(f"Starting optimized production assessment in {mode} mode...")
-        start_time = time.time()
-        try:
-            # Normalize and validate mode
-            assessment_mode = self._normalize_mode(mode, reference_text)
-            logger.info(f"Using assessment mode: {assessment_mode.value}")
-            # Step 1: Enhanced ASR transcription with features (0.3s)
-            asr_result = self.asr.transcribe_with_features(audio_path)
-            if not asr_result["character_transcript"]:
-                return self._create_error_result("No speech detected in audio")
-            # Step 2: Parallel analysis processing
-            future_word_analysis = self.executor.submit(
-                self.word_analyzer.analyze_words_enhanced,
-                reference_text, asr_result["phoneme_representation"], assessment_mode
-            )
-            # Step 3: Conditional prosody analysis (only for sentence mode)
-            future_prosody = None
-            if assessment_mode == AssessmentMode.SENTENCE:
-                future_prosody = self.executor.submit(
-                    self.prosody_analyzer.analyze_prosody_enhanced,
-                    asr_result["audio_features"], reference_text
-                )
-            # Get analysis results
-            analysis_result = future_word_analysis.result()
-            # Step 4: Parallel final processing
-            future_overall_score = self.executor.submit(
-                self._calculate_overall_score, analysis_result["phoneme_differences"]
-            )
-            future_phoneme_summary = self.executor.submit(
-                self._create_phoneme_comparison_summary, analysis_result["phoneme_pairs"]
-            )
-            # Get prosody analysis if needed
-            prosody_analysis = {}
-            if future_prosody:
-                prosody_analysis = future_prosody.result()
-            # Get final results
-            overall_score = future_overall_score.result()
-            phoneme_comparison_summary = future_phoneme_summary.result()
-            # Step 5: Generate enhanced feedback
-            feedback = self.feedback_generator.generate_enhanced_feedback(
-                overall_score,
-                analysis_result["wrong_words"],
-                analysis_result["phoneme_differences"],
-                assessment_mode,
-                prosody_analysis,
-            )
-            # Step 6: Assemble result with backward compatibility
-            result = self._create_enhanced_result(
-                asr_result,
-                analysis_result,
-                overall_score,
-                feedback,
-                prosody_analysis,
-                phoneme_comparison_summary,
-                assessment_mode,
-            )
-            # Add processing metadata
-            processing_time = time.time() - start_time
-            result["processing_info"] = {
-                "processing_time": round(processing_time, 2),
-                "mode": assessment_mode.value,
-                "model_used": "Wav2Vec2-Enhanced-Optimized",
-                "onnx_enabled": self.asr.use_onnx,
-                "confidence": asr_result["confidence"],
-                "enhanced_features": True,
-                "character_level_analysis": assessment_mode == AssessmentMode.WORD,
-                "prosody_analysis": assessment_mode == AssessmentMode.SENTENCE,
-                "optimized": True,
-            }
-            logger.info(f"Optimized production assessment completed in {processing_time:.2f}s")
-            return result
-        except Exception as e:
-            logger.error(f"Production assessment error: {e}")
-            return self._create_error_result(f"Assessment failed: {str(e)}")
-    def _normalize_mode(self, mode: str, reference_text: str) -> AssessmentMode:
-        """Normalize mode parameter with backward compatibility"""
-        # Legacy mode mapping
-        legacy_mapping = {
-            "normal": AssessmentMode.AUTO,
-            "advanced": AssessmentMode.AUTO,
-        }
-        if mode in legacy_mapping:
-            normalized_mode = legacy_mapping[mode]
-            logger.info(f"Mapped legacy mode '{mode}' to '{normalized_mode.value}'")
-            mode = normalized_mode.value
-        # Validate mode
-        try:
-            assessment_mode = AssessmentMode(mode)
-        except ValueError:
-            logger.warning(f"Invalid mode '{mode}', defaulting to AUTO")
-            assessment_mode = AssessmentMode.AUTO
-        # Auto-detect mode based on text length
-        if assessment_mode == AssessmentMode.AUTO:
-            word_count = len(reference_text.strip().split())
-            assessment_mode = (
-                AssessmentMode.WORD if word_count <= 3 else AssessmentMode.SENTENCE
-            )
-            logger.info(
-                f"Auto-detected mode: {assessment_mode.value} (word count: {word_count})"
-            )
-        return assessment_mode
-    def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
-        """Calculate weighted overall score"""
-        if not phoneme_comparisons:
-            return 0.0
-        total_weighted_score = 0.0
-        total_weight = 0.0
-        for comparison in phoneme_comparisons:
-            weight = comparison.get("difficulty", 0.5)  # Use difficulty as weight
-            score = comparison["score"]
-            total_weighted_score += score * weight
-            total_weight += weight
-        return total_weighted_score / total_weight if total_weight > 0 else 0.0
-    def _create_phoneme_comparison_summary(self, phoneme_pairs: List[Dict]) -> Dict:
-        """Create phoneme comparison summary statistics"""
-        total = len(phoneme_pairs)
-        if total == 0:
-            return {"total_phonemes": 0, "accuracy_percentage": 0}
-        correct = sum(1 for pair in phoneme_pairs if pair["match"])
-        substitutions = sum(
-            1 for pair in phoneme_pairs if pair["type"] == "substitution"
-        )
-        deletions = sum(1 for pair in phoneme_pairs if pair["type"] == "deletion")
-        insertions = sum(1 for pair in phoneme_pairs if pair["type"] == "insertion")
-        return {
-            "total_phonemes": total,
-            "correct": correct,
-            "substitutions": substitutions,
-            "deletions": deletions,
-            "insertions": insertions,
-            "accuracy_percentage": round((correct / total) * 100, 1),
-            "error_rate": round(
-                ((substitutions + deletions + insertions) / total) * 100, 1
-            ),
-        }
-    def _create_enhanced_result(
-        self,
-        asr_result: Dict,
-        analysis_result: Dict,
-        overall_score: float,
-        feedback: List[str],
-        prosody_analysis: Dict,
-        phoneme_summary: Dict,
-        assessment_mode: AssessmentMode,
-    ) -> Dict:
-        """Create enhanced result with backward compatibility"""
-        # Base result structure (backward compatible)
-        result = {
-            "transcript": asr_result["character_transcript"],
-            "transcript_phonemes": asr_result["phoneme_representation"],
-            "user_phonemes": asr_result["phoneme_representation"],
-            "character_transcript": asr_result["character_transcript"],
-            "overall_score": overall_score,
-            "word_highlights": analysis_result["word_highlights"],
-            "phoneme_differences": analysis_result["phoneme_differences"],
-            "wrong_words": analysis_result["wrong_words"],
-            "feedback": feedback,
-        }
-        # Enhanced features
-        result.update(
-            {
-                "reference_phonemes": analysis_result["reference_phonemes"],
-                "phoneme_pairs": analysis_result["phoneme_pairs"],
-                "phoneme_comparison": phoneme_summary,
-                "assessment_mode": assessment_mode.value,
-            }
-        )
-        # Add prosody analysis for sentence mode
-        if prosody_analysis:
-            result["prosody_analysis"] = prosody_analysis
-        # Add character-level analysis for word mode
-        if assessment_mode == AssessmentMode.WORD:
-            result["character_level_analysis"] = True
-            # Add character errors to word highlights if available
-            for word_highlight in result["word_highlights"]:
-                if "character_errors" in word_highlight:
-                    # Convert CharacterError objects to dicts for JSON serialization
-                    char_errors = []
-                    for error in word_highlight["character_errors"]:
-                        if isinstance(error, CharacterError):
-                            char_errors.append(
-                                {
-                                    "character": error.character,
-                                    "position": error.position,
-                                    "error_type": error.error_type,
-                                    "expected_sound": error.expected_sound,
-                                    "actual_sound": error.actual_sound,
-                                    "severity": error.severity,
-                                    "color": error.color,
-                                }
-                            )
-                        else:
-                            char_errors.append(error)
-                    word_highlight["character_errors"] = char_errors
-        return result
-    def _create_error_result(self, error_message: str) -> Dict:
-        """Create error result structure"""
-        return {
-            "transcript": "",
-            "transcript_phonemes": "",
-            "user_phonemes": "",
-            "character_transcript": "",
-            "overall_score": 0.0,
-            "word_highlights": [],
-            "phoneme_differences": [],
-            "wrong_words": [],
-            "feedback": [f"Lỗi: {error_message}"],
-            "error": error_message,
-            "assessment_mode": "error",
-            "processing_info": {
-                "processing_time": 0,
-                "mode": "error",
-                "model_used": "Wav2Vec2-Enhanced-Optimized",
-                "confidence": 0.0,
-                "enhanced_features": False,
-                "optimized": True,
-            },
-        }
-    def get_system_info(self) -> Dict:
-        """Get comprehensive system information"""
-        return {
-            "version": "2.1.0-production-optimized",
-            "name": "Optimized Production Pronunciation Assessment System",
-            "modes": [mode.value for mode in AssessmentMode],
-            "features": [
-                "Parallel processing for 60-70% speed improvement",
-                "LRU cache for G2P conversion (1000 words)",
-                "Enhanced Levenshtein distance phoneme alignment",
-                "Character-level error detection (word mode)",
-                "Advanced prosody analysis (sentence mode)",
-                "Vietnamese speaker-specific error patterns",
-                "Real-time confidence scoring",
-                "IPA phonetic representation with visualization",
-                "Backward compatibility with legacy APIs",
-                "Production-ready error handling",
-            ],
-            "model_info": {
-                "asr_model": self.asr.model_name,
-                "onnx_enabled": self.asr.use_onnx,
-                "sample_rate": self.asr.sample_rate,
-            },
-            "performance": {
-                "target_processing_time": "< 0.8s (vs original 2s)",
-                "expected_improvement": "60-70% faster",
-                "parallel_workers": 4,
-                "cached_operations": ["G2P conversion", "phoneme strings", "word mappings"],
-            },
-        }
-    def __del__(self):
-        """Cleanup executor"""
-        if hasattr(self, 'executor'):
-            self.executor.shutdown(wait=False)
-# Backward compatibility wrapper
-class SimplePronunciationAssessor:
-    """Backward compatible wrapper for the enhanced optimized system"""
-    def __init__(self, onnx: bool = True, quantized: bool = True):
-        print("Initializing Optimized Simple Pronunciation Assessor (Enhanced)...")
-        self.enhanced_assessor = ProductionPronunciationAssessor(onnx=onnx, quantized=quantized)
-        print("Optimized Enhanced Simple Pronunciation Assessor initialization completed")
-    def assess_pronunciation(
-        self, audio_path: str, reference_text: str, mode: str = "normal"
-    ) -> Dict:
-        """
-        Backward compatible assessment function with optimizations
-        Args:
-            audio_path: Path to audio file
-            reference_text: Reference text to compare
-            mode: Assessment mode (supports legacy modes)
-        """
-        return self.enhanced_assessor.assess_pronunciation(
-            audio_path, reference_text, mode
-        )
-# Example usage and performance testing
-if __name__ == "__main__":
-    import time
-    import psutil
-    import os
-    # Initialize optimized production system with ONNX and quantization
-    system = ProductionPronunciationAssessor(onnx=False, quantized=False)
-    # Performance test cases
-    test_cases = [
-        ("./hello_world.wav", "hello", "word"),
-        ("./hello_how_are_you_today.wav", "Hello, how are you today?", "sentence"),
-        ("./pronunciation.wav", "pronunciation", "auto"),
-    ]
-    print("=== OPTIMIZED PERFORMANCE TESTING ===")
-    for audio_path, reference_text, mode in test_cases:
-        print(f"\n--- Testing {mode.upper()} mode: '{reference_text}' ---")
-        if not os.path.exists(audio_path):
-            print(f"Warning: Test file {audio_path} not found, skipping...")
-            continue
-        # Multiple runs to test consistency
-        times = []
-        scores = []
-        for i in range(5):
-            start_time = time.time()
-            result = system.assess_pronunciation(audio_path, reference_text, mode)
-            end_time = time.time()
-            processing_time = end_time - start_time
-            times.append(processing_time)
-            scores.append(result.get('overall_score', 0))
-            print(f"Run {i+1}: {processing_time:.3f}s - Score: {scores[-1]:.2f}")
-        avg_time = sum(times) / len(times)
-        avg_score = sum(scores) / len(scores)
-        min_time = min(times)
-        max_time = max(times)
-        print(f"Average time: {avg_time:.3f}s")
-        print(f"Min time: {min_time:.3f}s")
-        print(f"Max time: {max_time:.3f}s")
-        print(f"Average score: {avg_score:.2f}")
-        print(f"Speed improvement vs 2s baseline: {((2.0 - avg_time) / 2.0 * 100):.1f}%")
-        # Check if target is met
-        if avg_time <= 0.8:
-            print("✅ TARGET ACHIEVED: < 0.8s")
-        else:
-            print("❌ Target missed: > 0.8s")
-    # Backward compatibility test
-    print(f"\n=== BACKWARD COMPATIBILITY TEST ===")
-    legacy_assessor = SimplePronunciationAssessor(onnx=True, quantized=True)
-    start_time = time.time()
-    legacy_result = legacy_assessor.assess_pronunciation(
-        "./hello_world.wav", "pronunciation", "normal"
-    )
-    processing_time = time.time() - start_time
-    print(f"Legacy API time: {processing_time:.3f}s")
-    print(f"Legacy result keys: {list(legacy_result.keys())}")
-    print(f"Legacy score: {legacy_result.get('overall_score', 0):.2f}")
-    print(f"Legacy mode mapped to: {legacy_result.get('assessment_mode', 'N/A')}")
-    # Memory usage test
-    process = psutil.Process(os.getpid())
-    memory_usage = process.memory_info().rss / 1024 / 1024  # MB
-    print(f"\nMemory usage: {memory_usage:.1f}MB")
-    # System info
-    print(f"\n=== SYSTEM INFORMATION ===")
-    system_info = system.get_system_info()
-    print(f"System version: {system_info['version']}")
-    print(f"Available modes: {system_info['modes']}")
-    print(f"Model info: {system_info['model_info']}")
-    print(f"Performance targets: {system_info['performance']}")
-    print(f"\n=== OPTIMIZATION SUMMARY ===")
-    optimizations = [
-        "✅ Parallel processing with ThreadPoolExecutor (4 workers)",
-        "✅ LRU cache for G2P conversion (1000 words cache)",
-        "✅ LRU cache for phoneme strings (500 phrases cache)",
-        "✅ Simplified audio feature extraction (10x frame sampling)",
-        "✅ Fast Levenshtein alignment algorithm",
-        "✅ ONNX + Quantization for fastest ASR inference",
-        "✅ Concurrent futures for independent tasks",
-        "✅ Reduced librosa computation overhead",
-        "✅ Quick phoneme pair alignment",
-        "✅ Minimal object creation in hot paths",
-        "✅ Conditional prosody analysis (sentence mode only)",
-        "✅ Optimized error pattern analysis",
-        "✅ Fast syllable counting algorithm",
-        "✅ Simplified phoneme mapping fallbacks",
-        "✅ Cached CMU dictionary lookups",
-    ]
-    for optimization in optimizations:
-        print(optimization)
-    print(f"\n=== PERFORMANCE COMPARISON ===")
-    print(f"Original system: ~2.0s total")
-    print(f"  - ASR: 0.3s")
-    print(f"  - Processing: 1.7s")
-    print(f"")
-    print(f"Optimized system: ~0.6-0.8s total (target)")
-    print(f"  - ASR: 0.3s (unchanged)")
-    print(f"  - Processing: 0.3-0.5s (65-70% improvement)")
-    print(f"")
-    print(f"Key improvements:")
-    print(f"  • Parallel processing of independent analysis tasks")
-    print(f"  • Cached G2P conversions avoid repeated computation")
-    print(f"  • Simplified audio analysis with strategic sampling")
-    print(f"  • Fast alignment algorithms for phoneme comparison")
-    print(f"  • ONNX quantized models for maximum ASR speed")
-    print(f"  • Conditional feature extraction based on assessment mode")
-    print(f"\n=== BACKWARD COMPATIBILITY ===")
-    print(f"✅ All original class names preserved")
-    print(f"✅ All original function signatures maintained")
-    print(f"✅ All original output formats supported")
-    print(f"✅ Legacy mode mapping (normal -> auto)")
-    print(f"✅ Original API completely functional")
-    print(f"✅ Enhanced features are additive, not breaking")
-    print(f"\nOptimization complete! Target: 60-70% faster processing achieved.")

src/apis/controllers/speaking_controller.py CHANGED Viewed

@@ -513,6 +513,24 @@ class EnhancedG2P:
         return phoneme_sequence
     def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
         """Convert CMU phonemes to IPA - Optimized"""
         cmu_to_ipa = {
@@ -641,7 +659,6 @@ class EnhancedG2P:
                 {
                     "phoneme": phoneme,
                     "color_category": color_category,
-                    "description": self._get_phoneme_description(phoneme),
                     "difficulty": self.difficulty_scores.get(phoneme, 0.3),
                 }
             )
@@ -825,7 +842,7 @@ class EnhancedWordAnalyzer:
         # Start parallel tasks
         future_ref_phonemes = self.executor.submit(
-            self.g2p.text_to_phonemes, reference_text
         )
         future_ref_phoneme_string = self.executor.submit(
             self.g2p.get_phoneme_string, reference_text
@@ -914,7 +931,7 @@ class EnhancedWordAnalyzer:
                 "phoneme_scores": word_phoneme_scores,
                 "phoneme_start_index": phoneme_index,
                 "phoneme_end_index": phoneme_index + num_phonemes - 1,
-                "phoneme_visualization": word_data["visualization"],
                 "character_errors": character_errors,
                 "detailed_analysis": mode == AssessmentMode.WORD,
             }
@@ -989,9 +1006,6 @@ class EnhancedWordAnalyzer:
                                 "expected": comparison["reference_phoneme"],
                                 "actual": comparison["learner_phoneme"],
                                 "difficulty": comparison["difficulty"],
-                                "description": self.g2p._get_phoneme_description(
-                                    comparison["reference_phoneme"]
-                                ),
                             }
                         )
                     elif comparison["status"] in ["missing", "deletion"]:
@@ -999,9 +1013,6 @@ class EnhancedWordAnalyzer:
                             {
                                 "phoneme": comparison["reference_phoneme"],
                                 "difficulty": comparison["difficulty"],
-                                "description": self.g2p._get_phoneme_description(
-                                    comparison["reference_phoneme"]
-                                ),
                             }
                         )
@@ -1015,7 +1026,6 @@ class EnhancedWordAnalyzer:
                     "tips": self._get_enhanced_vietnamese_tips(
                         wrong_phonemes, missing_phonemes
                     ),
-                    "phoneme_visualization": word_highlight["phoneme_visualization"],
                     "character_errors": word_highlight.get("character_errors", []),
                 }
@@ -1650,17 +1660,6 @@ class ProductionPronunciationAssessor:
             # Add processing metadata
             processing_time = time.time() - start_time
-            result["processing_info"] = {
-                "processing_time": round(processing_time, 2),
-                "mode": assessment_mode.value,
-                "model_used": "Wav2Vec2-Enhanced-Optimized",
-                "onnx_enabled": self.asr.use_onnx,
-                "confidence": asr_result["confidence"],
-                "enhanced_features": True,
-                "character_level_analysis": assessment_mode == AssessmentMode.WORD,
-                "prosody_analysis": assessment_mode == AssessmentMode.SENTENCE,
-                "optimized": True,
-            }
             logger.info(
                 f"Optimized production assessment completed in {processing_time:.2f}s"
@@ -1865,15 +1864,6 @@ class ProductionPronunciationAssessor:
             "audio_quality": audio_quality,
             "retry_suggestions": suggestions,
             "assessment_mode": "error",
-            "processing_info": {
-                "processing_time": 0,
-                "mode": "error",
-                "model_used": "Wav2Vec2-Enhanced-Optimized",
-                "confidence": 0.0,
-                "enhanced_features": False,
-                "optimized": True,
-                "error_handled": True,
-            },
         }
     def get_system_info(self) -> Dict:

         return phoneme_sequence
+    def text_to_phonemes_basic(self, text: str) -> List[Dict]:
+        """Convert text to phoneme sequence without visualization for speed"""
+        words = self._clean_text(text).split()
+        phoneme_sequence = []
+        for word in words:
+            phonemes = self.word_to_phonemes(word)
+            phoneme_sequence.append(
+                {
+                    "word": word,
+                    "phonemes": phonemes,
+                    "ipa": self._get_ipa(word),
+                    "phoneme_string": " ".join(phonemes),
+                }
+            )
+        return phoneme_sequence
     def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
         """Convert CMU phonemes to IPA - Optimized"""
         cmu_to_ipa = {
                 {
                     "phoneme": phoneme,
                     "color_category": color_category,
                     "difficulty": self.difficulty_scores.get(phoneme, 0.3),
                 }
             )
         # Start parallel tasks
         future_ref_phonemes = self.executor.submit(
+            self.g2p.text_to_phonemes_basic, reference_text
         )
         future_ref_phoneme_string = self.executor.submit(
             self.g2p.get_phoneme_string, reference_text
                 "phoneme_scores": word_phoneme_scores,
                 "phoneme_start_index": phoneme_index,
                 "phoneme_end_index": phoneme_index + num_phonemes - 1,
+                # Visualization removed for performance
                 "character_errors": character_errors,
                 "detailed_analysis": mode == AssessmentMode.WORD,
             }
                                 "expected": comparison["reference_phoneme"],
                                 "actual": comparison["learner_phoneme"],
                                 "difficulty": comparison["difficulty"],
                             }
                         )
                     elif comparison["status"] in ["missing", "deletion"]:
                             {
                                 "phoneme": comparison["reference_phoneme"],
                                 "difficulty": comparison["difficulty"],
                             }
                         )
                     "tips": self._get_enhanced_vietnamese_tips(
                         wrong_phonemes, missing_phonemes
                     ),
                     "character_errors": word_highlight.get("character_errors", []),
                 }
             # Add processing metadata
             processing_time = time.time() - start_time
             logger.info(
                 f"Optimized production assessment completed in {processing_time:.2f}s"
             "audio_quality": audio_quality,
             "retry_suggestions": suggestions,
             "assessment_mode": "error",
         }
     def get_system_info(self) -> Dict:

src/utils/speaking_utils.py CHANGED Viewed

@@ -1,564 +1,5 @@
-from typing import List, Dict
 import numpy as np
 import nltk
-import eng_to_ipa as ipa
-import re
-from collections import defaultdict
-try:
-    nltk.download("cmudict", quiet=True)
-    from nltk.corpus import cmudict
-except:
-    print("Warning: NLTK data not available")
-class SimpleG2P:
-    """Simple Grapheme-to-Phoneme converter for reference text"""
-    def __init__(self):
-        try:
-            self.cmu_dict = cmudict.dict()
-        except:
-            self.cmu_dict = {}
-            print("Warning: CMU dictionary not available")
-    def text_to_phonemes(self, text: str) -> List[Dict]:
-        """Convert text to phoneme sequence"""
-        words = self._clean_text(text).split()
-        phoneme_sequence = []
-        for word in words:
-            word_phonemes = self._get_word_phonemes(word)
-            phoneme_sequence.append(
-                {
-                    "word": word,
-                    "phonemes": word_phonemes,
-                    "ipa": self._get_ipa(word),
-                    "phoneme_string": " ".join(word_phonemes),
-                }
-            )
-        return phoneme_sequence
-    def get_reference_phoneme_string(self, text: str) -> str:
-        """Get reference phoneme string for comparison"""
-        phoneme_sequence = self.text_to_phonemes(text)
-        all_phonemes = []
-        for word_data in phoneme_sequence:
-            all_phonemes.extend(word_data["phonemes"])
-        return " ".join(all_phonemes)
-    def _clean_text(self, text: str) -> str:
-        """Clean text for processing"""
-        text = re.sub(r"[^\w\s\']", " ", text)
-        text = re.sub(r"\s+", " ", text)
-        return text.lower().strip()
-    def _get_word_phonemes(self, word: str) -> List[str]:
-        """Get phonemes for a word"""
-        word_lower = word.lower()
-        if word_lower in self.cmu_dict:
-            # Remove stress markers and convert to Wav2Vec2 phoneme format
-            phonemes = self.cmu_dict[word_lower][0]
-            clean_phonemes = [re.sub(r"[0-9]", "", p) for p in phonemes]
-            return self._convert_to_wav2vec_format(clean_phonemes)
-        else:
-            return self._estimate_phonemes(word)
-    def _convert_to_wav2vec_format(self, cmu_phonemes: List[str]) -> List[str]:
-        """Convert CMU phonemes to Wav2Vec2 format"""
-        # Mapping from CMU to Wav2Vec2/eSpeak phonemes
-        cmu_to_espeak = {
-            "AA": "ɑ",
-            "AE": "æ",
-            "AH": "ʌ",
-            "AO": "ɔ",
-            "AW": "aʊ",
-            "AY": "aɪ",
-            "EH": "ɛ",
-            "ER": "ɝ",
-            "EY": "eɪ",
-            "IH": "ɪ",
-            "IY": "i",
-            "OW": "oʊ",
-            "OY": "ɔɪ",
-            "UH": "ʊ",
-            "UW": "u",
-            "B": "b",
-            "CH": "tʃ",
-            "D": "d",
-            "DH": "ð",
-            "F": "f",
-            "G": "ɡ",
-            "HH": "h",
-            "JH": "dʒ",
-            "K": "k",
-            "L": "l",
-            "M": "m",
-            "N": "n",
-            "NG": "ŋ",
-            "P": "p",
-            "R": "r",
-            "S": "s",
-            "SH": "ʃ",
-            "T": "t",
-            "TH": "θ",
-            "V": "v",
-            "W": "w",
-            "Y": "j",
-            "Z": "z",
-            "ZH": "ʒ",
-        }
-        converted = []
-        for phoneme in cmu_phonemes:
-            converted_phoneme = cmu_to_espeak.get(phoneme, phoneme.lower())
-            converted.append(converted_phoneme)
-        return converted
-    def _get_ipa(self, word: str) -> str:
-        """Get IPA transcription"""
-        try:
-            return ipa.convert(word)
-        except:
-            return f"/{word}/"
-    def _estimate_phonemes(self, word: str) -> List[str]:
-        """Estimate phonemes for unknown words"""
-        # Basic phoneme estimation with eSpeak-style output
-        phoneme_map = {
-            "ch": ["tʃ"],
-            "sh": ["ʃ"],
-            "th": ["θ"],
-            "ph": ["f"],
-            "ck": ["k"],
-            "ng": ["ŋ"],
-            "qu": ["k", "w"],
-            "a": ["æ"],
-            "e": ["ɛ"],
-            "i": ["ɪ"],
-            "o": ["ʌ"],
-            "u": ["ʌ"],
-            "b": ["b"],
-            "c": ["k"],
-            "d": ["d"],
-            "f": ["f"],
-            "g": ["ɡ"],
-            "h": ["h"],
-            "j": ["dʒ"],
-            "k": ["k"],
-            "l": ["l"],
-            "m": ["m"],
-            "n": ["n"],
-            "p": ["p"],
-            "r": ["r"],
-            "s": ["s"],
-            "t": ["t"],
-            "v": ["v"],
-            "w": ["w"],
-            "x": ["k", "s"],
-            "y": ["j"],
-            "z": ["z"],
-        }
-        word = word.lower()
-        phonemes = []
-        i = 0
-        while i < len(word):
-            # Check 2-letter combinations first
-            if i <= len(word) - 2:
-                two_char = word[i : i + 2]
-                if two_char in phoneme_map:
-                    phonemes.extend(phoneme_map[two_char])
-                    i += 2
-                    continue
-            # Single character
-            char = word[i]
-            if char in phoneme_map:
-                phonemes.extend(phoneme_map[char])
-            i += 1
-        return phonemes
-class PhonemeComparator:
-    """Compare reference and learner phoneme sequences"""
-    def __init__(self):
-        # Vietnamese speakers' common phoneme substitutions
-        self.substitution_patterns = {
-            "θ": ["f", "s", "t"],  # TH → F, S, T
-            "ð": ["d", "z", "v"],  # DH → D, Z, V
-            "v": ["w", "f"],  # V → W, F
-            "r": ["l"],  # R → L
-            "l": ["r"],  # L → R
-            "z": ["s"],  # Z → S
-            "ʒ": ["ʃ", "z"],  # ZH → SH, Z
-            "ŋ": ["n"],  # NG → N
-        }
-        # Difficulty levels for Vietnamese speakers
-        self.difficulty_map = {
-            "θ": 0.9,  # th (think)
-            "ð": 0.9,  # th (this)
-            "v": 0.8,  # v
-            "z": 0.8,  # z
-            "ʒ": 0.9,  # zh (measure)
-            "r": 0.7,  # r
-            "l": 0.6,  # l
-            "w": 0.5,  # w
-            "f": 0.4,  # f
-            "s": 0.3,  # s
-            "ʃ": 0.5,  # sh
-            "tʃ": 0.4,  # ch
-            "dʒ": 0.5,  # j
-            "ŋ": 0.3,  # ng
-        }
-    def compare_phoneme_sequences(
-        self, reference_phonemes: str, learner_phonemes: str
-    ) -> List[Dict]:
-        """Compare reference and learner phoneme sequences"""
-        # Split phoneme strings
-        ref_phones = reference_phonemes.split()
-        learner_phones = learner_phonemes.split()
-        print(f"Reference phonemes: {ref_phones}")
-        print(f"Learner phonemes: {learner_phones}")
-        # Simple alignment comparison
-        comparisons = []
-        max_len = max(len(ref_phones), len(learner_phones))
-        for i in range(max_len):
-            ref_phoneme = ref_phones[i] if i < len(ref_phones) else ""
-            learner_phoneme = learner_phones[i] if i < len(learner_phones) else ""
-            if ref_phoneme and learner_phoneme:
-                # Both present - check accuracy
-                if ref_phoneme == learner_phoneme:
-                    status = "correct"
-                    score = 1.0
-                elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
-                    status = "acceptable"
-                    score = 0.7
-                else:
-                    status = "wrong"
-                    score = 0.2
-            elif ref_phoneme and not learner_phoneme:
-                # Missing phoneme
-                status = "missing"
-                score = 0.0
-            elif learner_phoneme and not ref_phoneme:
-                # Extra phoneme
-                status = "extra"
-                score = 0.0
-            else:
-                continue
-            comparison = {
-                "position": i,
-                "reference_phoneme": ref_phoneme,
-                "learner_phoneme": learner_phoneme,
-                "status": status,
-                "score": score,
-                "difficulty": self.difficulty_map.get(ref_phoneme, 0.3),
-            }
-            comparisons.append(comparison)
-        return comparisons
-    def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
-        """Check if learner phoneme is acceptable substitution for Vietnamese speakers"""
-        acceptable = self.substitution_patterns.get(reference, [])
-        return learner in acceptable
-# =============================================================================
-# WORD ANALYZER
-# =============================================================================
-class WordAnalyzer:
-    """Analyze word-level pronunciation accuracy using character-based ASR"""
-    def __init__(self):
-        self.g2p = SimpleG2P()
-        self.comparator = PhonemeComparator()
-    def analyze_words(self, reference_text: str, learner_phonemes: str) -> Dict:
-        """Analyze word-level pronunciation using phoneme representation from character ASR"""
-        # Get reference phonemes by word
-        reference_words = self.g2p.text_to_phonemes(reference_text)
-        # Get overall phoneme comparison
-        reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
-        phoneme_comparisons = self.comparator.compare_phoneme_sequences(
-            reference_phoneme_string, learner_phonemes
-        )
-        # Map phonemes back to words
-        word_highlights = self._create_word_highlights(
-            reference_words, phoneme_comparisons
-        )
-        # Identify wrong words
-        wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
-        return {
-            "word_highlights": word_highlights,
-            "phoneme_differences": phoneme_comparisons,
-            "wrong_words": wrong_words,
-        }
-    def _create_word_highlights(
-        self, reference_words: List[Dict], phoneme_comparisons: List[Dict]
-    ) -> List[Dict]:
-        """Create word highlighting data"""
-        word_highlights = []
-        phoneme_index = 0
-        for word_data in reference_words:
-            word = word_data["word"]
-            word_phonemes = word_data["phonemes"]
-            num_phonemes = len(word_phonemes)
-            # Get phoneme scores for this word
-            word_phoneme_scores = []
-            for j in range(num_phonemes):
-                if phoneme_index + j < len(phoneme_comparisons):
-                    comparison = phoneme_comparisons[phoneme_index + j]
-                    word_phoneme_scores.append(comparison["score"])
-            # Calculate word score
-            word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
-            # Create word highlight
-            highlight = {
-                "word": word,
-                "score": float(word_score),
-                "status": self._get_word_status(word_score),
-                "color": self._get_word_color(word_score),
-                "phonemes": word_phonemes,
-                "ipa": word_data["ipa"],
-                "phoneme_scores": word_phoneme_scores,
-                "phoneme_start_index": phoneme_index,
-                "phoneme_end_index": phoneme_index + num_phonemes - 1,
-            }
-            word_highlights.append(highlight)
-            phoneme_index += num_phonemes
-        return word_highlights
-    def _identify_wrong_words(
-        self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
-    ) -> List[Dict]:
-        """Identify words that were pronounced incorrectly"""
-        wrong_words = []
-        for word_highlight in word_highlights:
-            if word_highlight["score"] < 0.6:  # Threshold for wrong pronunciation
-                # Find specific phoneme errors for this word
-                start_idx = word_highlight["phoneme_start_index"]
-                end_idx = word_highlight["phoneme_end_index"]
-                wrong_phonemes = []
-                missing_phonemes = []
-                for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
-                    comparison = phoneme_comparisons[i]
-                    if comparison["status"] == "wrong":
-                        wrong_phonemes.append(
-                            {
-                                "expected": comparison["reference_phoneme"],
-                                "actual": comparison["learner_phoneme"],
-                                "difficulty": comparison["difficulty"],
-                            }
-                        )
-                    elif comparison["status"] == "missing":
-                        missing_phonemes.append(
-                            {
-                                "phoneme": comparison["reference_phoneme"],
-                                "difficulty": comparison["difficulty"],
-                            }
-                        )
-                wrong_word = {
-                    "word": word_highlight["word"],
-                    "score": word_highlight["score"],
-                    "expected_phonemes": word_highlight["phonemes"],
-                    "ipa": word_highlight["ipa"],
-                    "wrong_phonemes": wrong_phonemes,
-                    "missing_phonemes": missing_phonemes,
-                    "tips": self._get_vietnamese_tips(wrong_phonemes, missing_phonemes),
-                }
-                wrong_words.append(wrong_word)
-        return wrong_words
-    def _get_word_status(self, score: float) -> str:
-        """Get word status from score"""
-        if score >= 0.8:
-            return "excellent"
-        elif score >= 0.6:
-            return "good"
-        elif score >= 0.4:
-            return "needs_practice"
-        else:
-            return "poor"
-    def _get_word_color(self, score: float) -> str:
-        """Get color for word highlighting"""
-        if score >= 0.8:
-            return "#22c55e"  # Green
-        elif score >= 0.6:
-            return "#84cc16"  # Light green
-        elif score >= 0.4:
-            return "#eab308"  # Yellow
-        else:
-            return "#ef4444"  # Red
-    def _get_vietnamese_tips(
-        self, wrong_phonemes: List[Dict], missing_phonemes: List[Dict]
-    ) -> List[str]:
-        """Get Vietnamese-specific pronunciation tips"""
-        tips = []
-        # Tips for specific Vietnamese pronunciation challenges
-        vietnamese_tips = {
-            "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
-            "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
-            "v": "Chạm môi dưới vào răng trên, không dùng cả hai môi như tiếng Việt",
-            "r": "Cuộn lưỡi nhưng không chạm vào vòm miệng, không lăn lưỡi",
-            "l": "Đầu lưỡi chạm vào vòm miệng sau răng",
-            "z": "Giống âm 's' nhưng có rung dây thanh âm",
-            "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
-            "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
-        }
-        # Add tips for wrong phonemes
-        for wrong in wrong_phonemes:
-            expected = wrong["expected"]
-            actual = wrong["actual"]
-            if expected in vietnamese_tips:
-                tips.append(f"Âm '{expected}': {vietnamese_tips[expected]}")
-            else:
-                tips.append(f"Luyện âm '{expected}' thay vì '{actual}'")
-        # Add tips for missing phonemes
-        for missing in missing_phonemes:
-            phoneme = missing["phoneme"]
-            if phoneme in vietnamese_tips:
-                tips.append(f"Thiếu âm '{phoneme}': {vietnamese_tips[phoneme]}")
-        return tips
-class SimpleFeedbackGenerator:
-    """Generate simple, actionable feedback in Vietnamese"""
-    def generate_feedback(
-        self,
-        overall_score: float,
-        wrong_words: List[Dict],
-        phoneme_comparisons: List[Dict],
-    ) -> List[str]:
-        """Generate focused Vietnamese feedback with actionable improvements"""
-        feedback = []
-        # More specific and actionable feedback based on score ranges
-        if overall_score >= 0.8:
-            feedback.append(f"Xuất sắc! Điểm: {int(overall_score * 100)}%. Tiếp tục duy trì và luyện tập thêm tốc độ tự nhiên.")
-        elif overall_score >= 0.7:
-            feedback.append(f"Tốt! Điểm: {int(overall_score * 100)}%. Để đạt 80%+, hãy tập trung vào nhịp điệu và ngữ điệu.")
-        elif overall_score >= 0.6:
-            feedback.append(f"Khá! Điểm: {int(overall_score * 100)}%. Để cải thiện, hãy phát âm chậm hơn và rõ ràng từng âm.")
-        elif overall_score >= 0.4:
-            feedback.append(f"Cần cải thiện. Điểm: {int(overall_score * 100)}%. Nghe lại mẫu và tập từng từ riêng lẻ trước.")
-        else:
-            feedback.append(f"Điểm: {int(overall_score * 100)}%. Hãy nghe mẫu 3-5 lần, sau đó tập phát âm từng từ chậm rãi.")
-        # More specific wrong words feedback with improvement path
-        if wrong_words:
-            # Sort by score to focus on worst words first
-            sorted_words = sorted(wrong_words, key=lambda x: x["score"])
-            if len(wrong_words) == 1:
-                word = sorted_words[0]
-                feedback.append(f"Tập trung vào từ '{word['word']}' (điểm: {int(word['score']*100)}%). Click vào từ để nghe lại.")
-            elif len(wrong_words) <= 3:
-                worst_word = sorted_words[0]
-                feedback.append(f"Ưu tiên cải thiện: '{worst_word['word']}' ({int(worst_word['score']*100)}%) - các từ khác sẽ dễ hơn sau khi nắm được từ này.")
-            else:
-                # Focus on pattern recognition
-                feedback.append(f"Có {len(wrong_words)} từ cần cải thiện. Bắt đầu với 2 từ khó nhất và luyện tập 5 lần mỗi từ.")
-        # Specific phoneme guidance with improvement strategy
-        problem_phonemes = defaultdict(int)
-        for comparison in phoneme_comparisons:
-            if comparison["status"] in ["wrong", "missing"]:
-                phoneme = comparison["reference_phoneme"]
-                problem_phonemes[phoneme] += 1
-        if problem_phonemes:
-            most_difficult = sorted(
-                problem_phonemes.items(), key=lambda x: x[1], reverse=True
-            )
-            top_problems = most_difficult[:2]  # Focus on top 2 problems
-            detailed_phoneme_tips = {
-                "θ": "Đặt đầu lưỡi giữa 2 hàm răng, thổi nhẹ ra. Luyện: 'think', 'three', 'thank'.",
-                "ð": "Như /θ/ nhưng rung dây thanh. Luyện: 'this', 'that', 'the'.",
-                "v": "Răng trên chạm nhẹ môi dưới (không phải 2 môi). Luyện: 'very', 'have', 'love'.",
-                "r": "Cuộn lưỡi lên nhưng KHÔNG chạm nóc miệng. Luyện: 'red', 'run', 'car'.",
-                "l": "Đầu lưỡi chạm nướu răng trên. Luyện: 'love', 'like', 'tell'.",
-                "z": "Như 's' nhưng rung dây thanh (đặt tay vào cổ để cảm nhận). Luyện: 'zoo', 'buzz'.",
-                "ɛ": "Mở miệng vừa, lưỡi thấp (như 'e' trong 'ten'). Luyện: 'bed', 'red', 'get'.",
-                "æ": "Mở miệng rộng, hàm dưới hạ thấp. Luyện: 'cat', 'man', 'bad'.",
-                "ɪ": "Âm 'i' ngắn, lưỡi thả lỏng. Luyện: 'sit', 'big', 'this'.",
-                "ʊ": "Âm 'u' ngắn, môi tròn nhẹ. Luyện: 'book', 'put', 'could'.",
-            }
-            # Provide specific guidance for the most problematic phoneme
-            for phoneme, count in top_problems[:1]:  # Focus on the worst one
-                if phoneme in detailed_phoneme_tips:
-                    improvement = 100 - int((count / len(phoneme_comparisons)) * 100)
-                    feedback.append(
-                        f"🎯 Tập trung âm /{phoneme}/: {detailed_phoneme_tips[phoneme]} Cải thiện âm này sẽ tăng điểm ~{improvement}%."
-                    )
-        # Add specific action steps based on score range
-        if overall_score < 0.8:
-            if overall_score < 0.5:
-                feedback.append("📚 Bước tiếp: 1) Nghe mẫu 5 lần, 2) Tập phát âm từng từ 3 lần, 3) Ghi âm lại và so sánh.")
-            elif overall_score < 0.7:
-                feedback.append("📚 Bước tiếp: 1) Tập từ khó nhất 5 lần, 2) Đọc cả câu chậm 2 lần, 3) Tăng tốc độ dần.")
-            else:
-                feedback.append("📚 Bước tiếp: 1) Luyện ngữ điệu tự nhiên, 2) Kết nối âm giữa các từ, 3) Tập nói với cảm xúc.")
-        return feedback
 def convert_numpy_types(obj):

 import numpy as np
 import nltk
 def convert_numpy_types(obj):