ABAO77 commited on
Commit
9537fdb
·
1 Parent(s): 043359f

remove evaluation, no need function

Browse files
evalution.py DELETED
@@ -1,1675 +0,0 @@
1
- import asyncio
2
- import concurrent.futures
3
- from functools import lru_cache
4
- import time
5
- from typing import List, Dict, Optional, Tuple
6
- import numpy as np
7
- import librosa
8
- import nltk
9
- import eng_to_ipa as ipa
10
- import re
11
- from collections import defaultdict
12
- from loguru import logger
13
- import Levenshtein
14
- from dataclasses import dataclass
15
- from enum import Enum
16
- from src.AI_Models.wave2vec_inference import (
17
- create_inference,
18
- export_to_onnx,
19
- )
20
-
21
- # Download required NLTK data
22
- try:
23
- nltk.download("cmudict", quiet=True)
24
- from nltk.corpus import cmudict
25
- except:
26
- print("Warning: NLTK data not available")
27
-
28
-
29
- class AssessmentMode(Enum):
30
- WORD = "word"
31
- SENTENCE = "sentence"
32
- AUTO = "auto"
33
-
34
-
35
- class ErrorType(Enum):
36
- CORRECT = "correct"
37
- SUBSTITUTION = "substitution"
38
- DELETION = "deletion"
39
- INSERTION = "insertion"
40
- ACCEPTABLE = "acceptable"
41
-
42
-
43
- @dataclass
44
- class CharacterError:
45
- """Character-level error information for UI mapping"""
46
-
47
- character: str
48
- position: int
49
- error_type: str
50
- expected_sound: str
51
- actual_sound: str
52
- severity: float
53
- color: str
54
-
55
-
56
- class EnhancedWav2Vec2CharacterASR:
57
- """Enhanced Wav2Vec2 ASR with prosody analysis support - Optimized version"""
58
-
59
- def __init__(
60
- self,
61
- model_name: str = "facebook/wav2vec2-large-960h-lv60-self",
62
- onnx: bool = False,
63
- quantized: bool = False,
64
- ):
65
- self.use_onnx = onnx
66
- self.sample_rate = 16000
67
- self.model_name = model_name
68
-
69
- if onnx:
70
- import os
71
-
72
- model_path = (
73
- f"wav2vec2-large-960h-lv60-self{'.quant' if quantized else ''}.onnx"
74
- )
75
- if not os.path.exists(model_path):
76
- export_to_onnx(model_name, quantize=quantized)
77
-
78
- # Use optimized inference
79
- self.model = create_inference(
80
- model_name=model_name, use_onnx=onnx, use_onnx_quantize=quantized, use_gpu=True
81
- )
82
-
83
- def transcribe_with_features(self, audio_path: str) -> Dict:
84
- """Enhanced transcription with audio features for prosody analysis - Optimized"""
85
- try:
86
- start_time = time.time()
87
-
88
- # Basic transcription (already fast - 0.3s)
89
- character_transcript = self.model.file_to_text(audio_path)
90
- character_transcript = self._clean_character_transcript(
91
- character_transcript
92
- )
93
-
94
- # Fast phoneme conversion
95
- phoneme_representation = self._characters_to_phoneme_representation(
96
- character_transcript
97
- )
98
-
99
- # Basic audio features (simplified for speed)
100
- audio_features = self._extract_basic_audio_features(audio_path)
101
-
102
- logger.info(f"Optimized transcription time: {time.time() - start_time:.2f}s")
103
-
104
- return {
105
- "character_transcript": character_transcript,
106
- "phoneme_representation": phoneme_representation,
107
- "audio_features": audio_features,
108
- "confidence": self._estimate_confidence(character_transcript),
109
- }
110
-
111
- except Exception as e:
112
- logger.error(f"Enhanced ASR error: {e}")
113
- return self._empty_result()
114
-
115
- def _extract_basic_audio_features(self, audio_path: str) -> Dict:
116
- """Extract basic audio features for prosody analysis - Optimized"""
117
- try:
118
- y, sr = librosa.load(audio_path, sr=self.sample_rate)
119
- duration = len(y) / sr
120
-
121
- # Simplified pitch analysis (sample fewer frames)
122
- pitches, magnitudes = librosa.piptrack(y=y, sr=sr, threshold=0.1)
123
- pitch_values = []
124
- for t in range(0, pitches.shape[1], 10): # Sample every 10th frame
125
- index = magnitudes[:, t].argmax()
126
- pitch = pitches[index, t]
127
- if pitch > 80: # Filter noise
128
- pitch_values.append(pitch)
129
-
130
- # Basic rhythm
131
- tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
132
-
133
- # Basic intensity (reduced frame analysis)
134
- rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
135
-
136
- return {
137
- "duration": duration,
138
- "pitch": {
139
- "values": pitch_values,
140
- "mean": np.mean(pitch_values) if pitch_values else 0,
141
- "std": np.std(pitch_values) if pitch_values else 0,
142
- "range": (
143
- np.max(pitch_values) - np.min(pitch_values)
144
- if len(pitch_values) > 1 else 0
145
- ),
146
- "cv": (
147
- np.std(pitch_values) / np.mean(pitch_values)
148
- if pitch_values and np.mean(pitch_values) > 0
149
- else 0
150
- ),
151
- },
152
- "rhythm": {
153
- "tempo": tempo,
154
- "beats_per_second": len(beats) / duration if duration > 0 else 0,
155
- },
156
- "intensity": {
157
- "rms_mean": np.mean(rms),
158
- "rms_std": np.std(rms),
159
- },
160
- }
161
-
162
- except Exception as e:
163
- logger.error(f"Audio feature extraction error: {e}")
164
- return {"duration": 0, "error": str(e)}
165
-
166
- def _clean_character_transcript(self, transcript: str) -> str:
167
- """Clean and standardize character transcript"""
168
- logger.info(f"Raw transcript before cleaning: {transcript}")
169
- cleaned = re.sub(r"\s+", " ", transcript)
170
- return cleaned.strip().lower()
171
-
172
- def _characters_to_phoneme_representation(self, text: str) -> str:
173
- """Convert character-based transcript to phoneme representation - Optimized"""
174
- if not text:
175
- return ""
176
-
177
- words = text.split()
178
- phoneme_words = []
179
- g2p = EnhancedG2P()
180
-
181
- for word in words:
182
- try:
183
- if g2p:
184
- word_phonemes = g2p.word_to_phonemes(word)
185
- phoneme_words.extend(word_phonemes)
186
- else:
187
- phoneme_words.extend(self._simple_letter_to_phoneme(word))
188
- except:
189
- phoneme_words.extend(self._simple_letter_to_phoneme(word))
190
-
191
- return " ".join(phoneme_words)
192
-
193
- def _simple_letter_to_phoneme(self, word: str) -> List[str]:
194
- """Fallback letter-to-phoneme conversion"""
195
- letter_to_phoneme = {
196
- "a": "æ", "b": "b", "c": "k", "d": "d", "e": "ɛ", "f": "f",
197
- "g": "ɡ", "h": "h", "i": "ɪ", "j": "dʒ", "k": "k", "l": "l",
198
- "m": "m", "n": "n", "o": "ʌ", "p": "p", "q": "k", "r": "r",
199
- "s": "s", "t": "t", "u": "ʌ", "v": "v", "w": "w", "x": "ks",
200
- "y": "j", "z": "z",
201
- }
202
-
203
- return [
204
- letter_to_phoneme.get(letter, letter)
205
- for letter in word.lower()
206
- if letter in letter_to_phoneme
207
- ]
208
-
209
- def _estimate_confidence(self, transcript: str) -> float:
210
- """Estimate transcription confidence"""
211
- if not transcript or len(transcript.strip()) < 2:
212
- return 0.0
213
-
214
- repeated_chars = len(re.findall(r"(.)\1{2,}", transcript))
215
- return max(0.0, 1.0 - (repeated_chars * 0.2))
216
-
217
- def _empty_result(self) -> Dict:
218
- """Empty result for error cases"""
219
- return {
220
- "character_transcript": "",
221
- "phoneme_representation": "",
222
- "audio_features": {"duration": 0},
223
- "confidence": 0.0,
224
- }
225
-
226
-
227
- class EnhancedG2P:
228
- """Enhanced Grapheme-to-Phoneme converter with visualization support - Optimized"""
229
-
230
- def __init__(self):
231
- try:
232
- self.cmu_dict = cmudict.dict()
233
- except:
234
- self.cmu_dict = {}
235
- logger.warning("CMU dictionary not available")
236
-
237
- # Vietnamese speaker substitution patterns
238
- self.vn_substitutions = {
239
- "θ": ["f", "s", "t", "d"],
240
- "ð": ["d", "z", "v", "t"],
241
- "v": ["w", "f", "b"],
242
- "w": ["v", "b"],
243
- "r": ["l", "n"],
244
- "l": ["r", "n"],
245
- "z": ["s", "j"],
246
- "ʒ": ["ʃ", "z", "s"],
247
- "ʃ": ["s", "ʒ"],
248
- "ŋ": ["n", "m"],
249
- "tʃ": ["ʃ", "s", "k"],
250
- "dʒ": ["ʒ", "j", "g"],
251
- "æ": ["ɛ", "a"],
252
- "ɪ": ["i"],
253
- "ʊ": ["u"],
254
- }
255
-
256
- # Difficulty scores for Vietnamese speakers
257
- self.difficulty_scores = {
258
- "θ": 0.9, "ð": 0.9, "v": 0.8, "z": 0.8, "ʒ": 0.9,
259
- "r": 0.7, "l": 0.6, "w": 0.5, "æ": 0.7, "ɪ": 0.6, "ʊ": 0.6,
260
- "ŋ": 0.3, "f": 0.2, "s": 0.2, "ʃ": 0.5, "tʃ": 0.4, "dʒ": 0.5,
261
- }
262
-
263
- @lru_cache(maxsize=1000)
264
- def word_to_phonemes(self, word: str) -> List[str]:
265
- """Convert word to phoneme list - Cached for performance"""
266
- word_lower = word.lower().strip()
267
-
268
- if word_lower in self.cmu_dict:
269
- cmu_phonemes = self.cmu_dict[word_lower][0]
270
- return self._convert_cmu_to_ipa(cmu_phonemes)
271
- else:
272
- return self._estimate_phonemes(word_lower)
273
-
274
- @lru_cache(maxsize=500)
275
- def get_phoneme_string(self, text: str) -> str:
276
- """Get space-separated phoneme string - Cached"""
277
- words = self._clean_text(text).split()
278
- all_phonemes = []
279
-
280
- for word in words:
281
- if word:
282
- phonemes = self.word_to_phonemes(word)
283
- all_phonemes.extend(phonemes)
284
-
285
- return " ".join(all_phonemes)
286
-
287
- def text_to_phonemes(self, text: str) -> List[Dict]:
288
- """Convert text to phoneme sequence with visualization data"""
289
- words = self._clean_text(text).split()
290
- phoneme_sequence = []
291
-
292
- for word in words:
293
- word_phonemes = self.word_to_phonemes(word)
294
- phoneme_sequence.append(
295
- {
296
- "word": word,
297
- "phonemes": word_phonemes,
298
- "ipa": self._get_ipa(word),
299
- "phoneme_string": " ".join(word_phonemes),
300
- "visualization": self._create_phoneme_visualization(word_phonemes),
301
- }
302
- )
303
-
304
- return phoneme_sequence
305
-
306
- def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
307
- """Convert CMU phonemes to IPA - Optimized"""
308
- cmu_to_ipa = {
309
- "AA": "ɑ", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
310
- "EH": "ɛ", "ER": "ɝ", "EY": "eɪ", "IH": "ɪ", "IY": "i", "OW": "oʊ",
311
- "OY": "ɔɪ", "UH": "ʊ", "UW": "u", "B": "b", "CH": "tʃ", "D": "d",
312
- "DH": "ð", "F": "f", "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k",
313
- "L": "l", "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "r",
314
- "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v", "W": "w",
315
- "Y": "j", "Z": "z", "ZH": "ʒ",
316
- }
317
-
318
- ipa_phonemes = []
319
- for phoneme in cmu_phonemes:
320
- clean_phoneme = re.sub(r"[0-9]", "", phoneme)
321
- ipa_phoneme = cmu_to_ipa.get(clean_phoneme, clean_phoneme.lower())
322
- ipa_phonemes.append(ipa_phoneme)
323
-
324
- return ipa_phonemes
325
-
326
- def _estimate_phonemes(self, word: str) -> List[str]:
327
- """Estimate phonemes for unknown words - Optimized"""
328
- phoneme_map = {
329
- "ch": "tʃ", "sh": "ʃ", "th": "θ", "ph": "f", "ck": "k", "ng": "ŋ", "qu": "kw",
330
- "a": "æ", "e": "ɛ", "i": "ɪ", "o": "ʌ", "u": "ʌ", "b": "b", "c": "k",
331
- "d": "d", "f": "f", "g": "ɡ", "h": "h", "j": "dʒ", "k": "k", "l": "l",
332
- "m": "m", "n": "n", "p": "p", "r": "r", "s": "s", "t": "t", "v": "v",
333
- "w": "w", "x": "ks", "y": "j", "z": "z",
334
- }
335
-
336
- phonemes = []
337
- i = 0
338
- while i < len(word):
339
- if i <= len(word) - 2:
340
- two_char = word[i : i + 2]
341
- if two_char in phoneme_map:
342
- phonemes.append(phoneme_map[two_char])
343
- i += 2
344
- continue
345
-
346
- char = word[i]
347
- if char in phoneme_map:
348
- phonemes.append(phoneme_map[char])
349
- i += 1
350
-
351
- return phonemes
352
-
353
- def _clean_text(self, text: str) -> str:
354
- """Clean text for processing"""
355
- text = re.sub(r"[^\w\s']", " ", text)
356
- text = re.sub(r"\s+", " ", text)
357
- return text.lower().strip()
358
-
359
- def _get_ipa(self, word: str) -> str:
360
- """Get IPA transcription"""
361
- try:
362
- return ipa.convert(word)
363
- except:
364
- return f"/{word}/"
365
-
366
- def _create_phoneme_visualization(self, phonemes: List[str]) -> List[Dict]:
367
- """Create visualization data for phonemes"""
368
- visualization = []
369
- for phoneme in phonemes:
370
- color_category = self._get_phoneme_color_category(phoneme)
371
- visualization.append(
372
- {
373
- "phoneme": phoneme,
374
- "color_category": color_category,
375
- "description": self._get_phoneme_description(phoneme),
376
- "difficulty": self.difficulty_scores.get(phoneme, 0.3),
377
- }
378
- )
379
- return visualization
380
-
381
- def _get_phoneme_color_category(self, phoneme: str) -> str:
382
- """Categorize phonemes by color for visualization"""
383
- vowel_phonemes = {
384
- "ɑ", "æ", "ʌ", "ɔ", "aʊ", "aɪ", "ɛ", "ɝ", "eɪ", "ɪ", "i", "oʊ", "ɔɪ", "ʊ", "u",
385
- }
386
- difficult_consonants = {"θ", "ð", "v", "z", "ʒ", "r", "w"}
387
-
388
- if phoneme in vowel_phonemes:
389
- return "vowel"
390
- elif phoneme in difficult_consonants:
391
- return "difficult"
392
- else:
393
- return "consonant"
394
-
395
- def _get_phoneme_description(self, phoneme: str) -> str:
396
- """Get description for a phoneme"""
397
- descriptions = {
398
- "θ": "Voiceless dental fricative (like 'th' in 'think')",
399
- "ð": "Voiced dental fricative (like 'th' in 'this')",
400
- "v": "Voiced labiodental fricative (like 'v' in 'van')",
401
- "z": "Voiced alveolar fricative (like 'z' in 'zip')",
402
- "ʒ": "Voiced postalveolar fricative (like 's' in 'measure')",
403
- "r": "Alveolar approximant (like 'r' in 'red')",
404
- "w": "Labial-velar approximant (like 'w' in 'wet')",
405
- "æ": "Near-open front unrounded vowel (like 'a' in 'cat')",
406
- "ɪ": "Near-close near-front unrounded vowel (like 'i' in 'sit')",
407
- "ʊ": "Near-close near-back rounded vowel (like 'u' in 'put')",
408
- }
409
- return descriptions.get(phoneme, f"Phoneme: {phoneme}")
410
-
411
- def is_acceptable_substitution(self, reference: str, predicted: str) -> bool:
412
- """Check if substitution is acceptable for Vietnamese speakers"""
413
- acceptable = self.vn_substitutions.get(reference, [])
414
- return predicted in acceptable
415
-
416
- def get_difficulty_score(self, phoneme: str) -> float:
417
- """Get difficulty score for phoneme"""
418
- return self.difficulty_scores.get(phoneme, 0.3)
419
-
420
-
421
- class AdvancedPhonemeComparator:
422
- """Enhanced phoneme comparator using Levenshtein distance - Optimized"""
423
-
424
- def __init__(self):
425
- self.g2p = EnhancedG2P()
426
-
427
- def compare_with_levenshtein(self, reference: str, predicted: str) -> List[Dict]:
428
- """Compare phonemes using Levenshtein distance for accurate alignment - Optimized"""
429
- ref_phones = reference.split() if reference else []
430
- pred_phones = predicted.split() if predicted else []
431
-
432
- if not ref_phones:
433
- return []
434
-
435
- # Use Levenshtein editops for precise alignment
436
- ops = Levenshtein.editops(ref_phones, pred_phones)
437
-
438
- comparisons = []
439
- ref_idx = 0
440
- pred_idx = 0
441
-
442
- # Process equal parts first
443
- for op_type, ref_pos, pred_pos in ops:
444
- # Add equal characters before this operation
445
- while ref_idx < ref_pos and pred_idx < pred_pos:
446
- comparison = self._create_comparison(
447
- ref_phones[ref_idx],
448
- pred_phones[pred_idx],
449
- ErrorType.CORRECT,
450
- 1.0,
451
- len(comparisons),
452
- )
453
- comparisons.append(comparison)
454
- ref_idx += 1
455
- pred_idx += 1
456
-
457
- # Process the operation
458
- if op_type == "replace":
459
- ref_phoneme = ref_phones[ref_pos]
460
- pred_phoneme = pred_phones[pred_pos]
461
-
462
- if self.g2p.is_acceptable_substitution(ref_phoneme, pred_phoneme):
463
- error_type = ErrorType.ACCEPTABLE
464
- score = 0.7
465
- else:
466
- error_type = ErrorType.SUBSTITUTION
467
- score = 0.2
468
-
469
- comparison = self._create_comparison(
470
- ref_phoneme, pred_phoneme, error_type, score, len(comparisons)
471
- )
472
- comparisons.append(comparison)
473
- ref_idx = ref_pos + 1
474
- pred_idx = pred_pos + 1
475
-
476
- elif op_type == "delete":
477
- comparison = self._create_comparison(
478
- ref_phones[ref_pos], "", ErrorType.DELETION, 0.0, len(comparisons)
479
- )
480
- comparisons.append(comparison)
481
- ref_idx = ref_pos + 1
482
-
483
- elif op_type == "insert":
484
- comparison = self._create_comparison(
485
- "",
486
- pred_phones[pred_pos],
487
- ErrorType.INSERTION,
488
- 0.0,
489
- len(comparisons),
490
- )
491
- comparisons.append(comparison)
492
- pred_idx = pred_pos + 1
493
-
494
- # Add remaining equal characters
495
- while ref_idx < len(ref_phones) and pred_idx < len(pred_phones):
496
- comparison = self._create_comparison(
497
- ref_phones[ref_idx],
498
- pred_phones[pred_idx],
499
- ErrorType.CORRECT,
500
- 1.0,
501
- len(comparisons),
502
- )
503
- comparisons.append(comparison)
504
- ref_idx += 1
505
- pred_idx += 1
506
-
507
- return comparisons
508
-
509
- def _create_comparison(
510
- self,
511
- ref_phoneme: str,
512
- pred_phoneme: str,
513
- error_type: ErrorType,
514
- score: float,
515
- position: int,
516
- ) -> Dict:
517
- """Create comparison dictionary"""
518
- return {
519
- "position": position,
520
- "reference_phoneme": ref_phoneme,
521
- "learner_phoneme": pred_phoneme,
522
- "status": error_type.value,
523
- "score": score,
524
- "difficulty": self.g2p.get_difficulty_score(ref_phoneme),
525
- "error_type": error_type.value,
526
- }
527
-
528
-
529
- class EnhancedWordAnalyzer:
530
- """Enhanced word analyzer with character-level error mapping - Optimized"""
531
-
532
- def __init__(self):
533
- self.g2p = EnhancedG2P()
534
- self.comparator = AdvancedPhonemeComparator()
535
- # Thread pool for parallel processing
536
- self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
537
-
538
- def analyze_words_enhanced(
539
- self, reference_text: str, learner_phonemes: str, mode: AssessmentMode
540
- ) -> Dict:
541
- """Enhanced word analysis with character-level mapping - Parallelized"""
542
-
543
- # Start parallel tasks
544
- future_ref_phonemes = self.executor.submit(
545
- self.g2p.text_to_phonemes, reference_text
546
- )
547
- future_ref_phoneme_string = self.executor.submit(
548
- self.g2p.get_phoneme_string, reference_text
549
- )
550
-
551
- # Get results
552
- reference_words = future_ref_phonemes.result()
553
- reference_phoneme_string = future_ref_phoneme_string.result()
554
-
555
- # Phoneme comparison
556
- phoneme_comparisons = self.comparator.compare_with_levenshtein(
557
- reference_phoneme_string, learner_phonemes
558
- )
559
-
560
- # Parallel final processing
561
- future_highlights = self.executor.submit(
562
- self._create_enhanced_word_highlights,
563
- reference_words, phoneme_comparisons, mode
564
- )
565
- future_pairs = self.executor.submit(
566
- self._create_phoneme_pairs, reference_phoneme_string, learner_phonemes
567
- )
568
-
569
- word_highlights = future_highlights.result()
570
- phoneme_pairs = future_pairs.result()
571
-
572
- # Quick wrong words identification
573
- wrong_words = self._identify_wrong_words_enhanced(
574
- word_highlights, phoneme_comparisons
575
- )
576
-
577
- return {
578
- "word_highlights": word_highlights,
579
- "phoneme_differences": phoneme_comparisons,
580
- "wrong_words": wrong_words,
581
- "reference_phonemes": reference_phoneme_string,
582
- "phoneme_pairs": phoneme_pairs,
583
- }
584
-
585
- def _create_enhanced_word_highlights(
586
- self,
587
- reference_words: List[Dict],
588
- phoneme_comparisons: List[Dict],
589
- mode: AssessmentMode,
590
- ) -> List[Dict]:
591
- """Create enhanced word highlights with character-level error mapping - Optimized"""
592
-
593
- word_highlights = []
594
- phoneme_index = 0
595
-
596
- for word_data in reference_words:
597
- word = word_data["word"]
598
- word_phonemes = word_data["phonemes"]
599
- num_phonemes = len(word_phonemes)
600
-
601
- # Get phoneme scores for this word
602
- word_phoneme_scores = []
603
- word_comparisons = []
604
-
605
- for j in range(num_phonemes):
606
- if phoneme_index + j < len(phoneme_comparisons):
607
- comparison = phoneme_comparisons[phoneme_index + j]
608
- word_phoneme_scores.append(comparison["score"])
609
- word_comparisons.append(comparison)
610
-
611
- # Calculate word score
612
- word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
613
-
614
- # Map phoneme errors to character positions (enhanced for word mode)
615
- character_errors = []
616
- if mode == AssessmentMode.WORD:
617
- character_errors = self._map_phonemes_to_characters(
618
- word, word_comparisons
619
- )
620
-
621
- # Create enhanced word highlight
622
- highlight = {
623
- "word": word,
624
- "score": float(word_score),
625
- "status": self._get_word_status(word_score),
626
- "color": self._get_word_color(word_score),
627
- "phonemes": word_phonemes,
628
- "ipa": word_data["ipa"],
629
- "phoneme_scores": word_phoneme_scores,
630
- "phoneme_start_index": phoneme_index,
631
- "phoneme_end_index": phoneme_index + num_phonemes - 1,
632
- "phoneme_visualization": word_data["visualization"],
633
- "character_errors": character_errors,
634
- "detailed_analysis": mode == AssessmentMode.WORD,
635
- }
636
-
637
- word_highlights.append(highlight)
638
- phoneme_index += num_phonemes
639
-
640
- return word_highlights
641
-
642
- def _map_phonemes_to_characters(
643
- self, word: str, phoneme_comparisons: List[Dict]
644
- ) -> List[CharacterError]:
645
- """Map phoneme errors to character positions in word"""
646
- character_errors = []
647
-
648
- if not phoneme_comparisons or not word:
649
- return character_errors
650
-
651
- chars_per_phoneme = len(word) / len(phoneme_comparisons)
652
-
653
- for i, comparison in enumerate(phoneme_comparisons):
654
- if comparison["status"] in ["substitution", "deletion", "wrong"]:
655
- char_pos = min(int(i * chars_per_phoneme), len(word) - 1)
656
- severity = 1.0 - comparison["score"]
657
- color = self._get_error_color(severity)
658
-
659
- error = CharacterError(
660
- character=word[char_pos],
661
- position=char_pos,
662
- error_type=comparison["status"],
663
- expected_sound=comparison["reference_phoneme"],
664
- actual_sound=comparison["learner_phoneme"],
665
- severity=severity,
666
- color=color,
667
- )
668
- character_errors.append(error)
669
-
670
- return character_errors
671
-
672
- def _get_error_color(self, severity: float) -> str:
673
- """Get color code for character errors"""
674
- if severity >= 0.8:
675
- return "#ef4444" # Red - severe error
676
- elif severity >= 0.6:
677
- return "#f97316" # Orange - moderate error
678
- elif severity >= 0.4:
679
- return "#eab308" # Yellow - mild error
680
- else:
681
- return "#84cc16" # Light green - minor error
682
-
683
- def _identify_wrong_words_enhanced(
684
- self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
685
- ) -> List[Dict]:
686
- """Enhanced wrong word identification with detailed error analysis"""
687
-
688
- wrong_words = []
689
-
690
- for word_highlight in word_highlights:
691
- if word_highlight["score"] < 0.6:
692
- start_idx = word_highlight["phoneme_start_index"]
693
- end_idx = word_highlight["phoneme_end_index"]
694
-
695
- wrong_phonemes = []
696
- missing_phonemes = []
697
-
698
- for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
699
- comparison = phoneme_comparisons[i]
700
-
701
- if comparison["status"] in ["wrong", "substitution"]:
702
- wrong_phonemes.append(
703
- {
704
- "expected": comparison["reference_phoneme"],
705
- "actual": comparison["learner_phoneme"],
706
- "difficulty": comparison["difficulty"],
707
- "description": self.g2p._get_phoneme_description(
708
- comparison["reference_phoneme"]
709
- ),
710
- }
711
- )
712
- elif comparison["status"] in ["missing", "deletion"]:
713
- missing_phonemes.append(
714
- {
715
- "phoneme": comparison["reference_phoneme"],
716
- "difficulty": comparison["difficulty"],
717
- "description": self.g2p._get_phoneme_description(
718
- comparison["reference_phoneme"]
719
- ),
720
- }
721
- )
722
-
723
- wrong_word = {
724
- "word": word_highlight["word"],
725
- "score": word_highlight["score"],
726
- "expected_phonemes": word_highlight["phonemes"],
727
- "ipa": word_highlight["ipa"],
728
- "wrong_phonemes": wrong_phonemes,
729
- "missing_phonemes": missing_phonemes,
730
- "tips": self._get_enhanced_vietnamese_tips(
731
- wrong_phonemes, missing_phonemes
732
- ),
733
- "phoneme_visualization": word_highlight["phoneme_visualization"],
734
- "character_errors": word_highlight.get("character_errors", []),
735
- }
736
-
737
- wrong_words.append(wrong_word)
738
-
739
- return wrong_words
740
-
741
- def _create_phoneme_pairs(self, reference: str, learner: str) -> List[Dict]:
742
- """Create phoneme pairs for visualization - Optimized"""
743
- ref_phones = reference.split() if reference else []
744
- learner_phones = learner.split() if learner else []
745
-
746
- pairs = []
747
- min_len = min(len(ref_phones), len(learner_phones))
748
-
749
- # Quick alignment for most cases
750
- for i in range(min_len):
751
- pairs.append(
752
- {
753
- "reference": ref_phones[i],
754
- "learner": learner_phones[i],
755
- "match": ref_phones[i] == learner_phones[i],
756
- "type": "correct" if ref_phones[i] == learner_phones[i] else "substitution",
757
- }
758
- )
759
-
760
- # Handle extra phonemes
761
- for i in range(min_len, len(ref_phones)):
762
- pairs.append(
763
- {
764
- "reference": ref_phones[i],
765
- "learner": "",
766
- "match": False,
767
- "type": "deletion",
768
- }
769
- )
770
-
771
- for i in range(min_len, len(learner_phones)):
772
- pairs.append(
773
- {
774
- "reference": "",
775
- "learner": learner_phones[i],
776
- "match": False,
777
- "type": "insertion",
778
- }
779
- )
780
-
781
- return pairs
782
-
783
- def _get_word_status(self, score: float) -> str:
784
- """Get word status from score"""
785
- if score >= 0.8:
786
- return "excellent"
787
- elif score >= 0.6:
788
- return "good"
789
- elif score >= 0.4:
790
- return "needs_practice"
791
- else:
792
- return "poor"
793
-
794
- def _get_word_color(self, score: float) -> str:
795
- """Get color for word highlighting"""
796
- if score >= 0.8:
797
- return "#22c55e" # Green
798
- elif score >= 0.6:
799
- return "#84cc16" # Light green
800
- elif score >= 0.4:
801
- return "#eab308" # Yellow
802
- else:
803
- return "#ef4444" # Red
804
-
805
- def _get_enhanced_vietnamese_tips(
806
- self, wrong_phonemes: List[Dict], missing_phonemes: List[Dict]
807
- ) -> List[str]:
808
- """Enhanced Vietnamese-specific pronunciation tips"""
809
- tips = []
810
-
811
- vietnamese_tips = {
812
- "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
813
- "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
814
- "v": "Chạm môi dưới vào răng trên, không dùng cả hai môi như tiếng Việt",
815
- "r": "Cuộn lưỡi nhưng không chạm vào vòm miệng, không lăn lưỡi",
816
- "l": "Đầu lư��i chạm vào vòm miệng sau răng",
817
- "z": "Giống âm 's' nhưng có rung dây thanh âm",
818
- "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
819
- "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
820
- "æ": "Mở miệng rộng hơn khi phát âm 'a'",
821
- "ɪ": "Âm 'i' ngắn, không kéo dài như tiếng Việt",
822
- }
823
-
824
- for wrong in wrong_phonemes:
825
- expected = wrong["expected"]
826
- if expected in vietnamese_tips:
827
- tips.append(f"Âm /{expected}/: {vietnamese_tips[expected]}")
828
-
829
- for missing in missing_phonemes:
830
- phoneme = missing["phoneme"]
831
- if phoneme in vietnamese_tips:
832
- tips.append(f"Thiếu âm /{phoneme}/: {vietnamese_tips[phoneme]}")
833
-
834
- return tips
835
-
836
- def __del__(self):
837
- """Cleanup executor"""
838
- if hasattr(self, 'executor'):
839
- self.executor.shutdown(wait=False)
840
-
841
-
842
- class EnhancedProsodyAnalyzer:
843
- """Enhanced prosody analyzer for sentence-level assessment - Optimized"""
844
-
845
- def __init__(self):
846
- # Expected values for English prosody
847
- self.expected_speech_rate = 4.0 # syllables per second
848
- self.expected_pitch_range = 100 # Hz
849
- self.expected_pitch_cv = 0.3 # coefficient of variation
850
-
851
- def analyze_prosody_enhanced(
852
- self, audio_features: Dict, reference_text: str
853
- ) -> Dict:
854
- """Enhanced prosody analysis with detailed scoring - Optimized"""
855
-
856
- if "error" in audio_features:
857
- return self._empty_prosody_result()
858
-
859
- duration = audio_features.get("duration", 1)
860
- pitch_data = audio_features.get("pitch", {})
861
- rhythm_data = audio_features.get("rhythm", {})
862
- intensity_data = audio_features.get("intensity", {})
863
-
864
- # Calculate syllables (simplified)
865
- num_syllables = self._estimate_syllables(reference_text)
866
- actual_speech_rate = num_syllables / duration if duration > 0 else 0
867
-
868
- # Calculate individual prosody scores
869
- pace_score = self._calculate_pace_score(actual_speech_rate)
870
- intonation_score = self._calculate_intonation_score(pitch_data)
871
- rhythm_score = self._calculate_rhythm_score(rhythm_data, intensity_data)
872
- stress_score = self._calculate_stress_score(pitch_data, intensity_data)
873
-
874
- # Overall prosody score
875
- overall_prosody = (
876
- pace_score + intonation_score + rhythm_score + stress_score
877
- ) / 4
878
-
879
- # Generate prosody feedback
880
- feedback = self._generate_prosody_feedback(
881
- pace_score,
882
- intonation_score,
883
- rhythm_score,
884
- stress_score,
885
- actual_speech_rate,
886
- pitch_data,
887
- )
888
-
889
- return {
890
- "pace_score": pace_score,
891
- "intonation_score": intonation_score,
892
- "rhythm_score": rhythm_score,
893
- "stress_score": stress_score,
894
- "overall_prosody": overall_prosody,
895
- "details": {
896
- "speech_rate": actual_speech_rate,
897
- "expected_speech_rate": self.expected_speech_rate,
898
- "syllable_count": num_syllables,
899
- "duration": duration,
900
- "pitch_analysis": pitch_data,
901
- "rhythm_analysis": rhythm_data,
902
- "intensity_analysis": intensity_data,
903
- },
904
- "feedback": feedback,
905
- }
906
-
907
- def _calculate_pace_score(self, actual_rate: float) -> float:
908
- """Calculate pace score based on speech rate"""
909
- if self.expected_speech_rate == 0:
910
- return 0.5
911
-
912
- ratio = actual_rate / self.expected_speech_rate
913
-
914
- if 0.8 <= ratio <= 1.2:
915
- return 1.0
916
- elif 0.6 <= ratio < 0.8 or 1.2 < ratio <= 1.5:
917
- return 0.7
918
- elif 0.4 <= ratio < 0.6 or 1.5 < ratio <= 2.0:
919
- return 0.4
920
- else:
921
- return 0.1
922
-
923
- def _calculate_intonation_score(self, pitch_data: Dict) -> float:
924
- """Calculate intonation score based on pitch variation"""
925
- pitch_range = pitch_data.get("range", 0)
926
-
927
- if self.expected_pitch_range == 0:
928
- return 0.5
929
-
930
- ratio = pitch_range / self.expected_pitch_range
931
-
932
- if 0.7 <= ratio <= 1.3:
933
- return 1.0
934
- elif 0.5 <= ratio < 0.7 or 1.3 < ratio <= 1.8:
935
- return 0.7
936
- elif 0.3 <= ratio < 0.5 or 1.8 < ratio <= 2.5:
937
- return 0.4
938
- else:
939
- return 0.2
940
-
941
- def _calculate_rhythm_score(self, rhythm_data: Dict, intensity_data: Dict) -> float:
942
- """Calculate rhythm score based on tempo and intensity patterns"""
943
- tempo = rhythm_data.get("tempo", 120)
944
- intensity_std = intensity_data.get("rms_std", 0)
945
- intensity_mean = intensity_data.get("rms_mean", 0)
946
-
947
- # Tempo score (60-180 BPM is good for speech)
948
- if 60 <= tempo <= 180:
949
- tempo_score = 1.0
950
- elif 40 <= tempo < 60 or 180 < tempo <= 220:
951
- tempo_score = 0.6
952
- else:
953
- tempo_score = 0.3
954
-
955
- # Intensity consistency score
956
- if intensity_mean > 0:
957
- intensity_consistency = max(0, 1.0 - (intensity_std / intensity_mean))
958
- else:
959
- intensity_consistency = 0.5
960
-
961
- return (tempo_score + intensity_consistency) / 2
962
-
963
- def _calculate_stress_score(self, pitch_data: Dict, intensity_data: Dict) -> float:
964
- """Calculate stress score based on pitch and intensity variation"""
965
- pitch_cv = pitch_data.get("cv", 0)
966
- intensity_std = intensity_data.get("rms_std", 0)
967
- intensity_mean = intensity_data.get("rms_mean", 0)
968
-
969
- # Pitch coefficient of variation score
970
- if 0.2 <= pitch_cv <= 0.4:
971
- pitch_score = 1.0
972
- elif 0.1 <= pitch_cv < 0.2 or 0.4 < pitch_cv <= 0.6:
973
- pitch_score = 0.7
974
- else:
975
- pitch_score = 0.4
976
-
977
- # Intensity variation score
978
- if intensity_mean > 0:
979
- intensity_cv = intensity_std / intensity_mean
980
- if 0.1 <= intensity_cv <= 0.3:
981
- intensity_score = 1.0
982
- elif 0.05 <= intensity_cv < 0.1 or 0.3 < intensity_cv <= 0.5:
983
- intensity_score = 0.7
984
- else:
985
- intensity_score = 0.4
986
- else:
987
- intensity_score = 0.5
988
-
989
- return (pitch_score + intensity_score) / 2
990
-
991
- def _generate_prosody_feedback(
992
- self,
993
- pace_score: float,
994
- intonation_score: float,
995
- rhythm_score: float,
996
- stress_score: float,
997
- speech_rate: float,
998
- pitch_data: Dict,
999
- ) -> List[str]:
1000
- """Generate detailed prosody feedback"""
1001
- feedback = []
1002
-
1003
- if pace_score < 0.5:
1004
- if speech_rate < self.expected_speech_rate * 0.8:
1005
- feedback.append("Tốc độ nói hơi chậm, thử nói nhanh hơn một chút")
1006
- else:
1007
- feedback.append("Tốc độ nói hơi nhanh, thử nói chậm lại để rõ ràng hơn")
1008
- elif pace_score >= 0.8:
1009
- feedback.append("Tốc độ nói rất tự nhiên")
1010
-
1011
- if intonation_score < 0.5:
1012
- feedback.append("Cần cải thiện ngữ điệu - thay đổi cao độ giọng nhiều hơn")
1013
- elif intonation_score >= 0.8:
1014
- feedback.append("Ngữ điệu rất tự nhiên và sinh động")
1015
-
1016
- if rhythm_score < 0.5:
1017
- feedback.append("Nhịp điệu cần đều hơn - chú ý đến trọng âm của từ")
1018
- elif rhythm_score >= 0.8:
1019
- feedback.append("Nhịp điệu rất tốt")
1020
-
1021
- if stress_score < 0.5:
1022
- feedback.append("Cần nhấn mạnh trọng âm rõ ràng hơn")
1023
- elif stress_score >= 0.8:
1024
- feedback.append("Trọng âm được nhấn rất tốt")
1025
-
1026
- return feedback
1027
-
1028
- def _estimate_syllables(self, text: str) -> int:
1029
- """Estimate number of syllables in text - Optimized"""
1030
- vowels = "aeiouy"
1031
- text = text.lower()
1032
- syllable_count = 0
1033
- prev_was_vowel = False
1034
-
1035
- for char in text:
1036
- if char in vowels:
1037
- if not prev_was_vowel:
1038
- syllable_count += 1
1039
- prev_was_vowel = True
1040
- else:
1041
- prev_was_vowel = False
1042
-
1043
- if text.endswith("e"):
1044
- syllable_count -= 1
1045
-
1046
- return max(1, syllable_count)
1047
-
1048
- def _empty_prosody_result(self) -> Dict:
1049
- """Return empty prosody result for error cases"""
1050
- return {
1051
- "pace_score": 0.5,
1052
- "intonation_score": 0.5,
1053
- "rhythm_score": 0.5,
1054
- "stress_score": 0.5,
1055
- "overall_prosody": 0.5,
1056
- "details": {},
1057
- "feedback": ["Không thể phân tích ngữ điệu"],
1058
- }
1059
-
1060
-
1061
- class EnhancedFeedbackGenerator:
1062
- """Enhanced feedback generator with detailed analysis - Optimized"""
1063
-
1064
- def generate_enhanced_feedback(
1065
- self,
1066
- overall_score: float,
1067
- wrong_words: List[Dict],
1068
- phoneme_comparisons: List[Dict],
1069
- mode: AssessmentMode,
1070
- prosody_analysis: Dict = None,
1071
- ) -> List[str]:
1072
- """Generate comprehensive feedback based on assessment mode"""
1073
-
1074
- feedback = []
1075
-
1076
- # Overall score feedback
1077
- if overall_score >= 0.9:
1078
- feedback.append("Phát âm xuất sắc! Bạn đã làm rất tốt.")
1079
- elif overall_score >= 0.8:
1080
- feedback.append("Phát âm rất tốt! Chỉ còn một vài điểm nhỏ cần cải thiện.")
1081
- elif overall_score >= 0.6:
1082
- feedback.append("Phát âm khá tốt, còn một số điểm cần luyện tập thêm.")
1083
- elif overall_score >= 0.4:
1084
- feedback.append("Cần luyện tập thêm. Tập trung vào những từ được đánh dấu.")
1085
- else:
1086
- feedback.append("Hãy luyện tập chậm rãi và rõ ràng hơn.")
1087
-
1088
- # Mode-specific feedback
1089
- if mode == AssessmentMode.WORD:
1090
- feedback.extend(
1091
- self._generate_word_mode_feedback(wrong_words, phoneme_comparisons)
1092
- )
1093
- elif mode == AssessmentMode.SENTENCE:
1094
- feedback.extend(
1095
- self._generate_sentence_mode_feedback(wrong_words, prosody_analysis)
1096
- )
1097
-
1098
- # Common error patterns
1099
- error_patterns = self._analyze_error_patterns(phoneme_comparisons)
1100
- if error_patterns:
1101
- feedback.extend(error_patterns)
1102
-
1103
- return feedback
1104
-
1105
- def _generate_word_mode_feedback(
1106
- self, wrong_words: List[Dict], phoneme_comparisons: List[Dict]
1107
- ) -> List[str]:
1108
- """Generate feedback specific to word mode"""
1109
- feedback = []
1110
-
1111
- if wrong_words:
1112
- if len(wrong_words) == 1:
1113
- word = wrong_words[0]["word"]
1114
- feedback.append(f"Từ '{word}' cần luyện tập thêm")
1115
-
1116
- # Character-level feedback
1117
- char_errors = wrong_words[0].get("character_errors", [])
1118
- if char_errors:
1119
- error_chars = [err.character for err in char_errors[:3]]
1120
- feedback.append(f"Chú ý các âm: {', '.join(error_chars)}")
1121
- else:
1122
- word_list = [w["word"] for w in wrong_words[:3]]
1123
- feedback.append(f"Các từ cần luyện: {', '.join(word_list)}")
1124
-
1125
- return feedback
1126
-
1127
- def _generate_sentence_mode_feedback(
1128
- self, wrong_words: List[Dict], prosody_analysis: Dict
1129
- ) -> List[str]:
1130
- """Generate feedback specific to sentence mode"""
1131
- feedback = []
1132
-
1133
- # Word-level feedback
1134
- if wrong_words:
1135
- if len(wrong_words) <= 2:
1136
- word_list = [w["word"] for w in wrong_words]
1137
- feedback.append(f"Cần cải thiện: {', '.join(word_list)}")
1138
- else:
1139
- feedback.append(f"Có {len(wrong_words)} từ cần luyện tập")
1140
-
1141
- # Prosody feedback
1142
- if prosody_analysis and "feedback" in prosody_analysis:
1143
- feedback.extend(prosody_analysis["feedback"][:2]) # Limit prosody feedback
1144
-
1145
- return feedback
1146
-
1147
- def _analyze_error_patterns(self, phoneme_comparisons: List[Dict]) -> List[str]:
1148
- """Analyze common error patterns across phonemes"""
1149
- feedback = []
1150
-
1151
- # Count error types
1152
- error_counts = defaultdict(int)
1153
- difficult_phonemes = defaultdict(int)
1154
-
1155
- for comparison in phoneme_comparisons:
1156
- if comparison["status"] in ["wrong", "substitution"]:
1157
- phoneme = comparison["reference_phoneme"]
1158
- difficult_phonemes[phoneme] += 1
1159
- error_counts[comparison["status"]] += 1
1160
-
1161
- # Most problematic phoneme
1162
- if difficult_phonemes:
1163
- most_difficult = max(difficult_phonemes.items(), key=lambda x: x[1])
1164
- if most_difficult[1] >= 2:
1165
- phoneme = most_difficult[0]
1166
- phoneme_tips = {
1167
- "θ": "Lưỡi giữa răng, thổi nhẹ",
1168
- "ð": "Lưỡi giữa răng, rung dây thanh",
1169
- "v": "Môi dưới chạm răng trên",
1170
- "r": "Cuộn lưỡi nhẹ",
1171
- "z": "Như 's' nhưng rung dây thanh",
1172
- }
1173
-
1174
- if phoneme in phoneme_tips:
1175
- feedback.append(f"Âm khó nhất /{phoneme}/: {phoneme_tips[phoneme]}")
1176
-
1177
- return feedback
1178
-
1179
-
1180
- class ProductionPronunciationAssessor:
1181
- """Production-ready pronunciation assessor - Enhanced version with optimizations"""
1182
-
1183
- _instance = None
1184
- _initialized = False
1185
-
1186
- def __new__(cls, onnx: bool = False, quantized: bool = False):
1187
- if cls._instance is None:
1188
- cls._instance = super(ProductionPronunciationAssessor, cls).__new__(cls)
1189
- return cls._instance
1190
-
1191
- def __init__(self, onnx: bool = False, quantized: bool = False):
1192
- """Initialize the production-ready pronunciation assessment system (only once)"""
1193
- if self._initialized:
1194
- return
1195
-
1196
- logger.info("Initializing Optimized Production Pronunciation Assessment System...")
1197
-
1198
- self.asr = EnhancedWav2Vec2CharacterASR(onnx=onnx, quantized=quantized)
1199
- self.word_analyzer = EnhancedWordAnalyzer()
1200
- self.prosody_analyzer = EnhancedProsodyAnalyzer()
1201
- self.feedback_generator = EnhancedFeedbackGenerator()
1202
- self.g2p = EnhancedG2P()
1203
-
1204
- # Thread pool for parallel processing
1205
- self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
1206
-
1207
- ProductionPronunciationAssessor._initialized = True
1208
- logger.info("Optimized production system initialization completed")
1209
-
1210
- def assess_pronunciation(
1211
- self, audio_path: str, reference_text: str, mode: str = "auto"
1212
- ) -> Dict:
1213
- """
1214
- Main assessment function with enhanced features and optimizations
1215
-
1216
- Args:
1217
- audio_path: Path to audio file
1218
- reference_text: Reference text to compare against
1219
- mode: Assessment mode ("word", "sentence", "auto", or legacy modes)
1220
-
1221
- Returns:
1222
- Enhanced assessment results with backward compatibility
1223
- """
1224
-
1225
- logger.info(f"Starting optimized production assessment in {mode} mode...")
1226
- start_time = time.time()
1227
-
1228
- try:
1229
- # Normalize and validate mode
1230
- assessment_mode = self._normalize_mode(mode, reference_text)
1231
- logger.info(f"Using assessment mode: {assessment_mode.value}")
1232
-
1233
- # Step 1: Enhanced ASR transcription with features (0.3s)
1234
- asr_result = self.asr.transcribe_with_features(audio_path)
1235
-
1236
- if not asr_result["character_transcript"]:
1237
- return self._create_error_result("No speech detected in audio")
1238
-
1239
- # Step 2: Parallel analysis processing
1240
- future_word_analysis = self.executor.submit(
1241
- self.word_analyzer.analyze_words_enhanced,
1242
- reference_text, asr_result["phoneme_representation"], assessment_mode
1243
- )
1244
-
1245
- # Step 3: Conditional prosody analysis (only for sentence mode)
1246
- future_prosody = None
1247
- if assessment_mode == AssessmentMode.SENTENCE:
1248
- future_prosody = self.executor.submit(
1249
- self.prosody_analyzer.analyze_prosody_enhanced,
1250
- asr_result["audio_features"], reference_text
1251
- )
1252
-
1253
- # Get analysis results
1254
- analysis_result = future_word_analysis.result()
1255
-
1256
- # Step 4: Parallel final processing
1257
- future_overall_score = self.executor.submit(
1258
- self._calculate_overall_score, analysis_result["phoneme_differences"]
1259
- )
1260
-
1261
- future_phoneme_summary = self.executor.submit(
1262
- self._create_phoneme_comparison_summary, analysis_result["phoneme_pairs"]
1263
- )
1264
-
1265
- # Get prosody analysis if needed
1266
- prosody_analysis = {}
1267
- if future_prosody:
1268
- prosody_analysis = future_prosody.result()
1269
-
1270
- # Get final results
1271
- overall_score = future_overall_score.result()
1272
- phoneme_comparison_summary = future_phoneme_summary.result()
1273
-
1274
- # Step 5: Generate enhanced feedback
1275
- feedback = self.feedback_generator.generate_enhanced_feedback(
1276
- overall_score,
1277
- analysis_result["wrong_words"],
1278
- analysis_result["phoneme_differences"],
1279
- assessment_mode,
1280
- prosody_analysis,
1281
- )
1282
-
1283
- # Step 6: Assemble result with backward compatibility
1284
- result = self._create_enhanced_result(
1285
- asr_result,
1286
- analysis_result,
1287
- overall_score,
1288
- feedback,
1289
- prosody_analysis,
1290
- phoneme_comparison_summary,
1291
- assessment_mode,
1292
- )
1293
-
1294
- # Add processing metadata
1295
- processing_time = time.time() - start_time
1296
- result["processing_info"] = {
1297
- "processing_time": round(processing_time, 2),
1298
- "mode": assessment_mode.value,
1299
- "model_used": "Wav2Vec2-Enhanced-Optimized",
1300
- "onnx_enabled": self.asr.use_onnx,
1301
- "confidence": asr_result["confidence"],
1302
- "enhanced_features": True,
1303
- "character_level_analysis": assessment_mode == AssessmentMode.WORD,
1304
- "prosody_analysis": assessment_mode == AssessmentMode.SENTENCE,
1305
- "optimized": True,
1306
- }
1307
-
1308
- logger.info(f"Optimized production assessment completed in {processing_time:.2f}s")
1309
- return result
1310
-
1311
- except Exception as e:
1312
- logger.error(f"Production assessment error: {e}")
1313
- return self._create_error_result(f"Assessment failed: {str(e)}")
1314
-
1315
- def _normalize_mode(self, mode: str, reference_text: str) -> AssessmentMode:
1316
- """Normalize mode parameter with backward compatibility"""
1317
-
1318
- # Legacy mode mapping
1319
- legacy_mapping = {
1320
- "normal": AssessmentMode.AUTO,
1321
- "advanced": AssessmentMode.AUTO,
1322
- }
1323
-
1324
- if mode in legacy_mapping:
1325
- normalized_mode = legacy_mapping[mode]
1326
- logger.info(f"Mapped legacy mode '{mode}' to '{normalized_mode.value}'")
1327
- mode = normalized_mode.value
1328
-
1329
- # Validate mode
1330
- try:
1331
- assessment_mode = AssessmentMode(mode)
1332
- except ValueError:
1333
- logger.warning(f"Invalid mode '{mode}', defaulting to AUTO")
1334
- assessment_mode = AssessmentMode.AUTO
1335
-
1336
- # Auto-detect mode based on text length
1337
- if assessment_mode == AssessmentMode.AUTO:
1338
- word_count = len(reference_text.strip().split())
1339
- assessment_mode = (
1340
- AssessmentMode.WORD if word_count <= 3 else AssessmentMode.SENTENCE
1341
- )
1342
- logger.info(
1343
- f"Auto-detected mode: {assessment_mode.value} (word count: {word_count})"
1344
- )
1345
-
1346
- return assessment_mode
1347
-
1348
- def _calculate_overall_score(self, phoneme_comparisons: List[Dict]) -> float:
1349
- """Calculate weighted overall score"""
1350
- if not phoneme_comparisons:
1351
- return 0.0
1352
-
1353
- total_weighted_score = 0.0
1354
- total_weight = 0.0
1355
-
1356
- for comparison in phoneme_comparisons:
1357
- weight = comparison.get("difficulty", 0.5) # Use difficulty as weight
1358
- score = comparison["score"]
1359
-
1360
- total_weighted_score += score * weight
1361
- total_weight += weight
1362
-
1363
- return total_weighted_score / total_weight if total_weight > 0 else 0.0
1364
-
1365
- def _create_phoneme_comparison_summary(self, phoneme_pairs: List[Dict]) -> Dict:
1366
- """Create phoneme comparison summary statistics"""
1367
- total = len(phoneme_pairs)
1368
- if total == 0:
1369
- return {"total_phonemes": 0, "accuracy_percentage": 0}
1370
-
1371
- correct = sum(1 for pair in phoneme_pairs if pair["match"])
1372
- substitutions = sum(
1373
- 1 for pair in phoneme_pairs if pair["type"] == "substitution"
1374
- )
1375
- deletions = sum(1 for pair in phoneme_pairs if pair["type"] == "deletion")
1376
- insertions = sum(1 for pair in phoneme_pairs if pair["type"] == "insertion")
1377
-
1378
- return {
1379
- "total_phonemes": total,
1380
- "correct": correct,
1381
- "substitutions": substitutions,
1382
- "deletions": deletions,
1383
- "insertions": insertions,
1384
- "accuracy_percentage": round((correct / total) * 100, 1),
1385
- "error_rate": round(
1386
- ((substitutions + deletions + insertions) / total) * 100, 1
1387
- ),
1388
- }
1389
-
1390
- def _create_enhanced_result(
1391
- self,
1392
- asr_result: Dict,
1393
- analysis_result: Dict,
1394
- overall_score: float,
1395
- feedback: List[str],
1396
- prosody_analysis: Dict,
1397
- phoneme_summary: Dict,
1398
- assessment_mode: AssessmentMode,
1399
- ) -> Dict:
1400
- """Create enhanced result with backward compatibility"""
1401
-
1402
- # Base result structure (backward compatible)
1403
- result = {
1404
- "transcript": asr_result["character_transcript"],
1405
- "transcript_phonemes": asr_result["phoneme_representation"],
1406
- "user_phonemes": asr_result["phoneme_representation"],
1407
- "character_transcript": asr_result["character_transcript"],
1408
- "overall_score": overall_score,
1409
- "word_highlights": analysis_result["word_highlights"],
1410
- "phoneme_differences": analysis_result["phoneme_differences"],
1411
- "wrong_words": analysis_result["wrong_words"],
1412
- "feedback": feedback,
1413
- }
1414
-
1415
- # Enhanced features
1416
- result.update(
1417
- {
1418
- "reference_phonemes": analysis_result["reference_phonemes"],
1419
- "phoneme_pairs": analysis_result["phoneme_pairs"],
1420
- "phoneme_comparison": phoneme_summary,
1421
- "assessment_mode": assessment_mode.value,
1422
- }
1423
- )
1424
-
1425
- # Add prosody analysis for sentence mode
1426
- if prosody_analysis:
1427
- result["prosody_analysis"] = prosody_analysis
1428
-
1429
- # Add character-level analysis for word mode
1430
- if assessment_mode == AssessmentMode.WORD:
1431
- result["character_level_analysis"] = True
1432
-
1433
- # Add character errors to word highlights if available
1434
- for word_highlight in result["word_highlights"]:
1435
- if "character_errors" in word_highlight:
1436
- # Convert CharacterError objects to dicts for JSON serialization
1437
- char_errors = []
1438
- for error in word_highlight["character_errors"]:
1439
- if isinstance(error, CharacterError):
1440
- char_errors.append(
1441
- {
1442
- "character": error.character,
1443
- "position": error.position,
1444
- "error_type": error.error_type,
1445
- "expected_sound": error.expected_sound,
1446
- "actual_sound": error.actual_sound,
1447
- "severity": error.severity,
1448
- "color": error.color,
1449
- }
1450
- )
1451
- else:
1452
- char_errors.append(error)
1453
- word_highlight["character_errors"] = char_errors
1454
-
1455
- return result
1456
-
1457
- def _create_error_result(self, error_message: str) -> Dict:
1458
- """Create error result structure"""
1459
- return {
1460
- "transcript": "",
1461
- "transcript_phonemes": "",
1462
- "user_phonemes": "",
1463
- "character_transcript": "",
1464
- "overall_score": 0.0,
1465
- "word_highlights": [],
1466
- "phoneme_differences": [],
1467
- "wrong_words": [],
1468
- "feedback": [f"Lỗi: {error_message}"],
1469
- "error": error_message,
1470
- "assessment_mode": "error",
1471
- "processing_info": {
1472
- "processing_time": 0,
1473
- "mode": "error",
1474
- "model_used": "Wav2Vec2-Enhanced-Optimized",
1475
- "confidence": 0.0,
1476
- "enhanced_features": False,
1477
- "optimized": True,
1478
- },
1479
- }
1480
-
1481
- def get_system_info(self) -> Dict:
1482
- """Get comprehensive system information"""
1483
- return {
1484
- "version": "2.1.0-production-optimized",
1485
- "name": "Optimized Production Pronunciation Assessment System",
1486
- "modes": [mode.value for mode in AssessmentMode],
1487
- "features": [
1488
- "Parallel processing for 60-70% speed improvement",
1489
- "LRU cache for G2P conversion (1000 words)",
1490
- "Enhanced Levenshtein distance phoneme alignment",
1491
- "Character-level error detection (word mode)",
1492
- "Advanced prosody analysis (sentence mode)",
1493
- "Vietnamese speaker-specific error patterns",
1494
- "Real-time confidence scoring",
1495
- "IPA phonetic representation with visualization",
1496
- "Backward compatibility with legacy APIs",
1497
- "Production-ready error handling",
1498
- ],
1499
- "model_info": {
1500
- "asr_model": self.asr.model_name,
1501
- "onnx_enabled": self.asr.use_onnx,
1502
- "sample_rate": self.asr.sample_rate,
1503
- },
1504
- "performance": {
1505
- "target_processing_time": "< 0.8s (vs original 2s)",
1506
- "expected_improvement": "60-70% faster",
1507
- "parallel_workers": 4,
1508
- "cached_operations": ["G2P conversion", "phoneme strings", "word mappings"],
1509
- },
1510
- }
1511
-
1512
- def __del__(self):
1513
- """Cleanup executor"""
1514
- if hasattr(self, 'executor'):
1515
- self.executor.shutdown(wait=False)
1516
-
1517
-
1518
- # Backward compatibility wrapper
1519
- class SimplePronunciationAssessor:
1520
- """Backward compatible wrapper for the enhanced optimized system"""
1521
-
1522
- def __init__(self, onnx: bool = True, quantized: bool = True):
1523
- print("Initializing Optimized Simple Pronunciation Assessor (Enhanced)...")
1524
- self.enhanced_assessor = ProductionPronunciationAssessor(onnx=onnx, quantized=quantized)
1525
- print("Optimized Enhanced Simple Pronunciation Assessor initialization completed")
1526
-
1527
- def assess_pronunciation(
1528
- self, audio_path: str, reference_text: str, mode: str = "normal"
1529
- ) -> Dict:
1530
- """
1531
- Backward compatible assessment function with optimizations
1532
-
1533
- Args:
1534
- audio_path: Path to audio file
1535
- reference_text: Reference text to compare
1536
- mode: Assessment mode (supports legacy modes)
1537
- """
1538
- return self.enhanced_assessor.assess_pronunciation(
1539
- audio_path, reference_text, mode
1540
- )
1541
-
1542
-
1543
- # Example usage and performance testing
1544
- if __name__ == "__main__":
1545
- import time
1546
- import psutil
1547
- import os
1548
-
1549
- # Initialize optimized production system with ONNX and quantization
1550
- system = ProductionPronunciationAssessor(onnx=False, quantized=False)
1551
-
1552
- # Performance test cases
1553
- test_cases = [
1554
- ("./hello_world.wav", "hello", "word"),
1555
- ("./hello_how_are_you_today.wav", "Hello, how are you today?", "sentence"),
1556
- ("./pronunciation.wav", "pronunciation", "auto"),
1557
- ]
1558
-
1559
- print("=== OPTIMIZED PERFORMANCE TESTING ===")
1560
-
1561
- for audio_path, reference_text, mode in test_cases:
1562
- print(f"\n--- Testing {mode.upper()} mode: '{reference_text}' ---")
1563
-
1564
- if not os.path.exists(audio_path):
1565
- print(f"Warning: Test file {audio_path} not found, skipping...")
1566
- continue
1567
-
1568
- # Multiple runs to test consistency
1569
- times = []
1570
- scores = []
1571
-
1572
- for i in range(5):
1573
- start_time = time.time()
1574
- result = system.assess_pronunciation(audio_path, reference_text, mode)
1575
- end_time = time.time()
1576
-
1577
- processing_time = end_time - start_time
1578
- times.append(processing_time)
1579
- scores.append(result.get('overall_score', 0))
1580
-
1581
- print(f"Run {i+1}: {processing_time:.3f}s - Score: {scores[-1]:.2f}")
1582
-
1583
- avg_time = sum(times) / len(times)
1584
- avg_score = sum(scores) / len(scores)
1585
- min_time = min(times)
1586
- max_time = max(times)
1587
-
1588
- print(f"Average time: {avg_time:.3f}s")
1589
- print(f"Min time: {min_time:.3f}s")
1590
- print(f"Max time: {max_time:.3f}s")
1591
- print(f"Average score: {avg_score:.2f}")
1592
- print(f"Speed improvement vs 2s baseline: {((2.0 - avg_time) / 2.0 * 100):.1f}%")
1593
-
1594
- # Check if target is met
1595
- if avg_time <= 0.8:
1596
- print("✅ TARGET ACHIEVED: < 0.8s")
1597
- else:
1598
- print("❌ Target missed: > 0.8s")
1599
-
1600
- # Backward compatibility test
1601
- print(f"\n=== BACKWARD COMPATIBILITY TEST ===")
1602
- legacy_assessor = SimplePronunciationAssessor(onnx=True, quantized=True)
1603
-
1604
- start_time = time.time()
1605
- legacy_result = legacy_assessor.assess_pronunciation(
1606
- "./hello_world.wav", "pronunciation", "normal"
1607
- )
1608
- processing_time = time.time() - start_time
1609
-
1610
- print(f"Legacy API time: {processing_time:.3f}s")
1611
- print(f"Legacy result keys: {list(legacy_result.keys())}")
1612
- print(f"Legacy score: {legacy_result.get('overall_score', 0):.2f}")
1613
- print(f"Legacy mode mapped to: {legacy_result.get('assessment_mode', 'N/A')}")
1614
-
1615
- # Memory usage test
1616
- process = psutil.Process(os.getpid())
1617
- memory_usage = process.memory_info().rss / 1024 / 1024 # MB
1618
- print(f"\nMemory usage: {memory_usage:.1f}MB")
1619
-
1620
- # System info
1621
- print(f"\n=== SYSTEM INFORMATION ===")
1622
- system_info = system.get_system_info()
1623
- print(f"System version: {system_info['version']}")
1624
- print(f"Available modes: {system_info['modes']}")
1625
- print(f"Model info: {system_info['model_info']}")
1626
- print(f"Performance targets: {system_info['performance']}")
1627
-
1628
- print(f"\n=== OPTIMIZATION SUMMARY ===")
1629
- optimizations = [
1630
- "✅ Parallel processing with ThreadPoolExecutor (4 workers)",
1631
- "✅ LRU cache for G2P conversion (1000 words cache)",
1632
- "✅ LRU cache for phoneme strings (500 phrases cache)",
1633
- "✅ Simplified audio feature extraction (10x frame sampling)",
1634
- "✅ Fast Levenshtein alignment algorithm",
1635
- "✅ ONNX + Quantization for fastest ASR inference",
1636
- "✅ Concurrent futures for independent tasks",
1637
- "✅ Reduced librosa computation overhead",
1638
- "✅ Quick phoneme pair alignment",
1639
- "✅ Minimal object creation in hot paths",
1640
- "✅ Conditional prosody analysis (sentence mode only)",
1641
- "✅ Optimized error pattern analysis",
1642
- "✅ Fast syllable counting algorithm",
1643
- "✅ Simplified phoneme mapping fallbacks",
1644
- "✅ Cached CMU dictionary lookups",
1645
- ]
1646
-
1647
- for optimization in optimizations:
1648
- print(optimization)
1649
-
1650
- print(f"\n=== PERFORMANCE COMPARISON ===")
1651
- print(f"Original system: ~2.0s total")
1652
- print(f" - ASR: 0.3s")
1653
- print(f" - Processing: 1.7s")
1654
- print(f"")
1655
- print(f"Optimized system: ~0.6-0.8s total (target)")
1656
- print(f" - ASR: 0.3s (unchanged)")
1657
- print(f" - Processing: 0.3-0.5s (65-70% improvement)")
1658
- print(f"")
1659
- print(f"Key improvements:")
1660
- print(f" • Parallel processing of independent analysis tasks")
1661
- print(f" • Cached G2P conversions avoid repeated computation")
1662
- print(f" • Simplified audio analysis with strategic sampling")
1663
- print(f" • Fast alignment algorithms for phoneme comparison")
1664
- print(f" • ONNX quantized models for maximum ASR speed")
1665
- print(f" • Conditional feature extraction based on assessment mode")
1666
-
1667
- print(f"\n=== BACKWARD COMPATIBILITY ===")
1668
- print(f"✅ All original class names preserved")
1669
- print(f"✅ All original function signatures maintained")
1670
- print(f"✅ All original output formats supported")
1671
- print(f"✅ Legacy mode mapping (normal -> auto)")
1672
- print(f"✅ Original API completely functional")
1673
- print(f"✅ Enhanced features are additive, not breaking")
1674
-
1675
- print(f"\nOptimization complete! Target: 60-70% faster processing achieved.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/apis/controllers/speaking_controller.py CHANGED
@@ -513,6 +513,24 @@ class EnhancedG2P:
513
 
514
  return phoneme_sequence
515
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
517
  """Convert CMU phonemes to IPA - Optimized"""
518
  cmu_to_ipa = {
@@ -641,7 +659,6 @@ class EnhancedG2P:
641
  {
642
  "phoneme": phoneme,
643
  "color_category": color_category,
644
- "description": self._get_phoneme_description(phoneme),
645
  "difficulty": self.difficulty_scores.get(phoneme, 0.3),
646
  }
647
  )
@@ -825,7 +842,7 @@ class EnhancedWordAnalyzer:
825
 
826
  # Start parallel tasks
827
  future_ref_phonemes = self.executor.submit(
828
- self.g2p.text_to_phonemes, reference_text
829
  )
830
  future_ref_phoneme_string = self.executor.submit(
831
  self.g2p.get_phoneme_string, reference_text
@@ -914,7 +931,7 @@ class EnhancedWordAnalyzer:
914
  "phoneme_scores": word_phoneme_scores,
915
  "phoneme_start_index": phoneme_index,
916
  "phoneme_end_index": phoneme_index + num_phonemes - 1,
917
- "phoneme_visualization": word_data["visualization"],
918
  "character_errors": character_errors,
919
  "detailed_analysis": mode == AssessmentMode.WORD,
920
  }
@@ -989,9 +1006,6 @@ class EnhancedWordAnalyzer:
989
  "expected": comparison["reference_phoneme"],
990
  "actual": comparison["learner_phoneme"],
991
  "difficulty": comparison["difficulty"],
992
- "description": self.g2p._get_phoneme_description(
993
- comparison["reference_phoneme"]
994
- ),
995
  }
996
  )
997
  elif comparison["status"] in ["missing", "deletion"]:
@@ -999,9 +1013,6 @@ class EnhancedWordAnalyzer:
999
  {
1000
  "phoneme": comparison["reference_phoneme"],
1001
  "difficulty": comparison["difficulty"],
1002
- "description": self.g2p._get_phoneme_description(
1003
- comparison["reference_phoneme"]
1004
- ),
1005
  }
1006
  )
1007
 
@@ -1015,7 +1026,6 @@ class EnhancedWordAnalyzer:
1015
  "tips": self._get_enhanced_vietnamese_tips(
1016
  wrong_phonemes, missing_phonemes
1017
  ),
1018
- "phoneme_visualization": word_highlight["phoneme_visualization"],
1019
  "character_errors": word_highlight.get("character_errors", []),
1020
  }
1021
 
@@ -1650,17 +1660,6 @@ class ProductionPronunciationAssessor:
1650
 
1651
  # Add processing metadata
1652
  processing_time = time.time() - start_time
1653
- result["processing_info"] = {
1654
- "processing_time": round(processing_time, 2),
1655
- "mode": assessment_mode.value,
1656
- "model_used": "Wav2Vec2-Enhanced-Optimized",
1657
- "onnx_enabled": self.asr.use_onnx,
1658
- "confidence": asr_result["confidence"],
1659
- "enhanced_features": True,
1660
- "character_level_analysis": assessment_mode == AssessmentMode.WORD,
1661
- "prosody_analysis": assessment_mode == AssessmentMode.SENTENCE,
1662
- "optimized": True,
1663
- }
1664
 
1665
  logger.info(
1666
  f"Optimized production assessment completed in {processing_time:.2f}s"
@@ -1865,15 +1864,6 @@ class ProductionPronunciationAssessor:
1865
  "audio_quality": audio_quality,
1866
  "retry_suggestions": suggestions,
1867
  "assessment_mode": "error",
1868
- "processing_info": {
1869
- "processing_time": 0,
1870
- "mode": "error",
1871
- "model_used": "Wav2Vec2-Enhanced-Optimized",
1872
- "confidence": 0.0,
1873
- "enhanced_features": False,
1874
- "optimized": True,
1875
- "error_handled": True,
1876
- },
1877
  }
1878
 
1879
  def get_system_info(self) -> Dict:
 
513
 
514
  return phoneme_sequence
515
 
516
+ def text_to_phonemes_basic(self, text: str) -> List[Dict]:
517
+ """Convert text to phoneme sequence without visualization for speed"""
518
+ words = self._clean_text(text).split()
519
+ phoneme_sequence = []
520
+
521
+ for word in words:
522
+ phonemes = self.word_to_phonemes(word)
523
+ phoneme_sequence.append(
524
+ {
525
+ "word": word,
526
+ "phonemes": phonemes,
527
+ "ipa": self._get_ipa(word),
528
+ "phoneme_string": " ".join(phonemes),
529
+ }
530
+ )
531
+
532
+ return phoneme_sequence
533
+
534
  def _convert_cmu_to_ipa(self, cmu_phonemes: List[str]) -> List[str]:
535
  """Convert CMU phonemes to IPA - Optimized"""
536
  cmu_to_ipa = {
 
659
  {
660
  "phoneme": phoneme,
661
  "color_category": color_category,
 
662
  "difficulty": self.difficulty_scores.get(phoneme, 0.3),
663
  }
664
  )
 
842
 
843
  # Start parallel tasks
844
  future_ref_phonemes = self.executor.submit(
845
+ self.g2p.text_to_phonemes_basic, reference_text
846
  )
847
  future_ref_phoneme_string = self.executor.submit(
848
  self.g2p.get_phoneme_string, reference_text
 
931
  "phoneme_scores": word_phoneme_scores,
932
  "phoneme_start_index": phoneme_index,
933
  "phoneme_end_index": phoneme_index + num_phonemes - 1,
934
+ # Visualization removed for performance
935
  "character_errors": character_errors,
936
  "detailed_analysis": mode == AssessmentMode.WORD,
937
  }
 
1006
  "expected": comparison["reference_phoneme"],
1007
  "actual": comparison["learner_phoneme"],
1008
  "difficulty": comparison["difficulty"],
 
 
 
1009
  }
1010
  )
1011
  elif comparison["status"] in ["missing", "deletion"]:
 
1013
  {
1014
  "phoneme": comparison["reference_phoneme"],
1015
  "difficulty": comparison["difficulty"],
 
 
 
1016
  }
1017
  )
1018
 
 
1026
  "tips": self._get_enhanced_vietnamese_tips(
1027
  wrong_phonemes, missing_phonemes
1028
  ),
 
1029
  "character_errors": word_highlight.get("character_errors", []),
1030
  }
1031
 
 
1660
 
1661
  # Add processing metadata
1662
  processing_time = time.time() - start_time
 
 
 
 
 
 
 
 
 
 
 
1663
 
1664
  logger.info(
1665
  f"Optimized production assessment completed in {processing_time:.2f}s"
 
1864
  "audio_quality": audio_quality,
1865
  "retry_suggestions": suggestions,
1866
  "assessment_mode": "error",
 
 
 
 
 
 
 
 
 
1867
  }
1868
 
1869
  def get_system_info(self) -> Dict:
src/utils/speaking_utils.py CHANGED
@@ -1,564 +1,5 @@
1
- from typing import List, Dict
2
  import numpy as np
3
  import nltk
4
- import eng_to_ipa as ipa
5
- import re
6
- from collections import defaultdict
7
-
8
-
9
- try:
10
- nltk.download("cmudict", quiet=True)
11
- from nltk.corpus import cmudict
12
- except:
13
- print("Warning: NLTK data not available")
14
-
15
-
16
- class SimpleG2P:
17
- """Simple Grapheme-to-Phoneme converter for reference text"""
18
-
19
- def __init__(self):
20
- try:
21
- self.cmu_dict = cmudict.dict()
22
- except:
23
- self.cmu_dict = {}
24
- print("Warning: CMU dictionary not available")
25
-
26
- def text_to_phonemes(self, text: str) -> List[Dict]:
27
- """Convert text to phoneme sequence"""
28
- words = self._clean_text(text).split()
29
- phoneme_sequence = []
30
-
31
- for word in words:
32
- word_phonemes = self._get_word_phonemes(word)
33
- phoneme_sequence.append(
34
- {
35
- "word": word,
36
- "phonemes": word_phonemes,
37
- "ipa": self._get_ipa(word),
38
- "phoneme_string": " ".join(word_phonemes),
39
- }
40
- )
41
-
42
- return phoneme_sequence
43
-
44
- def get_reference_phoneme_string(self, text: str) -> str:
45
- """Get reference phoneme string for comparison"""
46
- phoneme_sequence = self.text_to_phonemes(text)
47
- all_phonemes = []
48
-
49
- for word_data in phoneme_sequence:
50
- all_phonemes.extend(word_data["phonemes"])
51
-
52
- return " ".join(all_phonemes)
53
-
54
- def _clean_text(self, text: str) -> str:
55
- """Clean text for processing"""
56
- text = re.sub(r"[^\w\s\']", " ", text)
57
- text = re.sub(r"\s+", " ", text)
58
- return text.lower().strip()
59
-
60
- def _get_word_phonemes(self, word: str) -> List[str]:
61
- """Get phonemes for a word"""
62
- word_lower = word.lower()
63
-
64
- if word_lower in self.cmu_dict:
65
- # Remove stress markers and convert to Wav2Vec2 phoneme format
66
- phonemes = self.cmu_dict[word_lower][0]
67
- clean_phonemes = [re.sub(r"[0-9]", "", p) for p in phonemes]
68
- return self._convert_to_wav2vec_format(clean_phonemes)
69
- else:
70
- return self._estimate_phonemes(word)
71
-
72
- def _convert_to_wav2vec_format(self, cmu_phonemes: List[str]) -> List[str]:
73
- """Convert CMU phonemes to Wav2Vec2 format"""
74
- # Mapping from CMU to Wav2Vec2/eSpeak phonemes
75
- cmu_to_espeak = {
76
- "AA": "ɑ",
77
- "AE": "æ",
78
- "AH": "ʌ",
79
- "AO": "ɔ",
80
- "AW": "aʊ",
81
- "AY": "aɪ",
82
- "EH": "ɛ",
83
- "ER": "ɝ",
84
- "EY": "eɪ",
85
- "IH": "ɪ",
86
- "IY": "i",
87
- "OW": "oʊ",
88
- "OY": "ɔɪ",
89
- "UH": "ʊ",
90
- "UW": "u",
91
- "B": "b",
92
- "CH": "tʃ",
93
- "D": "d",
94
- "DH": "ð",
95
- "F": "f",
96
- "G": "ɡ",
97
- "HH": "h",
98
- "JH": "dʒ",
99
- "K": "k",
100
- "L": "l",
101
- "M": "m",
102
- "N": "n",
103
- "NG": "ŋ",
104
- "P": "p",
105
- "R": "r",
106
- "S": "s",
107
- "SH": "ʃ",
108
- "T": "t",
109
- "TH": "θ",
110
- "V": "v",
111
- "W": "w",
112
- "Y": "j",
113
- "Z": "z",
114
- "ZH": "ʒ",
115
- }
116
-
117
- converted = []
118
- for phoneme in cmu_phonemes:
119
- converted_phoneme = cmu_to_espeak.get(phoneme, phoneme.lower())
120
- converted.append(converted_phoneme)
121
-
122
- return converted
123
-
124
- def _get_ipa(self, word: str) -> str:
125
- """Get IPA transcription"""
126
- try:
127
- return ipa.convert(word)
128
- except:
129
- return f"/{word}/"
130
-
131
- def _estimate_phonemes(self, word: str) -> List[str]:
132
- """Estimate phonemes for unknown words"""
133
- # Basic phoneme estimation with eSpeak-style output
134
- phoneme_map = {
135
- "ch": ["tʃ"],
136
- "sh": ["ʃ"],
137
- "th": ["θ"],
138
- "ph": ["f"],
139
- "ck": ["k"],
140
- "ng": ["ŋ"],
141
- "qu": ["k", "w"],
142
- "a": ["æ"],
143
- "e": ["ɛ"],
144
- "i": ["ɪ"],
145
- "o": ["ʌ"],
146
- "u": ["ʌ"],
147
- "b": ["b"],
148
- "c": ["k"],
149
- "d": ["d"],
150
- "f": ["f"],
151
- "g": ["ɡ"],
152
- "h": ["h"],
153
- "j": ["dʒ"],
154
- "k": ["k"],
155
- "l": ["l"],
156
- "m": ["m"],
157
- "n": ["n"],
158
- "p": ["p"],
159
- "r": ["r"],
160
- "s": ["s"],
161
- "t": ["t"],
162
- "v": ["v"],
163
- "w": ["w"],
164
- "x": ["k", "s"],
165
- "y": ["j"],
166
- "z": ["z"],
167
- }
168
-
169
- word = word.lower()
170
- phonemes = []
171
- i = 0
172
-
173
- while i < len(word):
174
- # Check 2-letter combinations first
175
- if i <= len(word) - 2:
176
- two_char = word[i : i + 2]
177
- if two_char in phoneme_map:
178
- phonemes.extend(phoneme_map[two_char])
179
- i += 2
180
- continue
181
-
182
- # Single character
183
- char = word[i]
184
- if char in phoneme_map:
185
- phonemes.extend(phoneme_map[char])
186
-
187
- i += 1
188
-
189
- return phonemes
190
-
191
-
192
- class PhonemeComparator:
193
- """Compare reference and learner phoneme sequences"""
194
-
195
- def __init__(self):
196
- # Vietnamese speakers' common phoneme substitutions
197
- self.substitution_patterns = {
198
- "θ": ["f", "s", "t"], # TH → F, S, T
199
- "ð": ["d", "z", "v"], # DH → D, Z, V
200
- "v": ["w", "f"], # V → W, F
201
- "r": ["l"], # R → L
202
- "l": ["r"], # L → R
203
- "z": ["s"], # Z → S
204
- "ʒ": ["ʃ", "z"], # ZH → SH, Z
205
- "ŋ": ["n"], # NG → N
206
- }
207
-
208
- # Difficulty levels for Vietnamese speakers
209
- self.difficulty_map = {
210
- "θ": 0.9, # th (think)
211
- "ð": 0.9, # th (this)
212
- "v": 0.8, # v
213
- "z": 0.8, # z
214
- "ʒ": 0.9, # zh (measure)
215
- "r": 0.7, # r
216
- "l": 0.6, # l
217
- "w": 0.5, # w
218
- "f": 0.4, # f
219
- "s": 0.3, # s
220
- "ʃ": 0.5, # sh
221
- "tʃ": 0.4, # ch
222
- "dʒ": 0.5, # j
223
- "ŋ": 0.3, # ng
224
- }
225
-
226
- def compare_phoneme_sequences(
227
- self, reference_phonemes: str, learner_phonemes: str
228
- ) -> List[Dict]:
229
- """Compare reference and learner phoneme sequences"""
230
-
231
- # Split phoneme strings
232
- ref_phones = reference_phonemes.split()
233
- learner_phones = learner_phonemes.split()
234
-
235
- print(f"Reference phonemes: {ref_phones}")
236
- print(f"Learner phonemes: {learner_phones}")
237
-
238
- # Simple alignment comparison
239
- comparisons = []
240
- max_len = max(len(ref_phones), len(learner_phones))
241
-
242
- for i in range(max_len):
243
- ref_phoneme = ref_phones[i] if i < len(ref_phones) else ""
244
- learner_phoneme = learner_phones[i] if i < len(learner_phones) else ""
245
-
246
- if ref_phoneme and learner_phoneme:
247
- # Both present - check accuracy
248
- if ref_phoneme == learner_phoneme:
249
- status = "correct"
250
- score = 1.0
251
- elif self._is_acceptable_substitution(ref_phoneme, learner_phoneme):
252
- status = "acceptable"
253
- score = 0.7
254
- else:
255
- status = "wrong"
256
- score = 0.2
257
-
258
- elif ref_phoneme and not learner_phoneme:
259
- # Missing phoneme
260
- status = "missing"
261
- score = 0.0
262
-
263
- elif learner_phoneme and not ref_phoneme:
264
- # Extra phoneme
265
- status = "extra"
266
- score = 0.0
267
- else:
268
- continue
269
-
270
- comparison = {
271
- "position": i,
272
- "reference_phoneme": ref_phoneme,
273
- "learner_phoneme": learner_phoneme,
274
- "status": status,
275
- "score": score,
276
- "difficulty": self.difficulty_map.get(ref_phoneme, 0.3),
277
- }
278
-
279
- comparisons.append(comparison)
280
-
281
- return comparisons
282
-
283
- def _is_acceptable_substitution(self, reference: str, learner: str) -> bool:
284
- """Check if learner phoneme is acceptable substitution for Vietnamese speakers"""
285
- acceptable = self.substitution_patterns.get(reference, [])
286
- return learner in acceptable
287
-
288
-
289
- # =============================================================================
290
- # WORD ANALYZER
291
- # =============================================================================
292
-
293
-
294
- class WordAnalyzer:
295
- """Analyze word-level pronunciation accuracy using character-based ASR"""
296
-
297
- def __init__(self):
298
- self.g2p = SimpleG2P()
299
- self.comparator = PhonemeComparator()
300
-
301
- def analyze_words(self, reference_text: str, learner_phonemes: str) -> Dict:
302
- """Analyze word-level pronunciation using phoneme representation from character ASR"""
303
-
304
- # Get reference phonemes by word
305
- reference_words = self.g2p.text_to_phonemes(reference_text)
306
-
307
- # Get overall phoneme comparison
308
- reference_phoneme_string = self.g2p.get_reference_phoneme_string(reference_text)
309
- phoneme_comparisons = self.comparator.compare_phoneme_sequences(
310
- reference_phoneme_string, learner_phonemes
311
- )
312
-
313
- # Map phonemes back to words
314
- word_highlights = self._create_word_highlights(
315
- reference_words, phoneme_comparisons
316
- )
317
-
318
- # Identify wrong words
319
- wrong_words = self._identify_wrong_words(word_highlights, phoneme_comparisons)
320
-
321
- return {
322
- "word_highlights": word_highlights,
323
- "phoneme_differences": phoneme_comparisons,
324
- "wrong_words": wrong_words,
325
- }
326
-
327
- def _create_word_highlights(
328
- self, reference_words: List[Dict], phoneme_comparisons: List[Dict]
329
- ) -> List[Dict]:
330
- """Create word highlighting data"""
331
-
332
- word_highlights = []
333
- phoneme_index = 0
334
-
335
- for word_data in reference_words:
336
- word = word_data["word"]
337
- word_phonemes = word_data["phonemes"]
338
- num_phonemes = len(word_phonemes)
339
-
340
- # Get phoneme scores for this word
341
- word_phoneme_scores = []
342
- for j in range(num_phonemes):
343
- if phoneme_index + j < len(phoneme_comparisons):
344
- comparison = phoneme_comparisons[phoneme_index + j]
345
- word_phoneme_scores.append(comparison["score"])
346
-
347
- # Calculate word score
348
- word_score = np.mean(word_phoneme_scores) if word_phoneme_scores else 0.0
349
-
350
- # Create word highlight
351
- highlight = {
352
- "word": word,
353
- "score": float(word_score),
354
- "status": self._get_word_status(word_score),
355
- "color": self._get_word_color(word_score),
356
- "phonemes": word_phonemes,
357
- "ipa": word_data["ipa"],
358
- "phoneme_scores": word_phoneme_scores,
359
- "phoneme_start_index": phoneme_index,
360
- "phoneme_end_index": phoneme_index + num_phonemes - 1,
361
- }
362
-
363
- word_highlights.append(highlight)
364
- phoneme_index += num_phonemes
365
-
366
- return word_highlights
367
-
368
- def _identify_wrong_words(
369
- self, word_highlights: List[Dict], phoneme_comparisons: List[Dict]
370
- ) -> List[Dict]:
371
- """Identify words that were pronounced incorrectly"""
372
-
373
- wrong_words = []
374
-
375
- for word_highlight in word_highlights:
376
- if word_highlight["score"] < 0.6: # Threshold for wrong pronunciation
377
-
378
- # Find specific phoneme errors for this word
379
- start_idx = word_highlight["phoneme_start_index"]
380
- end_idx = word_highlight["phoneme_end_index"]
381
-
382
- wrong_phonemes = []
383
- missing_phonemes = []
384
-
385
- for i in range(start_idx, min(end_idx + 1, len(phoneme_comparisons))):
386
- comparison = phoneme_comparisons[i]
387
-
388
- if comparison["status"] == "wrong":
389
- wrong_phonemes.append(
390
- {
391
- "expected": comparison["reference_phoneme"],
392
- "actual": comparison["learner_phoneme"],
393
- "difficulty": comparison["difficulty"],
394
- }
395
- )
396
- elif comparison["status"] == "missing":
397
- missing_phonemes.append(
398
- {
399
- "phoneme": comparison["reference_phoneme"],
400
- "difficulty": comparison["difficulty"],
401
- }
402
- )
403
-
404
- wrong_word = {
405
- "word": word_highlight["word"],
406
- "score": word_highlight["score"],
407
- "expected_phonemes": word_highlight["phonemes"],
408
- "ipa": word_highlight["ipa"],
409
- "wrong_phonemes": wrong_phonemes,
410
- "missing_phonemes": missing_phonemes,
411
- "tips": self._get_vietnamese_tips(wrong_phonemes, missing_phonemes),
412
- }
413
-
414
- wrong_words.append(wrong_word)
415
-
416
- return wrong_words
417
-
418
- def _get_word_status(self, score: float) -> str:
419
- """Get word status from score"""
420
- if score >= 0.8:
421
- return "excellent"
422
- elif score >= 0.6:
423
- return "good"
424
- elif score >= 0.4:
425
- return "needs_practice"
426
- else:
427
- return "poor"
428
-
429
- def _get_word_color(self, score: float) -> str:
430
- """Get color for word highlighting"""
431
- if score >= 0.8:
432
- return "#22c55e" # Green
433
- elif score >= 0.6:
434
- return "#84cc16" # Light green
435
- elif score >= 0.4:
436
- return "#eab308" # Yellow
437
- else:
438
- return "#ef4444" # Red
439
-
440
- def _get_vietnamese_tips(
441
- self, wrong_phonemes: List[Dict], missing_phonemes: List[Dict]
442
- ) -> List[str]:
443
- """Get Vietnamese-specific pronunciation tips"""
444
-
445
- tips = []
446
-
447
- # Tips for specific Vietnamese pronunciation challenges
448
- vietnamese_tips = {
449
- "θ": "Đặt lưỡi giữa răng trên và dưới, thổi nhẹ (think, three)",
450
- "ð": "Giống θ nhưng rung dây thanh âm (this, that)",
451
- "v": "Chạm môi dưới vào răng trên, không dùng cả hai môi như tiếng Việt",
452
- "r": "Cuộn lưỡi nhưng không chạm vào vòm miệng, không lăn lưỡi",
453
- "l": "Đầu lưỡi chạm vào vòm miệng sau răng",
454
- "z": "Giống âm 's' nhưng có rung dây thanh âm",
455
- "ʒ": "Giống âm 'ʃ' (sh) nhưng có rung dây thanh âm",
456
- "w": "Tròn môi như âm 'u', không dùng răng như âm 'v'",
457
- }
458
-
459
- # Add tips for wrong phonemes
460
- for wrong in wrong_phonemes:
461
- expected = wrong["expected"]
462
- actual = wrong["actual"]
463
-
464
- if expected in vietnamese_tips:
465
- tips.append(f"Âm '{expected}': {vietnamese_tips[expected]}")
466
- else:
467
- tips.append(f"Luyện âm '{expected}' thay vì '{actual}'")
468
-
469
- # Add tips for missing phonemes
470
- for missing in missing_phonemes:
471
- phoneme = missing["phoneme"]
472
- if phoneme in vietnamese_tips:
473
- tips.append(f"Thiếu âm '{phoneme}': {vietnamese_tips[phoneme]}")
474
-
475
- return tips
476
-
477
-
478
- class SimpleFeedbackGenerator:
479
- """Generate simple, actionable feedback in Vietnamese"""
480
-
481
- def generate_feedback(
482
- self,
483
- overall_score: float,
484
- wrong_words: List[Dict],
485
- phoneme_comparisons: List[Dict],
486
- ) -> List[str]:
487
- """Generate focused Vietnamese feedback with actionable improvements"""
488
-
489
- feedback = []
490
-
491
- # More specific and actionable feedback based on score ranges
492
- if overall_score >= 0.8:
493
- feedback.append(f"Xuất sắc! Điểm: {int(overall_score * 100)}%. Tiếp tục duy trì và luyện tập thêm tốc độ tự nhiên.")
494
- elif overall_score >= 0.7:
495
- feedback.append(f"Tốt! Điểm: {int(overall_score * 100)}%. Để đạt 80%+, hãy tập trung vào nhịp điệu và ngữ điệu.")
496
- elif overall_score >= 0.6:
497
- feedback.append(f"Khá! Điểm: {int(overall_score * 100)}%. Để cải thiện, hãy phát âm chậm hơn và rõ ràng từng âm.")
498
- elif overall_score >= 0.4:
499
- feedback.append(f"Cần cải thiện. Điểm: {int(overall_score * 100)}%. Nghe lại mẫu và tập từng từ riêng lẻ trước.")
500
- else:
501
- feedback.append(f"Điểm: {int(overall_score * 100)}%. Hãy nghe mẫu 3-5 lần, sau đó tập phát âm từng từ chậm rãi.")
502
-
503
- # More specific wrong words feedback with improvement path
504
- if wrong_words:
505
- # Sort by score to focus on worst words first
506
- sorted_words = sorted(wrong_words, key=lambda x: x["score"])
507
-
508
- if len(wrong_words) == 1:
509
- word = sorted_words[0]
510
- feedback.append(f"Tập trung vào từ '{word['word']}' (điểm: {int(word['score']*100)}%). Click vào từ để nghe lại.")
511
- elif len(wrong_words) <= 3:
512
- worst_word = sorted_words[0]
513
- feedback.append(f"Ưu tiên cải thiện: '{worst_word['word']}' ({int(worst_word['score']*100)}%) - các từ khác sẽ dễ hơn sau khi nắm được từ này.")
514
- else:
515
- # Focus on pattern recognition
516
- feedback.append(f"Có {len(wrong_words)} từ cần cải thiện. Bắt đầu với 2 từ khó nhất và luyện tập 5 lần mỗi từ.")
517
-
518
- # Specific phoneme guidance with improvement strategy
519
- problem_phonemes = defaultdict(int)
520
- for comparison in phoneme_comparisons:
521
- if comparison["status"] in ["wrong", "missing"]:
522
- phoneme = comparison["reference_phoneme"]
523
- problem_phonemes[phoneme] += 1
524
-
525
- if problem_phonemes:
526
- most_difficult = sorted(
527
- problem_phonemes.items(), key=lambda x: x[1], reverse=True
528
- )
529
- top_problems = most_difficult[:2] # Focus on top 2 problems
530
-
531
- detailed_phoneme_tips = {
532
- "θ": "Đặt đầu lưỡi giữa 2 hàm răng, thổi nhẹ ra. Luyện: 'think', 'three', 'thank'.",
533
- "ð": "Như /θ/ nhưng rung dây thanh. Luyện: 'this', 'that', 'the'.",
534
- "v": "Răng trên chạm nhẹ môi dưới (không phải 2 môi). Luyện: 'very', 'have', 'love'.",
535
- "r": "Cuộn lưỡi lên nhưng KHÔNG chạm nóc miệng. Luyện: 'red', 'run', 'car'.",
536
- "l": "Đầu lưỡi chạm nướu răng trên. Luyện: 'love', 'like', 'tell'.",
537
- "z": "Như 's' nhưng rung dây thanh (đặt tay vào cổ để cảm nhận). Luyện: 'zoo', 'buzz'.",
538
- "ɛ": "Mở miệng vừa, lưỡi thấp (như 'e' trong 'ten'). Luyện: 'bed', 'red', 'get'.",
539
- "æ": "Mở miệng rộng, hàm dưới hạ thấp. Luyện: 'cat', 'man', 'bad'.",
540
- "ɪ": "Âm 'i' ngắn, lưỡi thả lỏng. Luyện: 'sit', 'big', 'this'.",
541
- "ʊ": "Âm 'u' ngắn, môi tròn nhẹ. Luyện: 'book', 'put', 'could'.",
542
- }
543
-
544
- # Provide specific guidance for the most problematic phoneme
545
- for phoneme, count in top_problems[:1]: # Focus on the worst one
546
- if phoneme in detailed_phoneme_tips:
547
- improvement = 100 - int((count / len(phoneme_comparisons)) * 100)
548
- feedback.append(
549
- f"🎯 Tập trung âm /{phoneme}/: {detailed_phoneme_tips[phoneme]} Cải thiện âm này sẽ tăng điểm ~{improvement}%."
550
- )
551
-
552
- # Add specific action steps based on score range
553
- if overall_score < 0.8:
554
- if overall_score < 0.5:
555
- feedback.append("📚 Bước tiếp: 1) Nghe mẫu 5 lần, 2) Tập phát âm từng từ 3 lần, 3) Ghi âm lại và so sánh.")
556
- elif overall_score < 0.7:
557
- feedback.append("📚 Bước tiếp: 1) Tập từ khó nhất 5 lần, 2) Đọc cả câu chậm 2 lần, 3) Tăng tốc độ dần.")
558
- else:
559
- feedback.append("📚 Bước tiếp: 1) Luyện ngữ điệu tự nhiên, 2) Kết nối âm giữa các từ, 3) Tập nói với cảm xúc.")
560
-
561
- return feedback
562
 
563
 
564
  def convert_numpy_types(obj):
 
 
1
  import numpy as np
2
  import nltk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def convert_numpy_types(obj):