# translation.py from transformers import pipeline import logging import re from collections import Counter logger = logging.getLogger("translation-agent") logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader # To use lazy model loader vi_en = None zh_en = None def _dedupe_repeats(s: str, n_min: int = 3, n_max: int = 7) -> str: """Collapse excessive repeated n-grams and repeated phrases with improved logic.""" if not s: return s # Collapse repeated spaces/newlines s = re.sub(r"\s+", " ", s).strip() # More aggressive repetition detection # Check for simple word repetition (like "a lot of people do not" repeated) words = s.split() if len(words) > 20: # Only check if text is long enough # Look for repeated sequences of 3-8 words for seq_len in range(8, 2, -1): if len(words) < seq_len * 3: # Need at least 3 repetitions continue # Check each possible starting position for start in range(len(words) - seq_len * 2): sequence = words[start:start + seq_len] # Count how many times this sequence repeats repeat_count = 1 pos = start + seq_len while pos + seq_len <= len(words): if words[pos:pos + seq_len] == sequence: repeat_count += 1 pos += seq_len else: break # If we found 3+ repetitions, remove the excess if repeat_count >= 3: # Keep only the first occurrence new_words = words[:start + seq_len] + words[start + seq_len * repeat_count:] s = " ".join(new_words) words = s.split() break else: continue break # Break outer loop if we found and fixed a repetition # Additional cleanup for remaining patterns # Remove consecutive identical word tokens = s.split() out = [] last = None for t in tokens: if last is None or t.lower() != last.lower(): out.append(t) last = t s = " ".join(out) # Limit consecutive duplicate n-grams for n in range(n_max, n_min - 1, -1): pattern = re.compile(r"(\b(?:\w+\s+){%d}\w+\b)(?:\s+\1){2,}" % (n - 1), flags=re.IGNORECASE) s = pattern.sub(r"\1", s) return s def _normalize_and_cap(s: str, cap: int = 512) -> str: if not s: return s s = s.strip() if len(s) > cap: s = s[:cap] return s def _is_too_repetitive(s: str, threshold: float = 0.4) -> bool: if not s: return False tokens = [t.lower() for t in s.split()] if len(tokens) < 10: return False counts = Counter(tokens) top = counts.most_common(1)[0][1] return (top / max(1, len(tokens))) >= threshold def translate_query(text: str, lang_code: str) -> str: global vi_en, zh_en if not text or not text.strip(): return text try: if lang_code == "vi": if vi_en is None: logger.info("[Translation] Loading Vietnamese-English model...") vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1) # Limit input length to prevent model issues input_text = text[:1000] if len(text) > 1000 else text raw = vi_en(input_text, max_length=512)[0]["translation_text"] cleaned = _dedupe_repeats(raw) norm = _normalize_and_cap(cleaned, cap=512) if _is_too_repetitive(norm) or len(norm.strip()) < 10: logger.warning("[En-Vi] Translation repetitive or too short; falling back to original text") return text logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {norm[:100]}...") return norm elif lang_code == "zh": if zh_en is None: logger.info("[Translation] Loading Chinese-English model...") zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1) # Limit input length to prevent model issues input_text = text[:1000] if len(text) > 1000 else text raw = zh_en(input_text, max_length=512)[0]["translation_text"] cleaned = _dedupe_repeats(raw) norm = _normalize_and_cap(cleaned, cap=512) if _is_too_repetitive(norm) or len(norm.strip()) < 10: logger.warning("[En-Zh] Translation repetitive or too short; falling back to original text") return text logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {norm[:100]}...") return norm except Exception as e: logger.error(f"[Translation] Translation failed for {lang_code}: {e}") return text # Fallback to original text return text