|
|
""" |
|
|
title_matcher.py |
|
|
Système de matching par titre pour les templates médicaux |
|
|
""" |
|
|
|
|
|
import re |
|
|
import logging |
|
|
from typing import Optional, Tuple, List |
|
|
from dataclasses import dataclass |
|
|
from difflib import SequenceMatcher |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
@dataclass |
|
|
class TitleMatchResult: |
|
|
"""Résultat du matching par titre""" |
|
|
transcription_title: str |
|
|
template_id: str |
|
|
match_type: str |
|
|
confidence: float |
|
|
normalized_transcription: str |
|
|
normalized_template: str |
|
|
|
|
|
|
|
|
class TitleBasedMatcher: |
|
|
""" |
|
|
Classe pour matcher les transcriptions aux templates par titre |
|
|
Priorité au matching exact avant le matching sémantique |
|
|
""" |
|
|
|
|
|
def __init__(self, parser_instance): |
|
|
""" |
|
|
Initialise le matcher par titre |
|
|
|
|
|
Args: |
|
|
parser_instance: Instance de MedicalTemplateParser avec DB chargée |
|
|
""" |
|
|
self.parser = parser_instance |
|
|
self.template_titles = self._extract_all_template_titles() |
|
|
logger.info(f"📋 {len(self.template_titles)} titres de templates chargés") |
|
|
|
|
|
def _extract_all_template_titles(self) -> List[str]: |
|
|
"""Extrait tous les titres de templates disponibles""" |
|
|
titles = [] |
|
|
for template_id in self.parser.templates.keys(): |
|
|
titles.append(template_id) |
|
|
return titles |
|
|
|
|
|
def normalize_title(self, title: str) -> str: |
|
|
""" |
|
|
Normalise un titre pour le matching |
|
|
|
|
|
Args: |
|
|
title: Titre brut (fichier transcription ou template) |
|
|
|
|
|
Returns: |
|
|
str: Titre normalisé |
|
|
""" |
|
|
|
|
|
title = re.sub(r'\.(txt|rtf|docx|doc)$', '', title, flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
title = re.sub(r'^(default\.|mod\.)', '', title, flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
title = re.sub(r'_\d+_radiologie$', '', title, flags=re.IGNORECASE) |
|
|
title = re.sub(r'_radiologie$', '', title, flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
title = re.sub(r'[_\-\s]+', '.', title) |
|
|
title = title.lower().strip('.') |
|
|
|
|
|
return title |
|
|
|
|
|
def extract_key_identifier(self, title: str) -> str: |
|
|
""" |
|
|
Extrait l'identifiant clé du titre (ex: 6260.cherry.EXPERTISE) |
|
|
|
|
|
Args: |
|
|
title: Titre à analyser |
|
|
|
|
|
Returns: |
|
|
str: Identifiant clé |
|
|
""" |
|
|
normalized = self.normalize_title(title) |
|
|
|
|
|
|
|
|
|
|
|
match = re.search(r'(\d+\.[a-z]+(?:\.[A-Z]+)?)', normalized, flags=re.IGNORECASE) |
|
|
if match: |
|
|
return match.group(1).lower() |
|
|
|
|
|
|
|
|
return normalized |
|
|
|
|
|
def calculate_similarity(self, str1: str, str2: str) -> float: |
|
|
""" |
|
|
Calcule la similarité entre deux chaînes |
|
|
|
|
|
Args: |
|
|
str1: Première chaîne |
|
|
str2: Deuxième chaîne |
|
|
|
|
|
Returns: |
|
|
float: Score de similarité [0-1] |
|
|
""" |
|
|
return SequenceMatcher(None, str1, str2).ratio() |
|
|
|
|
|
def find_exact_match(self, transcription_title: str) -> Optional[str]: |
|
|
""" |
|
|
Cherche un match exact avec un template |
|
|
|
|
|
Args: |
|
|
transcription_title: Titre de la transcription |
|
|
|
|
|
Returns: |
|
|
Optional[str]: ID du template correspondant ou None |
|
|
""" |
|
|
normalized_trans = self.normalize_title(transcription_title) |
|
|
|
|
|
for template_id in self.template_titles: |
|
|
normalized_template = self.normalize_title(template_id) |
|
|
|
|
|
if normalized_trans == normalized_template: |
|
|
logger.info(f"✅ Match EXACT trouvé: {template_id}") |
|
|
return template_id |
|
|
|
|
|
return None |
|
|
|
|
|
def find_key_match(self, transcription_title: str) -> Optional[Tuple[str, float]]: |
|
|
""" |
|
|
Cherche un match basé sur l'identifiant clé |
|
|
|
|
|
Args: |
|
|
transcription_title: Titre de la transcription |
|
|
|
|
|
Returns: |
|
|
Optional[Tuple[str, float]]: (template_id, confidence) ou None |
|
|
""" |
|
|
trans_key = self.extract_key_identifier(transcription_title) |
|
|
|
|
|
best_match = None |
|
|
best_score = 0.0 |
|
|
|
|
|
for template_id in self.template_titles: |
|
|
template_key = self.extract_key_identifier(template_id) |
|
|
|
|
|
|
|
|
if trans_key in template_key or template_key in trans_key: |
|
|
similarity = self.calculate_similarity(trans_key, template_key) |
|
|
|
|
|
if similarity > best_score: |
|
|
best_score = similarity |
|
|
best_match = template_id |
|
|
|
|
|
if best_match and best_score >= 0.7: |
|
|
logger.info(f"✅ Match par CLÉ trouvé: {best_match} (score: {best_score:.3f})") |
|
|
return best_match, best_score |
|
|
|
|
|
return None |
|
|
|
|
|
def find_fuzzy_match(self, transcription_title: str, threshold: float = 0.8) -> Optional[Tuple[str, float]]: |
|
|
""" |
|
|
Cherche un match fuzzy (approximatif) |
|
|
|
|
|
Args: |
|
|
transcription_title: Titre de la transcription |
|
|
threshold: Seuil minimum de similarité |
|
|
|
|
|
Returns: |
|
|
Optional[Tuple[str, float]]: (template_id, confidence) ou None |
|
|
""" |
|
|
normalized_trans = self.normalize_title(transcription_title) |
|
|
|
|
|
best_match = None |
|
|
best_score = 0.0 |
|
|
|
|
|
for template_id in self.template_titles: |
|
|
normalized_template = self.normalize_title(template_id) |
|
|
|
|
|
similarity = self.calculate_similarity(normalized_trans, normalized_template) |
|
|
|
|
|
if similarity > best_score and similarity >= threshold: |
|
|
best_score = similarity |
|
|
best_match = template_id |
|
|
|
|
|
if best_match: |
|
|
logger.info(f"✅ Match FUZZY trouvé: {best_match} (score: {best_score:.3f})") |
|
|
return best_match, best_score |
|
|
|
|
|
return None |
|
|
|
|
|
def match_by_title(self, transcription_title: str, |
|
|
fuzzy_threshold: float = 0.8) -> TitleMatchResult: |
|
|
""" |
|
|
Effectue le matching par titre avec stratégie en cascade |
|
|
|
|
|
Args: |
|
|
transcription_title: Titre du fichier de transcription |
|
|
fuzzy_threshold: Seuil pour le matching fuzzy |
|
|
|
|
|
Returns: |
|
|
TitleMatchResult: Résultat du matching |
|
|
""" |
|
|
logger.info(f"\n{'='*80}") |
|
|
logger.info(f"🔍 MATCHING PAR TITRE: {transcription_title}") |
|
|
logger.info(f"{'='*80}") |
|
|
|
|
|
normalized_trans = self.normalize_title(transcription_title) |
|
|
logger.info(f"📝 Titre normalisé: {normalized_trans}") |
|
|
|
|
|
|
|
|
exact_match = self.find_exact_match(transcription_title) |
|
|
if exact_match: |
|
|
return TitleMatchResult( |
|
|
transcription_title=transcription_title, |
|
|
template_id=exact_match, |
|
|
match_type='exact', |
|
|
confidence=1.0, |
|
|
normalized_transcription=normalized_trans, |
|
|
normalized_template=self.normalize_title(exact_match) |
|
|
) |
|
|
|
|
|
|
|
|
key_match = self.find_key_match(transcription_title) |
|
|
if key_match: |
|
|
template_id, confidence = key_match |
|
|
return TitleMatchResult( |
|
|
transcription_title=transcription_title, |
|
|
template_id=template_id, |
|
|
match_type='normalized', |
|
|
confidence=confidence, |
|
|
normalized_transcription=normalized_trans, |
|
|
normalized_template=self.normalize_title(template_id) |
|
|
) |
|
|
|
|
|
|
|
|
fuzzy_match = self.find_fuzzy_match(transcription_title, fuzzy_threshold) |
|
|
if fuzzy_match: |
|
|
template_id, confidence = fuzzy_match |
|
|
return TitleMatchResult( |
|
|
transcription_title=transcription_title, |
|
|
template_id=template_id, |
|
|
match_type='fuzzy', |
|
|
confidence=confidence, |
|
|
normalized_transcription=normalized_trans, |
|
|
normalized_template=self.normalize_title(template_id) |
|
|
) |
|
|
|
|
|
|
|
|
logger.warning(f"⚠️ Aucun match par titre trouvé pour: {transcription_title}") |
|
|
return TitleMatchResult( |
|
|
transcription_title=transcription_title, |
|
|
template_id='', |
|
|
match_type='none', |
|
|
confidence=0.0, |
|
|
normalized_transcription=normalized_trans, |
|
|
normalized_template='' |
|
|
) |
|
|
|
|
|
def display_match_result(self, result: TitleMatchResult): |
|
|
""" |
|
|
Affiche un résultat de matching de manière formatée |
|
|
|
|
|
Args: |
|
|
result: Résultat à afficher |
|
|
""" |
|
|
print(f"\n{'='*80}") |
|
|
print(f"📋 RÉSULTAT MATCHING PAR TITRE") |
|
|
print(f"{'='*80}") |
|
|
print(f"📄 Transcription: {result.transcription_title}") |
|
|
print(f"📝 Normalisé: {result.normalized_transcription}") |
|
|
print(f"\n{'─'*80}") |
|
|
|
|
|
if result.match_type != 'none': |
|
|
print(f"✅ Template trouvé: {result.template_id}") |
|
|
print(f"📝 Normalisé: {result.normalized_template}") |
|
|
print(f"🎯 Type de match: {result.match_type.upper()}") |
|
|
print(f"📊 Confiance: {result.confidence:.2%}") |
|
|
else: |
|
|
print(f"❌ Aucun template correspondant trouvé") |
|
|
print(f"💡 Le matching sémantique sera utilisé") |
|
|
|
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
|
|
|
class HybridMatcher: |
|
|
""" |
|
|
Matcher hybride qui combine le matching par titre et le matching sémantique |
|
|
""" |
|
|
|
|
|
def __init__(self, parser_instance, semantic_matcher_instance): |
|
|
""" |
|
|
Initialise le matcher hybride |
|
|
|
|
|
Args: |
|
|
parser_instance: Instance de MedicalTemplateParser |
|
|
semantic_matcher_instance: Instance de TranscriptionMatcher |
|
|
""" |
|
|
self.parser = parser_instance |
|
|
self.semantic_matcher = semantic_matcher_instance |
|
|
self.title_matcher = TitleBasedMatcher(parser_instance) |
|
|
|
|
|
logger.info("🔄 Matcher hybride initialisé") |
|
|
|
|
|
def match_and_fill(self, transcription: str, transcription_filename: str = None, |
|
|
title_confidence_threshold: float = 0.8): |
|
|
""" |
|
|
Effectue le matching et le remplissage avec stratégie hybride |
|
|
|
|
|
Args: |
|
|
transcription: Contenu de la transcription |
|
|
transcription_filename: Nom du fichier (optionnel, pour matching par titre) |
|
|
title_confidence_threshold: Seuil de confiance pour utiliser le match par titre |
|
|
|
|
|
Returns: |
|
|
List[MatchResult]: Résultats du matching et remplissage |
|
|
""" |
|
|
from transcription_matcher import MatchResult |
|
|
|
|
|
logger.info("\n" + "="*80) |
|
|
logger.info("🚀 MATCHING HYBRIDE (Titre + Sémantique)") |
|
|
logger.info("="*80) |
|
|
|
|
|
|
|
|
template_id = None |
|
|
match_method = "semantic" |
|
|
|
|
|
if transcription_filename: |
|
|
logger.info(f"\n📋 Étape 1: Matching par TITRE") |
|
|
logger.info(f"{'─'*80}") |
|
|
|
|
|
title_result = self.title_matcher.match_by_title(transcription_filename) |
|
|
self.title_matcher.display_match_result(title_result) |
|
|
|
|
|
if title_result.match_type != 'none' and title_result.confidence >= title_confidence_threshold: |
|
|
template_id = title_result.template_id |
|
|
match_method = f"title ({title_result.match_type})" |
|
|
logger.info(f"✅ Utilisation du match par titre: {template_id}") |
|
|
else: |
|
|
logger.info(f"⚠️ Match par titre insuffisant (confiance: {title_result.confidence:.2%})") |
|
|
logger.info(f"🔄 Passage au matching sémantique...") |
|
|
|
|
|
|
|
|
if not template_id: |
|
|
logger.info(f"\n🧠 Étape 2: Matching SÉMANTIQUE") |
|
|
logger.info(f"{'─'*80}") |
|
|
|
|
|
results = self.semantic_matcher.match_and_fill(transcription, return_top_k=1) |
|
|
|
|
|
if results: |
|
|
|
|
|
for result in results: |
|
|
result.match_method = match_method |
|
|
return results |
|
|
else: |
|
|
logger.error("❌ Aucun résultat du matching sémantique") |
|
|
return [] |
|
|
|
|
|
|
|
|
logger.info(f"\n📝 Étape 3: REMPLISSAGE du template") |
|
|
logger.info(f"{'─'*80}") |
|
|
|
|
|
template_info = self.parser.get_template_info(template_id) |
|
|
if not template_info: |
|
|
logger.error(f"❌ Template {template_id} introuvable") |
|
|
return [] |
|
|
|
|
|
|
|
|
filled_data = self.semantic_matcher.fill_template_with_gpt( |
|
|
template_info.content, |
|
|
transcription |
|
|
) |
|
|
|
|
|
sections_filled = filled_data.get("sections", {}) |
|
|
confidence = filled_data.get("confidence", 0.0) |
|
|
|
|
|
|
|
|
filled_template = self.semantic_matcher.format_filled_template( |
|
|
template_info.content, |
|
|
sections_filled |
|
|
) |
|
|
|
|
|
result = MatchResult( |
|
|
template_id=template_id, |
|
|
template_content=template_info.content, |
|
|
similarity_score=1.0, |
|
|
filled_template=filled_template, |
|
|
sections_filled=sections_filled, |
|
|
confidence_score=confidence, |
|
|
match_method=match_method |
|
|
) |
|
|
|
|
|
logger.info(f"✅ Template rempli via {match_method}") |
|
|
logger.info("="*80 + "\n") |
|
|
|
|
|
return [result] |