Spaces:

Nourhenem
/

pipeline2

Sleeping

App Files Files Community

pipeline2 / title_matcher.py

Nourhenem

initial commit

f92da22 verified about 1 month ago

raw

history blame contribute delete

14.5 kB

	"""
	title_matcher.py
	Système de matching par titre pour les templates médicaux
	"""

	import re
	import logging
	from typing import Optional, Tuple, List
	from dataclasses import dataclass
	from difflib import SequenceMatcher

	logger = logging.getLogger(__name__)

	@dataclass
	class TitleMatchResult:
	"""Résultat du matching par titre"""
	transcription_title: str
	template_id: str
	match_type: str # 'exact', 'normalized', 'fuzzy', 'none'
	confidence: float
	normalized_transcription: str
	normalized_template: str


	class TitleBasedMatcher:
	"""
	Classe pour matcher les transcriptions aux templates par titre
	Priorité au matching exact avant le matching sémantique
	"""

	def __init__(self, parser_instance):
	"""
	Initialise le matcher par titre

	Args:
	parser_instance: Instance de MedicalTemplateParser avec DB chargée
	"""
	self.parser = parser_instance
	self.template_titles = self._extract_all_template_titles()
	logger.info(f"📋 {len(self.template_titles)} titres de templates chargés")

	def _extract_all_template_titles(self) -> List[str]:
	"""Extrait tous les titres de templates disponibles"""
	titles = []
	for template_id in self.parser.templates.keys():
	titles.append(template_id)
	return titles

	def normalize_title(self, title: str) -> str:
	"""
	Normalise un titre pour le matching

	Args:
	title: Titre brut (fichier transcription ou template)

	Returns:
	str: Titre normalisé
	"""
	# Retirer les extensions
	title = re.sub(r'\.(txt\|rtf\|docx\|doc)$', '', title, flags=re.IGNORECASE)

	# Retirer les préfixes courants
	title = re.sub(r'^(default\.\|mod\.)', '', title, flags=re.IGNORECASE)

	# Retirer les suffixes de transcription
	title = re.sub(r'_\d+_radiologie$', '', title, flags=re.IGNORECASE)
	title = re.sub(r'_radiologie$', '', title, flags=re.IGNORECASE)

	# Normaliser les espaces et la casse
	title = re.sub(r'[_\-\s]+', '.', title)
	title = title.lower().strip('.')

	return title

	def extract_key_identifier(self, title: str) -> str:
	"""
	Extrait l'identifiant clé du titre (ex: 6260.cherry.EXPERTISE)

	Args:
	title: Titre à analyser

	Returns:
	str: Identifiant clé
	"""
	normalized = self.normalize_title(title)

	# Chercher un pattern numérique suivi de mots
	# Ex: 6260.cherry.EXPERTISE
	match = re.search(r'(\d+\.[a-z]+(?:\.[A-Z]+)?)', normalized, flags=re.IGNORECASE)
	if match:
	return match.group(1).lower()

	# Sinon retourner le titre normalisé complet
	return normalized

	def calculate_similarity(self, str1: str, str2: str) -> float:
	"""
	Calcule la similarité entre deux chaînes

	Args:
	str1: Première chaîne
	str2: Deuxième chaîne

	Returns:
	float: Score de similarité [0-1]
	"""
	return SequenceMatcher(None, str1, str2).ratio()

	def find_exact_match(self, transcription_title: str) -> Optional[str]:
	"""
	Cherche un match exact avec un template

	Args:
	transcription_title: Titre de la transcription

	Returns:
	Optional[str]: ID du template correspondant ou None
	"""
	normalized_trans = self.normalize_title(transcription_title)

	for template_id in self.template_titles:
	normalized_template = self.normalize_title(template_id)

	if normalized_trans == normalized_template:
	logger.info(f"✅ Match EXACT trouvé: {template_id}")
	return template_id

	return None

	def find_key_match(self, transcription_title: str) -> Optional[Tuple[str, float]]:
	"""
	Cherche un match basé sur l'identifiant clé

	Args:
	transcription_title: Titre de la transcription

	Returns:
	Optional[Tuple[str, float]]: (template_id, confidence) ou None
	"""
	trans_key = self.extract_key_identifier(transcription_title)

	best_match = None
	best_score = 0.0

	for template_id in self.template_titles:
	template_key = self.extract_key_identifier(template_id)

	# Vérifier si les clés correspondent
	if trans_key in template_key or template_key in trans_key:
	similarity = self.calculate_similarity(trans_key, template_key)

	if similarity > best_score:
	best_score = similarity
	best_match = template_id

	if best_match and best_score >= 0.7:
	logger.info(f"✅ Match par CLÉ trouvé: {best_match} (score: {best_score:.3f})")
	return best_match, best_score

	return None

	def find_fuzzy_match(self, transcription_title: str, threshold: float = 0.8) -> Optional[Tuple[str, float]]:
	"""
	Cherche un match fuzzy (approximatif)

	Args:
	transcription_title: Titre de la transcription
	threshold: Seuil minimum de similarité

	Returns:
	Optional[Tuple[str, float]]: (template_id, confidence) ou None
	"""
	normalized_trans = self.normalize_title(transcription_title)

	best_match = None
	best_score = 0.0

	for template_id in self.template_titles:
	normalized_template = self.normalize_title(template_id)

	similarity = self.calculate_similarity(normalized_trans, normalized_template)

	if similarity > best_score and similarity >= threshold:
	best_score = similarity
	best_match = template_id

	if best_match:
	logger.info(f"✅ Match FUZZY trouvé: {best_match} (score: {best_score:.3f})")
	return best_match, best_score

	return None

	def match_by_title(self, transcription_title: str,
	fuzzy_threshold: float = 0.8) -> TitleMatchResult:
	"""
	Effectue le matching par titre avec stratégie en cascade

	Args:
	transcription_title: Titre du fichier de transcription
	fuzzy_threshold: Seuil pour le matching fuzzy

	Returns:
	TitleMatchResult: Résultat du matching
	"""
	logger.info(f"\n{'='*80}")
	logger.info(f"🔍 MATCHING PAR TITRE: {transcription_title}")
	logger.info(f"{'='*80}")

	normalized_trans = self.normalize_title(transcription_title)
	logger.info(f"📝 Titre normalisé: {normalized_trans}")

	# Stratégie 1: Match exact
	exact_match = self.find_exact_match(transcription_title)
	if exact_match:
	return TitleMatchResult(
	transcription_title=transcription_title,
	template_id=exact_match,
	match_type='exact',
	confidence=1.0,
	normalized_transcription=normalized_trans,
	normalized_template=self.normalize_title(exact_match)
	)

	# Stratégie 2: Match par identifiant clé
	key_match = self.find_key_match(transcription_title)
	if key_match:
	template_id, confidence = key_match
	return TitleMatchResult(
	transcription_title=transcription_title,
	template_id=template_id,
	match_type='normalized',
	confidence=confidence,
	normalized_transcription=normalized_trans,
	normalized_template=self.normalize_title(template_id)
	)

	# Stratégie 3: Match fuzzy
	fuzzy_match = self.find_fuzzy_match(transcription_title, fuzzy_threshold)
	if fuzzy_match:
	template_id, confidence = fuzzy_match
	return TitleMatchResult(
	transcription_title=transcription_title,
	template_id=template_id,
	match_type='fuzzy',
	confidence=confidence,
	normalized_transcription=normalized_trans,
	normalized_template=self.normalize_title(template_id)
	)

	# Aucun match trouvé
	logger.warning(f"⚠️ Aucun match par titre trouvé pour: {transcription_title}")
	return TitleMatchResult(
	transcription_title=transcription_title,
	template_id='',
	match_type='none',
	confidence=0.0,
	normalized_transcription=normalized_trans,
	normalized_template=''
	)

	def display_match_result(self, result: TitleMatchResult):
	"""
	Affiche un résultat de matching de manière formatée

	Args:
	result: Résultat à afficher
	"""
	print(f"\n{'='*80}")
	print(f"📋 RÉSULTAT MATCHING PAR TITRE")
	print(f"{'='*80}")
	print(f"📄 Transcription: {result.transcription_title}")
	print(f"📝 Normalisé: {result.normalized_transcription}")
	print(f"\n{'─'*80}")

	if result.match_type != 'none':
	print(f"✅ Template trouvé: {result.template_id}")
	print(f"📝 Normalisé: {result.normalized_template}")
	print(f"🎯 Type de match: {result.match_type.upper()}")
	print(f"📊 Confiance: {result.confidence:.2%}")
	else:
	print(f"❌ Aucun template correspondant trouvé")
	print(f"💡 Le matching sémantique sera utilisé")

	print(f"{'='*80}\n")


	class HybridMatcher:
	"""
	Matcher hybride qui combine le matching par titre et le matching sémantique
	"""

	def __init__(self, parser_instance, semantic_matcher_instance):
	"""
	Initialise le matcher hybride

	Args:
	parser_instance: Instance de MedicalTemplateParser
	semantic_matcher_instance: Instance de TranscriptionMatcher
	"""
	self.parser = parser_instance
	self.semantic_matcher = semantic_matcher_instance
	self.title_matcher = TitleBasedMatcher(parser_instance)

	logger.info("🔄 Matcher hybride initialisé")

	def match_and_fill(self, transcription: str, transcription_filename: str = None,
	title_confidence_threshold: float = 0.8):
	"""
	Effectue le matching et le remplissage avec stratégie hybride

	Args:
	transcription: Contenu de la transcription
	transcription_filename: Nom du fichier (optionnel, pour matching par titre)
	title_confidence_threshold: Seuil de confiance pour utiliser le match par titre

	Returns:
	List[MatchResult]: Résultats du matching et remplissage
	"""
	from transcription_matcher import MatchResult

	logger.info("\n" + "="*80)
	logger.info("🚀 MATCHING HYBRIDE (Titre + Sémantique)")
	logger.info("="*80)

	# Étape 1: Essayer le matching par titre si le filename est fourni
	template_id = None
	match_method = "semantic"

	if transcription_filename:
	logger.info(f"\n📋 Étape 1: Matching par TITRE")
	logger.info(f"{'─'*80}")

	title_result = self.title_matcher.match_by_title(transcription_filename)
	self.title_matcher.display_match_result(title_result)

	if title_result.match_type != 'none' and title_result.confidence >= title_confidence_threshold:
	template_id = title_result.template_id
	match_method = f"title ({title_result.match_type})"
	logger.info(f"✅ Utilisation du match par titre: {template_id}")
	else:
	logger.info(f"⚠️ Match par titre insuffisant (confiance: {title_result.confidence:.2%})")
	logger.info(f"🔄 Passage au matching sémantique...")

	# Étape 2: Si pas de match par titre, utiliser le matching sémantique
	if not template_id:
	logger.info(f"\n🧠 Étape 2: Matching SÉMANTIQUE")
	logger.info(f"{'─'*80}")

	results = self.semantic_matcher.match_and_fill(transcription, return_top_k=1)

	if results:
	# Ajouter l'info de la méthode de matching
	for result in results:
	result.match_method = match_method
	return results
	else:
	logger.error("❌ Aucun résultat du matching sémantique")
	return []

	# Étape 3: Remplir le template trouvé par titre
	logger.info(f"\n📝 Étape 3: REMPLISSAGE du template")
	logger.info(f"{'─'*80}")

	template_info = self.parser.get_template_info(template_id)
	if not template_info:
	logger.error(f"❌ Template {template_id} introuvable")
	return []

	# Remplir avec GPT
	filled_data = self.semantic_matcher.fill_template_with_gpt(
	template_info.content,
	transcription
	)

	sections_filled = filled_data.get("sections", {})
	confidence = filled_data.get("confidence", 0.0)

	# Formater le résultat
	filled_template = self.semantic_matcher.format_filled_template(
	template_info.content,
	sections_filled
	)

	result = MatchResult(
	template_id=template_id,
	template_content=template_info.content,
	similarity_score=1.0, # Match par titre = score parfait
	filled_template=filled_template,
	sections_filled=sections_filled,
	confidence_score=confidence,
	match_method=match_method
	)

	logger.info(f"✅ Template rempli via {match_method}")
	logger.info("="*80 + "\n")

	return [result]