Spaces:

Nourhenem
/

pipeline2

Sleeping

File size: 14,546 Bytes

f92da22

"""
title_matcher.py
Système de matching par titre pour les templates médicaux
"""

import re
import logging
from typing import Optional, Tuple, List
from dataclasses import dataclass
from difflib import SequenceMatcher

logger = logging.getLogger(__name__)

@dataclass
class TitleMatchResult:
    """Résultat du matching par titre"""
    transcription_title: str
    template_id: str
    match_type: str  # 'exact', 'normalized', 'fuzzy', 'none'
    confidence: float
    normalized_transcription: str
    normalized_template: str


class TitleBasedMatcher:
    """
    Classe pour matcher les transcriptions aux templates par titre
    Priorité au matching exact avant le matching sémantique
    """
    
    def __init__(self, parser_instance):
        """
        Initialise le matcher par titre
        
        Args:
            parser_instance: Instance de MedicalTemplateParser avec DB chargée
        """
        self.parser = parser_instance
        self.template_titles = self._extract_all_template_titles()
        logger.info(f"📋 {len(self.template_titles)} titres de templates chargés")
    
    def _extract_all_template_titles(self) -> List[str]:
        """Extrait tous les titres de templates disponibles"""
        titles = []
        for template_id in self.parser.templates.keys():
            titles.append(template_id)
        return titles
    
    def normalize_title(self, title: str) -> str:
        """
        Normalise un titre pour le matching
        
        Args:
            title: Titre brut (fichier transcription ou template)
            
        Returns:
            str: Titre normalisé
        """
        # Retirer les extensions
        title = re.sub(r'\.(txt|rtf|docx|doc)$', '', title, flags=re.IGNORECASE)
        
        # Retirer les préfixes courants
        title = re.sub(r'^(default\.|mod\.)', '', title, flags=re.IGNORECASE)
        
        # Retirer les suffixes de transcription
        title = re.sub(r'_\d+_radiologie$', '', title, flags=re.IGNORECASE)
        title = re.sub(r'_radiologie$', '', title, flags=re.IGNORECASE)
        
        # Normaliser les espaces et la casse
        title = re.sub(r'[_\-\s]+', '.', title)
        title = title.lower().strip('.')
        
        return title
    
    def extract_key_identifier(self, title: str) -> str:
        """
        Extrait l'identifiant clé du titre (ex: 6260.cherry.EXPERTISE)
        
        Args:
            title: Titre à analyser
            
        Returns:
            str: Identifiant clé
        """
        normalized = self.normalize_title(title)
        
        # Chercher un pattern numérique suivi de mots
        # Ex: 6260.cherry.EXPERTISE
        match = re.search(r'(\d+\.[a-z]+(?:\.[A-Z]+)?)', normalized, flags=re.IGNORECASE)
        if match:
            return match.group(1).lower()
        
        # Sinon retourner le titre normalisé complet
        return normalized
    
    def calculate_similarity(self, str1: str, str2: str) -> float:
        """
        Calcule la similarité entre deux chaînes
        
        Args:
            str1: Première chaîne
            str2: Deuxième chaîne
            
        Returns:
            float: Score de similarité [0-1]
        """
        return SequenceMatcher(None, str1, str2).ratio()
    
    def find_exact_match(self, transcription_title: str) -> Optional[str]:
        """
        Cherche un match exact avec un template
        
        Args:
            transcription_title: Titre de la transcription
            
        Returns:
            Optional[str]: ID du template correspondant ou None
        """
        normalized_trans = self.normalize_title(transcription_title)
        
        for template_id in self.template_titles:
            normalized_template = self.normalize_title(template_id)
            
            if normalized_trans == normalized_template:
                logger.info(f"✅ Match EXACT trouvé: {template_id}")
                return template_id
        
        return None
    
    def find_key_match(self, transcription_title: str) -> Optional[Tuple[str, float]]:
        """
        Cherche un match basé sur l'identifiant clé
        
        Args:
            transcription_title: Titre de la transcription
            
        Returns:
            Optional[Tuple[str, float]]: (template_id, confidence) ou None
        """
        trans_key = self.extract_key_identifier(transcription_title)
        
        best_match = None
        best_score = 0.0
        
        for template_id in self.template_titles:
            template_key = self.extract_key_identifier(template_id)
            
            # Vérifier si les clés correspondent
            if trans_key in template_key or template_key in trans_key:
                similarity = self.calculate_similarity(trans_key, template_key)
                
                if similarity > best_score:
                    best_score = similarity
                    best_match = template_id
        
        if best_match and best_score >= 0.7:
            logger.info(f"✅ Match par CLÉ trouvé: {best_match} (score: {best_score:.3f})")
            return best_match, best_score
        
        return None
    
    def find_fuzzy_match(self, transcription_title: str, threshold: float = 0.8) -> Optional[Tuple[str, float]]:
        """
        Cherche un match fuzzy (approximatif)
        
        Args:
            transcription_title: Titre de la transcription
            threshold: Seuil minimum de similarité
            
        Returns:
            Optional[Tuple[str, float]]: (template_id, confidence) ou None
        """
        normalized_trans = self.normalize_title(transcription_title)
        
        best_match = None
        best_score = 0.0
        
        for template_id in self.template_titles:
            normalized_template = self.normalize_title(template_id)
            
            similarity = self.calculate_similarity(normalized_trans, normalized_template)
            
            if similarity > best_score and similarity >= threshold:
                best_score = similarity
                best_match = template_id
        
        if best_match:
            logger.info(f"✅ Match FUZZY trouvé: {best_match} (score: {best_score:.3f})")
            return best_match, best_score
        
        return None
    
    def match_by_title(self, transcription_title: str, 
                       fuzzy_threshold: float = 0.8) -> TitleMatchResult:
        """
        Effectue le matching par titre avec stratégie en cascade
        
        Args:
            transcription_title: Titre du fichier de transcription
            fuzzy_threshold: Seuil pour le matching fuzzy
            
        Returns:
            TitleMatchResult: Résultat du matching
        """
        logger.info(f"\n{'='*80}")
        logger.info(f"🔍 MATCHING PAR TITRE: {transcription_title}")
        logger.info(f"{'='*80}")
        
        normalized_trans = self.normalize_title(transcription_title)
        logger.info(f"📝 Titre normalisé: {normalized_trans}")
        
        # Stratégie 1: Match exact
        exact_match = self.find_exact_match(transcription_title)
        if exact_match:
            return TitleMatchResult(
                transcription_title=transcription_title,
                template_id=exact_match,
                match_type='exact',
                confidence=1.0,
                normalized_transcription=normalized_trans,
                normalized_template=self.normalize_title(exact_match)
            )
        
        # Stratégie 2: Match par identifiant clé
        key_match = self.find_key_match(transcription_title)
        if key_match:
            template_id, confidence = key_match
            return TitleMatchResult(
                transcription_title=transcription_title,
                template_id=template_id,
                match_type='normalized',
                confidence=confidence,
                normalized_transcription=normalized_trans,
                normalized_template=self.normalize_title(template_id)
            )
        
        # Stratégie 3: Match fuzzy
        fuzzy_match = self.find_fuzzy_match(transcription_title, fuzzy_threshold)
        if fuzzy_match:
            template_id, confidence = fuzzy_match
            return TitleMatchResult(
                transcription_title=transcription_title,
                template_id=template_id,
                match_type='fuzzy',
                confidence=confidence,
                normalized_transcription=normalized_trans,
                normalized_template=self.normalize_title(template_id)
            )
        
        # Aucun match trouvé
        logger.warning(f"⚠️  Aucun match par titre trouvé pour: {transcription_title}")
        return TitleMatchResult(
            transcription_title=transcription_title,
            template_id='',
            match_type='none',
            confidence=0.0,
            normalized_transcription=normalized_trans,
            normalized_template=''
        )
    
    def display_match_result(self, result: TitleMatchResult):
        """
        Affiche un résultat de matching de manière formatée
        
        Args:
            result: Résultat à afficher
        """
        print(f"\n{'='*80}")
        print(f"📋 RÉSULTAT MATCHING PAR TITRE")
        print(f"{'='*80}")
        print(f"📄 Transcription: {result.transcription_title}")
        print(f"📝 Normalisé: {result.normalized_transcription}")
        print(f"\n{'─'*80}")
        
        if result.match_type != 'none':
            print(f"✅ Template trouvé: {result.template_id}")
            print(f"📝 Normalisé: {result.normalized_template}")
            print(f"🎯 Type de match: {result.match_type.upper()}")
            print(f"📊 Confiance: {result.confidence:.2%}")
        else:
            print(f"❌ Aucun template correspondant trouvé")
            print(f"💡 Le matching sémantique sera utilisé")
        
        print(f"{'='*80}\n")


class HybridMatcher:
    """
    Matcher hybride qui combine le matching par titre et le matching sémantique
    """
    
    def __init__(self, parser_instance, semantic_matcher_instance):
        """
        Initialise le matcher hybride
        
        Args:
            parser_instance: Instance de MedicalTemplateParser
            semantic_matcher_instance: Instance de TranscriptionMatcher
        """
        self.parser = parser_instance
        self.semantic_matcher = semantic_matcher_instance
        self.title_matcher = TitleBasedMatcher(parser_instance)
        
        logger.info("🔄 Matcher hybride initialisé")
    
    def match_and_fill(self, transcription: str, transcription_filename: str = None,
                       title_confidence_threshold: float = 0.8):
        """
        Effectue le matching et le remplissage avec stratégie hybride
        
        Args:
            transcription: Contenu de la transcription
            transcription_filename: Nom du fichier (optionnel, pour matching par titre)
            title_confidence_threshold: Seuil de confiance pour utiliser le match par titre
            
        Returns:
            List[MatchResult]: Résultats du matching et remplissage
        """
        from transcription_matcher import MatchResult
        
        logger.info("\n" + "="*80)
        logger.info("🚀 MATCHING HYBRIDE (Titre + Sémantique)")
        logger.info("="*80)
        
        # Étape 1: Essayer le matching par titre si le filename est fourni
        template_id = None
        match_method = "semantic"
        
        if transcription_filename:
            logger.info(f"\n📋 Étape 1: Matching par TITRE")
            logger.info(f"{'─'*80}")
            
            title_result = self.title_matcher.match_by_title(transcription_filename)
            self.title_matcher.display_match_result(title_result)
            
            if title_result.match_type != 'none' and title_result.confidence >= title_confidence_threshold:
                template_id = title_result.template_id
                match_method = f"title ({title_result.match_type})"
                logger.info(f"✅ Utilisation du match par titre: {template_id}")
            else:
                logger.info(f"⚠️  Match par titre insuffisant (confiance: {title_result.confidence:.2%})")
                logger.info(f"🔄 Passage au matching sémantique...")
        
        # Étape 2: Si pas de match par titre, utiliser le matching sémantique
        if not template_id:
            logger.info(f"\n🧠 Étape 2: Matching SÉMANTIQUE")
            logger.info(f"{'─'*80}")
            
            results = self.semantic_matcher.match_and_fill(transcription, return_top_k=1)
            
            if results:
                # Ajouter l'info de la méthode de matching
                for result in results:
                    result.match_method = match_method
                return results
            else:
                logger.error("❌ Aucun résultat du matching sémantique")
                return []
        
        # Étape 3: Remplir le template trouvé par titre
        logger.info(f"\n📝 Étape 3: REMPLISSAGE du template")
        logger.info(f"{'─'*80}")
        
        template_info = self.parser.get_template_info(template_id)
        if not template_info:
            logger.error(f"❌ Template {template_id} introuvable")
            return []
        
        # Remplir avec GPT
        filled_data = self.semantic_matcher.fill_template_with_gpt(
            template_info.content,
            transcription
        )
        
        sections_filled = filled_data.get("sections", {})
        confidence = filled_data.get("confidence", 0.0)
        
        # Formater le résultat
        filled_template = self.semantic_matcher.format_filled_template(
            template_info.content,
            sections_filled
        )
        
        result = MatchResult(
            template_id=template_id,
            template_content=template_info.content,
            similarity_score=1.0,  # Match par titre = score parfait
            filled_template=filled_template,
            sections_filled=sections_filled,
            confidence_score=confidence,
            match_method=match_method
        )
        
        logger.info(f"✅ Template rempli via {match_method}")
        logger.info("="*80 + "\n")
        
        return [result]