|
|
import os |
|
|
import re |
|
|
import json |
|
|
import logging |
|
|
from typing import Dict, List, Optional, Tuple, Any |
|
|
from dataclasses import dataclass, asdict |
|
|
from openai import AzureOpenAI |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
@dataclass |
|
|
class MedicalEntity: |
|
|
"""Entité médicale extraite""" |
|
|
entity_type: str |
|
|
value: str |
|
|
unit: Optional[str] = None |
|
|
confidence: float = 0.0 |
|
|
context: str = "" |
|
|
start_pos: int = -1 |
|
|
end_pos: int = -1 |
|
|
|
|
|
@dataclass |
|
|
class ExtractedData: |
|
|
"""Données médicales structurées extraites""" |
|
|
|
|
|
uterus_position: Optional[str] = None |
|
|
uterus_size: Optional[str] = None |
|
|
hysterometry: Optional[str] = None |
|
|
|
|
|
|
|
|
endometrium_thickness: Optional[str] = None |
|
|
|
|
|
|
|
|
myomas_present: Optional[bool] = None |
|
|
zone_jonctionnelle_status: Optional[str] = None |
|
|
adenomyosis_type: Optional[str] = None |
|
|
|
|
|
|
|
|
right_ovary_dimensions: Optional[str] = None |
|
|
right_ovary_cfa: Optional[str] = None |
|
|
right_ovary_accessibility: Optional[str] = None |
|
|
|
|
|
|
|
|
left_ovary_dimensions: Optional[str] = None |
|
|
left_ovary_cfa: Optional[str] = None |
|
|
left_ovary_accessibility: Optional[str] = None |
|
|
|
|
|
|
|
|
doppler_ip: Optional[str] = None |
|
|
doppler_ir: Optional[str] = None |
|
|
|
|
|
|
|
|
extraction_confidence: float = 0.0 |
|
|
missing_fields: List[str] = None |
|
|
|
|
|
def __post_init__(self): |
|
|
if self.missing_fields is None: |
|
|
self.missing_fields = [] |
|
|
|
|
|
class MedicalNERAgent: |
|
|
"""Agent NER médical utilisant GPT-5 et règles expertes""" |
|
|
|
|
|
def __init__(self): |
|
|
self.client = AzureOpenAI( |
|
|
api_key=os.getenv("AZURE_OPENAI_KEY", "0e1141969928462bbbf342678c01079e"), |
|
|
api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview"), |
|
|
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", "https://voxist-gpt-eastus2.openai.azure.com/") |
|
|
) |
|
|
self.model = os.getenv("AZURE_OPENAI_MODEL", "gpt-5") |
|
|
self.deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-5-eastus2") |
|
|
|
|
|
|
|
|
self.medical_patterns = { |
|
|
'uterus_position': [ |
|
|
r'utérus est (\w+)', |
|
|
r'L\'utérus .*?est (\w+)', |
|
|
], |
|
|
'hysterometry': [ |
|
|
r'(\d+(?:[.,]\d+)?)\s*(?:d\'|de\s+)?hystérométrie', |
|
|
r'hystérométrie\s*:?\s*(\d+(?:[.,]\d+)?)', |
|
|
], |
|
|
'endometrium': [ |
|
|
r'(\d+(?:[.,]\d+)?)\s*(?:d\'|de\s+)?endomètre', |
|
|
r'endomètre.*?(\d+(?:[.,]\d+)?)', |
|
|
], |
|
|
'zone_jonctionnelle': [ |
|
|
r'zone jonctionnelle\s+(\w+)', |
|
|
r'(\w+)\s+zone jonctionnelle', |
|
|
], |
|
|
'myomas_fibrome': [ |
|
|
r'pas de (fibrome|myome)s?', |
|
|
r'(fibrome|myome)s?\s+myomètre\s+pas de (fibrome|myome)s?', |
|
|
r'sans (fibrome|myome)s?', |
|
|
r'absence.*?(fibrome|myome)s?', |
|
|
], |
|
|
'adenomyosis': [ |
|
|
r'adénomyose\s+(\w+)', |
|
|
r'(\w+)\s+d\'adénomyose', |
|
|
], |
|
|
} |
|
|
|
|
|
def extract_medical_entities(self, transcription: str) -> ExtractedData: |
|
|
""" |
|
|
Extraction principale utilisant GPT-5 + NER expert amélioré |
|
|
""" |
|
|
logger.info("🚀 Début de l'extraction d'entités médicales") |
|
|
|
|
|
|
|
|
cleaned_text = self._preprocess_text(transcription) |
|
|
logger.info(f"📝 Texte nettoyé: {cleaned_text[:100]}...") |
|
|
|
|
|
|
|
|
ovary_data = self._extract_ovaries_structured(cleaned_text) |
|
|
|
|
|
|
|
|
gpt_entities = self._extract_with_gpt5(cleaned_text) |
|
|
|
|
|
|
|
|
expert_entities = self._extract_with_expert_ner(cleaned_text) |
|
|
|
|
|
|
|
|
final_data = self._merge_extraction_results_improved(gpt_entities, expert_entities, ovary_data, cleaned_text) |
|
|
|
|
|
|
|
|
final_data.extraction_confidence = self._calculate_confidence(final_data) |
|
|
final_data.missing_fields = self._identify_missing_fields(final_data) |
|
|
|
|
|
logger.info(f"✅ Extraction terminée - Confiance: {final_data.extraction_confidence:.2f}") |
|
|
|
|
|
return final_data |
|
|
|
|
|
def _extract_ovaries_structured(self, text: str) -> Dict[str, Dict[str, str]]: |
|
|
""" |
|
|
Extraction structurée des ovaires avec analyse contextuelle améliorée |
|
|
""" |
|
|
logger.info("🔍 Extraction structurée des ovaires") |
|
|
|
|
|
ovary_data = { |
|
|
'right': {'dimensions': None, 'cfa': None, 'accessibility': None}, |
|
|
'left': {'dimensions': None, 'cfa': None, 'accessibility': None} |
|
|
} |
|
|
|
|
|
|
|
|
right_match = re.search( |
|
|
r'ovaire droit\s+mesure\s+(\d+(?:[.,]\d+)?)\s*(?:fois|x)\s*(\d+(?:[.,]\d+)?)\s*mm.*?(\d+)\s*follicules', |
|
|
text, re.IGNORECASE | re.DOTALL |
|
|
) |
|
|
|
|
|
if right_match: |
|
|
ovary_data['right']['dimensions'] = f"{right_match.group(1)} x {right_match.group(2)} mm" |
|
|
ovary_data['right']['cfa'] = right_match.group(3) |
|
|
logger.info(f"✅ Ovaire droit trouvé: {ovary_data['right']}") |
|
|
|
|
|
|
|
|
|
|
|
left_dim_patterns = [ |
|
|
r'ovaire gauche.*?mesure\s+(\d+(?:[.,]\d+)?)\s*(?:fois|x)\s*(\d+(?:[.,]\d+)?)', |
|
|
r'il mesure\s+(\d+(?:[.,]\d+)?)\s*(?:fois|x)\s*(\d+(?:[.,]\d+)?)', |
|
|
] |
|
|
|
|
|
for pattern in left_dim_patterns: |
|
|
match = re.search(pattern, text, re.IGNORECASE) |
|
|
if match: |
|
|
ovary_data['left']['dimensions'] = f"{match.group(1)} x {match.group(2)} mm" |
|
|
logger.info(f"✅ Dimensions ovaire gauche trouvées: {ovary_data['left']['dimensions']}") |
|
|
break |
|
|
|
|
|
|
|
|
cfa_matches = list(re.finditer(r'(\d+)\s*follicules', text, re.IGNORECASE)) |
|
|
cfa_siege_matches = list(re.finditer(r'siège de.*?(\d+)\s*follicules', text, re.IGNORECASE)) |
|
|
|
|
|
|
|
|
all_cfa = [] |
|
|
for match in cfa_matches: |
|
|
all_cfa.append((match.group(1), match.start(), 'follicules')) |
|
|
for match in cfa_siege_matches: |
|
|
all_cfa.append((match.group(1), match.start(), 'siège')) |
|
|
|
|
|
|
|
|
all_cfa.sort(key=lambda x: x[1]) |
|
|
|
|
|
|
|
|
if len(all_cfa) >= 2: |
|
|
|
|
|
if not ovary_data['right']['cfa']: |
|
|
ovary_data['right']['cfa'] = all_cfa[0][0] |
|
|
|
|
|
|
|
|
for cfa_value, pos, cfa_type in reversed(all_cfa): |
|
|
if cfa_type == 'siège' or pos > all_cfa[0][1]: |
|
|
ovary_data['left']['cfa'] = cfa_value |
|
|
break |
|
|
|
|
|
logger.info(f"✅ CFA attribués - Droit: {ovary_data['right']['cfa']}, Gauche: {ovary_data['left']['cfa']}") |
|
|
|
|
|
elif len(all_cfa) == 1: |
|
|
|
|
|
cfa_value = all_cfa[0][0] |
|
|
cfa_context = text[max(0, all_cfa[0][1]-50):all_cfa[0][1]+50].lower() |
|
|
|
|
|
if 'gauche' in cfa_context or 'siège' in cfa_context: |
|
|
ovary_data['left']['cfa'] = cfa_value |
|
|
else: |
|
|
ovary_data['right']['cfa'] = cfa_value |
|
|
|
|
|
|
|
|
|
|
|
if re.search(r'ovaire gauche.*?accès difficile', text, re.IGNORECASE) or \ |
|
|
re.search(r'd\'accès difficile à rétro-utérin', text, re.IGNORECASE): |
|
|
ovary_data['left']['accessibility'] = 'difficile rétro-utérine' |
|
|
|
|
|
|
|
|
if re.search(r'par contre l\'autre il est normal', text, re.IGNORECASE): |
|
|
|
|
|
if ovary_data['left']['accessibility'] == 'difficile rétro-utérine': |
|
|
ovary_data['right']['accessibility'] = 'normale' |
|
|
else: |
|
|
|
|
|
context_before = text[:text.find('par contre')].lower() if 'par contre' in text.lower() else text.lower() |
|
|
if 'gauche' in context_before[-100:]: |
|
|
ovary_data['right']['accessibility'] = 'normale' |
|
|
else: |
|
|
ovary_data['left']['accessibility'] = 'normale' |
|
|
|
|
|
logger.info(f"🎯 Données ovaires structurées: {ovary_data}") |
|
|
return ovary_data |
|
|
|
|
|
def _preprocess_text(self, text: str) -> str: |
|
|
"""Prétraitement du texte médical""" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text.strip()) |
|
|
|
|
|
|
|
|
text = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', text) |
|
|
text = re.sub(r'(\d+)\s+(\d+)', lambda m: f"{m.group(1)}.{m.group(2)}" if len(m.group(2)) == 1 else f"{m.group(1)} {m.group(2)}", text) |
|
|
|
|
|
|
|
|
text = text.replace('fois', 'x') |
|
|
|
|
|
return text |
|
|
|
|
|
def _extract_with_gpt5(self, text: str) -> ExtractedData: |
|
|
"""Extraction avec GPT-5 améliorée""" |
|
|
prompt = self._build_improved_ner_prompt(text) |
|
|
|
|
|
try: |
|
|
response = self.client.chat.completions.create( |
|
|
model=self.deployment, |
|
|
messages=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": self._get_improved_medical_system_prompt() |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": prompt |
|
|
} |
|
|
], |
|
|
|
|
|
response_format={"type": "json_object"} |
|
|
) |
|
|
|
|
|
json_response = response.choices[0].message.content.strip() |
|
|
data_dict = json.loads(json_response) |
|
|
|
|
|
return self._dict_to_extracted_data(data_dict) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Erreur extraction GPT-5: {e}") |
|
|
return ExtractedData() |
|
|
|
|
|
def _build_improved_ner_prompt(self, text: str) -> str: |
|
|
"""Construction du prompt NER amélioré pour GPT-5""" |
|
|
return f""" |
|
|
Tu es un expert en analyse d'échographies gynécologiques. Extrais précisément les entités médicales de cette transcription en analysant TRÈS ATTENTIVEMENT le contexte pour les ovaires. |
|
|
|
|
|
TRANSCRIPTION: "{text}" |
|
|
|
|
|
RÈGLES CRITIQUES POUR LES OVAIRES: |
|
|
1. "L'ovaire droit mesure X x Y mm, Z follicules" -> ovaire droit |
|
|
2. "il mesure X x Y mm siège de Z follicules" après mention de l'ovaire gauche -> ovaire gauche |
|
|
3. "par contre l'autre il est normal" = généralement l'ovaire droit si le gauche est difficile d'accès |
|
|
4. "siège de X follicules" = généralement ovaire gauche |
|
|
5. Premier CFA mentionné = généralement ovaire droit, dernier = ovaire gauche |
|
|
|
|
|
Extrais au format JSON strict: |
|
|
|
|
|
{{ |
|
|
"uterus_position": "antéversé/rétroversé/intermédiaire ou null", |
|
|
"uterus_size": "dimension en cm ou null", |
|
|
"hysterometry": "valeur en mm ou null", |
|
|
"endometrium_thickness": "valeur en mm ou null", |
|
|
"myomas_present": true/false/null, |
|
|
"zone_jonctionnelle_status": "normale/épaissie ou null", |
|
|
"adenomyosis_type": "diffuse/focale/absente ou null", |
|
|
"right_ovary_dimensions": "longueur x largeur avec unité ou null", |
|
|
"right_ovary_cfa": "nombre follicules ou null", |
|
|
"right_ovary_accessibility": "normale/aisée/difficile/rétro-utérine ou null", |
|
|
"left_ovary_dimensions": "longueur x largeur avec unité ou null", |
|
|
"left_ovary_cfa": "nombre follicules ou null", |
|
|
"left_ovary_accessibility": "normale/aisée/difficile/rétro-utérine ou null", |
|
|
"doppler_ip": "valeur IP ou null", |
|
|
"doppler_ir": "valeur IR ou null" |
|
|
}} |
|
|
|
|
|
ANALYSE LE CONTEXTE COMPLET pour différencier ovaire droit/gauche. |
|
|
Réponds uniquement avec le JSON, sans explication. |
|
|
""" |
|
|
|
|
|
def _get_improved_medical_system_prompt(self) -> str: |
|
|
"""Prompt système médical amélioré pour GPT-5""" |
|
|
return """Tu es un système expert en NER (Named Entity Recognition) médical spécialisé dans les échographies gynécologiques. |
|
|
|
|
|
MISSION CRITIQUE: Extraire avec une précision maximale les entités médicales, avec une attention particulière à la DISAMBIGUATION DES OVAIRES. |
|
|
|
|
|
EXPERTISE SPÉCIFIQUE OVAIRES: |
|
|
- Analyse contextuelle: "L'ovaire droit mesure..." vs "il mesure..." (référence à l'ovaire précédemment mentionné) |
|
|
- Références croisées: "l'autre", "par contre" nécessitent une analyse du contexte complet |
|
|
- Attribution CFA: Premier mentionné = généralement droit, "siège de" = généralement gauche |
|
|
- Accessibilité: "difficile rétro-utérin" est souvent l'ovaire gauche, "normal" l'autre |
|
|
|
|
|
RÈGLES D'OR: |
|
|
1. Lis TOUT le texte avant d'attribuer les mesures aux ovaires |
|
|
2. Utilise les indices contextuels (ordre, proximité, références) |
|
|
3. Ne mélange JAMAIS les données entre ovaire droit et gauche |
|
|
4. Si ambiguïté, privilégie l'ordre d'apparition dans le texte médical standard |
|
|
|
|
|
Tu dois être extrêmement précis dans la différenciation ovaire droit/gauche.""" |
|
|
|
|
|
def _extract_with_expert_ner(self, text: str) -> ExtractedData: |
|
|
"""Extraction avec NER expert (regex + règles)""" |
|
|
extracted = ExtractedData() |
|
|
|
|
|
|
|
|
extracted.uterus_position = self._extract_pattern(text, 'uterus_position') |
|
|
|
|
|
|
|
|
extracted.hysterometry = self._extract_pattern(text, 'hysterometry') |
|
|
|
|
|
|
|
|
extracted.endometrium_thickness = self._extract_pattern(text, 'endometrium') |
|
|
|
|
|
|
|
|
zone_jonc = self._extract_zone_jonctionnelle(text) |
|
|
if zone_jonc: |
|
|
extracted.zone_jonctionnelle_status = zone_jonc |
|
|
|
|
|
|
|
|
myomas_status = self._extract_myomas_status(text) |
|
|
if myomas_status is not None: |
|
|
extracted.myomas_present = myomas_status |
|
|
|
|
|
|
|
|
doppler_values = self._extract_doppler(text) |
|
|
extracted.doppler_ip = doppler_values.get('ip') |
|
|
extracted.doppler_ir = doppler_values.get('ir') |
|
|
|
|
|
|
|
|
extracted.adenomyosis_type = self._extract_pattern(text, 'adenomyosis') |
|
|
|
|
|
return extracted |
|
|
|
|
|
def _extract_zone_jonctionnelle(self, text: str) -> Optional[str]: |
|
|
"""Extraction du statut de la zone jonctionnelle""" |
|
|
if re.search(r'zone jonctionnelle\s+épaissie', text, re.IGNORECASE): |
|
|
return "épaissie" |
|
|
elif re.search(r'épaissie.*?zone jonctionnelle', text, re.IGNORECASE): |
|
|
return "épaissie" |
|
|
elif re.search(r'zone jonctionnelle\s+normale', text, re.IGNORECASE): |
|
|
return "normale" |
|
|
|
|
|
return None |
|
|
|
|
|
def _extract_myomas_status(self, text: str) -> Optional[bool]: |
|
|
"""Extraction du statut des myomes/fibromes""" |
|
|
negative_patterns = [ |
|
|
r'pas de (fibrome|myome)s?', |
|
|
r'sans (fibrome|myome)s?', |
|
|
r'absence.*?(fibrome|myome)s?', |
|
|
r'(fibrome|myome)s?\s+myomètre\s+pas de (fibrome|myome)s?', |
|
|
] |
|
|
|
|
|
for pattern in negative_patterns: |
|
|
if re.search(pattern, text, re.IGNORECASE): |
|
|
return False |
|
|
|
|
|
positive_patterns = [ |
|
|
r'présence.*?(fibrome|myome)s?', |
|
|
r'(fibrome|myome)s?\s+présents?', |
|
|
r'multiples (fibrome|myome)s?', |
|
|
] |
|
|
|
|
|
for pattern in positive_patterns: |
|
|
if re.search(pattern, text, re.IGNORECASE): |
|
|
return True |
|
|
|
|
|
return None |
|
|
|
|
|
def _extract_pattern(self, text: str, pattern_key: str) -> Optional[str]: |
|
|
"""Extraction avec patterns regex""" |
|
|
patterns = self.medical_patterns.get(pattern_key, []) |
|
|
|
|
|
for pattern in patterns: |
|
|
match = re.search(pattern, text, re.IGNORECASE) |
|
|
if match: |
|
|
return match.group(1) |
|
|
|
|
|
return None |
|
|
|
|
|
def _extract_doppler(self, text: str) -> Dict[str, str]: |
|
|
"""Extraction valeurs Doppler""" |
|
|
doppler = {} |
|
|
|
|
|
|
|
|
doppler_match = re.search(r'Doppler\s*:?\s*IP\s*(\d+(?:[.,]\d+)?)\s*-?\s*IR\s*(\d+(?:[.,]\d+)?)', text, re.IGNORECASE) |
|
|
|
|
|
if doppler_match: |
|
|
doppler['ip'] = doppler_match.group(1) |
|
|
doppler['ir'] = doppler_match.group(2) |
|
|
else: |
|
|
|
|
|
ip_match = re.search(r'IP\s*:?\s*(\d+(?:[.,]\d+)?)', text, re.IGNORECASE) |
|
|
if ip_match: |
|
|
doppler['ip'] = ip_match.group(1) |
|
|
|
|
|
ir_match = re.search(r'IR\s*:?\s*(\d+(?:[.,]\d+)?)', text, re.IGNORECASE) |
|
|
if ir_match: |
|
|
doppler['ir'] = ir_match.group(1) |
|
|
|
|
|
return doppler |
|
|
|
|
|
def _merge_extraction_results_improved(self, gpt_data: ExtractedData, expert_data: ExtractedData, ovary_data: Dict, text: str) -> ExtractedData: |
|
|
"""Fusion intelligente avec priorité aux données ovaires structurées""" |
|
|
merged = ExtractedData() |
|
|
|
|
|
|
|
|
non_ovary_fields = [ |
|
|
'uterus_position', 'uterus_size', 'hysterometry', |
|
|
'endometrium_thickness', 'adenomyosis_type', 'zone_jonctionnelle_status', |
|
|
'myomas_present', 'doppler_ip', 'doppler_ir' |
|
|
] |
|
|
|
|
|
|
|
|
for field in non_ovary_fields: |
|
|
gpt_value = getattr(gpt_data, field, None) |
|
|
expert_value = getattr(expert_data, field, None) |
|
|
|
|
|
if gpt_value is not None and str(gpt_value).strip() and str(gpt_value) != 'null': |
|
|
setattr(merged, field, gpt_value) |
|
|
elif expert_value is not None and str(expert_value).strip(): |
|
|
setattr(merged, field, expert_value) |
|
|
|
|
|
|
|
|
if ovary_data['right']['dimensions']: |
|
|
merged.right_ovary_dimensions = ovary_data['right']['dimensions'] |
|
|
elif gpt_data.right_ovary_dimensions: |
|
|
merged.right_ovary_dimensions = gpt_data.right_ovary_dimensions |
|
|
|
|
|
if ovary_data['right']['cfa']: |
|
|
merged.right_ovary_cfa = ovary_data['right']['cfa'] |
|
|
elif gpt_data.right_ovary_cfa: |
|
|
merged.right_ovary_cfa = gpt_data.right_ovary_cfa |
|
|
|
|
|
if ovary_data['right']['accessibility']: |
|
|
merged.right_ovary_accessibility = ovary_data['right']['accessibility'] |
|
|
elif gpt_data.right_ovary_accessibility: |
|
|
merged.right_ovary_accessibility = gpt_data.right_ovary_accessibility |
|
|
|
|
|
if ovary_data['left']['dimensions']: |
|
|
merged.left_ovary_dimensions = ovary_data['left']['dimensions'] |
|
|
elif gpt_data.left_ovary_dimensions: |
|
|
merged.left_ovary_dimensions = gpt_data.left_ovary_dimensions |
|
|
|
|
|
if ovary_data['left']['cfa']: |
|
|
merged.left_ovary_cfa = ovary_data['left']['cfa'] |
|
|
elif gpt_data.left_ovary_cfa: |
|
|
merged.left_ovary_cfa = gpt_data.left_ovary_cfa |
|
|
|
|
|
if ovary_data['left']['accessibility']: |
|
|
merged.left_ovary_accessibility = ovary_data['left']['accessibility'] |
|
|
elif gpt_data.left_ovary_accessibility: |
|
|
merged.left_ovary_accessibility = gpt_data.left_ovary_accessibility |
|
|
|
|
|
|
|
|
merged = self._post_process_contextual(merged, text) |
|
|
|
|
|
return merged |
|
|
|
|
|
def _post_process_contextual(self, data: ExtractedData, text: str) -> ExtractedData: |
|
|
"""Post-traitement contextuel pour corriger les erreurs spécifiques""" |
|
|
|
|
|
|
|
|
if data.hysterometry and 'mm mm' in str(data.hysterometry): |
|
|
data.hysterometry = str(data.hysterometry).replace(' mm mm', ' mm') |
|
|
|
|
|
if data.endometrium_thickness and 'mm mm' in str(data.endometrium_thickness): |
|
|
data.endometrium_thickness = str(data.endometrium_thickness).replace(' mm mm', ' mm') |
|
|
|
|
|
|
|
|
if data.right_ovary_cfa and 'follicules follicules' in str(data.right_ovary_cfa): |
|
|
data.right_ovary_cfa = str(data.right_ovary_cfa).replace(' follicules follicules', '') |
|
|
|
|
|
if data.left_ovary_cfa and 'follicules follicules' in str(data.left_ovary_cfa): |
|
|
data.left_ovary_cfa = str(data.left_ovary_cfa).replace(' follicules follicules', '') |
|
|
|
|
|
|
|
|
if data.left_ovary_dimensions and not ('mm' in str(data.left_ovary_dimensions) or 'cm' in str(data.left_ovary_dimensions)): |
|
|
if re.match(r'\d+\s*x\s*\d+$', str(data.left_ovary_dimensions).strip()): |
|
|
data.left_ovary_dimensions = str(data.left_ovary_dimensions) + ' mm' |
|
|
|
|
|
if data.right_ovary_dimensions and not ('mm' in str(data.right_ovary_dimensions) or 'cm' in str(data.right_ovary_dimensions)): |
|
|
if re.match(r'\d+\s*x\s*\d+$', str(data.right_ovary_dimensions).strip()): |
|
|
data.right_ovary_dimensions = str(data.right_ovary_dimensions) + ' mm' |
|
|
|
|
|
return data |
|
|
|
|
|
def _calculate_confidence(self, data: ExtractedData) -> float: |
|
|
"""Calcul score de confiance basé sur la complétude""" |
|
|
important_fields = [ |
|
|
'uterus_position', 'hysterometry', 'endometrium_thickness', |
|
|
'right_ovary_dimensions', 'left_ovary_dimensions', |
|
|
'right_ovary_cfa', 'left_ovary_cfa', |
|
|
'doppler_ip', 'doppler_ir' |
|
|
] |
|
|
|
|
|
filled_fields = 0 |
|
|
for field in important_fields: |
|
|
value = getattr(data, field, None) |
|
|
if value is not None and str(value).strip(): |
|
|
filled_fields += 1 |
|
|
|
|
|
return filled_fields / len(important_fields) |
|
|
|
|
|
def _identify_missing_fields(self, data: ExtractedData) -> List[str]: |
|
|
"""Identifie les champs manquants""" |
|
|
missing = [] |
|
|
|
|
|
field_mapping = { |
|
|
'uterus_position': 'Position utérus', |
|
|
'hysterometry': 'Hystérométrie', |
|
|
'endometrium_thickness': 'Épaisseur endomètre', |
|
|
'zone_jonctionnelle_status': 'Zone jonctionnelle', |
|
|
'myomas_present': 'Présence myomes', |
|
|
'right_ovary_dimensions': 'Taille ovaire droit', |
|
|
'left_ovary_dimensions': 'Taille ovaire gauche', |
|
|
'right_ovary_cfa': 'CFA ovaire droit', |
|
|
'left_ovary_cfa': 'CFA ovaire gauche', |
|
|
'right_ovary_accessibility': 'Accessibilité ovaire droit', |
|
|
'left_ovary_accessibility': 'Accessibilité ovaire gauche', |
|
|
'doppler_ip': 'IP Doppler', |
|
|
'doppler_ir': 'IR Doppler' |
|
|
} |
|
|
|
|
|
for field, description in field_mapping.items(): |
|
|
value = getattr(data, field, None) |
|
|
if value is None or (isinstance(value, str) and not value.strip()): |
|
|
missing.append(description) |
|
|
|
|
|
return missing |
|
|
|
|
|
def _dict_to_extracted_data(self, data_dict: Dict[str, Any]) -> ExtractedData: |
|
|
"""Convertit dictionnaire JSON en ExtractedData""" |
|
|
extracted = ExtractedData() |
|
|
|
|
|
field_mapping = { |
|
|
'uterus_position': 'uterus_position', |
|
|
'uterus_size': 'uterus_size', |
|
|
'hysterometry': 'hysterometry', |
|
|
'endometrium_thickness': 'endometrium_thickness', |
|
|
'myomas_present': 'myomas_present', |
|
|
'zone_jonctionnelle_status': 'zone_jonctionnelle_status', |
|
|
'adenomyosis_type': 'adenomyosis_type', |
|
|
'right_ovary_dimensions': 'right_ovary_dimensions', |
|
|
'right_ovary_cfa': 'right_ovary_cfa', |
|
|
'right_ovary_accessibility': 'right_ovary_accessibility', |
|
|
'left_ovary_dimensions': 'left_ovary_dimensions', |
|
|
'left_ovary_cfa': 'left_ovary_cfa', |
|
|
'left_ovary_accessibility': 'left_ovary_accessibility', |
|
|
'doppler_ip': 'doppler_ip', |
|
|
'doppler_ir': 'doppler_ir' |
|
|
} |
|
|
|
|
|
for json_key, attr_name in field_mapping.items(): |
|
|
value = data_dict.get(json_key) |
|
|
if value is not None and str(value).strip() and str(value) != 'null': |
|
|
setattr(extracted, attr_name, value) |
|
|
|
|
|
return extracted |
|
|
|
|
|
def print_extraction_report(self, data: ExtractedData) -> str: |
|
|
"""Génère un rapport d'extraction formaté""" |
|
|
report = " RAPPORT D'EXTRACTION MÉDICALE\n" |
|
|
report += "=" * 50 + "\n\n" |
|
|
|
|
|
|
|
|
report += " UTÉRUS:\n" |
|
|
report += f" Position: {data.uterus_position or '❌ Non trouvé'}\n" |
|
|
report += f" Taille: {data.uterus_size or '❌ Non trouvé'}\n" |
|
|
report += f" Hystérométrie: {data.hysterometry or '❌ Non trouvé'}\n" |
|
|
|
|
|
|
|
|
report += f"\n ENDOMÈTRE:\n" |
|
|
report += f" Épaisseur: {data.endometrium_thickness or '❌ Non trouvé'}\n" |
|
|
|
|
|
|
|
|
report += f"\n ZONE JONCTIONNELLE:\n" |
|
|
report += f" Status: {data.zone_jonctionnelle_status or '❌ Non trouvé'}\n" |
|
|
report += f" Myomes présents: {data.myomas_present if data.myomas_present is not None else '❌ Non trouvé'}\n" |
|
|
report += f" Adénomyose: {data.adenomyosis_type or '❌ Non trouvé'}\n" |
|
|
|
|
|
|
|
|
report += f"\n OVAIRE DROIT:\n" |
|
|
report += f" Dimensions: {data.right_ovary_dimensions or '❌ Non trouvé'}\n" |
|
|
report += f" CFA: {data.right_ovary_cfa or '❌ Non trouvé'} follicules\n" |
|
|
report += f" Accessibilité: {data.right_ovary_accessibility or '❌ Non trouvé'}\n" |
|
|
|
|
|
report += f"\n OVAIRE GAUCHE:\n" |
|
|
report += f" Dimensions: {data.left_ovary_dimensions or '❌ Non trouvé'}\n" |
|
|
report += f" CFA: {data.left_ovary_cfa or '❌ Non trouvé'} follicules\n" |
|
|
report += f" Accessibilité: {data.left_ovary_accessibility or '❌ Non trouvé'}\n" |
|
|
|
|
|
|
|
|
report += f"\n DOPPLER:\n" |
|
|
report += f" IP: {data.doppler_ip or '❌ Non trouvé'}\n" |
|
|
report += f" IR: {data.doppler_ir or '❌ Non trouvé'}\n" |
|
|
|
|
|
|
|
|
report += f"\n STATISTIQUES:\n" |
|
|
report += f" Score de confiance: {data.extraction_confidence:.1%}\n" |
|
|
report += f" Champs manquants: {len(data.missing_fields)}\n" |
|
|
|
|
|
if data.missing_fields: |
|
|
report += f" Détails manquants: {', '.join(data.missing_fields)}\n" |
|
|
|
|
|
return report |
|
|
|
|
|
def test_extraction(): |
|
|
"""Test de l'extraction avec le cas problématique""" |
|
|
|
|
|
transcription = """Compte rendu classique. L'utérus est antéversé de taille 7,8 cm 60 d'hystérométrie |
|
|
3,7 d'endomètre triangulaire zone jonctionnelle épaissie focale d'adénomyose diffuse fibromes |
|
|
myomètre pas de fibromes. Le col voulut le laisser comme il est la morphologie triangulaire. |
|
|
L'ovaire droit mesure 26 x 20 mm, 5 follicules. L'ovaire gauche accessibilité au maître rétro |
|
|
thérape par contre l'autre il est normal il mesure 25 x 19 mm siège de CFA : 22 follicules. |
|
|
Le Doppler : IP 3,24 - IR 0,91 et le reste tout en fait qui est l'ovaire gauche d'accès |
|
|
difficile à rétro-utérin.""" |
|
|
|
|
|
|
|
|
agent = MedicalNERAgent() |
|
|
|
|
|
|
|
|
result = agent.extract_medical_entities(transcription) |
|
|
|
|
|
|
|
|
print(agent.print_extraction_report(result)) |
|
|
|
|
|
|
|
|
print("\n🔍 VÉRIFICATION SPÉCIFIQUE DES OVAIRES:") |
|
|
print(f"Ovaire droit - Dimensions: {result.right_ovary_dimensions}") |
|
|
print(f"Ovaire droit - CFA: {result.right_ovary_cfa}") |
|
|
print(f"Ovaire droit - Accessibilité: {result.right_ovary_accessibility}") |
|
|
print(f"Ovaire gauche - Dimensions: {result.left_ovary_dimensions}") |
|
|
print(f"Ovaire gauche - CFA: {result.left_ovary_cfa}") |
|
|
print(f"Ovaire gauche - Accessibilité: {result.left_ovary_accessibility}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
test_extraction() |