|
|
|
|
|
""" |
|
|
Document Validator |
|
|
Validates generated medical documents against original transcriptions |
|
|
""" |
|
|
|
|
|
import re |
|
|
from typing import Dict, Any, List |
|
|
from docx import Document |
|
|
from langchain.prompts import ChatPromptTemplate |
|
|
|
|
|
|
|
|
def validate_generated_document(template_path: str, transcription_path: str, generated_doc_path: str) -> Dict[str, Any]: |
|
|
"""Validate that the generated document contains all important content from the transcription.""" |
|
|
from template_analyzer import analyze_word_template |
|
|
from transcription_processor import load_transcription |
|
|
|
|
|
|
|
|
doc = Document(generated_doc_path) |
|
|
generated_content = [] |
|
|
for paragraph in doc.paragraphs: |
|
|
text = paragraph.text.strip() |
|
|
if text and not text.startswith("Date:") and not text.startswith("Heure:"): |
|
|
generated_content.append(text) |
|
|
generated_text = "\n".join(generated_content) |
|
|
|
|
|
|
|
|
transcription_text = load_transcription(transcription_path) |
|
|
|
|
|
|
|
|
def extract_medical_entities(text: str) -> List[str]: |
|
|
patterns = [ |
|
|
r'\d+(?:\.\d+)?\s*(?:mm|cm|kg|cc|ml|g|mg)', |
|
|
r'\b(?:rein|vessie|foie|rate|poumon|coeur|cerveau|muscle|tendon|os|articulation)\b', |
|
|
r'\b(?:lithiase|calcification|tendinopathie|inflammation|dilatation|normal|anormal)\b', |
|
|
r'\b(?:échographie|radiographie|scanner|irm|examen)\b', |
|
|
] |
|
|
entities = [] |
|
|
for pattern in patterns: |
|
|
matches = re.findall(pattern, text.lower()) |
|
|
entities.extend(matches) |
|
|
return list(set(entities)) |
|
|
|
|
|
transcription_entities = extract_medical_entities(transcription_text) |
|
|
generated_entities = extract_medical_entities(generated_text) |
|
|
|
|
|
|
|
|
missing_entities = [ |
|
|
entity for entity in transcription_entities if entity not in generated_entities] |
|
|
coverage_percentage = ((len(transcription_entities) - len(missing_entities)) / |
|
|
len(transcription_entities) * 100) if transcription_entities else 100 |
|
|
|
|
|
|
|
|
template_analysis = analyze_word_template(template_path) |
|
|
template_sections = [section['text'] |
|
|
for section in template_analysis.get('sections', [])] |
|
|
|
|
|
found_sections = [] |
|
|
for paragraph in doc.paragraphs: |
|
|
text = paragraph.text.strip() |
|
|
for template_section in template_sections: |
|
|
template_clean = template_section.lower().replace( |
|
|
'\xa0', ' ').replace(':', '').strip() |
|
|
text_clean = text.lower().replace(':', '').strip() |
|
|
if template_clean in text_clean or text_clean in template_clean: |
|
|
found_sections.append(template_section) |
|
|
break |
|
|
|
|
|
missing_sections = [ |
|
|
s for s in template_sections if s not in found_sections] |
|
|
structure_valid = len(missing_sections) == 0 |
|
|
|
|
|
|
|
|
structure_score = 1.0 if structure_valid else 0.5 |
|
|
entities_score = coverage_percentage / 100 |
|
|
overall_score = (structure_score + entities_score) / 2 |
|
|
|
|
|
validation_result = { |
|
|
"overall_score": overall_score, |
|
|
"structure_valid": structure_valid, |
|
|
"entities_coverage": coverage_percentage, |
|
|
"missing_sections": missing_sections, |
|
|
"missing_entities": missing_entities, |
|
|
"transcription_entities_count": len(transcription_entities), |
|
|
"generated_entities_count": len(generated_entities), |
|
|
"found_sections": found_sections, |
|
|
"template_sections": template_sections |
|
|
} |
|
|
|
|
|
return validation_result |
|
|
|
|
|
|
|
|
def create_validation_chain(llm): |
|
|
"""Create the validation chain.""" |
|
|
validation_prompt = ChatPromptTemplate.from_messages([ |
|
|
("system", """You are a medical document validation expert. |
|
|
Analyze if the generated medical document contains all important medical information from the original transcription. |
|
|
|
|
|
Provide a brief validation summary with: |
|
|
- Overall quality assessment |
|
|
- Missing important information (if any) |
|
|
- Key recommendations"""), |
|
|
("human", """Validate the content coverage between the original transcription and the generated document. |
|
|
|
|
|
ORIGINAL TRANSCRIPTION: |
|
|
{transcription} |
|
|
|
|
|
GENERATED DOCUMENT CONTENT: |
|
|
{generated_content} |
|
|
|
|
|
VALIDATION METRICS: |
|
|
- Structure Valid: {structure_valid} |
|
|
- Entities Coverage: {entities_coverage:.1f}% |
|
|
- Missing Sections: {missing_sections} |
|
|
- Missing Entities: {missing_entities} |
|
|
|
|
|
Provide a concise validation summary.""") |
|
|
]) |
|
|
|
|
|
return validation_prompt | llm |
|
|
|