File size: 4,608 Bytes
f92da22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
#!/usr/bin/env python3
"""
Document Validator
Validates generated medical documents against original transcriptions
"""
import re
from typing import Dict, Any, List
from docx import Document
from langchain.prompts import ChatPromptTemplate
def validate_generated_document(template_path: str, transcription_path: str, generated_doc_path: str) -> Dict[str, Any]:
"""Validate that the generated document contains all important content from the transcription."""
from template_analyzer import analyze_word_template
from transcription_processor import load_transcription
# Extract content from generated document
doc = Document(generated_doc_path)
generated_content = []
for paragraph in doc.paragraphs:
text = paragraph.text.strip()
if text and not text.startswith("Date:") and not text.startswith("Heure:"):
generated_content.append(text)
generated_text = "\n".join(generated_content)
# Load transcription
transcription_text = load_transcription(transcription_path)
# Extract medical entities from both texts
def extract_medical_entities(text: str) -> List[str]:
patterns = [
r'\d+(?:\.\d+)?\s*(?:mm|cm|kg|cc|ml|g|mg)', # Measurements
r'\b(?:rein|vessie|foie|rate|poumon|coeur|cerveau|muscle|tendon|os|articulation)\b',
r'\b(?:lithiase|calcification|tendinopathie|inflammation|dilatation|normal|anormal)\b',
r'\b(?:échographie|radiographie|scanner|irm|examen)\b',
]
entities = []
for pattern in patterns:
matches = re.findall(pattern, text.lower())
entities.extend(matches)
return list(set(entities))
transcription_entities = extract_medical_entities(transcription_text)
generated_entities = extract_medical_entities(generated_text)
# Calculate coverage
missing_entities = [
entity for entity in transcription_entities if entity not in generated_entities]
coverage_percentage = ((len(transcription_entities) - len(missing_entities)) /
len(transcription_entities) * 100) if transcription_entities else 100
# Validate structure
template_analysis = analyze_word_template(template_path)
template_sections = [section['text']
for section in template_analysis.get('sections', [])]
found_sections = []
for paragraph in doc.paragraphs:
text = paragraph.text.strip()
for template_section in template_sections:
template_clean = template_section.lower().replace(
'\xa0', ' ').replace(':', '').strip()
text_clean = text.lower().replace(':', '').strip()
if template_clean in text_clean or text_clean in template_clean:
found_sections.append(template_section)
break
missing_sections = [
s for s in template_sections if s not in found_sections]
structure_valid = len(missing_sections) == 0
# Overall score
structure_score = 1.0 if structure_valid else 0.5
entities_score = coverage_percentage / 100
overall_score = (structure_score + entities_score) / 2
validation_result = {
"overall_score": overall_score,
"structure_valid": structure_valid,
"entities_coverage": coverage_percentage,
"missing_sections": missing_sections,
"missing_entities": missing_entities,
"transcription_entities_count": len(transcription_entities),
"generated_entities_count": len(generated_entities),
"found_sections": found_sections,
"template_sections": template_sections
}
return validation_result
def create_validation_chain(llm):
"""Create the validation chain."""
validation_prompt = ChatPromptTemplate.from_messages([
("system", """You are a medical document validation expert.
Analyze if the generated medical document contains all important medical information from the original transcription.
Provide a brief validation summary with:
- Overall quality assessment
- Missing important information (if any)
- Key recommendations"""),
("human", """Validate the content coverage between the original transcription and the generated document.
ORIGINAL TRANSCRIPTION:
{transcription}
GENERATED DOCUMENT CONTENT:
{generated_content}
VALIDATION METRICS:
- Structure Valid: {structure_valid}
- Entities Coverage: {entities_coverage:.1f}%
- Missing Sections: {missing_sections}
- Missing Entities: {missing_entities}
Provide a concise validation summary.""")
])
return validation_prompt | llm
|