#!/usr/bin/env python3 """ Document Validator Validates generated medical documents against original transcriptions """ import re from typing import Dict, Any, List from docx import Document from langchain.prompts import ChatPromptTemplate def validate_generated_document(template_path: str, transcription_path: str, generated_doc_path: str) -> Dict[str, Any]: """Validate that the generated document contains all important content from the transcription.""" from template_analyzer import analyze_word_template from transcription_processor import load_transcription # Extract content from generated document doc = Document(generated_doc_path) generated_content = [] for paragraph in doc.paragraphs: text = paragraph.text.strip() if text and not text.startswith("Date:") and not text.startswith("Heure:"): generated_content.append(text) generated_text = "\n".join(generated_content) # Load transcription transcription_text = load_transcription(transcription_path) # Extract medical entities from both texts def extract_medical_entities(text: str) -> List[str]: patterns = [ r'\d+(?:\.\d+)?\s*(?:mm|cm|kg|cc|ml|g|mg)', # Measurements r'\b(?:rein|vessie|foie|rate|poumon|coeur|cerveau|muscle|tendon|os|articulation)\b', r'\b(?:lithiase|calcification|tendinopathie|inflammation|dilatation|normal|anormal)\b', r'\b(?:échographie|radiographie|scanner|irm|examen)\b', ] entities = [] for pattern in patterns: matches = re.findall(pattern, text.lower()) entities.extend(matches) return list(set(entities)) transcription_entities = extract_medical_entities(transcription_text) generated_entities = extract_medical_entities(generated_text) # Calculate coverage missing_entities = [ entity for entity in transcription_entities if entity not in generated_entities] coverage_percentage = ((len(transcription_entities) - len(missing_entities)) / len(transcription_entities) * 100) if transcription_entities else 100 # Validate structure template_analysis = analyze_word_template(template_path) template_sections = [section['text'] for section in template_analysis.get('sections', [])] found_sections = [] for paragraph in doc.paragraphs: text = paragraph.text.strip() for template_section in template_sections: template_clean = template_section.lower().replace( '\xa0', ' ').replace(':', '').strip() text_clean = text.lower().replace(':', '').strip() if template_clean in text_clean or text_clean in template_clean: found_sections.append(template_section) break missing_sections = [ s for s in template_sections if s not in found_sections] structure_valid = len(missing_sections) == 0 # Overall score structure_score = 1.0 if structure_valid else 0.5 entities_score = coverage_percentage / 100 overall_score = (structure_score + entities_score) / 2 validation_result = { "overall_score": overall_score, "structure_valid": structure_valid, "entities_coverage": coverage_percentage, "missing_sections": missing_sections, "missing_entities": missing_entities, "transcription_entities_count": len(transcription_entities), "generated_entities_count": len(generated_entities), "found_sections": found_sections, "template_sections": template_sections } return validation_result def create_validation_chain(llm): """Create the validation chain.""" validation_prompt = ChatPromptTemplate.from_messages([ ("system", """You are a medical document validation expert. Analyze if the generated medical document contains all important medical information from the original transcription. Provide a brief validation summary with: - Overall quality assessment - Missing important information (if any) - Key recommendations"""), ("human", """Validate the content coverage between the original transcription and the generated document. ORIGINAL TRANSCRIPTION: {transcription} GENERATED DOCUMENT CONTENT: {generated_content} VALIDATION METRICS: - Structure Valid: {structure_valid} - Entities Coverage: {entities_coverage:.1f}% - Missing Sections: {missing_sections} - Missing Entities: {missing_entities} Provide a concise validation summary.""") ]) return validation_prompt | llm