Spaces:

Nourhenem
/

pipeline2

Sleeping

File size: 4,608 Bytes

f92da22

#!/usr/bin/env python3
"""
Document Validator
Validates generated medical documents against original transcriptions
"""

import re
from typing import Dict, Any, List
from docx import Document
from langchain.prompts import ChatPromptTemplate


def validate_generated_document(template_path: str, transcription_path: str, generated_doc_path: str) -> Dict[str, Any]:
    """Validate that the generated document contains all important content from the transcription."""
    from template_analyzer import analyze_word_template
    from transcription_processor import load_transcription

    # Extract content from generated document
    doc = Document(generated_doc_path)
    generated_content = []
    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        if text and not text.startswith("Date:") and not text.startswith("Heure:"):
            generated_content.append(text)
    generated_text = "\n".join(generated_content)

    # Load transcription
    transcription_text = load_transcription(transcription_path)

    # Extract medical entities from both texts
    def extract_medical_entities(text: str) -> List[str]:
        patterns = [
            r'\d+(?:\.\d+)?\s*(?:mm|cm|kg|cc|ml|g|mg)',  # Measurements
            r'\b(?:rein|vessie|foie|rate|poumon|coeur|cerveau|muscle|tendon|os|articulation)\b',
            r'\b(?:lithiase|calcification|tendinopathie|inflammation|dilatation|normal|anormal)\b',
            r'\b(?:échographie|radiographie|scanner|irm|examen)\b',
        ]
        entities = []
        for pattern in patterns:
            matches = re.findall(pattern, text.lower())
            entities.extend(matches)
        return list(set(entities))

    transcription_entities = extract_medical_entities(transcription_text)
    generated_entities = extract_medical_entities(generated_text)

    # Calculate coverage
    missing_entities = [
        entity for entity in transcription_entities if entity not in generated_entities]
    coverage_percentage = ((len(transcription_entities) - len(missing_entities)) /
                           len(transcription_entities) * 100) if transcription_entities else 100

    # Validate structure
    template_analysis = analyze_word_template(template_path)
    template_sections = [section['text']
                         for section in template_analysis.get('sections', [])]

    found_sections = []
    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        for template_section in template_sections:
            template_clean = template_section.lower().replace(
                '\xa0', ' ').replace(':', '').strip()
            text_clean = text.lower().replace(':', '').strip()
            if template_clean in text_clean or text_clean in template_clean:
                found_sections.append(template_section)
                break

    missing_sections = [
        s for s in template_sections if s not in found_sections]
    structure_valid = len(missing_sections) == 0

    # Overall score
    structure_score = 1.0 if structure_valid else 0.5
    entities_score = coverage_percentage / 100
    overall_score = (structure_score + entities_score) / 2

    validation_result = {
        "overall_score": overall_score,
        "structure_valid": structure_valid,
        "entities_coverage": coverage_percentage,
        "missing_sections": missing_sections,
        "missing_entities": missing_entities,
        "transcription_entities_count": len(transcription_entities),
        "generated_entities_count": len(generated_entities),
        "found_sections": found_sections,
        "template_sections": template_sections
    }

    return validation_result


def create_validation_chain(llm):
    """Create the validation chain."""
    validation_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a medical document validation expert.
Analyze if the generated medical document contains all important medical information from the original transcription.

Provide a brief validation summary with:
- Overall quality assessment
- Missing important information (if any)
- Key recommendations"""),
        ("human", """Validate the content coverage between the original transcription and the generated document.

ORIGINAL TRANSCRIPTION:
{transcription}

GENERATED DOCUMENT CONTENT:
{generated_content}

VALIDATION METRICS:
- Structure Valid: {structure_valid}
- Entities Coverage: {entities_coverage:.1f}%
- Missing Sections: {missing_sections}
- Missing Entities: {missing_entities}

Provide a concise validation summary.""")
    ])

    return validation_prompt | llm