Spaces:

Nourhenem
/

pipeline2

Sleeping

App Files Files Community

pipeline2 / document_validator.py

Nourhenem

initial commit

f92da22 verified about 1 month ago

raw

history blame

4.61 kB

	#!/usr/bin/env python3
	"""
	Document Validator
	Validates generated medical documents against original transcriptions
	"""

	import re
	from typing import Dict, Any, List
	from docx import Document
	from langchain.prompts import ChatPromptTemplate


	def validate_generated_document(template_path: str, transcription_path: str, generated_doc_path: str) -> Dict[str, Any]:
	"""Validate that the generated document contains all important content from the transcription."""
	from template_analyzer import analyze_word_template
	from transcription_processor import load_transcription

	# Extract content from generated document
	doc = Document(generated_doc_path)
	generated_content = []
	for paragraph in doc.paragraphs:
	text = paragraph.text.strip()
	if text and not text.startswith("Date:") and not text.startswith("Heure:"):
	generated_content.append(text)
	generated_text = "\n".join(generated_content)

	# Load transcription
	transcription_text = load_transcription(transcription_path)

	# Extract medical entities from both texts
	def extract_medical_entities(text: str) -> List[str]:
	patterns = [
	r'\d+(?:\.\d+)?\s*(?:mm\|cm\|kg\|cc\|ml\|g\|mg)', # Measurements
	r'\b(?:rein\|vessie\|foie\|rate\|poumon\|coeur\|cerveau\|muscle\|tendon\|os\|articulation)\b',
	r'\b(?:lithiase\|calcification\|tendinopathie\|inflammation\|dilatation\|normal\|anormal)\b',
	r'\b(?:échographie\|radiographie\|scanner\|irm\|examen)\b',
	]
	entities = []
	for pattern in patterns:
	matches = re.findall(pattern, text.lower())
	entities.extend(matches)
	return list(set(entities))

	transcription_entities = extract_medical_entities(transcription_text)
	generated_entities = extract_medical_entities(generated_text)

	# Calculate coverage
	missing_entities = [
	entity for entity in transcription_entities if entity not in generated_entities]
	coverage_percentage = ((len(transcription_entities) - len(missing_entities)) /
	len(transcription_entities) * 100) if transcription_entities else 100

	# Validate structure
	template_analysis = analyze_word_template(template_path)
	template_sections = [section['text']
	for section in template_analysis.get('sections', [])]

	found_sections = []
	for paragraph in doc.paragraphs:
	text = paragraph.text.strip()
	for template_section in template_sections:
	template_clean = template_section.lower().replace(
	'\xa0', ' ').replace(':', '').strip()
	text_clean = text.lower().replace(':', '').strip()
	if template_clean in text_clean or text_clean in template_clean:
	found_sections.append(template_section)
	break

	missing_sections = [
	s for s in template_sections if s not in found_sections]
	structure_valid = len(missing_sections) == 0

	# Overall score
	structure_score = 1.0 if structure_valid else 0.5
	entities_score = coverage_percentage / 100
	overall_score = (structure_score + entities_score) / 2

	validation_result = {
	"overall_score": overall_score,
	"structure_valid": structure_valid,
	"entities_coverage": coverage_percentage,
	"missing_sections": missing_sections,
	"missing_entities": missing_entities,
	"transcription_entities_count": len(transcription_entities),
	"generated_entities_count": len(generated_entities),
	"found_sections": found_sections,
	"template_sections": template_sections
	}

	return validation_result


	def create_validation_chain(llm):
	"""Create the validation chain."""
	validation_prompt = ChatPromptTemplate.from_messages([
	("system", """You are a medical document validation expert.
	Analyze if the generated medical document contains all important medical information from the original transcription.

	Provide a brief validation summary with:
	- Overall quality assessment
	- Missing important information (if any)
	- Key recommendations"""),
	("human", """Validate the content coverage between the original transcription and the generated document.

	ORIGINAL TRANSCRIPTION:
	{transcription}

	GENERATED DOCUMENT CONTENT:
	{generated_content}

	VALIDATION METRICS:
	- Structure Valid: {structure_valid}
	- Entities Coverage: {entities_coverage:.1f}%
	- Missing Sections: {missing_sections}
	- Missing Entities: {missing_entities}

	Provide a concise validation summary.""")
	])

	return validation_prompt \| llm