Spaces:

Nourhenem
/

pipeline2

Sleeping

App Files Files Community

pipeline2 / template_generator.py

Nourhenem

initial commit

f92da22 verified about 1 month ago

raw

history blame

15.8 kB

	import os
	import logging
	from datetime import datetime
	from typing import List
	from docx import Document
	from docx.shared import Inches
	from docx.enum.style import WD_STYLE_TYPE
	from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
	from docx.shared import RGBColor
	from docx.oxml.shared import OxmlElement, qn

	# Importer les classes du premier fichier
	from template_matcher import TemplateMatcher, TemplateMatch

	from dotenv import load_dotenv

	# Charger les variables d'environnement
	load_dotenv()

	DB_PATH = os.getenv("TEMPLATE_DB_PATH", "templates/medical_templates.pkl")
	OUTPUT_DIR = os.getenv("OUTPUT_DIR", "templates_remplis")

	class TemplateGenerator:
	"""Génère des templates médicaux remplis au format .doc"""

	def __init__(self):
	"""Initialise le générateur de templates"""

	self.output_dir = OUTPUT_DIR
	self._create_output_directory()

	# Configuration du logging pour ce module
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - [GENERATOR] %(message)s'
	)

	def _create_output_directory(self):
	"""Crée le répertoire de sortie s'il n'existe pas"""
	if not os.path.exists(self.output_dir):
	os.makedirs(self.output_dir)
	logging.info(f"📁 Répertoire de sortie créé: {self.output_dir}")

	def _add_custom_styles(self, doc: Document):
	"""Ajoute des styles personnalisés au document"""
	styles = doc.styles

	# Style pour les titres de section
	try:
	section_style = styles.add_style('Section Title', WD_STYLE_TYPE.PARAGRAPH)
	section_style.font.size = Inches(0.16) # 12pt
	section_style.font.bold = True
	section_style.font.color.rgb = RGBColor(0, 51, 102) # Bleu foncé
	section_style.paragraph_format.space_after = Inches(0.1)
	section_style.paragraph_format.keep_with_next = True
	except:
	logging.warning("Style 'Section Title' déjà existant")

	# Style pour le contenu des sections
	try:
	content_style = styles.add_style('Section Content', WD_STYLE_TYPE.PARAGRAPH)
	content_style.font.size = Inches(0.14) # 11pt
	content_style.paragraph_format.left_indent = Inches(0.25)
	content_style.paragraph_format.space_after = Inches(0.15)
	except:
	logging.warning("Style 'Section Content' déjà existant")

	# Style pour l'en-tête
	try:
	header_style = styles.add_style('Document Header', WD_STYLE_TYPE.PARAGRAPH)
	header_style.font.size = Inches(0.18) # 14pt
	header_style.font.bold = True
	header_style.font.color.rgb = RGBColor(0, 0, 0)
	header_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
	header_style.paragraph_format.space_after = Inches(0.2)
	except:
	logging.warning("Style 'Document Header' déjà existant")

	def _add_document_header(self, doc: Document, template_match: TemplateMatch, transcription_filename: str):
	"""Ajoute l'en-tête du document"""
	# Titre principal
	header = doc.add_paragraph()
	header.style = 'Document Header'
	header.add_run("COMPTE-RENDU MÉDICAL GÉNÉRÉ AUTOMATIQUEMENT")

	# Informations du template
	info_paragraph = doc.add_paragraph()
	info_paragraph.add_run("Template utilisé: ").bold = True
	info_paragraph.add_run(os.path.basename(template_match.template_info.filepath))

	# Informations médicales
	if template_match.template_info.medecin and template_match.template_info.medecin != "Non identifié":
	medecin_para = doc.add_paragraph()
	medecin_para.add_run("Médecin: ").bold = True
	medecin_para.add_run(template_match.template_info.medecin)

	centre = getattr(template_match.template_info, 'centre_medical', 'Non spécifié')
	if centre and centre != "Non spécifié":
	centre_para = doc.add_paragraph()
	centre_para.add_run("Centre médical: ").bold = True
	centre_para.add_run(centre)

	# Type de document
	type_para = doc.add_paragraph()
	type_para.add_run("Type de document: ").bold = True
	type_para.add_run(template_match.template_info.type)

	# Informations de génération
	generation_para = doc.add_paragraph()
	generation_para.add_run("Date de génération: ").bold = True
	generation_para.add_run(datetime.now().strftime("%d/%m/%Y à %H:%M"))

	score_para = doc.add_paragraph()
	score_para.add_run("Score de correspondance: ").bold = True
	score_para.add_run(f"{template_match.overall_score:.3f} ({template_match.confidence_level})")

	filling_para = doc.add_paragraph()
	filling_para.add_run("Pourcentage de remplissage: ").bold = True
	filling_para.add_run(f"{template_match.filling_percentage:.1f}%")

	# Ligne de séparation
	doc.add_paragraph("_" * 80)

	def _add_filled_sections(self, doc: Document, template_match: TemplateMatch):
	"""Ajoute les sections remplies au document"""
	if not template_match.extracted_data:
	logging.warning("❌ Aucune section à remplir trouvée")
	doc.add_paragraph("Aucune section n'a pu être remplie automatiquement.")
	return

	logging.info(f"📝 Génération de {len(template_match.extracted_data)} sections remplies")

	# Ajouter un titre pour les sections remplies
	sections_title = doc.add_paragraph()
	sections_title.add_run("CONTENU EXTRAIT ET STRUCTURÉ").bold = True
	sections_title.add_run().font.size = Inches(0.18)

	for section_name, content in template_match.extracted_data.items():
	# Titre de section
	section_title = doc.add_paragraph()
	section_title.style = 'Section Title'
	section_title.add_run(f"{section_name.upper()}")

	# Contenu de section
	section_content = doc.add_paragraph()
	section_content.style = 'Section Content'
	section_content.add_run(content)

	logging.info(f" ✅ Section ajoutée: {section_name} ({len(content)} caractères)")

	def _add_missing_sections(self, doc: Document, template_match: TemplateMatch):
	"""Ajoute les sections manquantes au document"""
	missing_sections = [s.section_name for s in template_match.section_matches.values() if not s.can_fill]

	if missing_sections:
	logging.info(f"⚠️ {len(missing_sections)} sections manquantes identifiées")

	# Titre pour les sections manquantes
	missing_title = doc.add_paragraph()
	missing_title.add_run("SECTIONS NON REMPLIES").bold = True
	missing_title.add_run().font.color.rgb = RGBColor(204, 102, 0) # Orange

	missing_subtitle = doc.add_paragraph()
	missing_subtitle.add_run("(Informations non trouvées dans la transcription)")
	missing_subtitle.add_run().font.color.rgb = RGBColor(102, 102, 102) # Gris

	for section in missing_sections:
	missing_para = doc.add_paragraph()
	missing_para.add_run(f"• {section}")
	missing_para.add_run().font.color.rgb = RGBColor(204, 102, 0)

	# Ajouter un espace pour remplissage manuel
	placeholder = doc.add_paragraph()
	placeholder.style = 'Section Content'
	placeholder.add_run("[À COMPLÉTER MANUELLEMENT]")
	placeholder.add_run().font.color.rgb = RGBColor(153, 153, 153) # Gris clair
	placeholder.add_run().italic = True

	def _add_original_transcription(self, doc: Document, transcription: str):
	"""Ajoute la transcription originale en annexe"""
	# Saut de page
	doc.add_page_break()

	# Titre de l'annexe
	annexe_title = doc.add_paragraph()
	annexe_title.add_run("ANNEXE - TRANSCRIPTION ORIGINALE").bold = True
	annexe_title.add_run().font.size = Inches(0.16)
	annexe_title.add_run().font.color.rgb = RGBColor(102, 102, 102)

	# Ligne de séparation
	doc.add_paragraph("=" * 60)

	# Transcription originale
	transcription_para = doc.add_paragraph()
	transcription_para.add_run(transcription)
	transcription_para.add_run().font.size = Inches(0.12) # Texte plus petit
	transcription_para.add_run().font.color.rgb = RGBColor(51, 51, 51) # Gris foncé

	def generate_filled_template(self, template_match: TemplateMatch, transcription: str, transcription_filename: str) -> str:
	"""
	Génère un template rempli et le sauvegarde au format .doc

	Args:
	template_match: Le template avec le meilleur score
	transcription: La transcription originale
	transcription_filename: Le nom du fichier de transcription

	Returns:
	str: Le chemin du fichier généré
	"""
	logging.info("🚀 Début de la génération du template rempli")
	logging.info(f"📋 Template sélectionné: {template_match.template_id}")
	logging.info(f"📊 Score: {template_match.overall_score:.3f}")
	logging.info(f"🔧 Remplissage: {template_match.filling_percentage:.1f}%")

	try:
	# Créer un nouveau document Word
	doc = Document()

	# Ajouter les styles personnalisés
	self._add_custom_styles(doc)

	# Ajouter l'en-tête du document
	self._add_document_header(doc, template_match, transcription_filename)

	# Ajouter les sections remplies
	self._add_filled_sections(doc, template_match)

	# Ajouter les sections manquantes
	self._add_missing_sections(doc, template_match)

	# Ajouter la transcription originale en annexe
	self._add_original_transcription(doc, transcription)

	# Générer le nom de fichier de sortie
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	safe_template_id = template_match.template_id.replace('/', '_').replace('\\', '_')
	output_filename = f"template_rempli_{safe_template_id}_{timestamp}.docx"
	output_path = os.path.join(self.output_dir, output_filename)

	# Sauvegarder le document
	doc.save(output_path)

	logging.info(f"✅ Template rempli généré avec succès:")
	logging.info(f" 📁 Fichier: {output_path}")
	logging.info(f" 📏 Taille: {os.path.getsize(output_path)} bytes")
	logging.info(f" 📋 Sections remplies: {len(template_match.extracted_data)}")
	logging.info(f" ⚠️ Sections manquantes: {len([s for s in template_match.section_matches.values() if not s.can_fill])}")

	return output_path

	except Exception as e:
	logging.error(f"❌ Erreur lors de la génération du template: {e}")
	raise

	def display_generation_summary(self, template_match: TemplateMatch, output_path: str):
	"""Affiche un résumé de la génération dans les logs"""
	logging.info("=" * 80)
	logging.info("📊 RÉSUMÉ DE LA GÉNÉRATION")
	logging.info("=" * 80)
	logging.info(f"🎯 Template utilisé: {template_match.template_id}")
	logging.info(f"📁 Template source: {os.path.basename(template_match.template_info.filepath)}")
	logging.info(f"👨‍⚕️ Médecin: {template_match.template_info.medecin}")
	logging.info(f"🏥 Centre: {getattr(template_match.template_info, 'centre_medical', 'Non spécifié')}")
	logging.info(f"📝 Type: {template_match.template_info.type}")
	logging.info(f"📊 Score de correspondance: {template_match.overall_score:.3f} ({template_match.confidence_level})")
	logging.info(f"🔧 Pourcentage de remplissage: {template_match.filling_percentage:.1f}%")
	logging.info(f"📋 Sections remplies: {len(template_match.extracted_data)}")
	logging.info(f"⚠️ Sections manquantes: {len([s for s in template_match.section_matches.values() if not s.can_fill])}")
	logging.info(f"💾 Fichier généré: {os.path.basename(output_path)}")
	logging.info(f"📏 Taille du fichier: {os.path.getsize(output_path)} bytes")
	logging.info("=" * 80)


	def main():
	"""Fonction principale qui utilise le premier fichier pour matcher puis génère le template"""

	# Configuration du logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)

	# Chemin de la base de données
	db_path = DB_PATH

	# Exemple de transcription
	transcription_filename = "default.73.931915433.rtf_3650535_radiologie.doc"
	transcription_content = """ la Technique :** 3 plans T2, diffusion axiale, T2 grand champ et T1 Dixon.
	Résultats
	L'utérus est antéversé, antéfléchi, latéralisé à droite, de taille normale pour l'âge.
	L'endomètre est fin, mesurant moins de 2 mm.
	Pas d'adénomyose franche.
	Aspect normal du col utérin et du vagin.
	L'ovaire droit, en position postérieure, mesure 18 x 11 mm avec présence de 4 follicules.
	L'ovaire gauche, en position latéro-utérine, présente un volumineux endométriome de 45 mm, typique en hypersignal T1 Dixon.
	Deuxième endométriome accolé à l'ovaire droit, périphérique, mesurant 13 mm.
	Pas d'épaississement marqué du torus ni des ligaments utéro-sacrés.
	Pas d'autre localisation pelvienne.
	Pas d'épanchement pelvien.
	Pas d'anomalie de la vessie.
	Pas d'adénomégalie pelvienne, pas de dilatation des uretères.
	en Conclusion
	Endométriome ovarien droit périphérique de 13 mm.
	Endométriome ovarien gauche centro-ovarien de 45 mm."""

	if not os.path.exists(db_path):
	logging.error(f"❌ Base de données non trouvée: {db_path}")
	return

	try:
	logging.info("🚀 DÉMARRAGE DU PROCESSUS COMPLET")
	logging.info("=" * 80)

	# ÉTAPE 1: Matching avec le premier fichier
	logging.info("📍 ÉTAPE 1: MATCHING DES TEMPLATES")
	matcher = TemplateMatcher(db_path)
	matches = matcher.match_templates(transcription_content, transcription_filename, k=3)

	if not matches:
	logging.error("❌ Aucun template trouvé")
	return

	# Sélectionner le meilleur template
	best_match = matches[0]
	logging.info(f"✅ Meilleur template sélectionné: {best_match.template_id}")

	# ÉTAPE 2: Génération avec le deuxième fichier
	logging.info("📍 ÉTAPE 2: GÉNÉRATION DU TEMPLATE REMPLI")
	generator = TemplateGenerator()
	output_path = generator.generate_filled_template(
	best_match,
	transcription_content,
	transcription_filename
	)

	# ÉTAPE 3: Affichage du résumé
	logging.info("📍 ÉTAPE 3: RÉSUMÉ FINAL")
	generator.display_generation_summary(best_match, output_path)

	logging.info("🎉 PROCESSUS TERMINÉ AVEC SUCCÈS")

	except Exception as e:
	logging.error(f"❌ Erreur dans le processus principal: {e}")


	if __name__ == "__main__":
	main()