Spaces:

Nourhenem
/

pipeline2

Sleeping

App Files Files Community

pipeline2 / document_assembler.py

Nourhenem

initial commit

f92da22 verified about 1 month ago

raw

history blame

5.91 kB

	#!/usr/bin/env python3
	"""
	Document Assembler
	Handles creating medical documents by inserting sections into Word templates
	"""

	import os
	import re
	from datetime import datetime
	from typing import Dict, Any, List
	from docx import Document
	from docx.enum.text import WD_ALIGN_PARAGRAPH
	from docx.shared import Pt
	from langchain.tools import tool
	from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
	from langchain.agents import AgentExecutor, create_openai_tools_agent


	@tool
	def create_medical_document(template_path: str, sections_text: str, title: str, output_path: str) -> str:
	"""Create a medical document by inserting sections into a Word template."""
	if not os.path.exists(template_path):
	raise FileNotFoundError(f"Template file not found: {template_path}")

	doc = Document(template_path)

	# Parse sections from text
	sections = {}
	current_section = None
	current_content = []
	for line in sections_text.split('\n'):
	line = line.strip()
	if not line:
	continue
	line_lower = line.lower().replace('é', 'e').replace('è', 'e').replace('à', 'a')
	if any(keyword in line_lower for keyword in ['technique', 'resultat', 'conclusion', 'indication']):
	if current_section:
	sections[current_section] = '\n'.join(current_content).strip()
	current_section = line
	current_content = []
	elif current_section:
	current_content.append(line)
	if current_section and current_content:
	sections[current_section] = '\n'.join(current_content).strip()

	# First, check if there's a "Titre" section in the template and insert the title there
	title_section_found = False
	for idx, paragraph in enumerate(doc.paragraphs):
	para_text = paragraph.text.strip()
	para_norm = para_text.lower().replace('é', 'e').replace('è', 'e').replace(
	'à', 'a').replace(':', '').replace('\xa0', ' ').strip()

	# Check if this is a title section (case insensitive)
	if 'titre' in para_norm:
	print(
	f"🎯 Found title section in template: '{para_text}' at index {idx}")
	# Clear the paragraph and insert the generated title
	paragraph.clear()
	paragraph.text = title
	# Apply formatting to make it stand out
	for run in paragraph.runs:
	run.font.bold = True
	run.font.size = Pt(14)
	title_section_found = True
	break

	# If no title section found, add header with dynamic title
	if not title_section_found:
	print("📝 No title section found in template, adding header...")
	header_para = doc.paragraphs[0].insert_paragraph_before()
	header_text = f"{title}\nDate: {datetime.now().strftime('%d/%m/%Y')}\nHeure: {datetime.now().strftime('%H:%M')}\n{'='*40}"
	header_para.text = header_text
	for run in header_para.runs:
	run.font.bold = True
	run.font.size = Pt(14)
	header_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

	# Locate section titles in the template
	section_indices = {}
	for idx, paragraph in enumerate(doc.paragraphs):
	para_text = paragraph.text.strip()
	para_norm = para_text.lower().replace('é', 'e').replace('è', 'e').replace(
	'à', 'a').replace(':', '').replace('\xa0', ' ').strip()
	for section_name in sections.keys():
	section_norm = section_name.lower().replace('é', 'e').replace(
	'è', 'e').replace('à', 'a').replace(':', '').strip()
	if (section_norm in para_norm and len(section_norm) > 0 and len(para_norm) > 0):
	section_indices[section_name] = idx
	print("DEBUG section_indices:", section_indices)
	print("DEBUG sections.keys():", list(sections.keys()))

	# For each section found, remove content between this title and the next title, then insert the generated content
	sorted_sections = sorted(section_indices.items(), key=lambda x: x[1])
	for i, (section_name, idx) in enumerate(sorted_sections):
	# Determine the end of the section (before the next title or end of doc)
	start = idx + 1
	if i + 1 < len(sorted_sections):
	end = sorted_sections[i+1][1]
	else:
	end = len(doc.paragraphs)
	# Remove paragraphs between start and end
	for j in range(end-1, start-1, -1):
	p = doc.paragraphs[j]
	if p.text.strip():
	p.clear()
	# Insert content right after the title
	if sections[section_name]:
	new_para = doc.paragraphs[idx+1] if (idx+1 <
	len(doc.paragraphs)) else doc.add_paragraph()
	new_para.text = sections[section_name]

	doc.save(output_path)
	return f"Document created successfully: {output_path}"


	def create_document_assembler_agent(llm):
	"""Create the document assembler agent."""
	document_assembler_prompt = ChatPromptTemplate.from_messages([
	("system", """You are a medical document assembler.
	Create medical documents by inserting sections into Word templates.
	Use the provided title for the document header and insert sections in the correct locations."""),
	("human",
	"Create a medical document with template {template_path}, sections content: {sections_text}, title: {title}, and save to {output_path}"),
	MessagesPlaceholder("agent_scratchpad")
	])

	document_assembler_agent = create_openai_tools_agent(
	llm=llm,
	tools=[create_medical_document],
	prompt=document_assembler_prompt
	)

	document_assembler_executor = AgentExecutor(
	agent=document_assembler_agent,
	tools=[create_medical_document],
	verbose=True
	)

	return document_assembler_executor