pipeline2 / document_assembler.py
Nourhenem's picture
initial commit
f92da22 verified
raw
history blame
5.91 kB
#!/usr/bin/env python3
"""
Document Assembler
Handles creating medical documents by inserting sections into Word templates
"""
import os
import re
from datetime import datetime
from typing import Dict, Any, List
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents import AgentExecutor, create_openai_tools_agent
@tool
def create_medical_document(template_path: str, sections_text: str, title: str, output_path: str) -> str:
"""Create a medical document by inserting sections into a Word template."""
if not os.path.exists(template_path):
raise FileNotFoundError(f"Template file not found: {template_path}")
doc = Document(template_path)
# Parse sections from text
sections = {}
current_section = None
current_content = []
for line in sections_text.split('\n'):
line = line.strip()
if not line:
continue
line_lower = line.lower().replace('é', 'e').replace('è', 'e').replace('à', 'a')
if any(keyword in line_lower for keyword in ['technique', 'resultat', 'conclusion', 'indication']):
if current_section:
sections[current_section] = '\n'.join(current_content).strip()
current_section = line
current_content = []
elif current_section:
current_content.append(line)
if current_section and current_content:
sections[current_section] = '\n'.join(current_content).strip()
# First, check if there's a "Titre" section in the template and insert the title there
title_section_found = False
for idx, paragraph in enumerate(doc.paragraphs):
para_text = paragraph.text.strip()
para_norm = para_text.lower().replace('é', 'e').replace('è', 'e').replace(
'à', 'a').replace(':', '').replace('\xa0', ' ').strip()
# Check if this is a title section (case insensitive)
if 'titre' in para_norm:
print(
f"🎯 Found title section in template: '{para_text}' at index {idx}")
# Clear the paragraph and insert the generated title
paragraph.clear()
paragraph.text = title
# Apply formatting to make it stand out
for run in paragraph.runs:
run.font.bold = True
run.font.size = Pt(14)
title_section_found = True
break
# If no title section found, add header with dynamic title
if not title_section_found:
print("📝 No title section found in template, adding header...")
header_para = doc.paragraphs[0].insert_paragraph_before()
header_text = f"{title}\nDate: {datetime.now().strftime('%d/%m/%Y')}\nHeure: {datetime.now().strftime('%H:%M')}\n{'='*40}"
header_para.text = header_text
for run in header_para.runs:
run.font.bold = True
run.font.size = Pt(14)
header_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Locate section titles in the template
section_indices = {}
for idx, paragraph in enumerate(doc.paragraphs):
para_text = paragraph.text.strip()
para_norm = para_text.lower().replace('é', 'e').replace('è', 'e').replace(
'à', 'a').replace(':', '').replace('\xa0', ' ').strip()
for section_name in sections.keys():
section_norm = section_name.lower().replace('é', 'e').replace(
'è', 'e').replace('à', 'a').replace(':', '').strip()
if (section_norm in para_norm and len(section_norm) > 0 and len(para_norm) > 0):
section_indices[section_name] = idx
print("DEBUG section_indices:", section_indices)
print("DEBUG sections.keys():", list(sections.keys()))
# For each section found, remove content between this title and the next title, then insert the generated content
sorted_sections = sorted(section_indices.items(), key=lambda x: x[1])
for i, (section_name, idx) in enumerate(sorted_sections):
# Determine the end of the section (before the next title or end of doc)
start = idx + 1
if i + 1 < len(sorted_sections):
end = sorted_sections[i+1][1]
else:
end = len(doc.paragraphs)
# Remove paragraphs between start and end
for j in range(end-1, start-1, -1):
p = doc.paragraphs[j]
if p.text.strip():
p.clear()
# Insert content right after the title
if sections[section_name]:
new_para = doc.paragraphs[idx+1] if (idx+1 <
len(doc.paragraphs)) else doc.add_paragraph()
new_para.text = sections[section_name]
doc.save(output_path)
return f"Document created successfully: {output_path}"
def create_document_assembler_agent(llm):
"""Create the document assembler agent."""
document_assembler_prompt = ChatPromptTemplate.from_messages([
("system", """You are a medical document assembler.
Create medical documents by inserting sections into Word templates.
Use the provided title for the document header and insert sections in the correct locations."""),
("human",
"Create a medical document with template {template_path}, sections content: {sections_text}, title: {title}, and save to {output_path}"),
MessagesPlaceholder("agent_scratchpad")
])
document_assembler_agent = create_openai_tools_agent(
llm=llm,
tools=[create_medical_document],
prompt=document_assembler_prompt
)
document_assembler_executor = AgentExecutor(
agent=document_assembler_agent,
tools=[create_medical_document],
verbose=True
)
return document_assembler_executor