pipeline2 / template_analyzer.py
Nourhenem's picture
initial commit
f92da22 verified
#!/usr/bin/env python3
"""
Template Analyzer Agent
Analyzes Word document templates to extract structure and sections
"""
import os
import re
from typing import Dict, Any
from docx import Document
from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents import AgentExecutor, create_openai_tools_agent
@tool
def analyze_word_template(template_path: str) -> Dict[str, Any]:
"""Analyze a Word document template to extract structure and sections."""
if not os.path.exists(template_path):
raise FileNotFoundError(f"Template file not found: {template_path}")
doc = Document(template_path)
analysis = {
'sections': [],
'formatting': {},
'document_info': {}
}
# Analyze paragraphs and sections
for i, paragraph in enumerate(doc.paragraphs):
text = paragraph.text.strip()
if text:
# Detect sections - improved regex to catch all section types
if re.search(r'\b(examen|observation|conclusion|résultat|resultat|diagnostic|rapport|échographie|echographie|analyse|commentaire|recommandation|technique|matériel|matériel|méthode|indication)\b', text, re.IGNORECASE):
analysis['sections'].append({
'text': text,
'index': i,
'style': paragraph.style.name if paragraph.style else 'Normal'
})
# Analyze formatting
if paragraph.runs:
run = paragraph.runs[0]
analysis['formatting'][i] = {
'bold': run.bold,
'italic': run.italic,
'font_name': run.font.name,
'font_size': run.font.size.pt if run.font.size else None,
'alignment': paragraph.alignment
}
# Analyze document properties
if doc.core_properties.title:
analysis['document_info'] = {
'title': doc.core_properties.title,
'author': doc.core_properties.author,
'subject': doc.core_properties.subject
}
return analysis
def create_template_analyzer_agent(llm):
"""Create the template analyzer agent."""
template_analyzer_prompt = ChatPromptTemplate.from_messages([
("system", """You are a medical document template analyzer.
Analyze the provided Word template and extract its structure, sections, and formatting.
Provide a detailed analysis that can be used by other agents."""),
("human",
"Analyze the template at {template_path} and provide a comprehensive analysis."),
MessagesPlaceholder("agent_scratchpad")
])
template_analyzer_agent = create_openai_tools_agent(
llm=llm,
tools=[analyze_word_template],
prompt=template_analyzer_prompt
)
template_analyzer_executor = AgentExecutor(
agent=template_analyzer_agent,
tools=[analyze_word_template],
verbose=True
)
return template_analyzer_executor