|
|
|
|
|
""" |
|
|
Improved Template Analyzer - Enhanced section detection |
|
|
Fixes issues with section detection and provides better analysis |
|
|
""" |
|
|
|
|
|
import os |
|
|
import re |
|
|
from typing import Dict, Any, List, Tuple |
|
|
from docx import Document |
|
|
import json |
|
|
from datetime import datetime |
|
|
from langchain.tools import tool |
|
|
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder |
|
|
from langchain.agents import AgentExecutor, create_openai_tools_agent |
|
|
from langchain_openai import ChatOpenAI |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
@tool |
|
|
def analyze_word_template_tool(template_path: str) -> Dict[str, Any]: |
|
|
"""Analyze a Word document template to extract structure and sections.""" |
|
|
if not os.path.exists(template_path): |
|
|
raise FileNotFoundError(f"Template file not found: {template_path}") |
|
|
|
|
|
doc = Document(template_path) |
|
|
analysis = { |
|
|
'sections': [], |
|
|
'formatting': {}, |
|
|
'document_info': {} |
|
|
} |
|
|
|
|
|
|
|
|
section_patterns = [ |
|
|
r'\b(clinique|examen|observation)\b', |
|
|
r'\b(technique|matériel|méthode|procédure)\b', |
|
|
r'\b(résultat|resultat|resultats|résultats)\b', |
|
|
r'\b(conclusion|diagnostic|impression)\b', |
|
|
r'\b(échographie|echographie|imagerie)\b', |
|
|
r'\b(recommandation|traitement|suivi)\b', |
|
|
r'\b(analyse|commentaire|discussion)\b', |
|
|
r'\b(antécédents|histoire|anamnèse)\b', |
|
|
r'\b(indication|objectif)\b', |
|
|
r'\b(biologie|laboratoire)\b' |
|
|
] |
|
|
|
|
|
combined_pattern = '|'.join(section_patterns) |
|
|
|
|
|
|
|
|
for i, paragraph in enumerate(doc.paragraphs): |
|
|
text = paragraph.text.strip() |
|
|
if text: |
|
|
|
|
|
if re.search(combined_pattern, text, re.IGNORECASE): |
|
|
analysis['sections'].append({ |
|
|
'text': text, |
|
|
'index': i, |
|
|
'style': paragraph.style.name if paragraph.style else 'Normal' |
|
|
}) |
|
|
|
|
|
|
|
|
if paragraph.runs: |
|
|
run = paragraph.runs[0] |
|
|
analysis['formatting'][i] = { |
|
|
'bold': run.bold, |
|
|
'italic': run.italic, |
|
|
'font_name': run.font.name, |
|
|
'font_size': run.font.size.pt if run.font.size else None, |
|
|
'alignment': paragraph.alignment |
|
|
} |
|
|
|
|
|
|
|
|
if doc.core_properties: |
|
|
analysis['document_info'] = { |
|
|
'title': doc.core_properties.title or 'Word Document', |
|
|
'author': doc.core_properties.author or '', |
|
|
'subject': doc.core_properties.subject or '' |
|
|
} |
|
|
|
|
|
return analysis |
|
|
|
|
|
|
|
|
class ImprovedTemplateAnalyzer: |
|
|
"""Enhanced template analyzer with better section detection.""" |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize the template analyzer.""" |
|
|
print("🔍 Improved Template Analyzer initialized") |
|
|
|
|
|
|
|
|
self.section_patterns = { |
|
|
'clinique': r'\b(clinique|examen|observation|examen_clinique)\b', |
|
|
'technique': r'\b(technique|matériel|méthode|procédure|protocole)\b', |
|
|
'resultats': r'\b(résultat|resultat|resultats|résultats|findings)\b', |
|
|
'conclusion': r'\b(conclusion|diagnostic|impression|synthèse)\b', |
|
|
'imagerie': r'\b(échographie|echographie|imagerie|radiologie)\b', |
|
|
'recommandations': r'\b(recommandation|traitement|suivi|conduite)\b', |
|
|
'analyse': r'\b(analyse|commentaire|discussion|interprétation)\b', |
|
|
'antecedents': r'\b(antécédents|histoire|anamnèse|contexte)\b', |
|
|
'indication': r'\b(indication|objectif|but|demande)\b', |
|
|
'biologie': r'\b(biologie|laboratoire|bilan|analyses)\b' |
|
|
} |
|
|
|
|
|
def analyze_word_template(self, template_path: str) -> Dict[str, Any]: |
|
|
"""Analyze a Word document template to extract structure and sections.""" |
|
|
if not os.path.exists(template_path): |
|
|
raise FileNotFoundError(f"Template file not found: {template_path}") |
|
|
|
|
|
print(f"📄 Analyzing template: {template_path}") |
|
|
|
|
|
doc = Document(template_path) |
|
|
analysis = { |
|
|
'sections': [], |
|
|
'formatting': {}, |
|
|
'document_info': {}, |
|
|
'all_text': [], |
|
|
'structure': {}, |
|
|
'detected_section_types': [] |
|
|
} |
|
|
|
|
|
|
|
|
for i, paragraph in enumerate(doc.paragraphs): |
|
|
text = paragraph.text.strip() |
|
|
|
|
|
|
|
|
if text: |
|
|
analysis['all_text'].append({ |
|
|
'index': i, |
|
|
'text': text, |
|
|
'length': len(text) |
|
|
}) |
|
|
|
|
|
|
|
|
section_type = self._detect_section_type(text) |
|
|
if section_type: |
|
|
analysis['sections'].append({ |
|
|
'text': text, |
|
|
'index': i, |
|
|
'style': paragraph.style.name if paragraph.style else 'Normal', |
|
|
'section_type': section_type, |
|
|
'is_header': self._is_likely_header(text) |
|
|
}) |
|
|
|
|
|
if section_type not in analysis['detected_section_types']: |
|
|
analysis['detected_section_types'].append(section_type) |
|
|
|
|
|
|
|
|
if paragraph.runs: |
|
|
run = paragraph.runs[0] |
|
|
analysis['formatting'][i] = { |
|
|
'bold': run.bold, |
|
|
'italic': run.italic, |
|
|
'font_name': run.font.name, |
|
|
'font_size': run.font.size.pt if run.font.size else None, |
|
|
'alignment': str(paragraph.alignment) if paragraph.alignment else None |
|
|
} |
|
|
|
|
|
|
|
|
if doc.core_properties: |
|
|
analysis['document_info'] = { |
|
|
'title': doc.core_properties.title or 'Word Document', |
|
|
'author': doc.core_properties.author or '', |
|
|
'subject': doc.core_properties.subject or '', |
|
|
'created': doc.core_properties.created.isoformat() if doc.core_properties.created else None, |
|
|
'modified': doc.core_properties.modified.isoformat() if doc.core_properties.modified else None |
|
|
} |
|
|
|
|
|
|
|
|
analysis['structure'] = self._extract_structure(analysis['sections']) |
|
|
|
|
|
return analysis |
|
|
|
|
|
def _detect_section_type(self, text: str) -> str: |
|
|
"""Detect the type of section based on improved pattern matching.""" |
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
for section_type, pattern in self.section_patterns.items(): |
|
|
if re.search(pattern, text_lower): |
|
|
return section_type |
|
|
|
|
|
|
|
|
if ':' in text and len(text.split()) <= 3: |
|
|
|
|
|
first_word = text.split(':')[0].strip().lower() |
|
|
if first_word in ['clinique', 'technique', 'resultats', 'résultats', 'conclusion']: |
|
|
return first_word if first_word != 'résultats' else 'resultats' |
|
|
|
|
|
return None |
|
|
|
|
|
def _is_likely_header(self, text: str) -> bool: |
|
|
"""Determine if text is likely a section header.""" |
|
|
|
|
|
conditions = [ |
|
|
len(text) < 100, |
|
|
text.endswith(':'), |
|
|
text.isupper(), |
|
|
len(text.split()) <= 3 |
|
|
] |
|
|
|
|
|
return any(conditions) |
|
|
|
|
|
def _extract_structure(self, sections: List[Dict[str, Any]]) -> Dict[str, Any]: |
|
|
"""Extract the document structure from sections.""" |
|
|
structure = { |
|
|
'detected_sections': [], |
|
|
'section_types': [], |
|
|
'total_sections': len(sections) |
|
|
} |
|
|
|
|
|
for section in sections: |
|
|
structure['detected_sections'].append({ |
|
|
'text': section['text'], |
|
|
'type': section.get('section_type', 'unknown'), |
|
|
'index': section['index'] |
|
|
}) |
|
|
|
|
|
section_type = section.get('section_type', 'unknown') |
|
|
if section_type not in structure['section_types']: |
|
|
structure['section_types'].append(section_type) |
|
|
|
|
|
return structure |
|
|
|
|
|
def save_analysis(self, analysis: Dict[str, Any], output_path: str = None): |
|
|
"""Save analysis results to JSON file.""" |
|
|
if not output_path: |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
output_path = f"improved_template_analysis_{timestamp}.json" |
|
|
|
|
|
try: |
|
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
|
json.dump(analysis, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f"💾 Analysis saved to: {output_path}") |
|
|
return output_path |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error saving analysis: {e}") |
|
|
return None |
|
|
|
|
|
def display_analysis_summary(self, analysis: Dict[str, Any]): |
|
|
"""Display a summary of the template analysis.""" |
|
|
print("\n📊 IMPROVED TEMPLATE ANALYSIS SUMMARY") |
|
|
print("=" * 60) |
|
|
|
|
|
print(f"Total paragraphs: {len(analysis['all_text'])}") |
|
|
print(f"Detected sections: {len(analysis['sections'])}") |
|
|
|
|
|
if analysis['detected_section_types']: |
|
|
print(f"Section types found: {', '.join(analysis['detected_section_types'])}") |
|
|
|
|
|
print(f"Document title: {analysis['document_info'].get('title', 'N/A')}") |
|
|
print(f"Document author: {analysis['document_info'].get('author', 'N/A')}") |
|
|
|
|
|
print("\n🔍 DETECTED SECTIONS:") |
|
|
for i, section in enumerate(analysis['structure']['detected_sections']): |
|
|
print(f" {i+1}. [{section['type']}] {section['text']}") |
|
|
|
|
|
print(f"\n📄 ALL PARAGRAPHS:") |
|
|
for i, text_item in enumerate(analysis['all_text']): |
|
|
print(f" {i+1}. {text_item['text']}") |
|
|
|
|
|
def test_with_sample_template(self, template_path: str): |
|
|
"""Test the analyzer with a sample template.""" |
|
|
print(f"🚀 Testing Improved Template Analyzer with: {template_path}") |
|
|
print("=" * 60) |
|
|
|
|
|
try: |
|
|
|
|
|
analysis = self.analyze_word_template(template_path) |
|
|
|
|
|
|
|
|
self.display_analysis_summary(analysis) |
|
|
|
|
|
|
|
|
output_file = self.save_analysis(analysis) |
|
|
|
|
|
print(f"\n✅ Improved analysis completed successfully!") |
|
|
print(f"📁 Results saved to: {output_file}") |
|
|
|
|
|
return analysis |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error during analysis: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return None |
|
|
|
|
|
def create_template_analyzer_agent(self, llm): |
|
|
"""Create the improved template analyzer agent.""" |
|
|
template_analyzer_prompt = ChatPromptTemplate.from_messages([ |
|
|
("system", """You are an enhanced medical document template analyzer. |
|
|
Analyze the provided Word template and extract its structure, sections, and formatting. |
|
|
Pay special attention to detecting ALL sections including: CLINIQUE, TECHNIQUE, RESULTATS, and CONCLUSION. |
|
|
Provide a detailed analysis that can be used by other agents."""), |
|
|
("human", |
|
|
"Analyze the template at {template_path} and provide a comprehensive analysis. Make sure to detect all sections including RESULTATS."), |
|
|
MessagesPlaceholder("agent_scratchpad") |
|
|
]) |
|
|
|
|
|
template_analyzer_agent = create_openai_tools_agent( |
|
|
llm=llm, |
|
|
tools=[analyze_word_template_tool], |
|
|
prompt=template_analyzer_prompt |
|
|
) |
|
|
|
|
|
template_analyzer_executor = AgentExecutor( |
|
|
agent=template_analyzer_agent, |
|
|
tools=[analyze_word_template_tool], |
|
|
verbose=True |
|
|
) |
|
|
|
|
|
return template_analyzer_executor |
|
|
|
|
|
def test_with_agent(self, template_path: str): |
|
|
"""Test the template analyzer using the enhanced LangChain agent.""" |
|
|
print(f"🤖 Testing Improved Template Analyzer AGENT with: {template_path}") |
|
|
print("=" * 60) |
|
|
|
|
|
try: |
|
|
|
|
|
api_key = os.getenv('OPENAI_API_KEY') |
|
|
if not api_key: |
|
|
print("❌ OpenAI API key not found in environment variables") |
|
|
return None |
|
|
|
|
|
llm = ChatOpenAI( |
|
|
model="gpt-4o-mini", |
|
|
temperature=0, |
|
|
api_key=api_key |
|
|
) |
|
|
|
|
|
|
|
|
print("🔧 Creating improved template analyzer agent...") |
|
|
agent_executor = self.create_template_analyzer_agent(llm) |
|
|
|
|
|
|
|
|
print("🚀 Running enhanced agent analysis...") |
|
|
result = agent_executor.invoke({ |
|
|
"template_path": template_path |
|
|
}) |
|
|
|
|
|
print("✅ Enhanced agent analysis completed!") |
|
|
print("\n📋 AGENT OUTPUT:") |
|
|
print("=" * 50) |
|
|
print(result['output']) |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
agent_output_file = f"improved_agent_analysis_{timestamp}.json" |
|
|
|
|
|
with open(agent_output_file, 'w', encoding='utf-8') as f: |
|
|
json.dump(result, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f"\n💾 Enhanced agent result saved to: {agent_output_file}") |
|
|
|
|
|
return result |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error during enhanced agent analysis: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return None |
|
|
|
|
|
|
|
|
def main(): |
|
|
print("🏥 Improved Template Analyzer - Enhanced Section Detection") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
analyzer = ImprovedTemplateAnalyzer() |
|
|
|
|
|
|
|
|
sample_path = "sample.docx" |
|
|
|
|
|
""" |
|
|
if os.path.exists(sample_path): |
|
|
print(f"📄 Found sample file: {sample_path}") |
|
|
print("🔬 Running enhanced analysis...") |
|
|
|
|
|
# Test both methods |
|
|
print("\n1️⃣ Testing improved direct analysis...") |
|
|
direct_result = analyzer.test_with_sample_template(sample_path) |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("2️⃣ Testing improved agent analysis...") |
|
|
agent_result = analyzer.test_with_agent(sample_path) |
|
|
|
|
|
if direct_result and agent_result: |
|
|
print(f"\n🎉 Both enhanced analyses completed successfully!") |
|
|
print(f"📊 Direct analysis found {len(direct_result['sections'])} sections") |
|
|
print(f"📊 Agent analysis tool was executed successfully") |
|
|
""" |
|
|
if os.path.exists(sample_path): |
|
|
print(f"📄 Found sample file: {sample_path}") |
|
|
print("🤖 Running enhanced **agent** analysis with GPT...") |
|
|
|
|
|
|
|
|
agent_result = analyzer.test_with_agent(sample_path) |
|
|
|
|
|
|
|
|
if agent_result: |
|
|
print(f"\n🎉 Enhanced agent analysis completed successfully!") |
|
|
|
|
|
|
|
|
|
|
|
print("\n=== AGENT RAW OUTPUT ===\n", agent_result) |
|
|
|
|
|
|
|
|
else: |
|
|
print("❌ sample.docx not found. Please provide the correct path.") |
|
|
template_path = input("Enter the path to your Word template file: ").strip() |
|
|
|
|
|
if template_path and os.path.exists(template_path): |
|
|
analyzer.test_with_sample_template(template_path) |
|
|
else: |
|
|
print("❌ Invalid file path provided") |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |