File size: 14,380 Bytes
f92da22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
#!/usr/bin/env python3
"""
LangChain Medical Agents Architecture - Refactored
A multi-agent system for processing medical transcriptions and documents.
"""

import os
import re
from datetime import datetime
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

# Import modular components
from models import TemplateAnalysis, MedicalTranscription, SectionContent, InsertSectionsInput
from sftp_agent import create_sftp_downloader_agent, download_model_from_sftp
from template_analyzer import create_template_analyzer_agent, analyze_word_template
from transcription_processor import (
    create_transcription_corrector_chain,
    create_medical_analyzer_chain,
    create_title_generator_chain,
    load_transcription_with_user_id
)
from section_generator import create_dynamic_section_prompt, fix_section_names
from document_assembler import create_document_assembler_agent
from document_validator import validate_generated_document, create_validation_chain

# Load environment variables
load_dotenv()

# Initialize LLM with Azure OpenAI
llm = AzureChatOpenAI(
    azure_deployment="gtp-4o-eastus2",
    openai_api_version="2024-02-15-preview",
    azure_endpoint="https://voxist-gpt-eastus2.openai.azure.com/",
    api_key="98db8190a2ff438b904c7e9862a13210",
    temperature=0.1
)


class MedicalDocumentOrchestrator:
    """Main orchestrator that coordinates all agents."""

    def __init__(self, template_path: str = None, transcription_path: str = None, transcriptions_dir: str = "transcriptions"):
        self.template_path = template_path
        self.transcription_path = transcription_path
        self.transcriptions_dir = transcriptions_dir
        self.template_analysis = None
        self.corrected_transcription = None
        self.medical_data = None
        self.generated_sections = None
        self.generated_title = None
        self.downloaded_models = None

    def run_full_pipeline(self, output_path: str = None) -> str:
        """Run the complete medical document processing pipeline."""
        print("πŸš€ Starting LangChain Medical Document Pipeline...")

        # Step 0: Download only the model corresponding to the transcription
        print("\nπŸ“₯ Step 0: Downloading model from SFTP for the selected transcription...")
        try:
            transcription_filename = os.path.basename(self.transcription_path)
            match = re.search(r'transcriptions_(.+)\.rtf_',
                              transcription_filename)
            if match:
                model_id = match.group(1)
                model_filename = f"{model_id}.rtf"
                local_filename = f"{model_id}.doc"
                local_template_path = os.path.join("models", local_filename)
                print(f"πŸ”Ž Model identifier for this transcription: {model_id}")

                # Download only the required model via a simple agent
                simple_sftp_agent = create_openai_tools_agent(
                    llm=llm,
                    tools=[download_model_from_sftp],
                    prompt=ChatPromptTemplate.from_messages([
                        ("system", "You are an SFTP downloader. Download the specified model file."),
                        ("human", "Download the model file: {model_filename}"),
                        MessagesPlaceholder("agent_scratchpad")
                    ])
                )
                simple_sftp_executor = AgentExecutor(
                    agent=simple_sftp_agent,
                    tools=[download_model_from_sftp],
                    verbose=True
                )

                result = simple_sftp_executor.invoke({
                    "model_filename": model_filename
                })
                print(
                    f"βœ… Model downloaded and available as: {local_template_path}")
                self.template_path = local_template_path
                self.downloaded_models = [{
                    'model_id': model_id,
                    'model_filename': model_filename,
                    'local_filename': local_filename,
                    'local_path': local_template_path,
                    'status': 'success'
                }]
            else:
                raise ValueError(
                    "Unable to extract the model identifier from the transcription filename.")
        except Exception as e:
            print(f"❌ Error during SFTP download step: {str(e)}")
            if self.template_path:
                print("⚠️ Continuing with pipeline using the provided template_path...")
            else:
                print(
                    "❌ No template path provided and SFTP download failed. Cannot continue.")
                raise Exception(
                    "Cannot continue without a template. SFTP download failed and no template path was provided.")
            self.downloaded_models = []

        # Step 1: Analyze template
        print("\nπŸ“‹ Step 1: Analyzing template...")
        if not self.template_path:
            raise ValueError("No template path available for analysis")
        self.template_analysis = analyze_word_template(self.template_path)
        print(
            f"βœ… Template analyzed: {len(self.template_analysis.get('sections', []))} sections found")

        # Step 2: Load and correct transcription
        print("\n✏️ Step 2: Correcting transcription...")
        raw_transcription, user_id = load_transcription_with_user_id(
            self.transcription_path)
        transcription_corrector_chain = create_transcription_corrector_chain(
            llm)
        self.corrected_transcription = transcription_corrector_chain.invoke({
            "transcription": raw_transcription
        }).content
        # ← Ajoute ces deux lignes juste aprΓ¨s
        print("\n===== Transcription après correction =====")
        print(self.corrected_transcription)

        print("βœ… Transcription corrected")
        print("βœ… Transcription corrected")

        # Step 3: Analyze medical data
        print("\nπŸ”¬ Step 3: Analyzing medical data...")
        medical_analyzer_chain = create_medical_analyzer_chain(llm)
        self.medical_data = medical_analyzer_chain.invoke({
            "corrected_transcription": self.corrected_transcription
        }).content
        print("βœ… Medical data analyzed")

        # Step 4: Generate title
        print("\nπŸ“ Step 4: Generating title...")
        title_generator_chain = create_title_generator_chain(llm)
        self.generated_title = title_generator_chain.invoke({
            "medical_data": self.medical_data
        }).content
        print(f"βœ… Title generated: {self.generated_title}")

        # Step 5: Generate sections
        print("\nπŸ“ Step 5: Generating sections...")

        # Extract sections from template analysis
        template_sections = []

        # Debug: see exactly what template_analysis contains
        print("\n--- DEBUG: Type and content of template_analysis ---")
        print(f"Type: {type(self.template_analysis)}")
        print(f"Content: {self.template_analysis}")
        if hasattr(self.template_analysis, '__dict__'):
            print(f"Attributes: {self.template_analysis.__dict__}")
        print("--- END DEBUG ---\n")

        # Always retrieve the sections list if possible
        try:
            if isinstance(self.template_analysis, dict) and 'sections' in self.template_analysis:
                template_sections = [section['text']
                                     for section in self.template_analysis['sections']]
            elif hasattr(self.template_analysis, 'get') and 'sections' in self.template_analysis:
                template_sections = [section['text']
                                     for section in self.template_analysis['sections']]
            elif hasattr(self.template_analysis, 'output') and isinstance(self.template_analysis.output, dict) and 'sections' in self.template_analysis.output:
                template_sections = [section['text']
                                     for section in self.template_analysis.output['sections']]
        except Exception as e:
            print('Error extracting sections:', e)
        # Fallback: try to extract from the agent response text
        if not template_sections:
            response_text = str(self.template_analysis)
            if 'Technique' in response_text and 'RΓ©sultat' in response_text and 'Conclusion' in response_text:
                template_sections = ['Technique\xa0:',
                                     'RΓ©sultat\xa0:', 'Conclusion\xa0:']
            elif 'CONCLUSION' in response_text:
                template_sections = ['CONCLUSION\xa0:']

        # Create dynamic prompt based on template sections
        dynamic_section_prompt = create_dynamic_section_prompt(
            template_sections)
        section_generator_chain = dynamic_section_prompt | llm

        generated_content = section_generator_chain.invoke({
            "template_sections": template_sections,
            "medical_data": self.medical_data,
            "corrected_transcription": self.corrected_transcription
        }).content

        # Post-process to ensure exact section names are used
        self.generated_sections = fix_section_names(
            generated_content, template_sections)
        print("\n--- DEBUG: Generated sections ---")
        print(self.generated_sections)
        print("--- END DEBUG ---\n")
        print("\n--- DEBUG: Template sections ---")
        print(template_sections)
        print("--- END DEBUG ---\n")
        print("\n--- DEBUG: Generated title ---")
        print(self.generated_title)
        print("--- END DEBUG ---\n")

        # Step 6: Assemble document
        print("\nπŸ“„ Step 6: Assembling document...")
        if output_path is None:
            # Generate output filename based on user_id
            # Replace the last extension with .docx
            if '.' in user_id:
                # Split by dots and replace the last part with docx
                parts = user_id.split('.')
                parts[-1] = 'docx'
                output_filename = '.'.join(parts)
            else:
                # If no extension, just add .docx
                output_filename = f"{user_id}.docx"

            output_path = output_filename

        # Use the agent for assembly
        document_assembler_executor = create_document_assembler_agent(llm)
        result = document_assembler_executor.invoke({
            "template_path": self.template_path,
            "sections_text": self.generated_sections,
            "title": self.generated_title,
            "output_path": output_path
        })

        print(f"πŸŽ‰ Pipeline completed! Document saved: {output_path}")

        # Step 7: Validate document
        print("\nπŸ“‹ Step 7: Validating document...")
        validation_result = validate_generated_document(
            self.template_path, self.transcription_path, output_path)

        # Display validation results
        print("\n" + "=" * 60)
        print("πŸ“Š VALIDATION RESULTS")
        print("=" * 60)

        # Overall score
        score = validation_result["overall_score"]
        score_emoji = "🟒" if score >= 0.8 else "🟑" if score >= 0.6 else "πŸ”΄"
        print(f"{score_emoji} Overall Score: {score:.1%}")

        # Structure validation
        structure_valid = validation_result["structure_valid"]
        structure_emoji = "βœ…" if structure_valid else "❌"
        print(f"{structure_emoji} Structure Valid: {structure_valid}")

        if not structure_valid:
            missing = validation_result["missing_sections"]
            print(f"   Missing sections: {', '.join(missing)}")

        # Entities validation
        entities_coverage = validation_result["entities_coverage"]
        entities_emoji = "βœ…" if entities_coverage >= 80 else "⚠️"
        print(f"{entities_emoji} Medical Entities Coverage: {entities_coverage:.1f}%")

        if entities_coverage < 80:
            missing_entities = validation_result["missing_entities"][:5]
            print(f"   Missing entities: {', '.join(missing_entities)}")

        # Generate AI validation report
        print("\nπŸ“ AI Validation Report:")
        print("-" * 40)

        # Extract content for AI validation
        from docx import Document
        doc = Document(output_path)
        generated_content = []
        for paragraph in doc.paragraphs:
            text = paragraph.text.strip()
            if text and not text.startswith("Date:") and not text.startswith("Heure:"):
                generated_content.append(text)
        generated_text = "\n".join(generated_content)

        validation_chain = create_validation_chain(llm)
        ai_validation = validation_chain.invoke({
            "transcription": self.corrected_transcription,
            "generated_content": generated_text,
            "structure_valid": structure_valid,
            "entities_coverage": entities_coverage,
            "missing_sections": validation_result["missing_sections"],
            "missing_entities": validation_result["missing_entities"]
        })

        print(ai_validation.content)
        print("\n" + "=" * 60)

        print("βœ… Document validated")

        # Remove the local model after validation
        try:
            if self.template_path and os.path.exists(self.template_path):
                os.remove(self.template_path)
                print(f"πŸ—‘οΈ Deleted local model file: {self.template_path}")
        except Exception as e:
            print(f"⚠️ Could not delete local model file: {e}")

        return output_path


def main():
    """Main function to run the LangChain medical document pipeline."""
    print("πŸ₯ LangChain Medical Document Agents - Refactored")
    print("=" * 60)

    # Initialize orchestrator
    orchestrator = MedicalDocumentOrchestrator(
        template_path="default.528.251014072.doc",
        transcription_path="transciption.txt"
    )

    # Run the complete pipeline
    output_file = orchestrator.run_full_pipeline()

    print(f"\nβœ… Final document: {output_file}")
    print("πŸŽ‰ LangChain pipeline completed successfully!")


if __name__ == "__main__":
    main()