#!/usr/bin/env python3 """ Section Generator Handles dynamic section generation based on template sections """ from typing import List from langchain.prompts import ChatPromptTemplate def create_dynamic_section_prompt(template_sections: List[str]) -> ChatPromptTemplate: """Create a dynamic prompt based on the sections found in the template.""" # Build section instructions dynamically section_instructions = [] for section in template_sections: section_clean = section.strip().replace('\xa0', ' ').replace(':', '').strip() section_instructions.append( f"{section}:\n[Extract and organize content for {section_clean} section from the transcription, maintaining maximum fidelity to the original text]") sections_text = "\n\n".join(section_instructions) # Clean section names for display clean_section_names = [s.strip().replace('\xa0', ' ').replace( ':', '').strip() for s in template_sections] sections_list = ', '.join(clean_section_names) # Special handling for single section templates if len(template_sections) == 1: single_section_instruction = f"""CRITICAL: This template has only ONE section: {sections_list}. You MUST generate content for this section using ALL the information from the transcription. Do not leave the section empty - extract and organize ALL relevant content from the transcription.""" else: single_section_instruction = "" system_prompt = f"""You are a medical document organizer. Your task is to organize the corrected medical transcription into the required sections while maintaining maximum fidelity to the original text. You MUST fill ALL sections requested in the template: {sections_list}. CRITICAL: Use the EXACT section names provided in template_sections (including any punctuation like ':') - DO NOT translate or change them. DO NOT summarize, interpret, or add information not present in the transcription. DO NOT use markdown formatting or add extra headers. {single_section_instruction} ORGANIZATION RULES: - Extract relevant content from the transcription for each section - Maintain the original wording and structure as much as possible - Do not add medical interpretations or conclusions not present in the text - Keep all measurements, observations, and findings exactly as stated - Preserve the original medical terminology - Use ONLY the exact section names from the template (e.g., 'Technique :', 'Résultat :', 'Conclusion :') - If there is only one section, put ALL relevant content in that section Format your response with clear section headers using the EXACT names from the template: {sections_text} IMPORTANT: - Use the corrected transcription content to fill all sections - Use the EXACT section names from the template - DO NOT translate or modify them - Do not add markdown formatting or extra headers - Maintain maximum fidelity to the original transcription content - Do not summarize or interpret the medical information - Keep all original medical terms and measurements exactly as they appear - NEVER leave a section empty - always provide content based on the transcription""" human_prompt = """Organize the corrected medical transcription into the required sections: Template sections: {template_sections} Medical data: {medical_data} Corrected transcription: {corrected_transcription} Generate each section with the exact title from the template, using the corrected transcription content while maintaining maximum fidelity to the original text.""" return ChatPromptTemplate.from_messages([ ("system", system_prompt), ("human", human_prompt) ]) def fix_section_names(content: str, template_sections: List[str]) -> str: """Post-process the generated content to ensure exact section names are used.""" import re # If content is empty or very short, return the original content if not content.strip() or len(content.strip()) < 50: return content # If there's only one template section, put all content in that section if len(template_sections) == 1: return f"{template_sections[0]}\n{content.strip()}" sections = {} current_section = None current_content = [] # Create a pattern to match any section header section_pattern = re.compile(r'^([A-Za-zÀ-ÿ\s]+:?)\s*$', re.IGNORECASE) for line in content.split('\n'): line = line.strip() if not line: continue # Check if this is a section header match = section_pattern.match(line) if match: section_name = match.group(1).strip() # Normalize section names for comparison section_normalized = section_name.lower().replace('é', 'e').replace( 'è', 'e').replace('à', 'a').replace(':', '').strip() # Check if this section name is similar to any template section matched_template_section = None for template_section in template_sections: template_normalized = template_section.lower().replace('é', 'e').replace( 'è', 'e').replace('à', 'a').replace(':', '').replace('\xa0', ' ').strip() # Check if they are similar (case insensitive and accent-insensitive) if (section_normalized in template_normalized or template_normalized in section_normalized or any(word in section_normalized for word in template_normalized.split())): matched_template_section = template_section break if matched_template_section: if current_section: sections[current_section] = '\n'.join( current_content).strip() current_section = matched_template_section # Use exact template section name current_content = [] else: # If no match found, treat as content if current_section: current_content.append(line) elif current_section: current_content.append(line) # Add last section if current_section and current_content: sections[current_section] = '\n'.join(current_content).strip() # If no sections were found, put all content in the first template section if not sections and template_sections: sections[template_sections[0]] = content.strip() # Reconstruct the content with exact section names fixed_content = [] for section_name, content in sections.items(): fixed_content.append(f"{section_name}") if content: fixed_content.append(content) fixed_content.append("") return "\n".join(fixed_content)