pipeline2 / section_generator.py
Nourhenem's picture
initial commit
f92da22 verified
#!/usr/bin/env python3
"""
Section Generator
Handles dynamic section generation based on template sections
"""
from typing import List
from langchain.prompts import ChatPromptTemplate
def create_dynamic_section_prompt(template_sections: List[str]) -> ChatPromptTemplate:
"""Create a dynamic prompt based on the sections found in the template."""
# Build section instructions dynamically
section_instructions = []
for section in template_sections:
section_clean = section.strip().replace('\xa0', ' ').replace(':', '').strip()
section_instructions.append(
f"{section}:\n[Extract and organize content for {section_clean} section from the transcription, maintaining maximum fidelity to the original text]")
sections_text = "\n\n".join(section_instructions)
# Clean section names for display
clean_section_names = [s.strip().replace('\xa0', ' ').replace(
':', '').strip() for s in template_sections]
sections_list = ', '.join(clean_section_names)
# Special handling for single section templates
if len(template_sections) == 1:
single_section_instruction = f"""CRITICAL: This template has only ONE section: {sections_list}.
You MUST generate content for this section using ALL the information from the transcription.
Do not leave the section empty - extract and organize ALL relevant content from the transcription."""
else:
single_section_instruction = ""
system_prompt = f"""You are a medical document organizer.
Your task is to organize the corrected medical transcription into the required sections while maintaining maximum fidelity to the original text.
You MUST fill ALL sections requested in the template: {sections_list}.
CRITICAL: Use the EXACT section names provided in template_sections (including any punctuation like ':') - DO NOT translate or change them.
DO NOT summarize, interpret, or add information not present in the transcription.
DO NOT use markdown formatting or add extra headers.
{single_section_instruction}
ORGANIZATION RULES:
- Extract relevant content from the transcription for each section
- Maintain the original wording and structure as much as possible
- Do not add medical interpretations or conclusions not present in the text
- Keep all measurements, observations, and findings exactly as stated
- Preserve the original medical terminology
- Use ONLY the exact section names from the template (e.g., 'Technique :', 'Résultat :', 'Conclusion :')
- If there is only one section, put ALL relevant content in that section
Format your response with clear section headers using the EXACT names from the template:
{sections_text}
IMPORTANT:
- Use the corrected transcription content to fill all sections
- Use the EXACT section names from the template - DO NOT translate or modify them
- Do not add markdown formatting or extra headers
- Maintain maximum fidelity to the original transcription content
- Do not summarize or interpret the medical information
- Keep all original medical terms and measurements exactly as they appear
- NEVER leave a section empty - always provide content based on the transcription"""
human_prompt = """Organize the corrected medical transcription into the required sections:
Template sections: {template_sections}
Medical data: {medical_data}
Corrected transcription: {corrected_transcription}
Generate each section with the exact title from the template, using the corrected transcription content while maintaining maximum fidelity to the original text."""
return ChatPromptTemplate.from_messages([
("system", system_prompt),
("human", human_prompt)
])
def fix_section_names(content: str, template_sections: List[str]) -> str:
"""Post-process the generated content to ensure exact section names are used."""
import re
# If content is empty or very short, return the original content
if not content.strip() or len(content.strip()) < 50:
return content
# If there's only one template section, put all content in that section
if len(template_sections) == 1:
return f"{template_sections[0]}\n{content.strip()}"
sections = {}
current_section = None
current_content = []
# Create a pattern to match any section header
section_pattern = re.compile(r'^([A-Za-zÀ-ÿ\s]+:?)\s*$', re.IGNORECASE)
for line in content.split('\n'):
line = line.strip()
if not line:
continue
# Check if this is a section header
match = section_pattern.match(line)
if match:
section_name = match.group(1).strip()
# Normalize section names for comparison
section_normalized = section_name.lower().replace('é', 'e').replace(
'è', 'e').replace('à', 'a').replace(':', '').strip()
# Check if this section name is similar to any template section
matched_template_section = None
for template_section in template_sections:
template_normalized = template_section.lower().replace('é', 'e').replace(
'è', 'e').replace('à', 'a').replace(':', '').replace('\xa0', ' ').strip()
# Check if they are similar (case insensitive and accent-insensitive)
if (section_normalized in template_normalized or
template_normalized in section_normalized or
any(word in section_normalized for word in template_normalized.split())):
matched_template_section = template_section
break
if matched_template_section:
if current_section:
sections[current_section] = '\n'.join(
current_content).strip()
current_section = matched_template_section # Use exact template section name
current_content = []
else:
# If no match found, treat as content
if current_section:
current_content.append(line)
elif current_section:
current_content.append(line)
# Add last section
if current_section and current_content:
sections[current_section] = '\n'.join(current_content).strip()
# If no sections were found, put all content in the first template section
if not sections and template_sections:
sections[template_sections[0]] = content.strip()
# Reconstruct the content with exact section names
fixed_content = []
for section_name, content in sections.items():
fixed_content.append(f"{section_name}")
if content:
fixed_content.append(content)
fixed_content.append("")
return "\n".join(fixed_content)