|
|
|
|
|
""" |
|
|
Section Generator |
|
|
Handles dynamic section generation based on template sections |
|
|
""" |
|
|
|
|
|
from typing import List |
|
|
from langchain.prompts import ChatPromptTemplate |
|
|
|
|
|
|
|
|
def create_dynamic_section_prompt(template_sections: List[str]) -> ChatPromptTemplate: |
|
|
"""Create a dynamic prompt based on the sections found in the template.""" |
|
|
|
|
|
|
|
|
section_instructions = [] |
|
|
for section in template_sections: |
|
|
section_clean = section.strip().replace('\xa0', ' ').replace(':', '').strip() |
|
|
section_instructions.append( |
|
|
f"{section}:\n[Extract and organize content for {section_clean} section from the transcription, maintaining maximum fidelity to the original text]") |
|
|
|
|
|
sections_text = "\n\n".join(section_instructions) |
|
|
|
|
|
|
|
|
clean_section_names = [s.strip().replace('\xa0', ' ').replace( |
|
|
':', '').strip() for s in template_sections] |
|
|
sections_list = ', '.join(clean_section_names) |
|
|
|
|
|
|
|
|
if len(template_sections) == 1: |
|
|
single_section_instruction = f"""CRITICAL: This template has only ONE section: {sections_list}. |
|
|
You MUST generate content for this section using ALL the information from the transcription. |
|
|
Do not leave the section empty - extract and organize ALL relevant content from the transcription.""" |
|
|
else: |
|
|
single_section_instruction = "" |
|
|
|
|
|
system_prompt = f"""You are a medical document organizer. |
|
|
Your task is to organize the corrected medical transcription into the required sections while maintaining maximum fidelity to the original text. |
|
|
|
|
|
You MUST fill ALL sections requested in the template: {sections_list}. |
|
|
CRITICAL: Use the EXACT section names provided in template_sections (including any punctuation like ':') - DO NOT translate or change them. |
|
|
DO NOT summarize, interpret, or add information not present in the transcription. |
|
|
DO NOT use markdown formatting or add extra headers. |
|
|
|
|
|
{single_section_instruction} |
|
|
|
|
|
ORGANIZATION RULES: |
|
|
- Extract relevant content from the transcription for each section |
|
|
- Maintain the original wording and structure as much as possible |
|
|
- Do not add medical interpretations or conclusions not present in the text |
|
|
- Keep all measurements, observations, and findings exactly as stated |
|
|
- Preserve the original medical terminology |
|
|
- Use ONLY the exact section names from the template (e.g., 'Technique :', 'Résultat :', 'Conclusion :') |
|
|
- If there is only one section, put ALL relevant content in that section |
|
|
|
|
|
Format your response with clear section headers using the EXACT names from the template: |
|
|
|
|
|
{sections_text} |
|
|
|
|
|
IMPORTANT: |
|
|
- Use the corrected transcription content to fill all sections |
|
|
- Use the EXACT section names from the template - DO NOT translate or modify them |
|
|
- Do not add markdown formatting or extra headers |
|
|
- Maintain maximum fidelity to the original transcription content |
|
|
- Do not summarize or interpret the medical information |
|
|
- Keep all original medical terms and measurements exactly as they appear |
|
|
- NEVER leave a section empty - always provide content based on the transcription""" |
|
|
|
|
|
human_prompt = """Organize the corrected medical transcription into the required sections: |
|
|
|
|
|
Template sections: {template_sections} |
|
|
Medical data: {medical_data} |
|
|
Corrected transcription: {corrected_transcription} |
|
|
|
|
|
Generate each section with the exact title from the template, using the corrected transcription content while maintaining maximum fidelity to the original text.""" |
|
|
|
|
|
return ChatPromptTemplate.from_messages([ |
|
|
("system", system_prompt), |
|
|
("human", human_prompt) |
|
|
]) |
|
|
|
|
|
|
|
|
def fix_section_names(content: str, template_sections: List[str]) -> str: |
|
|
"""Post-process the generated content to ensure exact section names are used.""" |
|
|
import re |
|
|
|
|
|
|
|
|
if not content.strip() or len(content.strip()) < 50: |
|
|
return content |
|
|
|
|
|
|
|
|
if len(template_sections) == 1: |
|
|
return f"{template_sections[0]}\n{content.strip()}" |
|
|
|
|
|
sections = {} |
|
|
current_section = None |
|
|
current_content = [] |
|
|
|
|
|
|
|
|
section_pattern = re.compile(r'^([A-Za-zÀ-ÿ\s]+:?)\s*$', re.IGNORECASE) |
|
|
|
|
|
for line in content.split('\n'): |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
|
|
|
match = section_pattern.match(line) |
|
|
if match: |
|
|
section_name = match.group(1).strip() |
|
|
|
|
|
section_normalized = section_name.lower().replace('é', 'e').replace( |
|
|
'è', 'e').replace('à', 'a').replace(':', '').strip() |
|
|
|
|
|
|
|
|
matched_template_section = None |
|
|
for template_section in template_sections: |
|
|
template_normalized = template_section.lower().replace('é', 'e').replace( |
|
|
'è', 'e').replace('à', 'a').replace(':', '').replace('\xa0', ' ').strip() |
|
|
|
|
|
|
|
|
if (section_normalized in template_normalized or |
|
|
template_normalized in section_normalized or |
|
|
any(word in section_normalized for word in template_normalized.split())): |
|
|
matched_template_section = template_section |
|
|
break |
|
|
|
|
|
if matched_template_section: |
|
|
if current_section: |
|
|
sections[current_section] = '\n'.join( |
|
|
current_content).strip() |
|
|
current_section = matched_template_section |
|
|
current_content = [] |
|
|
else: |
|
|
|
|
|
if current_section: |
|
|
current_content.append(line) |
|
|
elif current_section: |
|
|
current_content.append(line) |
|
|
|
|
|
|
|
|
if current_section and current_content: |
|
|
sections[current_section] = '\n'.join(current_content).strip() |
|
|
|
|
|
|
|
|
if not sections and template_sections: |
|
|
sections[template_sections[0]] = content.strip() |
|
|
|
|
|
|
|
|
fixed_content = [] |
|
|
for section_name, content in sections.items(): |
|
|
fixed_content.append(f"{section_name}") |
|
|
if content: |
|
|
fixed_content.append(content) |
|
|
fixed_content.append("") |
|
|
|
|
|
return "\n".join(fixed_content) |
|
|
|