File size: 6,772 Bytes
f92da22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
#!/usr/bin/env python3
"""
Section Generator
Handles dynamic section generation based on template sections
"""
from typing import List
from langchain.prompts import ChatPromptTemplate
def create_dynamic_section_prompt(template_sections: List[str]) -> ChatPromptTemplate:
"""Create a dynamic prompt based on the sections found in the template."""
# Build section instructions dynamically
section_instructions = []
for section in template_sections:
section_clean = section.strip().replace('\xa0', ' ').replace(':', '').strip()
section_instructions.append(
f"{section}:\n[Extract and organize content for {section_clean} section from the transcription, maintaining maximum fidelity to the original text]")
sections_text = "\n\n".join(section_instructions)
# Clean section names for display
clean_section_names = [s.strip().replace('\xa0', ' ').replace(
':', '').strip() for s in template_sections]
sections_list = ', '.join(clean_section_names)
# Special handling for single section templates
if len(template_sections) == 1:
single_section_instruction = f"""CRITICAL: This template has only ONE section: {sections_list}.
You MUST generate content for this section using ALL the information from the transcription.
Do not leave the section empty - extract and organize ALL relevant content from the transcription."""
else:
single_section_instruction = ""
system_prompt = f"""You are a medical document organizer.
Your task is to organize the corrected medical transcription into the required sections while maintaining maximum fidelity to the original text.
You MUST fill ALL sections requested in the template: {sections_list}.
CRITICAL: Use the EXACT section names provided in template_sections (including any punctuation like ':') - DO NOT translate or change them.
DO NOT summarize, interpret, or add information not present in the transcription.
DO NOT use markdown formatting or add extra headers.
{single_section_instruction}
ORGANIZATION RULES:
- Extract relevant content from the transcription for each section
- Maintain the original wording and structure as much as possible
- Do not add medical interpretations or conclusions not present in the text
- Keep all measurements, observations, and findings exactly as stated
- Preserve the original medical terminology
- Use ONLY the exact section names from the template (e.g., 'Technique :', 'Résultat :', 'Conclusion :')
- If there is only one section, put ALL relevant content in that section
Format your response with clear section headers using the EXACT names from the template:
{sections_text}
IMPORTANT:
- Use the corrected transcription content to fill all sections
- Use the EXACT section names from the template - DO NOT translate or modify them
- Do not add markdown formatting or extra headers
- Maintain maximum fidelity to the original transcription content
- Do not summarize or interpret the medical information
- Keep all original medical terms and measurements exactly as they appear
- NEVER leave a section empty - always provide content based on the transcription"""
human_prompt = """Organize the corrected medical transcription into the required sections:
Template sections: {template_sections}
Medical data: {medical_data}
Corrected transcription: {corrected_transcription}
Generate each section with the exact title from the template, using the corrected transcription content while maintaining maximum fidelity to the original text."""
return ChatPromptTemplate.from_messages([
("system", system_prompt),
("human", human_prompt)
])
def fix_section_names(content: str, template_sections: List[str]) -> str:
"""Post-process the generated content to ensure exact section names are used."""
import re
# If content is empty or very short, return the original content
if not content.strip() or len(content.strip()) < 50:
return content
# If there's only one template section, put all content in that section
if len(template_sections) == 1:
return f"{template_sections[0]}\n{content.strip()}"
sections = {}
current_section = None
current_content = []
# Create a pattern to match any section header
section_pattern = re.compile(r'^([A-Za-zÀ-ÿ\s]+:?)\s*$', re.IGNORECASE)
for line in content.split('\n'):
line = line.strip()
if not line:
continue
# Check if this is a section header
match = section_pattern.match(line)
if match:
section_name = match.group(1).strip()
# Normalize section names for comparison
section_normalized = section_name.lower().replace('é', 'e').replace(
'è', 'e').replace('à', 'a').replace(':', '').strip()
# Check if this section name is similar to any template section
matched_template_section = None
for template_section in template_sections:
template_normalized = template_section.lower().replace('é', 'e').replace(
'è', 'e').replace('à', 'a').replace(':', '').replace('\xa0', ' ').strip()
# Check if they are similar (case insensitive and accent-insensitive)
if (section_normalized in template_normalized or
template_normalized in section_normalized or
any(word in section_normalized for word in template_normalized.split())):
matched_template_section = template_section
break
if matched_template_section:
if current_section:
sections[current_section] = '\n'.join(
current_content).strip()
current_section = matched_template_section # Use exact template section name
current_content = []
else:
# If no match found, treat as content
if current_section:
current_content.append(line)
elif current_section:
current_content.append(line)
# Add last section
if current_section and current_content:
sections[current_section] = '\n'.join(current_content).strip()
# If no sections were found, put all content in the first template section
if not sections and template_sections:
sections[template_sections[0]] = content.strip()
# Reconstruct the content with exact section names
fixed_content = []
for section_name, content in sections.items():
fixed_content.append(f"{section_name}")
if content:
fixed_content.append(content)
fixed_content.append("")
return "\n".join(fixed_content)
|