Spaces:

Nourhenem
/

pipeline2

Sleeping

File size: 6,772 Bytes

f92da22

#!/usr/bin/env python3
"""
Section Generator
Handles dynamic section generation based on template sections
"""

from typing import List
from langchain.prompts import ChatPromptTemplate


def create_dynamic_section_prompt(template_sections: List[str]) -> ChatPromptTemplate:
    """Create a dynamic prompt based on the sections found in the template."""

    # Build section instructions dynamically
    section_instructions = []
    for section in template_sections:
        section_clean = section.strip().replace('\xa0', ' ').replace(':', '').strip()
        section_instructions.append(
            f"{section}:\n[Extract and organize content for {section_clean} section from the transcription, maintaining maximum fidelity to the original text]")

    sections_text = "\n\n".join(section_instructions)

    # Clean section names for display
    clean_section_names = [s.strip().replace('\xa0', ' ').replace(
        ':', '').strip() for s in template_sections]
    sections_list = ', '.join(clean_section_names)

    # Special handling for single section templates
    if len(template_sections) == 1:
        single_section_instruction = f"""CRITICAL: This template has only ONE section: {sections_list}.
You MUST generate content for this section using ALL the information from the transcription.
Do not leave the section empty - extract and organize ALL relevant content from the transcription."""
    else:
        single_section_instruction = ""

    system_prompt = f"""You are a medical document organizer.
Your task is to organize the corrected medical transcription into the required sections while maintaining maximum fidelity to the original text.

You MUST fill ALL sections requested in the template: {sections_list}.
CRITICAL: Use the EXACT section names provided in template_sections (including any punctuation like ':') - DO NOT translate or change them.
DO NOT summarize, interpret, or add information not present in the transcription.
DO NOT use markdown formatting or add extra headers.

{single_section_instruction}

ORGANIZATION RULES:
- Extract relevant content from the transcription for each section
- Maintain the original wording and structure as much as possible
- Do not add medical interpretations or conclusions not present in the text
- Keep all measurements, observations, and findings exactly as stated
- Preserve the original medical terminology
- Use ONLY the exact section names from the template (e.g., 'Technique :', 'Résultat :', 'Conclusion :')
- If there is only one section, put ALL relevant content in that section

Format your response with clear section headers using the EXACT names from the template:

{sections_text}

IMPORTANT: 
- Use the corrected transcription content to fill all sections
- Use the EXACT section names from the template - DO NOT translate or modify them
- Do not add markdown formatting or extra headers
- Maintain maximum fidelity to the original transcription content
- Do not summarize or interpret the medical information
- Keep all original medical terms and measurements exactly as they appear
- NEVER leave a section empty - always provide content based on the transcription"""

    human_prompt = """Organize the corrected medical transcription into the required sections:

Template sections: {template_sections}
Medical data: {medical_data}
Corrected transcription: {corrected_transcription}

Generate each section with the exact title from the template, using the corrected transcription content while maintaining maximum fidelity to the original text."""

    return ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", human_prompt)
    ])


def fix_section_names(content: str, template_sections: List[str]) -> str:
    """Post-process the generated content to ensure exact section names are used."""
    import re

    # If content is empty or very short, return the original content
    if not content.strip() or len(content.strip()) < 50:
        return content

    # If there's only one template section, put all content in that section
    if len(template_sections) == 1:
        return f"{template_sections[0]}\n{content.strip()}"

    sections = {}
    current_section = None
    current_content = []

    # Create a pattern to match any section header
    section_pattern = re.compile(r'^([A-Za-zÀ-ÿ\s]+:?)\s*$', re.IGNORECASE)

    for line in content.split('\n'):
        line = line.strip()
        if not line:
            continue

        # Check if this is a section header
        match = section_pattern.match(line)
        if match:
            section_name = match.group(1).strip()
            # Normalize section names for comparison
            section_normalized = section_name.lower().replace('é', 'e').replace(
                'è', 'e').replace('à', 'a').replace(':', '').strip()

            # Check if this section name is similar to any template section
            matched_template_section = None
            for template_section in template_sections:
                template_normalized = template_section.lower().replace('é', 'e').replace(
                    'è', 'e').replace('à', 'a').replace(':', '').replace('\xa0', ' ').strip()

                # Check if they are similar (case insensitive and accent-insensitive)
                if (section_normalized in template_normalized or
                    template_normalized in section_normalized or
                        any(word in section_normalized for word in template_normalized.split())):
                    matched_template_section = template_section
                    break

            if matched_template_section:
                if current_section:
                    sections[current_section] = '\n'.join(
                        current_content).strip()
                current_section = matched_template_section  # Use exact template section name
                current_content = []
            else:
                # If no match found, treat as content
                if current_section:
                    current_content.append(line)
        elif current_section:
            current_content.append(line)

    # Add last section
    if current_section and current_content:
        sections[current_section] = '\n'.join(current_content).strip()

    # If no sections were found, put all content in the first template section
    if not sections and template_sections:
        sections[template_sections[0]] = content.strip()

    # Reconstruct the content with exact section names
    fixed_content = []
    for section_name, content in sections.items():
        fixed_content.append(f"{section_name}")
        if content:
            fixed_content.append(content)
        fixed_content.append("")

    return "\n".join(fixed_content)