Spaces:

Nourhenem
/

pipeline2

Sleeping

App Files Files Community

pipeline2 / section_generator.py

Nourhenem

initial commit

f92da22 verified about 1 month ago

raw

history blame contribute delete

6.77 kB

	#!/usr/bin/env python3
	"""
	Section Generator
	Handles dynamic section generation based on template sections
	"""

	from typing import List
	from langchain.prompts import ChatPromptTemplate


	def create_dynamic_section_prompt(template_sections: List[str]) -> ChatPromptTemplate:
	"""Create a dynamic prompt based on the sections found in the template."""

	# Build section instructions dynamically
	section_instructions = []
	for section in template_sections:
	section_clean = section.strip().replace('\xa0', ' ').replace(':', '').strip()
	section_instructions.append(
	f"{section}:\n[Extract and organize content for {section_clean} section from the transcription, maintaining maximum fidelity to the original text]")

	sections_text = "\n\n".join(section_instructions)

	# Clean section names for display
	clean_section_names = [s.strip().replace('\xa0', ' ').replace(
	':', '').strip() for s in template_sections]
	sections_list = ', '.join(clean_section_names)

	# Special handling for single section templates
	if len(template_sections) == 1:
	single_section_instruction = f"""CRITICAL: This template has only ONE section: {sections_list}.
	You MUST generate content for this section using ALL the information from the transcription.
	Do not leave the section empty - extract and organize ALL relevant content from the transcription."""
	else:
	single_section_instruction = ""

	system_prompt = f"""You are a medical document organizer.
	Your task is to organize the corrected medical transcription into the required sections while maintaining maximum fidelity to the original text.

	You MUST fill ALL sections requested in the template: {sections_list}.
	CRITICAL: Use the EXACT section names provided in template_sections (including any punctuation like ':') - DO NOT translate or change them.
	DO NOT summarize, interpret, or add information not present in the transcription.
	DO NOT use markdown formatting or add extra headers.

	{single_section_instruction}

	ORGANIZATION RULES:
	- Extract relevant content from the transcription for each section
	- Maintain the original wording and structure as much as possible
	- Do not add medical interpretations or conclusions not present in the text
	- Keep all measurements, observations, and findings exactly as stated
	- Preserve the original medical terminology
	- Use ONLY the exact section names from the template (e.g., 'Technique :', 'Résultat :', 'Conclusion :')
	- If there is only one section, put ALL relevant content in that section

	Format your response with clear section headers using the EXACT names from the template:

	{sections_text}

	IMPORTANT:
	- Use the corrected transcription content to fill all sections
	- Use the EXACT section names from the template - DO NOT translate or modify them
	- Do not add markdown formatting or extra headers
	- Maintain maximum fidelity to the original transcription content
	- Do not summarize or interpret the medical information
	- Keep all original medical terms and measurements exactly as they appear
	- NEVER leave a section empty - always provide content based on the transcription"""

	human_prompt = """Organize the corrected medical transcription into the required sections:

	Template sections: {template_sections}
	Medical data: {medical_data}
	Corrected transcription: {corrected_transcription}

	Generate each section with the exact title from the template, using the corrected transcription content while maintaining maximum fidelity to the original text."""

	return ChatPromptTemplate.from_messages([
	("system", system_prompt),
	("human", human_prompt)
	])


	def fix_section_names(content: str, template_sections: List[str]) -> str:
	"""Post-process the generated content to ensure exact section names are used."""
	import re

	# If content is empty or very short, return the original content
	if not content.strip() or len(content.strip()) < 50:
	return content

	# If there's only one template section, put all content in that section
	if len(template_sections) == 1:
	return f"{template_sections[0]}\n{content.strip()}"

	sections = {}
	current_section = None
	current_content = []

	# Create a pattern to match any section header
	section_pattern = re.compile(r'^([A-Za-zÀ-ÿ\s]+:?)\s*$', re.IGNORECASE)

	for line in content.split('\n'):
	line = line.strip()
	if not line:
	continue

	# Check if this is a section header
	match = section_pattern.match(line)
	if match:
	section_name = match.group(1).strip()
	# Normalize section names for comparison
	section_normalized = section_name.lower().replace('é', 'e').replace(
	'è', 'e').replace('à', 'a').replace(':', '').strip()

	# Check if this section name is similar to any template section
	matched_template_section = None
	for template_section in template_sections:
	template_normalized = template_section.lower().replace('é', 'e').replace(
	'è', 'e').replace('à', 'a').replace(':', '').replace('\xa0', ' ').strip()

	# Check if they are similar (case insensitive and accent-insensitive)
	if (section_normalized in template_normalized or
	template_normalized in section_normalized or
	any(word in section_normalized for word in template_normalized.split())):
	matched_template_section = template_section
	break

	if matched_template_section:
	if current_section:
	sections[current_section] = '\n'.join(
	current_content).strip()
	current_section = matched_template_section # Use exact template section name
	current_content = []
	else:
	# If no match found, treat as content
	if current_section:
	current_content.append(line)
	elif current_section:
	current_content.append(line)

	# Add last section
	if current_section and current_content:
	sections[current_section] = '\n'.join(current_content).strip()

	# If no sections were found, put all content in the first template section
	if not sections and template_sections:
	sections[template_sections[0]] = content.strip()

	# Reconstruct the content with exact section names
	fixed_content = []
	for section_name, content in sections.items():
	fixed_content.append(f"{section_name}")
	if content:
	fixed_content.append(content)
	fixed_content.append("")

	return "\n".join(fixed_content)