pipeline2 / transcription_processor.py
Nourhenem's picture
initial commit
f92da22 verified
#!/usr/bin/env python3
"""
Transcription Processor
Handles transcription loading, correction, and medical analysis
"""
import os
import json
from typing import Dict, Any, Tuple
from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate
@tool
def load_transcription(transcription_path: str) -> str:
"""Load and return the raw transcription text from a file."""
if not os.path.exists(transcription_path):
raise FileNotFoundError(
f"Transcription file not found: {transcription_path}")
with open(transcription_path, 'r', encoding='utf-8') as f:
return f.read().strip()
def load_transcription_with_user_id(transcription_path: str) -> Tuple[str, str]:
"""Load transcription text and user_id from a JSON file."""
if not os.path.exists(transcription_path):
raise FileNotFoundError(
f"Transcription file not found: {transcription_path}")
with open(transcription_path, 'r', encoding='utf-8') as f:
data = json.load(f)
transcription_text = data.get('transcription', '')
user_id = data.get('user_id', 'unknown')
return transcription_text, user_id
def create_transcription_corrector_chain(llm):
"""Create the transcription corrector chain."""
transcription_corrector_prompt = ChatPromptTemplate.from_messages([
("system", """You are an experienced medical secretary. You will receive a document that is the output of a speech recognition engine (ASR) applied to medical data.
Your task is to correct the document while maintaining maximum fidelity to the original text:
CORRECTION RULES:
- Correct spelling and grammar errors
- Correct incorrect words, especially medication names or disease names
- Correct incorrect dates or addresses
- Do NOT add synonyms, stay as faithful as possible to the original text
- Add line breaks and punctuation when necessary
- List results and analysis descriptions as bullet points
- Replace "la ligne" and "à la ligne" with line breaks in the text
- Replace "point" with a period followed by a line break in the text
NUMBER FORMATTING:
Write numbers in their correct numerical form:
- "1°" for "un degré"
- "3D" for "trois d"
- "200 kg" for "deux cent kilogrammes" or "deux cent kilo"
- "5.423" for "cinq point quatre cent vingt trois"
- "0.5" for "zéro virgule cinq"
- "3 3/4" for "trois et trois quarts"
- "142,015" for "cent quarante deux mille quinze"
- "06 32 16 15 19" for "zéro six trente deux seize quinze dix-neuf"
- "99.50 €" for "quatre vingt dix neuf euros et cinquante centimes"
- "Friday May 15, 2015" for "quinze mai deux mille quinze"
- "20:30" for "vingt heures trente"
- "12:15" for "midi quinze"
- "5:15" for "cinq heures et quart"
- "2:45" for "trois heures moins le quart"
Also support Swiss and Belgian forms:
- "70" for "septante"
- "80" for "huitante"
- "90" for "nonante"
- "77" for "septante sept"
- "81" for "huitante un"
- "95" for "nonante cinq"
Return the corrected text as simple text without explanations or comments. Maintain the original structure and content as much as possible."""),
("human",
"Correct the following medical transcription while maintaining maximum fidelity to the original text:\n\n{transcription}")
])
return transcription_corrector_prompt | llm
def create_medical_analyzer_chain(llm):
"""Create the medical analyzer chain."""
medical_analyzer_prompt = ChatPromptTemplate.from_messages([
("system", """You are a medical information extractor.
Extract and categorize ONLY the medical information that is explicitly mentioned in the transcription.
DO NOT add interpretations, conclusions, or information not present in the text.
DO NOT make assumptions or add medical knowledge.
Simply organize the information that is already there into structured categories.
Focus on measurements, anatomical structures, and findings that are explicitly stated."""),
("human",
"Extract and organize ONLY the medical information explicitly mentioned in this transcription:\n\n{corrected_transcription}")
])
return medical_analyzer_prompt | llm
def create_title_generator_chain(llm):
"""Create the title generator chain."""
title_generator_prompt = ChatPromptTemplate.from_messages([
("system", """You are a medical title generator.
Generate a professional medical report title in FRENCH based on the medical data and findings.
The title should be specific to the type of examination and findings.
Return ONLY the title in French, nothing else."""),
("human", """Generate a medical report title in FRENCH based on this medical data:
{medical_data}
Generate a professional title in French that reflects the type of examination and key findings.""")
])
return title_generator_prompt | llm