pipeline2 / annotation.py
Nourhenem's picture
initial commit
f92da22 verified
raw
history blame
14.2 kB
import os
from dotenv import load_dotenv
from openai import AzureOpenAI
import json
load_dotenv()
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") # deployment name
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
# Configure OpenAI for Azure
client = AzureOpenAI(
api_key=AZURE_OPENAI_KEY,
api_version=AZURE_OPENAI_API_VERSION,
azure_endpoint=AZURE_OPENAI_ENDPOINT
)
def extract_medical_entities(text: str) -> dict:
prompt = f""" You are a medical NER expert. Your task is to extract relevant entities from the given medical report text and return them in a JSON object.
Analyze the text carefully and identify the following fields:
- "exam_types": any type of medical test, examination, or diagnostic method performed on the patient.
- "specialties": the branch of medicine or medical discipline relevant to the report.
- "anatomical_regions": specific parts or regions of the body mentioned in the report.
- "pathologies": diagnosed diseases, disorders, or abnormal medical conditions noted in the report.
- "procedures": medical interventions, treatments, or actions performed on the patient.
- "measurements": numerical values or quantities recorded in the report, such as vital signs, lab results, sizes, or pressures.
- "medications": drugs, therapies, or prescribed substances mentioned in the report.
- "symptoms": patient-experienced signs or observable indications of a health issue.
Text to analyze:
\"\"\"
{text}
\"\"\"
Return ONLY a valid JSON object with all fields. If a field has no values, return an empty list.
"""
response = client.chat.completions.create(
model=AZURE_OPENAI_DEPLOYMENT,
messages=[{"role": "user", "content": prompt}],
#temperature=0,
#max_tokens=1024
)
content = response.choices[0].message.content
try:
return json.loads(content)
except json.JSONDecodeError:
return {
"exam_types": [],
"specialties": [],
"anatomical_regions": [],
"pathologies": [],
"procedures": [],
"measurements": [],
"medications": [],
"symptoms": []
}
import json
def save_annotation(text: str, labels: dict, output_file="dataset.jsonl"):
record = {
"text": text,
"labels": labels
}
# append as one line of JSON
with open(output_file, "a", encoding="utf-8") as f:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
if __name__ == "__main__":
input_folder = "data_txt" # 📂 folder containing your .txt files
output_file = "dataset.json"
# Ensure output file is empty before starting
open(output_file, "w", encoding="utf-8").close()
for filename in os.listdir(input_folder):
if filename.endswith(".txt"):
file_path = os.path.join(input_folder, filename)
with open(file_path, "r", encoding="utf-8") as f:
transcription = f.read().strip()
print(f"\n=== Processing {filename} ===")
entities = extract_medical_entities(transcription)
# Save results
save_annotation(transcription, entities, output_file=output_file)
print(f"✅ Saved {filename}{output_file}")
"""
if __name__ == "__main__":
input_folder = "data_txt" # 📂 folder containing your .txt files
output_file = "dataset.json"
# Liste des fichiers à exclure
excluded_files = {
"template7.txt",
"template1167.txt",
"template429.txt",
"template401.txt",
"template367.txt",
"template415.txt",
"template398.txt",
"template1198.txt",
"template159.txt",
"template165.txt",
"template1107.txt",
"template449.txt",
"template1113.txt",
"template313.txt",
"template475.txt",
"template461.txt",
"template307.txt",
"template893.txt",
"template139.txt",
"template887.txt",
"template677.txt",
"template111.txt",
"template105.txt",
"template663.txt",
"template688.txt",
"template850.txt",
"template844.txt",
"template878.txt",
"template16.txt",
"template703.txt",
"template717.txt",
"template924.txt",
"template930.txt",
"template918.txt",
"template1073.txt",
"template529.txt",
"template1067.txt",
"template267.txt",
"template501.txt",
"template515.txt",
"template273.txt",
"template298.txt",
"template1098.txt",
"template1099.txt",
"template299.txt",
"template514.txt",
"template272.txt",
"template266.txt",
"template500.txt",
"template528.txt",
"template1066.txt",
"template1072.txt",
"template919.txt",
"template931.txt",
"template925.txt",
"template716.txt",
"template702.txt",
"template879.txt",
"template845.txt",
"template851.txt",
"template689.txt",
"template104.txt",
"template662.txt",
"template676.txt",
"template110.txt",
"template138.txt",
"template886.txt",
"template892.txt",
"template460.txt",
"template306.txt",
"template312.txt",
"template474.txt",
"template1112.txt",
"template1106.txt",
"template448.txt",
"template338.txt",
"template1110.txt",
"template1104.txt",
"template304.txt",
"template462.txt",
"template476.txt",
"template310.txt",
"template1138.txt",
"template489.txt",
"template884.txt",
"template890.txt",
"template648.txt",
"template660.txt",
"template106.txt",
"template112.txt",
"template674.txt",
"template847.txt",
"template853.txt",
"template728.txt",
"template15.txt",
"template714.txt",
"template29.txt",
"template700.txt",
"template933.txt",
"template927.txt",
"template1064.txt",
"template1070.txt",
"template258.txt",
"template1058.txt",
"template270.txt",
"template516.txt",
"template502.txt",
"template264.txt",
"template503.txt",
"template265.txt",
"template271.txt",
"template1059.txt",
"template517.txt",
"template259.txt",
"template1071.txt",
"template1065.txt",
"template926.txt",
"template932.txt",
"template701.txt",
"template715.txt",
"template28.txt",
"template729.txt",
"template14.txt",
"template852.txt",
"template846.txt",
"template113.txt",
"template675.txt",
"template661.txt",
"template107.txt",
"template649.txt",
"template891.txt",
"template885.txt",
"template488.txt",
"template477.txt",
"template1139.txt",
"template311.txt",
"template305.txt",
"template463.txt",
"template1105.txt",
"template1111.txt",
"template339.txt",
"template467.txt",
"template1129.txt",
"template301.txt",
"template315.txt",
"template473.txt",
"template1115.txt",
"template1101.txt",
"template329.txt",
"template498.txt",
"template103.txt",
"template665.txt",
"template671.txt",
"template117.txt",
"template881.txt",
"template659.txt",
"template895.txt",
"template842.txt",
"template856.txt",
"template711.txt",
"template705.txt",
"template38.txt",
"template10.txt",
"template739.txt",
"template936.txt",
"template922.txt",
"template513.txt",
"template275.txt",
"template261.txt",
"template1049.txt",
"template507.txt",
"template249.txt",
"template1061.txt",
"template1075.txt",
"template1074.txt",
"template1060.txt",
"template248.txt",
"template1048.txt",
"template260.txt",
"template506.txt",
"template512.txt",
"template274.txt",
"template923.txt",
"template937.txt",
"template738.txt",
"template11.txt",
"template704.txt",
"template710.txt",
"template857.txt",
"template843.txt",
"template894.txt",
"template658.txt",
"template880.txt",
"template670.txt",
"template116.txt",
"template102.txt",
"template664.txt",
"template499.txt",
"template328.txt",
"template1100.txt",
"template1114.txt",
"template314.txt",
"template472.txt",
"template466.txt",
"template300.txt",
"template1128.txt",
"template470.txt",
"template316.txt",
"template302.txt",
"template464.txt",
"template1102.txt",
"template1116.txt",
"template458.txt",
"template114.txt",
"template672.txt",
"template666.txt",
"template100.txt",
"template128.txt",
"template896.txt",
"template882.txt",
"template869.txt",
"template855.txt",
"template699.txt",
"template841.txt",
"template706.txt",
"template712.txt",
"template13.txt",
"template909.txt",
"template921.txt",
"template935.txt",
"template504.txt",
"template262.txt",
"template276.txt",
"template510.txt",
"template538.txt",
"template1076.txt",
"template1062.txt",
"template1089.txt",
"template289.txt",
"template288.txt",
"template1088.txt",
"template1063.txt",
"template539.txt",
"template1077.txt",
"template277.txt",
"template511.txt",
"template505.txt",
"template263.txt",
"template934.txt",
"template920.txt",
"template908.txt",
"template12.txt",
"template713.txt",
"template707.txt",
"template840.txt",
"template698.txt",
"template854.txt",
"template868.txt",
"template883.txt",
"template129.txt",
"template897.txt",
"template667.txt",
"template101.txt",
"template115.txt",
"template673.txt",
"template1117.txt",
"template459.txt",
"template1103.txt",
"template303.txt",
"template465.txt",
"template471.txt",
"template317.txt",
"template4.txt",
"template1164.txt",
"template1170.txt",
"template358.txt",
"template416.txt",
"template1158.txt",
"template370.txt",
"template364.txt",
"template402.txt",
"template628.txt",
"template172.txt",
"template614.txt",
"template600.txt",
"template166.txt",
"template833.txt",
"template827.txt",
"template199.txt",
"template61.txt",
"template1212.txt",
"template984.txt",
"template748.txt",
"template990.txt",
"template75.txt",
"template1206.txt",
"template760.txt",
"template774.txt",
"template49.txt",
"template947.txt",
"template953.txt",
"template238.txt",
"template1010.txt",
"template1004.txt",
"template562.txt",
"template204.txt",
"template210.txt",
"template1038.txt",
"template576.txt",
"template589.txt",
"template588.txt",
"template1039.txt",
"template211.txt",
"template577.txt",
"template563.txt",
"template205.txt",
"template1005.txt",
"template1011.txt",
"template239.txt",
"template952.txt",
"template946.txt",
"template775.txt",
"template48.txt",
"template761.txt",
"template991.txt",
"template749.txt",
"template1207.txt",
"template74.txt",
"template1213.txt",
"template60.txt",
"template985.txt",
"template826.txt",
"template198.txt",
"template832.txt",
"template601.txt",
"template167.txt",
"template173.txt",
"template615.txt",
"template629.txt",
"template365.txt",
"template403.txt",
"template417.txt",
"template371.txt",
"template1159.txt",
"template359.txt",
"template1171.txt",
"template1165.txt",
"template5.txt",
"template1173.txt",
"template373.txt"
}
# Ensure output file is empty before starting
open(output_file, "w", encoding="utf-8").close()
processed_count = 0
excluded_count = 0
for filename in os.listdir(input_folder):
if filename.endswith(".txt"):
# Vérifier si le fichier est dans la liste d'exclusion
if filename in excluded_files:
print(f"⏭️ Fichier exclu : {filename}")
excluded_count += 1
continue
file_path = os.path.join(input_folder, filename)
with open(file_path, "r", encoding="utf-8") as f:
transcription = f.read().strip()
print(f"\n=== Processing {filename} ===")
entities = extract_medical_entities(transcription)
# Save results
save_annotation(transcription, entities, output_file=output_file)
print(f"✅ Saved {filename} → {output_file}")
processed_count += 1
print(f"\n📊 Résumé : {processed_count} fichiers traités, {excluded_count} fichiers exclus")
"""