|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
from openai import AzureOpenAI |
|
|
import json |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") |
|
|
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") |
|
|
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") |
|
|
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview") |
|
|
|
|
|
|
|
|
client = AzureOpenAI( |
|
|
api_key=AZURE_OPENAI_KEY, |
|
|
api_version=AZURE_OPENAI_API_VERSION, |
|
|
azure_endpoint=AZURE_OPENAI_ENDPOINT |
|
|
) |
|
|
|
|
|
def extract_medical_entities(text: str) -> dict: |
|
|
prompt = f""" You are a medical NER expert. Your task is to extract relevant entities from the given medical report text and return them in a JSON object. |
|
|
|
|
|
Analyze the text carefully and identify the following fields: |
|
|
|
|
|
- "exam_types": any type of medical test, examination, or diagnostic method performed on the patient. |
|
|
- "specialties": the branch of medicine or medical discipline relevant to the report. |
|
|
- "anatomical_regions": specific parts or regions of the body mentioned in the report. |
|
|
- "pathologies": diagnosed diseases, disorders, or abnormal medical conditions noted in the report. |
|
|
- "procedures": medical interventions, treatments, or actions performed on the patient. |
|
|
- "measurements": numerical values or quantities recorded in the report, such as vital signs, lab results, sizes, or pressures. |
|
|
- "medications": drugs, therapies, or prescribed substances mentioned in the report. |
|
|
- "symptoms": patient-experienced signs or observable indications of a health issue. |
|
|
|
|
|
Text to analyze: |
|
|
\"\"\" |
|
|
{text} |
|
|
\"\"\" |
|
|
|
|
|
Return ONLY a valid JSON object with all fields. If a field has no values, return an empty list. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
response = client.chat.completions.create( |
|
|
model=AZURE_OPENAI_DEPLOYMENT, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
|
|
|
|
|
|
) |
|
|
|
|
|
content = response.choices[0].message.content |
|
|
try: |
|
|
return json.loads(content) |
|
|
except json.JSONDecodeError: |
|
|
return { |
|
|
"exam_types": [], |
|
|
"specialties": [], |
|
|
"anatomical_regions": [], |
|
|
"pathologies": [], |
|
|
"procedures": [], |
|
|
"measurements": [], |
|
|
"medications": [], |
|
|
"symptoms": [] |
|
|
} |
|
|
import json |
|
|
|
|
|
def save_annotation(text: str, labels: dict, output_file="dataset.jsonl"): |
|
|
record = { |
|
|
"text": text, |
|
|
"labels": labels |
|
|
} |
|
|
|
|
|
with open(output_file, "a", encoding="utf-8") as f: |
|
|
f.write(json.dumps(record, ensure_ascii=False) + "\n") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
input_folder = "data_txt" |
|
|
output_file = "dataset.json" |
|
|
|
|
|
|
|
|
open(output_file, "w", encoding="utf-8").close() |
|
|
|
|
|
for filename in os.listdir(input_folder): |
|
|
if filename.endswith(".txt"): |
|
|
file_path = os.path.join(input_folder, filename) |
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
|
transcription = f.read().strip() |
|
|
|
|
|
print(f"\n=== Processing {filename} ===") |
|
|
entities = extract_medical_entities(transcription) |
|
|
|
|
|
|
|
|
save_annotation(transcription, entities, output_file=output_file) |
|
|
|
|
|
print(f"✅ Saved {filename} → {output_file}") |
|
|
""" |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
input_folder = "data_txt" # 📂 folder containing your .txt files |
|
|
output_file = "dataset.json" |
|
|
|
|
|
# Liste des fichiers à exclure |
|
|
excluded_files = { |
|
|
"template7.txt", |
|
|
"template1167.txt", |
|
|
"template429.txt", |
|
|
"template401.txt", |
|
|
"template367.txt", |
|
|
"template415.txt", |
|
|
"template398.txt", |
|
|
"template1198.txt", |
|
|
"template159.txt", |
|
|
"template165.txt", |
|
|
"template1107.txt", |
|
|
"template449.txt", |
|
|
"template1113.txt", |
|
|
"template313.txt", |
|
|
"template475.txt", |
|
|
"template461.txt", |
|
|
"template307.txt", |
|
|
"template893.txt", |
|
|
"template139.txt", |
|
|
"template887.txt", |
|
|
"template677.txt", |
|
|
"template111.txt", |
|
|
"template105.txt", |
|
|
"template663.txt", |
|
|
"template688.txt", |
|
|
"template850.txt", |
|
|
"template844.txt", |
|
|
"template878.txt", |
|
|
"template16.txt", |
|
|
"template703.txt", |
|
|
"template717.txt", |
|
|
"template924.txt", |
|
|
"template930.txt", |
|
|
"template918.txt", |
|
|
"template1073.txt", |
|
|
"template529.txt", |
|
|
"template1067.txt", |
|
|
"template267.txt", |
|
|
"template501.txt", |
|
|
"template515.txt", |
|
|
"template273.txt", |
|
|
"template298.txt", |
|
|
"template1098.txt", |
|
|
"template1099.txt", |
|
|
"template299.txt", |
|
|
"template514.txt", |
|
|
"template272.txt", |
|
|
"template266.txt", |
|
|
"template500.txt", |
|
|
"template528.txt", |
|
|
"template1066.txt", |
|
|
"template1072.txt", |
|
|
"template919.txt", |
|
|
"template931.txt", |
|
|
"template925.txt", |
|
|
"template716.txt", |
|
|
"template702.txt", |
|
|
"template879.txt", |
|
|
"template845.txt", |
|
|
"template851.txt", |
|
|
"template689.txt", |
|
|
"template104.txt", |
|
|
"template662.txt", |
|
|
"template676.txt", |
|
|
"template110.txt", |
|
|
"template138.txt", |
|
|
"template886.txt", |
|
|
"template892.txt", |
|
|
"template460.txt", |
|
|
"template306.txt", |
|
|
"template312.txt", |
|
|
"template474.txt", |
|
|
"template1112.txt", |
|
|
"template1106.txt", |
|
|
"template448.txt", |
|
|
"template338.txt", |
|
|
"template1110.txt", |
|
|
"template1104.txt", |
|
|
"template304.txt", |
|
|
"template462.txt", |
|
|
"template476.txt", |
|
|
"template310.txt", |
|
|
"template1138.txt", |
|
|
"template489.txt", |
|
|
"template884.txt", |
|
|
"template890.txt", |
|
|
"template648.txt", |
|
|
"template660.txt", |
|
|
"template106.txt", |
|
|
"template112.txt", |
|
|
"template674.txt", |
|
|
"template847.txt", |
|
|
"template853.txt", |
|
|
"template728.txt", |
|
|
"template15.txt", |
|
|
"template714.txt", |
|
|
"template29.txt", |
|
|
"template700.txt", |
|
|
"template933.txt", |
|
|
"template927.txt", |
|
|
"template1064.txt", |
|
|
"template1070.txt", |
|
|
"template258.txt", |
|
|
"template1058.txt", |
|
|
"template270.txt", |
|
|
"template516.txt", |
|
|
"template502.txt", |
|
|
"template264.txt", |
|
|
"template503.txt", |
|
|
"template265.txt", |
|
|
"template271.txt", |
|
|
"template1059.txt", |
|
|
"template517.txt", |
|
|
"template259.txt", |
|
|
"template1071.txt", |
|
|
"template1065.txt", |
|
|
"template926.txt", |
|
|
"template932.txt", |
|
|
"template701.txt", |
|
|
"template715.txt", |
|
|
"template28.txt", |
|
|
"template729.txt", |
|
|
"template14.txt", |
|
|
"template852.txt", |
|
|
"template846.txt", |
|
|
"template113.txt", |
|
|
"template675.txt", |
|
|
"template661.txt", |
|
|
"template107.txt", |
|
|
"template649.txt", |
|
|
"template891.txt", |
|
|
"template885.txt", |
|
|
"template488.txt", |
|
|
"template477.txt", |
|
|
"template1139.txt", |
|
|
"template311.txt", |
|
|
"template305.txt", |
|
|
"template463.txt", |
|
|
"template1105.txt", |
|
|
"template1111.txt", |
|
|
"template339.txt", |
|
|
"template467.txt", |
|
|
"template1129.txt", |
|
|
"template301.txt", |
|
|
"template315.txt", |
|
|
"template473.txt", |
|
|
"template1115.txt", |
|
|
"template1101.txt", |
|
|
"template329.txt", |
|
|
"template498.txt", |
|
|
"template103.txt", |
|
|
"template665.txt", |
|
|
"template671.txt", |
|
|
"template117.txt", |
|
|
"template881.txt", |
|
|
"template659.txt", |
|
|
"template895.txt", |
|
|
"template842.txt", |
|
|
"template856.txt", |
|
|
"template711.txt", |
|
|
"template705.txt", |
|
|
"template38.txt", |
|
|
"template10.txt", |
|
|
"template739.txt", |
|
|
"template936.txt", |
|
|
"template922.txt", |
|
|
"template513.txt", |
|
|
"template275.txt", |
|
|
"template261.txt", |
|
|
"template1049.txt", |
|
|
"template507.txt", |
|
|
"template249.txt", |
|
|
"template1061.txt", |
|
|
"template1075.txt", |
|
|
"template1074.txt", |
|
|
"template1060.txt", |
|
|
"template248.txt", |
|
|
"template1048.txt", |
|
|
"template260.txt", |
|
|
"template506.txt", |
|
|
"template512.txt", |
|
|
"template274.txt", |
|
|
"template923.txt", |
|
|
"template937.txt", |
|
|
"template738.txt", |
|
|
"template11.txt", |
|
|
"template704.txt", |
|
|
"template710.txt", |
|
|
"template857.txt", |
|
|
"template843.txt", |
|
|
"template894.txt", |
|
|
"template658.txt", |
|
|
"template880.txt", |
|
|
"template670.txt", |
|
|
"template116.txt", |
|
|
"template102.txt", |
|
|
"template664.txt", |
|
|
"template499.txt", |
|
|
"template328.txt", |
|
|
"template1100.txt", |
|
|
"template1114.txt", |
|
|
"template314.txt", |
|
|
"template472.txt", |
|
|
"template466.txt", |
|
|
"template300.txt", |
|
|
"template1128.txt", |
|
|
"template470.txt", |
|
|
"template316.txt", |
|
|
"template302.txt", |
|
|
"template464.txt", |
|
|
"template1102.txt", |
|
|
"template1116.txt", |
|
|
"template458.txt", |
|
|
"template114.txt", |
|
|
"template672.txt", |
|
|
"template666.txt", |
|
|
"template100.txt", |
|
|
"template128.txt", |
|
|
"template896.txt", |
|
|
"template882.txt", |
|
|
"template869.txt", |
|
|
"template855.txt", |
|
|
"template699.txt", |
|
|
"template841.txt", |
|
|
"template706.txt", |
|
|
"template712.txt", |
|
|
"template13.txt", |
|
|
"template909.txt", |
|
|
"template921.txt", |
|
|
"template935.txt", |
|
|
"template504.txt", |
|
|
"template262.txt", |
|
|
"template276.txt", |
|
|
"template510.txt", |
|
|
"template538.txt", |
|
|
"template1076.txt", |
|
|
"template1062.txt", |
|
|
"template1089.txt", |
|
|
"template289.txt", |
|
|
"template288.txt", |
|
|
"template1088.txt", |
|
|
"template1063.txt", |
|
|
"template539.txt", |
|
|
"template1077.txt", |
|
|
"template277.txt", |
|
|
"template511.txt", |
|
|
"template505.txt", |
|
|
"template263.txt", |
|
|
"template934.txt", |
|
|
"template920.txt", |
|
|
"template908.txt", |
|
|
"template12.txt", |
|
|
"template713.txt", |
|
|
"template707.txt", |
|
|
"template840.txt", |
|
|
"template698.txt", |
|
|
"template854.txt", |
|
|
"template868.txt", |
|
|
"template883.txt", |
|
|
"template129.txt", |
|
|
"template897.txt", |
|
|
"template667.txt", |
|
|
"template101.txt", |
|
|
"template115.txt", |
|
|
"template673.txt", |
|
|
"template1117.txt", |
|
|
"template459.txt", |
|
|
"template1103.txt", |
|
|
"template303.txt", |
|
|
"template465.txt", |
|
|
"template471.txt", |
|
|
"template317.txt", |
|
|
"template4.txt", |
|
|
"template1164.txt", |
|
|
"template1170.txt", |
|
|
"template358.txt", |
|
|
"template416.txt", |
|
|
"template1158.txt", |
|
|
"template370.txt", |
|
|
"template364.txt", |
|
|
"template402.txt", |
|
|
"template628.txt", |
|
|
"template172.txt", |
|
|
"template614.txt", |
|
|
"template600.txt", |
|
|
"template166.txt", |
|
|
"template833.txt", |
|
|
"template827.txt", |
|
|
"template199.txt", |
|
|
"template61.txt", |
|
|
"template1212.txt", |
|
|
"template984.txt", |
|
|
"template748.txt", |
|
|
"template990.txt", |
|
|
"template75.txt", |
|
|
"template1206.txt", |
|
|
"template760.txt", |
|
|
"template774.txt", |
|
|
"template49.txt", |
|
|
"template947.txt", |
|
|
"template953.txt", |
|
|
"template238.txt", |
|
|
"template1010.txt", |
|
|
"template1004.txt", |
|
|
"template562.txt", |
|
|
"template204.txt", |
|
|
"template210.txt", |
|
|
"template1038.txt", |
|
|
"template576.txt", |
|
|
"template589.txt", |
|
|
"template588.txt", |
|
|
"template1039.txt", |
|
|
"template211.txt", |
|
|
"template577.txt", |
|
|
"template563.txt", |
|
|
"template205.txt", |
|
|
"template1005.txt", |
|
|
"template1011.txt", |
|
|
"template239.txt", |
|
|
"template952.txt", |
|
|
"template946.txt", |
|
|
"template775.txt", |
|
|
"template48.txt", |
|
|
"template761.txt", |
|
|
"template991.txt", |
|
|
"template749.txt", |
|
|
"template1207.txt", |
|
|
"template74.txt", |
|
|
"template1213.txt", |
|
|
"template60.txt", |
|
|
"template985.txt", |
|
|
"template826.txt", |
|
|
"template198.txt", |
|
|
"template832.txt", |
|
|
"template601.txt", |
|
|
"template167.txt", |
|
|
"template173.txt", |
|
|
"template615.txt", |
|
|
"template629.txt", |
|
|
"template365.txt", |
|
|
"template403.txt", |
|
|
"template417.txt", |
|
|
"template371.txt", |
|
|
"template1159.txt", |
|
|
"template359.txt", |
|
|
"template1171.txt", |
|
|
"template1165.txt", |
|
|
"template5.txt", |
|
|
"template1173.txt", |
|
|
"template373.txt" |
|
|
} |
|
|
|
|
|
# Ensure output file is empty before starting |
|
|
open(output_file, "w", encoding="utf-8").close() |
|
|
|
|
|
processed_count = 0 |
|
|
excluded_count = 0 |
|
|
|
|
|
for filename in os.listdir(input_folder): |
|
|
if filename.endswith(".txt"): |
|
|
# Vérifier si le fichier est dans la liste d'exclusion |
|
|
if filename in excluded_files: |
|
|
print(f"⏭️ Fichier exclu : {filename}") |
|
|
excluded_count += 1 |
|
|
continue |
|
|
|
|
|
file_path = os.path.join(input_folder, filename) |
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
|
transcription = f.read().strip() |
|
|
|
|
|
print(f"\n=== Processing {filename} ===") |
|
|
entities = extract_medical_entities(transcription) |
|
|
|
|
|
# Save results |
|
|
save_annotation(transcription, entities, output_file=output_file) |
|
|
|
|
|
print(f"✅ Saved {filename} → {output_file}") |
|
|
processed_count += 1 |
|
|
|
|
|
print(f"\n📊 Résumé : {processed_count} fichiers traités, {excluded_count} fichiers exclus") |
|
|
""" |