note-ner-demo

Sleeping

App Files Files Community

andrewgleave commited on May 25, 2023

Commit

38788ba

1 Parent(s): 557942e

Add simple process script

Browse files

Files changed (1) hide show

process.py +61 -0

process.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import argparse
+import csv
+import json
+from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
+MODEL = "d4data/biomedical-ner-all"
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+model = AutoModelForTokenClassification.from_pretrained(MODEL)
+pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+def process(*args):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--notes', help='Notes CSV', required=True)
+    parser.add_argument('--out', help='Output', required=True)
+    args = parser.parse_args()
+    filepath = args.notes
+    outpath = args.out
+    if not filepath.endswith(".csv"):
+        raise ValueError("Filepath must be a .csv file.")
+    if not outpath.endswith(".json"):
+        raise ValueError("Output path must be a .json file.")
+    processed = []
+    with open(filepath, "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            text = row["text"]
+            raw = pipe(text)
+            # do something with `raw` here e.g. save to file
+            ner_content = {
+                "text": text,
+                "score": row["score"],
+                "entities": [
+                    {
+                        "entity": x["entity_group"],
+                        "word": x["word"],
+                        "score": float(x["score"]),
+                        "start": x["start"],
+                        "end": x["end"],
+                    }
+                    for x in raw
+                ],
+            }
+            processed.append(ner_content)
+    # write as json to file
+    with open(outpath, "w") as f:
+        json.dump(processed, f)
+if __name__ == "__main__":
+    import sys
+    process(*sys.argv[1:])