Spaces:

thankrandomness
/

mimic-iii-retrieval-matryoshka

Runtime error

App Files Files Community

thankrandomness commited on Jun 2, 2024

Commit

f2ca0de

1 Parent(s): 22a06c1

add efficiency metrics

Browse files

Files changed (2) hide show

app.py +72 -33
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModel
 import chromadb
 import gradio as gr
 # Mean Pooling - Take attention mask into account for correct averaging
 def meanpooling(output, mask):
@@ -11,7 +12,7 @@ def meanpooling(output, mask):
     mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
     return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
-# Load the private dataset using the token
 dataset = load_dataset("thankrandomness/mimic-iii-sample")
 # Load the model and tokenizer
@@ -30,36 +31,37 @@ def embed_text(text):
 client = chromadb.Client()
 collection = client.create_collection(name="pubmedbert_matryoshka_embeddings")
-# Process the dataset and upsert into ChromaDB
-for i, row in enumerate(dataset['train']):
-    for note in row['notes']:
-        text = note.get('text', '')
-        annotations_list = []
-        for annotation in note.get('annotations', []):
-            try:
-                code = annotation['code']
-                code_system = annotation['code_system']
-                description = annotation['description']
-                #annotations_list.append(f"{code}: {code_system}: {description}")
-                annotations_list.append({"code": code, "code_system": code_system, "description": description})
-            except KeyError as e:
-                print(f"Skipping annotation due to missing key: {e}")
-        print(f"Processed annotations for note {note['note_id']}: {annotations_list}")
-        if text and annotations_list:
-            embeddings = embed_text([text])[0]
-            # Upsert data, embeddings, and annotations into ChromaDB
-            for j, annotation in enumerate(annotations_list):
-                collection.upsert(
-                    ids=[f"note_{note['note_id']}_{j}"],
-                    embeddings=[embeddings],
-                    metadatas=[annotation]
-                )
-        else:
-            print(f"Skipping note {note['note_id']} due to missing 'text' or 'annotations'")
 # Define retrieval function
 def retrieve_relevant_text(input_text):
@@ -81,6 +83,33 @@ def retrieve_relevant_text(input_text):
         })
     return output
 # Gradio interface
 def gradio_interface(input_text):
     results = retrieve_relevant_text(input_text)
@@ -88,7 +117,17 @@ def gradio_interface(input_text):
         f"Similarity Score: {result['similarity_score']:.2f}, Code: {result['code']}, Description: {result['description']}"
         for result in results
     ]
-    return formatted_results
-interface = gr.Interface(fn=gradio_interface, inputs="text", outputs="text")
 interface.launch()

 from transformers import AutoTokenizer, AutoModel
 import chromadb
 import gradio as gr
+from sklearn.metrics import precision_score, recall_score, f1_score
 # Mean Pooling - Take attention mask into account for correct averaging
 def meanpooling(output, mask):
     mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
     return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
+# Load the dataset
 dataset = load_dataset("thankrandomness/mimic-iii-sample")
 # Load the model and tokenizer
 client = chromadb.Client()
 collection = client.create_collection(name="pubmedbert_matryoshka_embeddings")
+# Function to upsert data into ChromaDB
+def upsert_data(dataset_split):
+    for i, row in enumerate(dataset_split):
+        for note in row['notes']:
+            text = note.get('text', '')
+            annotations_list = []
+            for annotation in note.get('annotations', []):
+                try:
+                    code = annotation['code']
+                    code_system = annotation['code_system']
+                    description = annotation['description']
+                    annotations_list.append({"code": code, "code_system": code_system, "description": description})
+                except KeyError as e:
+                    print(f"Skipping annotation due to missing key: {e}")
+            if text and annotations_list:
+                embeddings = embed_text([text])[0]
+                # Upsert data, embeddings, and annotations into ChromaDB
+                for j, annotation in enumerate(annotations_list):
+                    collection.upsert(
+                        ids=[f"note_{note['note_id']}_{j}"],
+                        embeddings=[embeddings],
+                        metadatas=[annotation]
+                    )
+            else:
+                print(f"Skipping note {note['note_id']} due to missing 'text' or 'annotations'")
+# Upsert training data
+upsert_data(dataset['train'])
 # Define retrieval function
 def retrieve_relevant_text(input_text):
         })
     return output
+# Evaluate retrieval efficiency on the validation/test set
+def evaluate_efficiency(dataset_split):
+    y_true = []
+    y_pred = []
+    for i, row in enumerate(dataset_split):
+        for note in row['notes']:
+            text = note.get('text', '')
+            annotations_list = [annotation['code'] for annotation in note.get('annotations', []) if 'code' in annotation]
+            if text and annotations_list:
+                retrieved_results = retrieve_relevant_text(text)
+                retrieved_codes = [result['code'] for result in retrieved_results]
+                # Ground truth
+                y_true.extend(annotations_list)
+                # Predictions
+                y_pred.extend(retrieved_codes[:len(annotations_list)])  # Assuming we compare the top-k results
+    precision = precision_score(y_true, y_pred, average='macro')
+    recall = recall_score(y_true, y_pred, average='macro')
+    f1 = f1_score(y_true, y_pred, average='macro')
+    return precision, recall, f1
+# Calculate retrieval efficiency metrics
+precision, recall, f1 = evaluate_efficiency(dataset['validation'])
 # Gradio interface
 def gradio_interface(input_text):
     results = retrieve_relevant_text(input_text)
         f"Similarity Score: {result['similarity_score']:.2f}, Code: {result['code']}, Description: {result['description']}"
         for result in results
     ]
+    metrics = f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}"
+    return formatted_results, metrics
+interface = gr.Interface(
+    fn=gradio_interface,
+    inputs="text",
+    outputs=["text", "text"],
+    live=True
+)
+# Display retrieval efficiency metrics
+print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
 interface.launch()

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ torch
 transformers
 chromadb
 gradio
-numpy

 transformers
 chromadb
 gradio
+numpy
+scikit-learn