Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Running

App Files Files Community

KeenWoo commited on Sep 18

Commit

eb91eac

verified ·

1 Parent(s): ffd1c88

Update evaluate.py

Browse files

Files changed (1) hide show

evaluate.py +28 -3

evaluate.py CHANGED Viewed

@@ -197,8 +197,9 @@ def load_test_fixtures():
     #candidates = [env_path] if env_path else [str(default_fixture_file)]
     # --- END: DEFINITIVE FIX ---
-    candidates = [env_path] if env_path else ["conversation_test_fixtures_v10.jsonl"]
     # candidates = [env_path] if env_path else ["small_test_cases_v10.jsonl"]
     path = next((p for p in candidates if p and os.path.exists(p)), None)
     if not path:
@@ -206,8 +207,9 @@ def load_test_fixtures():
         return
     # Use the corrected v10 file if available
-    if "conversation_test_fixtures_v10.jsonl" in path:
     # if "small_test_cases_v10.jsonl" in path:
         print(f"Using corrected test fixtures: {path}")
     with open(path, "r", encoding="utf-8") as f:
@@ -233,6 +235,7 @@ def evaluate_nlu_tags(expected: Dict[str, Any], actual: Dict[str, Any], tag_key:
     f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
     return {"precision": precision, "recall": recall, "f1_score": f1_score}
 def _parse_judge_json(raw_str: str) -> dict | None:
     try:
         start_brace = raw_str.find('{')
@@ -272,6 +275,21 @@ def _classify_error(gt: str, gen: str) -> str:
         return "omission"
     return "contradiction"
 ## NEW
 # In evaluate.py
 def run_comprehensive_evaluation(
@@ -449,6 +467,11 @@ def run_comprehensive_evaluation(
             except Exception as e:
                 print(f"ERROR during faithfulness judging: {e}")
         sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
         results.append({
             "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
@@ -460,6 +483,7 @@ def run_comprehensive_evaluation(
             "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
             "answer_correctness": answer_correctness_score,
             "category": category, "error_class": error_class,
             "latency_ms": latency_ms
         })
@@ -475,7 +499,7 @@ def run_comprehensive_evaluation(
             "context_precision", "context_recall",
             "faithfulness", "hallucination_rate",
             "answer_correctness",
-            "category", "error_class", "latency_ms",
         ]
         df = df[[c for c in cols if c in df.columns]]
@@ -506,6 +530,7 @@ def run_comprehensive_evaluation(
 - **Context F1 (avg)**: {cf1_mean:.2f}%
 - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
 - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
 - **RAG  Answers w/ Sources**: {rag_with_sources_pct:.1f}%
 - **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
 - **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%

     #candidates = [env_path] if env_path else [str(default_fixture_file)]
     # --- END: DEFINITIVE FIX ---
+    # candidates = [env_path] if env_path else ["conversation_test_fixtures_v10.jsonl"]
     # candidates = [env_path] if env_path else ["small_test_cases_v10.jsonl"]
+    candidates = [env_path] if env_path else ["Test_syn_caregiving_patient.jsonl"]
     path = next((p for p in candidates if p and os.path.exists(p)), None)
     if not path:
         return
     # Use the corrected v10 file if available
+    # if "conversation_test_fixtures_v10.jsonl" in path:
     # if "small_test_cases_v10.jsonl" in path:
+    if "Test_syn_caregiving_patient.jsonl" in path:
         print(f"Using corrected test fixtures: {path}")
     with open(path, "r", encoding="utf-8") as f:
     f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
     return {"precision": precision, "recall": recall, "f1_score": f1_score}
 def _parse_judge_json(raw_str: str) -> dict | None:
     try:
         start_brace = raw_str.find('{')
         return "omission"
     return "contradiction"
+# New Test Metric
+def calculate_recall_at_k(retrieved_docs: List[str], expected_sources: set, k: int) -> float:
+    """Calculates the fraction of relevant docs found in the top K results."""
+    top_k_docs = set(retrieved_docs[:k])
+    expected_set = set(expected_sources)
+    if not expected_set:
+        return 1.0 # If there are no expected docs, recall is trivially perfect.
+    found_count = len(top_k_docs.intersection(expected_set))
+    total_relevant = len(expected_set)
+    return found_count / total_relevant if total_relevant > 0 else 0.0
 ## NEW
 # In evaluate.py
 def run_comprehensive_evaluation(
             except Exception as e:
                 print(f"ERROR during faithfulness judging: {e}")
+        # --- ADD THIS LINE TO CALCULATE RECALL@5 ---
+        recall_at_5 = calculate_recall_at_k(raw_sources, expected_sources_set, 5)
+        # --- END OF ADDITION ---
         sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
         results.append({
             "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
             "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
             "answer_correctness": answer_correctness_score,
             "category": category, "error_class": error_class,
+            "recall_at_5": recall_at_5  # <-- ADD THIS LINE
             "latency_ms": latency_ms
         })
             "context_precision", "context_recall",
             "faithfulness", "hallucination_rate",
             "answer_correctness",
+            "category", "error_class", "latency_ms", "recall_at_5" # <-- ADD recall_at_5 HERE
         ]
         df = df[[c for c in cols if c in df.columns]]
 - **Context F1 (avg)**: {cf1_mean:.2f}%
 - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
 - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
+- **RAG: Recall@5**: {(to_f(df["recall_at_5"]).mean() * 100):.1f}%
 - **RAG  Answers w/ Sources**: {rag_with_sources_pct:.1f}%
 - **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
 - **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%