KeenWoo commited on
Commit
eb91eac
·
verified ·
1 Parent(s): ffd1c88

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +28 -3
evaluate.py CHANGED
@@ -197,8 +197,9 @@ def load_test_fixtures():
197
  #candidates = [env_path] if env_path else [str(default_fixture_file)]
198
 
199
  # --- END: DEFINITIVE FIX ---
200
- candidates = [env_path] if env_path else ["conversation_test_fixtures_v10.jsonl"]
201
  # candidates = [env_path] if env_path else ["small_test_cases_v10.jsonl"]
 
202
 
203
  path = next((p for p in candidates if p and os.path.exists(p)), None)
204
  if not path:
@@ -206,8 +207,9 @@ def load_test_fixtures():
206
  return
207
 
208
  # Use the corrected v10 file if available
209
- if "conversation_test_fixtures_v10.jsonl" in path:
210
  # if "small_test_cases_v10.jsonl" in path:
 
211
  print(f"Using corrected test fixtures: {path}")
212
 
213
  with open(path, "r", encoding="utf-8") as f:
@@ -233,6 +235,7 @@ def evaluate_nlu_tags(expected: Dict[str, Any], actual: Dict[str, Any], tag_key:
233
  f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
234
  return {"precision": precision, "recall": recall, "f1_score": f1_score}
235
 
 
236
  def _parse_judge_json(raw_str: str) -> dict | None:
237
  try:
238
  start_brace = raw_str.find('{')
@@ -272,6 +275,21 @@ def _classify_error(gt: str, gen: str) -> str:
272
  return "omission"
273
  return "contradiction"
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  ## NEW
276
  # In evaluate.py
277
  def run_comprehensive_evaluation(
@@ -449,6 +467,11 @@ def run_comprehensive_evaluation(
449
  except Exception as e:
450
  print(f"ERROR during faithfulness judging: {e}")
451
 
 
 
 
 
 
452
  sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
453
  results.append({
454
  "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
@@ -460,6 +483,7 @@ def run_comprehensive_evaluation(
460
  "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
461
  "answer_correctness": answer_correctness_score,
462
  "category": category, "error_class": error_class,
 
463
  "latency_ms": latency_ms
464
  })
465
 
@@ -475,7 +499,7 @@ def run_comprehensive_evaluation(
475
  "context_precision", "context_recall",
476
  "faithfulness", "hallucination_rate",
477
  "answer_correctness",
478
- "category", "error_class", "latency_ms",
479
  ]
480
  df = df[[c for c in cols if c in df.columns]]
481
 
@@ -506,6 +530,7 @@ def run_comprehensive_evaluation(
506
  - **Context F1 (avg)**: {cf1_mean:.2f}%
507
  - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
508
  - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
 
509
  - **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
510
  - **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
511
  - **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%
 
197
  #candidates = [env_path] if env_path else [str(default_fixture_file)]
198
 
199
  # --- END: DEFINITIVE FIX ---
200
+ # candidates = [env_path] if env_path else ["conversation_test_fixtures_v10.jsonl"]
201
  # candidates = [env_path] if env_path else ["small_test_cases_v10.jsonl"]
202
+ candidates = [env_path] if env_path else ["Test_syn_caregiving_patient.jsonl"]
203
 
204
  path = next((p for p in candidates if p and os.path.exists(p)), None)
205
  if not path:
 
207
  return
208
 
209
  # Use the corrected v10 file if available
210
+ # if "conversation_test_fixtures_v10.jsonl" in path:
211
  # if "small_test_cases_v10.jsonl" in path:
212
+ if "Test_syn_caregiving_patient.jsonl" in path:
213
  print(f"Using corrected test fixtures: {path}")
214
 
215
  with open(path, "r", encoding="utf-8") as f:
 
235
  f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
236
  return {"precision": precision, "recall": recall, "f1_score": f1_score}
237
 
238
+
239
  def _parse_judge_json(raw_str: str) -> dict | None:
240
  try:
241
  start_brace = raw_str.find('{')
 
275
  return "omission"
276
  return "contradiction"
277
 
278
+ # New Test Metric
279
+ def calculate_recall_at_k(retrieved_docs: List[str], expected_sources: set, k: int) -> float:
280
+ """Calculates the fraction of relevant docs found in the top K results."""
281
+ top_k_docs = set(retrieved_docs[:k])
282
+ expected_set = set(expected_sources)
283
+
284
+ if not expected_set:
285
+ return 1.0 # If there are no expected docs, recall is trivially perfect.
286
+
287
+ found_count = len(top_k_docs.intersection(expected_set))
288
+ total_relevant = len(expected_set)
289
+
290
+ return found_count / total_relevant if total_relevant > 0 else 0.0
291
+
292
+
293
  ## NEW
294
  # In evaluate.py
295
  def run_comprehensive_evaluation(
 
467
  except Exception as e:
468
  print(f"ERROR during faithfulness judging: {e}")
469
 
470
+
471
+ # --- ADD THIS LINE TO CALCULATE RECALL@5 ---
472
+ recall_at_5 = calculate_recall_at_k(raw_sources, expected_sources_set, 5)
473
+ # --- END OF ADDITION ---
474
+
475
  sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
476
  results.append({
477
  "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
 
483
  "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
484
  "answer_correctness": answer_correctness_score,
485
  "category": category, "error_class": error_class,
486
+ "recall_at_5": recall_at_5 # <-- ADD THIS LINE
487
  "latency_ms": latency_ms
488
  })
489
 
 
499
  "context_precision", "context_recall",
500
  "faithfulness", "hallucination_rate",
501
  "answer_correctness",
502
+ "category", "error_class", "latency_ms", "recall_at_5" # <-- ADD recall_at_5 HERE
503
  ]
504
  df = df[[c for c in cols if c in df.columns]]
505
 
 
530
  - **Context F1 (avg)**: {cf1_mean:.2f}%
531
  - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
532
  - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
533
+ - **RAG: Recall@5**: {(to_f(df["recall_at_5"]).mean() * 100):.1f}%
534
  - **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
535
  - **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
536
  - **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%