Spaces:
Running
Running
Update evaluate.py
Browse files- evaluate.py +28 -3
evaluate.py
CHANGED
|
@@ -197,8 +197,9 @@ def load_test_fixtures():
|
|
| 197 |
#candidates = [env_path] if env_path else [str(default_fixture_file)]
|
| 198 |
|
| 199 |
# --- END: DEFINITIVE FIX ---
|
| 200 |
-
candidates = [env_path] if env_path else ["conversation_test_fixtures_v10.jsonl"]
|
| 201 |
# candidates = [env_path] if env_path else ["small_test_cases_v10.jsonl"]
|
|
|
|
| 202 |
|
| 203 |
path = next((p for p in candidates if p and os.path.exists(p)), None)
|
| 204 |
if not path:
|
|
@@ -206,8 +207,9 @@ def load_test_fixtures():
|
|
| 206 |
return
|
| 207 |
|
| 208 |
# Use the corrected v10 file if available
|
| 209 |
-
if "conversation_test_fixtures_v10.jsonl" in path:
|
| 210 |
# if "small_test_cases_v10.jsonl" in path:
|
|
|
|
| 211 |
print(f"Using corrected test fixtures: {path}")
|
| 212 |
|
| 213 |
with open(path, "r", encoding="utf-8") as f:
|
|
@@ -233,6 +235,7 @@ def evaluate_nlu_tags(expected: Dict[str, Any], actual: Dict[str, Any], tag_key:
|
|
| 233 |
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
|
| 234 |
return {"precision": precision, "recall": recall, "f1_score": f1_score}
|
| 235 |
|
|
|
|
| 236 |
def _parse_judge_json(raw_str: str) -> dict | None:
|
| 237 |
try:
|
| 238 |
start_brace = raw_str.find('{')
|
|
@@ -272,6 +275,21 @@ def _classify_error(gt: str, gen: str) -> str:
|
|
| 272 |
return "omission"
|
| 273 |
return "contradiction"
|
| 274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
## NEW
|
| 276 |
# In evaluate.py
|
| 277 |
def run_comprehensive_evaluation(
|
|
@@ -449,6 +467,11 @@ def run_comprehensive_evaluation(
|
|
| 449 |
except Exception as e:
|
| 450 |
print(f"ERROR during faithfulness judging: {e}")
|
| 451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
|
| 453 |
results.append({
|
| 454 |
"test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
|
|
@@ -460,6 +483,7 @@ def run_comprehensive_evaluation(
|
|
| 460 |
"faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
|
| 461 |
"answer_correctness": answer_correctness_score,
|
| 462 |
"category": category, "error_class": error_class,
|
|
|
|
| 463 |
"latency_ms": latency_ms
|
| 464 |
})
|
| 465 |
|
|
@@ -475,7 +499,7 @@ def run_comprehensive_evaluation(
|
|
| 475 |
"context_precision", "context_recall",
|
| 476 |
"faithfulness", "hallucination_rate",
|
| 477 |
"answer_correctness",
|
| 478 |
-
"category", "error_class", "latency_ms",
|
| 479 |
]
|
| 480 |
df = df[[c for c in cols if c in df.columns]]
|
| 481 |
|
|
@@ -506,6 +530,7 @@ def run_comprehensive_evaluation(
|
|
| 506 |
- **Context F1 (avg)**: {cf1_mean:.2f}%
|
| 507 |
- **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
|
| 508 |
- **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
|
|
|
|
| 509 |
- **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
|
| 510 |
- **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
|
| 511 |
- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%
|
|
|
|
| 197 |
#candidates = [env_path] if env_path else [str(default_fixture_file)]
|
| 198 |
|
| 199 |
# --- END: DEFINITIVE FIX ---
|
| 200 |
+
# candidates = [env_path] if env_path else ["conversation_test_fixtures_v10.jsonl"]
|
| 201 |
# candidates = [env_path] if env_path else ["small_test_cases_v10.jsonl"]
|
| 202 |
+
candidates = [env_path] if env_path else ["Test_syn_caregiving_patient.jsonl"]
|
| 203 |
|
| 204 |
path = next((p for p in candidates if p and os.path.exists(p)), None)
|
| 205 |
if not path:
|
|
|
|
| 207 |
return
|
| 208 |
|
| 209 |
# Use the corrected v10 file if available
|
| 210 |
+
# if "conversation_test_fixtures_v10.jsonl" in path:
|
| 211 |
# if "small_test_cases_v10.jsonl" in path:
|
| 212 |
+
if "Test_syn_caregiving_patient.jsonl" in path:
|
| 213 |
print(f"Using corrected test fixtures: {path}")
|
| 214 |
|
| 215 |
with open(path, "r", encoding="utf-8") as f:
|
|
|
|
| 235 |
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
|
| 236 |
return {"precision": precision, "recall": recall, "f1_score": f1_score}
|
| 237 |
|
| 238 |
+
|
| 239 |
def _parse_judge_json(raw_str: str) -> dict | None:
|
| 240 |
try:
|
| 241 |
start_brace = raw_str.find('{')
|
|
|
|
| 275 |
return "omission"
|
| 276 |
return "contradiction"
|
| 277 |
|
| 278 |
+
# New Test Metric
|
| 279 |
+
def calculate_recall_at_k(retrieved_docs: List[str], expected_sources: set, k: int) -> float:
|
| 280 |
+
"""Calculates the fraction of relevant docs found in the top K results."""
|
| 281 |
+
top_k_docs = set(retrieved_docs[:k])
|
| 282 |
+
expected_set = set(expected_sources)
|
| 283 |
+
|
| 284 |
+
if not expected_set:
|
| 285 |
+
return 1.0 # If there are no expected docs, recall is trivially perfect.
|
| 286 |
+
|
| 287 |
+
found_count = len(top_k_docs.intersection(expected_set))
|
| 288 |
+
total_relevant = len(expected_set)
|
| 289 |
+
|
| 290 |
+
return found_count / total_relevant if total_relevant > 0 else 0.0
|
| 291 |
+
|
| 292 |
+
|
| 293 |
## NEW
|
| 294 |
# In evaluate.py
|
| 295 |
def run_comprehensive_evaluation(
|
|
|
|
| 467 |
except Exception as e:
|
| 468 |
print(f"ERROR during faithfulness judging: {e}")
|
| 469 |
|
| 470 |
+
|
| 471 |
+
# --- ADD THIS LINE TO CALCULATE RECALL@5 ---
|
| 472 |
+
recall_at_5 = calculate_recall_at_k(raw_sources, expected_sources_set, 5)
|
| 473 |
+
# --- END OF ADDITION ---
|
| 474 |
+
|
| 475 |
sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
|
| 476 |
results.append({
|
| 477 |
"test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
|
|
|
|
| 483 |
"faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
|
| 484 |
"answer_correctness": answer_correctness_score,
|
| 485 |
"category": category, "error_class": error_class,
|
| 486 |
+
"recall_at_5": recall_at_5 # <-- ADD THIS LINE
|
| 487 |
"latency_ms": latency_ms
|
| 488 |
})
|
| 489 |
|
|
|
|
| 499 |
"context_precision", "context_recall",
|
| 500 |
"faithfulness", "hallucination_rate",
|
| 501 |
"answer_correctness",
|
| 502 |
+
"category", "error_class", "latency_ms", "recall_at_5" # <-- ADD recall_at_5 HERE
|
| 503 |
]
|
| 504 |
df = df[[c for c in cols if c in df.columns]]
|
| 505 |
|
|
|
|
| 530 |
- **Context F1 (avg)**: {cf1_mean:.2f}%
|
| 531 |
- **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
|
| 532 |
- **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
|
| 533 |
+
- **RAG: Recall@5**: {(to_f(df["recall_at_5"]).mean() * 100):.1f}%
|
| 534 |
- **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
|
| 535 |
- **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
|
| 536 |
- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%
|