Spaces:
Running
Running
Update evaluate.py
Browse files- evaluate.py +6 -3
evaluate.py
CHANGED
|
@@ -515,11 +515,13 @@ def run_comprehensive_evaluation(
|
|
| 515 |
# --- ADD THIS LINE TO CALCULATE RECALL@5 ---
|
| 516 |
recall_at_5 = calculate_recall_at_k(raw_sources, expected_sources_set, 5)
|
| 517 |
# --- END OF ADDITION ---
|
| 518 |
-
|
|
|
|
|
|
|
| 519 |
sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
|
| 520 |
results.append({
|
| 521 |
"test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
|
| 522 |
-
"route_correct":
|
| 523 |
"behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
|
| 524 |
"topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
|
| 525 |
"generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
|
|
@@ -552,7 +554,8 @@ def run_comprehensive_evaluation(
|
|
| 552 |
df = df[[c for c in cols if c in df.columns]]
|
| 553 |
|
| 554 |
# --- START OF MODIFICATION ---
|
| 555 |
-
pct = df["route_correct"].value_counts(normalize=True).get("β
", 0) * 100
|
|
|
|
| 556 |
to_f = lambda s: pd.to_numeric(s, errors="coerce")
|
| 557 |
|
| 558 |
# Calculate the mean for the NLU F1 scores
|
|
|
|
| 515 |
# --- ADD THIS LINE TO CALCULATE RECALL@5 ---
|
| 516 |
recall_at_5 = calculate_recall_at_k(raw_sources, expected_sources_set, 5)
|
| 517 |
# --- END OF ADDITION ---
|
| 518 |
+
|
| 519 |
+
# "route_correct": "β
" if route_correct else "β", "expected_route": expected_route, "actual_route": actual_route,
|
| 520 |
+
|
| 521 |
sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
|
| 522 |
results.append({
|
| 523 |
"test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
|
| 524 |
+
"route_correct": 1 if route_correct else 0,
|
| 525 |
"behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
|
| 526 |
"topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
|
| 527 |
"generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
|
|
|
|
| 554 |
df = df[[c for c in cols if c in df.columns]]
|
| 555 |
|
| 556 |
# --- START OF MODIFICATION ---
|
| 557 |
+
# pct = df["route_correct"].value_counts(normalize=True).get("β
", 0) * 100
|
| 558 |
+
pct = df["route_correct"].mean() * 100
|
| 559 |
to_f = lambda s: pd.to_numeric(s, errors="coerce")
|
| 560 |
|
| 561 |
# Calculate the mean for the NLU F1 scores
|