KeenWoo commited on
Commit
f3001ae
Β·
verified Β·
1 Parent(s): e59b85f

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +6 -3
evaluate.py CHANGED
@@ -515,11 +515,13 @@ def run_comprehensive_evaluation(
515
  # --- ADD THIS LINE TO CALCULATE RECALL@5 ---
516
  recall_at_5 = calculate_recall_at_k(raw_sources, expected_sources_set, 5)
517
  # --- END OF ADDITION ---
518
-
 
 
519
  sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
520
  results.append({
521
  "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
522
- "route_correct": "βœ…" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
523
  "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
524
  "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
525
  "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
@@ -552,7 +554,8 @@ def run_comprehensive_evaluation(
552
  df = df[[c for c in cols if c in df.columns]]
553
 
554
  # --- START OF MODIFICATION ---
555
- pct = df["route_correct"].value_counts(normalize=True).get("βœ…", 0) * 100
 
556
  to_f = lambda s: pd.to_numeric(s, errors="coerce")
557
 
558
  # Calculate the mean for the NLU F1 scores
 
515
  # --- ADD THIS LINE TO CALCULATE RECALL@5 ---
516
  recall_at_5 = calculate_recall_at_k(raw_sources, expected_sources_set, 5)
517
  # --- END OF ADDITION ---
518
+
519
+ # "route_correct": "βœ…" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
520
+
521
  sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
522
  results.append({
523
  "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
524
+ "route_correct": 1 if route_correct else 0,
525
  "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
526
  "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
527
  "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
 
554
  df = df[[c for c in cols if c in df.columns]]
555
 
556
  # --- START OF MODIFICATION ---
557
+ # pct = df["route_correct"].value_counts(normalize=True).get("βœ…", 0) * 100
558
+ pct = df["route_correct"].mean() * 100
559
  to_f = lambda s: pd.to_numeric(s, errors="coerce")
560
 
561
  # Calculate the mean for the NLU F1 scores