Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Running

KeenWoo commited on Sep 19

Commit

f3001ae

verified ·

1 Parent(s): e59b85f

Update evaluate.py

Files changed (1) hide show

evaluate.py CHANGED Viewed

@@ -515,11 +515,13 @@ def run_comprehensive_evaluation(
         # --- ADD THIS LINE TO CALCULATE RECALL@5 ---
         recall_at_5 = calculate_recall_at_k(raw_sources, expected_sources_set, 5)
         # --- END OF ADDITION ---
         sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
         results.append({
             "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
-            "route_correct": "✅" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
             "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
@@ -552,7 +554,8 @@ def run_comprehensive_evaluation(
         df = df[[c for c in cols if c in df.columns]]
         # --- START OF MODIFICATION ---
-        pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
         to_f = lambda s: pd.to_numeric(s, errors="coerce")
         # Calculate the mean for the NLU F1 scores

         # --- ADD THIS LINE TO CALCULATE RECALL@5 ---
         recall_at_5 = calculate_recall_at_k(raw_sources, expected_sources_set, 5)
         # --- END OF ADDITION ---
+        #  "route_correct": "✅" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
         sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
         results.append({
             "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
+            "route_correct": 1 if route_correct else 0,
             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
             "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
         df = df[[c for c in cols if c in df.columns]]
         # --- START OF MODIFICATION ---
+        # pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
+        pct = df["route_correct"].mean() * 100
         to_f = lambda s: pd.to_numeric(s, errors="coerce")
         # Calculate the mean for the NLU F1 scores