Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Sleeping

App Files Files Community

KeenWoo commited on Sep 14

Commit

be0801e

verified ·

1 Parent(s): 01d4070

Update evaluate.py

Browse files

Files changed (1) hide show

evaluate.py +33 -7

evaluate.py CHANGED Viewed

@@ -333,19 +333,27 @@ def run_comprehensive_evaluation(
             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
             "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
-            "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
             "context_precision": context_precision, "context_recall": context_recall,
             "answer_correctness": answer_correctness_score,
-            "latency_ms": latency_ms,
-            "category": category,
-            "error_class": error_class
         })
     df = pd.DataFrame(results)
     summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
     if not df.empty:
-        cols = ["test_id", "title", "route_correct", "expected_route", "actual_route", "context_precision", "context_recall", "faithfulness", "answer_correctness", "behavior_f1", "emotion_f1", "topic_f1", "context_f1", "source_count", "latency_ms", "sources", "generated_answer", "category", "error_class"]
         df = df[[c for c in cols if c in df.columns]]
         # --- START OF MODIFICATION ---
@@ -382,12 +390,30 @@ def run_comprehensive_evaluation(
 """
         # --- END OF MODIFICATION ---
         df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
         table_rows = df_display.values.tolist()
         headers = df_display.columns.tolist()
         output_path = "evaluation_results.csv"
         df.to_csv(output_path, index=False, encoding="utf-8")
         print(f"Evaluation results saved to {output_path}")

             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
             "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
             "context_precision": context_precision, "context_recall": context_recall,
+            "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
             "answer_correctness": answer_correctness_score,
+            "category": category, "error_class": error_class,
+            "latency_ms": latency_ms
         })
     df = pd.DataFrame(results)
     summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
     if not df.empty:
+        # Add "hallucination_rate" to this list of columns to ensure it is not dropped.
+        cols = [
+            "test_id", "title", "route_correct", "expected_route", "actual_route",
+            "behavior_f1", "emotion_f1", "topic_f1", "context_f1",
+            "generated_answer", "sources", "source_count",
+            "context_precision", "context_recall",
+            "faithfulness", "hallucination_rate",
+            "answer_correctness",
+            "category", "error_class", "latency_ms",
+        ]
         df = df[[c for c in cols if c in df.columns]]
         # --- START OF MODIFICATION ---
 """
         # --- END OF MODIFICATION ---
         df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
         table_rows = df_display.values.tolist()
         headers = df_display.columns.tolist()
+    else:
+        summary_text = "No valid test fixtures found to evaluate."
+        table_rows, headers = [], []
+        # --- NEW: per-category averages ---
+        try:
+            cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
+            print("\n📊 Correctness by Category:")
+            print(cat_means.to_string(index=False))
+        except Exception as e:
+            print(f"WARNING: Could not compute category breakdown: {e}")
+        # --- NEW: confusion-style matrix ---
+        try:
+            confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
+                                    rownames=["Category"], colnames=["Error Class"], dropna=False)
+            print("\n📊 Error Class Distribution by Category:")
+            print(confusion.to_string())
+        except Exception as e:
+            print(f"WARNING: Could not build confusion matrix: {e}")
         output_path = "evaluation_results.csv"
         df.to_csv(output_path, index=False, encoding="utf-8")
         print(f"Evaluation results saved to {output_path}")