KeenWoo commited on
Commit
be0801e
·
verified ·
1 Parent(s): 01d4070

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +33 -7
evaluate.py CHANGED
@@ -333,19 +333,27 @@ def run_comprehensive_evaluation(
333
  "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
334
  "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
335
  "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
336
- "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
337
  "context_precision": context_precision, "context_recall": context_recall,
 
338
  "answer_correctness": answer_correctness_score,
339
- "latency_ms": latency_ms,
340
- "category": category,
341
- "error_class": error_class
342
  })
343
 
344
  df = pd.DataFrame(results)
345
  summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
346
 
347
  if not df.empty:
348
- cols = ["test_id", "title", "route_correct", "expected_route", "actual_route", "context_precision", "context_recall", "faithfulness", "answer_correctness", "behavior_f1", "emotion_f1", "topic_f1", "context_f1", "source_count", "latency_ms", "sources", "generated_answer", "category", "error_class"]
 
 
 
 
 
 
 
 
 
349
  df = df[[c for c in cols if c in df.columns]]
350
 
351
  # --- START OF MODIFICATION ---
@@ -382,12 +390,30 @@ def run_comprehensive_evaluation(
382
  """
383
  # --- END OF MODIFICATION ---
384
 
385
-
386
  df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
387
  table_rows = df_display.values.tolist()
388
  headers = df_display.columns.tolist()
389
-
 
 
390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  output_path = "evaluation_results.csv"
392
  df.to_csv(output_path, index=False, encoding="utf-8")
393
  print(f"Evaluation results saved to {output_path}")
 
333
  "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
334
  "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
335
  "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
 
336
  "context_precision": context_precision, "context_recall": context_recall,
337
+ "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
338
  "answer_correctness": answer_correctness_score,
339
+ "category": category, "error_class": error_class,
340
+ "latency_ms": latency_ms
 
341
  })
342
 
343
  df = pd.DataFrame(results)
344
  summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
345
 
346
  if not df.empty:
347
+ # Add "hallucination_rate" to this list of columns to ensure it is not dropped.
348
+ cols = [
349
+ "test_id", "title", "route_correct", "expected_route", "actual_route",
350
+ "behavior_f1", "emotion_f1", "topic_f1", "context_f1",
351
+ "generated_answer", "sources", "source_count",
352
+ "context_precision", "context_recall",
353
+ "faithfulness", "hallucination_rate",
354
+ "answer_correctness",
355
+ "category", "error_class", "latency_ms",
356
+ ]
357
  df = df[[c for c in cols if c in df.columns]]
358
 
359
  # --- START OF MODIFICATION ---
 
390
  """
391
  # --- END OF MODIFICATION ---
392
 
 
393
  df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
394
  table_rows = df_display.values.tolist()
395
  headers = df_display.columns.tolist()
396
+ else:
397
+ summary_text = "No valid test fixtures found to evaluate."
398
+ table_rows, headers = [], []
399
 
400
+ # --- NEW: per-category averages ---
401
+ try:
402
+ cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
403
+ print("\n📊 Correctness by Category:")
404
+ print(cat_means.to_string(index=False))
405
+ except Exception as e:
406
+ print(f"WARNING: Could not compute category breakdown: {e}")
407
+
408
+ # --- NEW: confusion-style matrix ---
409
+ try:
410
+ confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
411
+ rownames=["Category"], colnames=["Error Class"], dropna=False)
412
+ print("\n📊 Error Class Distribution by Category:")
413
+ print(confusion.to_string())
414
+ except Exception as e:
415
+ print(f"WARNING: Could not build confusion matrix: {e}")
416
+
417
  output_path = "evaluation_results.csv"
418
  df.to_csv(output_path, index=False, encoding="utf-8")
419
  print(f"Evaluation results saved to {output_path}")