Spaces:
Sleeping
Sleeping
Update evaluate.py
Browse files- evaluate.py +33 -7
evaluate.py
CHANGED
|
@@ -333,19 +333,27 @@ def run_comprehensive_evaluation(
|
|
| 333 |
"behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
|
| 334 |
"topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
|
| 335 |
"generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
|
| 336 |
-
"faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
|
| 337 |
"context_precision": context_precision, "context_recall": context_recall,
|
|
|
|
| 338 |
"answer_correctness": answer_correctness_score,
|
| 339 |
-
"
|
| 340 |
-
"
|
| 341 |
-
"error_class": error_class
|
| 342 |
})
|
| 343 |
|
| 344 |
df = pd.DataFrame(results)
|
| 345 |
summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
|
| 346 |
|
| 347 |
if not df.empty:
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
df = df[[c for c in cols if c in df.columns]]
|
| 350 |
|
| 351 |
# --- START OF MODIFICATION ---
|
|
@@ -382,12 +390,30 @@ def run_comprehensive_evaluation(
|
|
| 382 |
"""
|
| 383 |
# --- END OF MODIFICATION ---
|
| 384 |
|
| 385 |
-
|
| 386 |
df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
|
| 387 |
table_rows = df_display.values.tolist()
|
| 388 |
headers = df_display.columns.tolist()
|
| 389 |
-
|
|
|
|
|
|
|
| 390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
output_path = "evaluation_results.csv"
|
| 392 |
df.to_csv(output_path, index=False, encoding="utf-8")
|
| 393 |
print(f"Evaluation results saved to {output_path}")
|
|
|
|
| 333 |
"behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
|
| 334 |
"topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
|
| 335 |
"generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
|
|
|
|
| 336 |
"context_precision": context_precision, "context_recall": context_recall,
|
| 337 |
+
"faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
|
| 338 |
"answer_correctness": answer_correctness_score,
|
| 339 |
+
"category": category, "error_class": error_class,
|
| 340 |
+
"latency_ms": latency_ms
|
|
|
|
| 341 |
})
|
| 342 |
|
| 343 |
df = pd.DataFrame(results)
|
| 344 |
summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
|
| 345 |
|
| 346 |
if not df.empty:
|
| 347 |
+
# Add "hallucination_rate" to this list of columns to ensure it is not dropped.
|
| 348 |
+
cols = [
|
| 349 |
+
"test_id", "title", "route_correct", "expected_route", "actual_route",
|
| 350 |
+
"behavior_f1", "emotion_f1", "topic_f1", "context_f1",
|
| 351 |
+
"generated_answer", "sources", "source_count",
|
| 352 |
+
"context_precision", "context_recall",
|
| 353 |
+
"faithfulness", "hallucination_rate",
|
| 354 |
+
"answer_correctness",
|
| 355 |
+
"category", "error_class", "latency_ms",
|
| 356 |
+
]
|
| 357 |
df = df[[c for c in cols if c in df.columns]]
|
| 358 |
|
| 359 |
# --- START OF MODIFICATION ---
|
|
|
|
| 390 |
"""
|
| 391 |
# --- END OF MODIFICATION ---
|
| 392 |
|
|
|
|
| 393 |
df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
|
| 394 |
table_rows = df_display.values.tolist()
|
| 395 |
headers = df_display.columns.tolist()
|
| 396 |
+
else:
|
| 397 |
+
summary_text = "No valid test fixtures found to evaluate."
|
| 398 |
+
table_rows, headers = [], []
|
| 399 |
|
| 400 |
+
# --- NEW: per-category averages ---
|
| 401 |
+
try:
|
| 402 |
+
cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
|
| 403 |
+
print("\n📊 Correctness by Category:")
|
| 404 |
+
print(cat_means.to_string(index=False))
|
| 405 |
+
except Exception as e:
|
| 406 |
+
print(f"WARNING: Could not compute category breakdown: {e}")
|
| 407 |
+
|
| 408 |
+
# --- NEW: confusion-style matrix ---
|
| 409 |
+
try:
|
| 410 |
+
confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
|
| 411 |
+
rownames=["Category"], colnames=["Error Class"], dropna=False)
|
| 412 |
+
print("\n📊 Error Class Distribution by Category:")
|
| 413 |
+
print(confusion.to_string())
|
| 414 |
+
except Exception as e:
|
| 415 |
+
print(f"WARNING: Could not build confusion matrix: {e}")
|
| 416 |
+
|
| 417 |
output_path = "evaluation_results.csv"
|
| 418 |
df.to_csv(output_path, index=False, encoding="utf-8")
|
| 419 |
print(f"Evaluation results saved to {output_path}")
|