Spaces:
Running
Running
Update evaluate.py
Browse files- evaluate.py +30 -8
evaluate.py
CHANGED
|
@@ -338,7 +338,36 @@ def run_comprehensive_evaluation(
|
|
| 338 |
|
| 339 |
if not df.empty:
|
| 340 |
cols = ["test_id", "title", "route_correct", "expected_route", "actual_route", "context_precision", "context_recall", "faithfulness", "answer_correctness", "behavior_f1", "emotion_f1", "topic_f1", "context_f1", "source_count", "latency_ms", "sources", "generated_answer", "category", "error_class"]
|
| 341 |
-
df = df[[c for c in cols if c in df.columns]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
output_path = "evaluation_results.csv"
|
| 343 |
df.to_csv(output_path, index=False, encoding="utf-8")
|
| 344 |
print(f"Evaluation results saved to {output_path}")
|
|
@@ -369,13 +398,6 @@ def run_comprehensive_evaluation(
|
|
| 369 |
logf.write("\n")
|
| 370 |
except Exception as e:
|
| 371 |
print(f"WARNING: Could not build confusion matrix: {e}")
|
| 372 |
-
|
| 373 |
-
pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
|
| 374 |
-
to_f = lambda s: pd.to_numeric(s, errors="coerce")
|
| 375 |
-
summary_text = f"""## Evaluation Summary\n- **Routing Accuracy**: {pct:.2f}%\n- **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%\n- **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%\n- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%"""
|
| 376 |
-
df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
|
| 377 |
-
table_rows = df_display.values.tolist()
|
| 378 |
-
headers = df_display.columns.tolist()
|
| 379 |
|
| 380 |
return summary_text, table_rows, headers
|
| 381 |
|
|
|
|
| 338 |
|
| 339 |
if not df.empty:
|
| 340 |
cols = ["test_id", "title", "route_correct", "expected_route", "actual_route", "context_precision", "context_recall", "faithfulness", "answer_correctness", "behavior_f1", "emotion_f1", "topic_f1", "context_f1", "source_count", "latency_ms", "sources", "generated_answer", "category", "error_class"]
|
| 341 |
+
df = df[[c for c in cols if c in df.columns]]
|
| 342 |
+
|
| 343 |
+
# --- START OF MODIFICATION ---
|
| 344 |
+
pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
|
| 345 |
+
to_f = lambda s: pd.to_numeric(s, errors="coerce")
|
| 346 |
+
|
| 347 |
+
# Calculate the mean for the NLU F1 scores
|
| 348 |
+
bf1_mean = to_f(df["behavior_f1"]).mean() * 100
|
| 349 |
+
ef1_mean = to_f(df["emotion_f1"]).mean() * 100
|
| 350 |
+
tf1_mean = to_f(df["topic_f1"]).mean() * 100
|
| 351 |
+
cf1_mean = to_f(df["context_f1"]).mean() * 100
|
| 352 |
+
|
| 353 |
+
# Add the NLU metrics to the summary f-string
|
| 354 |
+
summary_text = f"""## Evaluation Summary
|
| 355 |
+
- **Routing Accuracy**: {pct:.2f}%
|
| 356 |
+
- **Behaviour F1 (avg)**: {bf1_mean:.2f}%
|
| 357 |
+
- **Emotion F1 (avg)**: {ef1_mean:.2f}%
|
| 358 |
+
- **Topic F1 (avg)**: {tf1_mean:.2f}%
|
| 359 |
+
- **Context F1 (avg)**: {cf1_mean:.2f}%
|
| 360 |
+
- **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
|
| 361 |
+
- **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
|
| 362 |
+
- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%"""
|
| 363 |
+
# --- END OF MODIFICATION ---
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
|
| 367 |
+
table_rows = df_display.values.tolist()
|
| 368 |
+
headers = df_display.columns.tolist
|
| 369 |
+
|
| 370 |
+
|
| 371 |
output_path = "evaluation_results.csv"
|
| 372 |
df.to_csv(output_path, index=False, encoding="utf-8")
|
| 373 |
print(f"Evaluation results saved to {output_path}")
|
|
|
|
| 398 |
logf.write("\n")
|
| 399 |
except Exception as e:
|
| 400 |
print(f"WARNING: Could not build confusion matrix: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
|
| 402 |
return summary_text, table_rows, headers
|
| 403 |
|