Spaces:
Running
Running
Update evaluate.py
Browse files- evaluate.py +17 -3
evaluate.py
CHANGED
|
@@ -353,8 +353,8 @@ def run_comprehensive_evaluation(
|
|
| 353 |
expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
|
| 354 |
expected_tags = ground_truth.get("expected_tags", {})
|
| 355 |
expected_sources = ground_truth.get("expected_sources", [])
|
| 356 |
-
|
| 357 |
-
# ---
|
| 358 |
if NLU_ONLY_TEST:
|
| 359 |
actual_route = _norm(route_query_type(query))
|
| 360 |
actual_tags = {}
|
|
@@ -364,17 +364,31 @@ def run_comprehensive_evaluation(
|
|
| 364 |
behavior_options=config["behavior_tags"], emotion_options=config["emotion_tags"],
|
| 365 |
topic_options=config["topic_tags"], context_options=config["context_tags"],
|
| 366 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
results.append({
|
| 368 |
"test_id": test_id, "title": fx.get("title", "N/A"), "user_query": query,
|
| 369 |
"actual_route": actual_route, "expected_route": expected_route,
|
| 370 |
"route_correct": 1 if actual_route == expected_route else 0,
|
| 371 |
"actual_tags": actual_tags, "expected_tags": expected_tags,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
"raw_sources": [], "expected_sources": expected_sources, "answer": "(NLU_ONLY_TEST)",
|
| 373 |
"context_precision": None, "context_recall": None, "recall_at_5": None,
|
| 374 |
"answer_correctness": None, "faithfulness_score": None, "latency_ms": 0
|
| 375 |
})
|
| 376 |
continue # Skip to the next test case
|
| 377 |
-
# END
|
|
|
|
| 378 |
|
| 379 |
# --- 3. FULL RAG PIPELINE (only runs if NLU_ONLY_TEST is False) ---
|
| 380 |
actual_route = _norm(route_query_type(query))
|
|
|
|
| 353 |
expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
|
| 354 |
expected_tags = ground_truth.get("expected_tags", {})
|
| 355 |
expected_sources = ground_truth.get("expected_sources", [])
|
| 356 |
+
|
| 357 |
+
# --- CORRECTED NLU-ONLY GUARD CLAUSE ---
|
| 358 |
if NLU_ONLY_TEST:
|
| 359 |
actual_route = _norm(route_query_type(query))
|
| 360 |
actual_tags = {}
|
|
|
|
| 364 |
behavior_options=config["behavior_tags"], emotion_options=config["emotion_tags"],
|
| 365 |
topic_options=config["topic_tags"], context_options=config["context_tags"],
|
| 366 |
)
|
| 367 |
+
|
| 368 |
+
# --- FIX: Calculate NLU F1 scores before appending results ---
|
| 369 |
+
behavior_metrics = evaluate_nlu_tags(expected_tags, actual_tags, "detected_behaviors")
|
| 370 |
+
emotion_metrics = evaluate_nlu_tags(expected_tags, actual_tags, "detected_emotion")
|
| 371 |
+
topic_metrics = evaluate_nlu_tags(expected_tags, actual_tags, "detected_topics")
|
| 372 |
+
context_metrics = evaluate_nlu_tags(expected_tags, actual_tags, "detected_contexts")
|
| 373 |
+
|
| 374 |
results.append({
|
| 375 |
"test_id": test_id, "title": fx.get("title", "N/A"), "user_query": query,
|
| 376 |
"actual_route": actual_route, "expected_route": expected_route,
|
| 377 |
"route_correct": 1 if actual_route == expected_route else 0,
|
| 378 |
"actual_tags": actual_tags, "expected_tags": expected_tags,
|
| 379 |
+
# Add the F1 scores to the results dictionary
|
| 380 |
+
"behavior_f1": f"{behavior_metrics['f1_score']:.2f}",
|
| 381 |
+
"emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
|
| 382 |
+
"topic_f1": f"{topic_metrics['f1_score']:.2f}",
|
| 383 |
+
"context_f1": f"{context_metrics['f1_score']:.2f}",
|
| 384 |
+
# Set RAG metrics to default/None values
|
| 385 |
"raw_sources": [], "expected_sources": expected_sources, "answer": "(NLU_ONLY_TEST)",
|
| 386 |
"context_precision": None, "context_recall": None, "recall_at_5": None,
|
| 387 |
"answer_correctness": None, "faithfulness_score": None, "latency_ms": 0
|
| 388 |
})
|
| 389 |
continue # Skip to the next test case
|
| 390 |
+
# --- END OF CORRECTED BLOCK ---
|
| 391 |
+
|
| 392 |
|
| 393 |
# --- 3. FULL RAG PIPELINE (only runs if NLU_ONLY_TEST is False) ---
|
| 394 |
actual_route = _norm(route_query_type(query))
|