KeenWoo commited on
Commit
e59b85f
·
verified ·
1 Parent(s): ab9fd24

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +17 -3
evaluate.py CHANGED
@@ -353,8 +353,8 @@ def run_comprehensive_evaluation(
353
  expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
354
  expected_tags = ground_truth.get("expected_tags", {})
355
  expected_sources = ground_truth.get("expected_sources", [])
356
-
357
- # --- 2. NLU-ONLY GUARD CLAUSE ---
358
  if NLU_ONLY_TEST:
359
  actual_route = _norm(route_query_type(query))
360
  actual_tags = {}
@@ -364,17 +364,31 @@ def run_comprehensive_evaluation(
364
  behavior_options=config["behavior_tags"], emotion_options=config["emotion_tags"],
365
  topic_options=config["topic_tags"], context_options=config["context_tags"],
366
  )
 
 
 
 
 
 
 
367
  results.append({
368
  "test_id": test_id, "title": fx.get("title", "N/A"), "user_query": query,
369
  "actual_route": actual_route, "expected_route": expected_route,
370
  "route_correct": 1 if actual_route == expected_route else 0,
371
  "actual_tags": actual_tags, "expected_tags": expected_tags,
 
 
 
 
 
 
372
  "raw_sources": [], "expected_sources": expected_sources, "answer": "(NLU_ONLY_TEST)",
373
  "context_precision": None, "context_recall": None, "recall_at_5": None,
374
  "answer_correctness": None, "faithfulness_score": None, "latency_ms": 0
375
  })
376
  continue # Skip to the next test case
377
- # END if NLU_ONLY_TEST:
 
378
 
379
  # --- 3. FULL RAG PIPELINE (only runs if NLU_ONLY_TEST is False) ---
380
  actual_route = _norm(route_query_type(query))
 
353
  expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
354
  expected_tags = ground_truth.get("expected_tags", {})
355
  expected_sources = ground_truth.get("expected_sources", [])
356
+
357
+ # --- CORRECTED NLU-ONLY GUARD CLAUSE ---
358
  if NLU_ONLY_TEST:
359
  actual_route = _norm(route_query_type(query))
360
  actual_tags = {}
 
364
  behavior_options=config["behavior_tags"], emotion_options=config["emotion_tags"],
365
  topic_options=config["topic_tags"], context_options=config["context_tags"],
366
  )
367
+
368
+ # --- FIX: Calculate NLU F1 scores before appending results ---
369
+ behavior_metrics = evaluate_nlu_tags(expected_tags, actual_tags, "detected_behaviors")
370
+ emotion_metrics = evaluate_nlu_tags(expected_tags, actual_tags, "detected_emotion")
371
+ topic_metrics = evaluate_nlu_tags(expected_tags, actual_tags, "detected_topics")
372
+ context_metrics = evaluate_nlu_tags(expected_tags, actual_tags, "detected_contexts")
373
+
374
  results.append({
375
  "test_id": test_id, "title": fx.get("title", "N/A"), "user_query": query,
376
  "actual_route": actual_route, "expected_route": expected_route,
377
  "route_correct": 1 if actual_route == expected_route else 0,
378
  "actual_tags": actual_tags, "expected_tags": expected_tags,
379
+ # Add the F1 scores to the results dictionary
380
+ "behavior_f1": f"{behavior_metrics['f1_score']:.2f}",
381
+ "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
382
+ "topic_f1": f"{topic_metrics['f1_score']:.2f}",
383
+ "context_f1": f"{context_metrics['f1_score']:.2f}",
384
+ # Set RAG metrics to default/None values
385
  "raw_sources": [], "expected_sources": expected_sources, "answer": "(NLU_ONLY_TEST)",
386
  "context_precision": None, "context_recall": None, "recall_at_5": None,
387
  "answer_correctness": None, "faithfulness_score": None, "latency_ms": 0
388
  })
389
  continue # Skip to the next test case
390
+ # --- END OF CORRECTED BLOCK ---
391
+
392
 
393
  # --- 3. FULL RAG PIPELINE (only runs if NLU_ONLY_TEST is False) ---
394
  actual_route = _norm(route_query_type(query))