KeenWoo commited on
Commit
ab9fd24
·
verified ·
1 Parent(s): 96ba81c

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +29 -48
evaluate.py CHANGED
@@ -352,6 +352,31 @@ def run_comprehensive_evaluation(
352
  ground_truth = fx.get("ground_truth", {})
353
  expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
354
  expected_tags = ground_truth.get("expected_tags", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  actual_route = _norm(route_query_type(query))
356
  route_correct = (actual_route == expected_route)
357
 
@@ -384,46 +409,7 @@ def run_comprehensive_evaluation(
384
  role=current_test_role,
385
  for_evaluation=True
386
  )
387
-
388
-
389
- # --- START MODIFICATION ---
390
- if NLU_ONLY_TEST:
391
- # 1. Run only the NLU parts
392
- actual_route = route_query_type(user_query)
393
- actual_tags = detect_tags_from_query(user_query, actual_route)
394
-
395
- # 2. Add the NLU results to your list
396
- results.append({
397
- "test_id": test_id,
398
- "title": title,
399
- "user_query": user_query,
400
- "actual_route": actual_route,
401
- "expected_route": expected_route,
402
- "route_correct": 1 if actual_route == expected_route else 0,
403
- "actual_tags": actual_tags,
404
- "expected_tags": expected_tags,
405
- # Set RAG metrics to default/None values
406
- "raw_sources": [],
407
- "expected_sources": expected_sources,
408
- "answer": "(NLU_ONLY_TEST)",
409
- "context_precision": None,
410
- "context_recall": None,
411
- "recall_at_5": None,
412
- "answer_correctness": None,
413
- "faithfulness_score": None,
414
- "latency_ms": 0
415
- })
416
-
417
- # 3. Use 'continue' to skip the rest of the loop and go to the next test case
418
- continue
419
- # --- END MODIFICATION ---
420
-
421
- # ####################################################################
422
- # ALL OF YOUR ORIGINAL RAG PIPELINE CODE STAYS HERE.
423
- # IT IS NOT INDENTED AND ONLY RUNS IF NLU_ONLY_TEST IS FALSE.
424
- # ####################################################################
425
-
426
-
427
  t0 = time.time()
428
  response = answer_query(rag_chain, query, query_type=actual_route, chat_history=api_chat_history, **final_tags)
429
  latency_ms = round((time.time() - t0) * 1000.0, 1)
@@ -531,10 +517,9 @@ def run_comprehensive_evaluation(
531
  "latency_ms": latency_ms
532
  })
533
 
534
- # ####################################################################
535
- # THIS IS YOUR ORIGINAL RESULTS PRINTOUT SECTION, NOW MODIFIED.
536
- # IT IS OUTSIDE THE LOOP AND WILL ALWAYS RUN.
537
- # ####################################################################
538
 
539
  df = pd.DataFrame(results)
540
  summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
@@ -562,10 +547,6 @@ def run_comprehensive_evaluation(
562
  tf1_mean = to_f(df["topic_f1"]).mean() * 100
563
  cf1_mean = to_f(df["context_f1"]).mean() * 100
564
 
565
-
566
-
567
-
568
-
569
  # --- START: CORRECTED SUMMARY LOGIC ---
570
  # 1. Start building the summary_text string with the common parts
571
  summary_text = f"""## Evaluation Summary (Mode: {'NLU-Only' if NLU_ONLY_TEST else 'Full RAG'})
 
352
  ground_truth = fx.get("ground_truth", {})
353
  expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
354
  expected_tags = ground_truth.get("expected_tags", {})
355
+ expected_sources = ground_truth.get("expected_sources", [])
356
+
357
+ # --- 2. NLU-ONLY GUARD CLAUSE ---
358
+ if NLU_ONLY_TEST:
359
+ actual_route = _norm(route_query_type(query))
360
+ actual_tags = {}
361
+ if "caregiving_scenario" in actual_route:
362
+ actual_tags = detect_tags_from_query(
363
+ query, nlu_vectorstore=nlu_vectorstore,
364
+ behavior_options=config["behavior_tags"], emotion_options=config["emotion_tags"],
365
+ topic_options=config["topic_tags"], context_options=config["context_tags"],
366
+ )
367
+ results.append({
368
+ "test_id": test_id, "title": fx.get("title", "N/A"), "user_query": query,
369
+ "actual_route": actual_route, "expected_route": expected_route,
370
+ "route_correct": 1 if actual_route == expected_route else 0,
371
+ "actual_tags": actual_tags, "expected_tags": expected_tags,
372
+ "raw_sources": [], "expected_sources": expected_sources, "answer": "(NLU_ONLY_TEST)",
373
+ "context_precision": None, "context_recall": None, "recall_at_5": None,
374
+ "answer_correctness": None, "faithfulness_score": None, "latency_ms": 0
375
+ })
376
+ continue # Skip to the next test case
377
+ # END if NLU_ONLY_TEST:
378
+
379
+ # --- 3. FULL RAG PIPELINE (only runs if NLU_ONLY_TEST is False) ---
380
  actual_route = _norm(route_query_type(query))
381
  route_correct = (actual_route == expected_route)
382
 
 
409
  role=current_test_role,
410
  for_evaluation=True
411
  )
412
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  t0 = time.time()
414
  response = answer_query(rag_chain, query, query_type=actual_route, chat_history=api_chat_history, **final_tags)
415
  latency_ms = round((time.time() - t0) * 1000.0, 1)
 
517
  "latency_ms": latency_ms
518
  })
519
 
520
+ # --- 4. FINAL SUMMARY AND RETURN SECTION ---
521
+ if not results:
522
+ return "No valid test fixtures found to evaluate.", [], []
 
523
 
524
  df = pd.DataFrame(results)
525
  summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
 
547
  tf1_mean = to_f(df["topic_f1"]).mean() * 100
548
  cf1_mean = to_f(df["context_f1"]).mean() * 100
549
 
 
 
 
 
550
  # --- START: CORRECTED SUMMARY LOGIC ---
551
  # 1. Start building the summary_text string with the common parts
552
  summary_text = f"""## Evaluation Summary (Mode: {'NLU-Only' if NLU_ONLY_TEST else 'Full RAG'})