Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Running

App Files Files Community

KeenWoo commited on Sep 19

Commit

ab9fd24

verified ·

1 Parent(s): 96ba81c

Update evaluate.py

Browse files

Files changed (1) hide show

evaluate.py +29 -48

evaluate.py CHANGED Viewed

@@ -352,6 +352,31 @@ def run_comprehensive_evaluation(
         ground_truth = fx.get("ground_truth", {})
         expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
         expected_tags = ground_truth.get("expected_tags", {})
         actual_route = _norm(route_query_type(query))
         route_correct = (actual_route == expected_route)
@@ -384,46 +409,7 @@ def run_comprehensive_evaluation(
             role=current_test_role,
             for_evaluation=True
         )
-        # --- START MODIFICATION ---
-        if NLU_ONLY_TEST:
-            # 1. Run only the NLU parts
-            actual_route = route_query_type(user_query)
-            actual_tags = detect_tags_from_query(user_query, actual_route)
-        # 2. Add the NLU results to your list
-        results.append({
-                "test_id": test_id,
-                "title": title,
-                "user_query": user_query,
-                "actual_route": actual_route,
-                "expected_route": expected_route,
-                "route_correct": 1 if actual_route == expected_route else 0,
-                "actual_tags": actual_tags,
-                "expected_tags": expected_tags,
-                # Set RAG metrics to default/None values
-                "raw_sources": [],
-                "expected_sources": expected_sources,
-                "answer": "(NLU_ONLY_TEST)",
-                "context_precision": None,
-                "context_recall": None,
-                "recall_at_5": None,
-                "answer_correctness": None,
-                "faithfulness_score": None,
-                "latency_ms": 0
-            })
-        # 3. Use 'continue' to skip the rest of the loop and go to the next test case
-        continue
-    # --- END MODIFICATION ---
-    # ####################################################################
-    # ALL OF YOUR ORIGINAL RAG PIPELINE CODE STAYS HERE.
-    # IT IS NOT INDENTED AND ONLY RUNS IF NLU_ONLY_TEST IS FALSE.
-    # ####################################################################
         t0 = time.time()
         response = answer_query(rag_chain, query, query_type=actual_route, chat_history=api_chat_history, **final_tags)
         latency_ms = round((time.time() - t0) * 1000.0, 1)
@@ -531,10 +517,9 @@ def run_comprehensive_evaluation(
             "latency_ms": latency_ms
         })
-    # ####################################################################
-    # THIS IS YOUR ORIGINAL RESULTS PRINTOUT SECTION, NOW MODIFIED.
-    # IT IS OUTSIDE THE LOOP AND WILL ALWAYS RUN.
-    # ####################################################################
     df = pd.DataFrame(results)
     summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
@@ -562,10 +547,6 @@ def run_comprehensive_evaluation(
         tf1_mean = to_f(df["topic_f1"]).mean() * 100
         cf1_mean = to_f(df["context_f1"]).mean() * 100
         # --- START: CORRECTED SUMMARY LOGIC ---
         # 1. Start building the summary_text string with the common parts
         summary_text = f"""## Evaluation Summary (Mode: {'NLU-Only' if NLU_ONLY_TEST else 'Full RAG'})

         ground_truth = fx.get("ground_truth", {})
         expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
         expected_tags = ground_truth.get("expected_tags", {})
+        expected_sources = ground_truth.get("expected_sources", [])
+        # --- 2. NLU-ONLY GUARD CLAUSE ---
+        if NLU_ONLY_TEST:
+            actual_route = _norm(route_query_type(query))
+            actual_tags = {}
+            if "caregiving_scenario" in actual_route:
+                actual_tags = detect_tags_from_query(
+                    query, nlu_vectorstore=nlu_vectorstore,
+                    behavior_options=config["behavior_tags"], emotion_options=config["emotion_tags"],
+                    topic_options=config["topic_tags"], context_options=config["context_tags"],
+                )
+            results.append({
+                "test_id": test_id, "title": fx.get("title", "N/A"), "user_query": query,
+                "actual_route": actual_route, "expected_route": expected_route,
+                "route_correct": 1 if actual_route == expected_route else 0,
+                "actual_tags": actual_tags, "expected_tags": expected_tags,
+                "raw_sources": [], "expected_sources": expected_sources, "answer": "(NLU_ONLY_TEST)",
+                "context_precision": None, "context_recall": None, "recall_at_5": None,
+                "answer_correctness": None, "faithfulness_score": None, "latency_ms": 0
+            })
+            continue # Skip to the next test case
+        # END if NLU_ONLY_TEST:
+        # --- 3. FULL RAG PIPELINE (only runs if NLU_ONLY_TEST is False) ---
         actual_route = _norm(route_query_type(query))
         route_correct = (actual_route == expected_route)
             role=current_test_role,
             for_evaluation=True
         )
         t0 = time.time()
         response = answer_query(rag_chain, query, query_type=actual_route, chat_history=api_chat_history, **final_tags)
         latency_ms = round((time.time() - t0) * 1000.0, 1)
             "latency_ms": latency_ms
         })
+    # --- 4. FINAL SUMMARY AND RETURN SECTION ---
+    if not results:
+        return "No valid test fixtures found to evaluate.", [], []
     df = pd.DataFrame(results)
     summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
         tf1_mean = to_f(df["topic_f1"]).mean() * 100
         cf1_mean = to_f(df["context_f1"]).mean() * 100
         # --- START: CORRECTED SUMMARY LOGIC ---
         # 1. Start building the summary_text string with the common parts
         summary_text = f"""## Evaluation Summary (Mode: {'NLU-Only' if NLU_ONLY_TEST else 'Full RAG'})