Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Running

App Files Files Community

KeenWoo commited on Sep 13

Commit

adda93b

verified ·

1 Parent(s): 5408fc0

Update evaluate.py

Browse files

Files changed (1) hide show

evaluate.py +81 -124

evaluate.py CHANGED Viewed

@@ -131,52 +131,63 @@ def _classify_error(gt: str, gen: str) -> str:
         return "omission"
     return "contradiction"
 def run_comprehensive_evaluation(
-    vs_general: FAISS,
-    vs_personal: FAISS,
-    nlu_vectorstore: FAISS,
-    config: Dict[str, Any]
 ):
     global test_fixtures
     if not test_fixtures:
-        return "No test fixtures loaded. Please ensure conversation_test_fixtures_v10.jsonl exists.", [], []
     def _norm(label: str) -> str:
         label = (label or "").strip().lower()
         return "factual_question" if "factual" in label else label
     print("Starting comprehensive evaluation...")
     results: List[Dict[str, Any]] = []
-    # ADD THESE LINES:
     total_fixtures = len(test_fixtures)
     print(f"\n🚀 STARTING EVALUATION on {total_fixtures} test cases...")
-    # In evaluate.py, before the evaluation loop
-    print("--- DEBUG: Checking personal vector store before evaluation ---")
-    if vs_personal and hasattr(vs_personal.docstore, '_dict'):
-        print(f"Personal vector store contains {len(vs_personal.docstore._dict)} documents.")
-    else:
-        print("Personal vector store appears to be empty or invalid.")
-    # REPLACE the original for loop with this one to get the counter 'i'
     for i, fx in enumerate(test_fixtures):
-    # for fx in test_fixtures:
         test_id = fx.get("test_id", "N/A")
-        # This print statement now works because we have 'i'
         print(f"--- Processing Test Case {i+1}/{total_fixtures}: ID = {test_id} ---")
         turns = fx.get("turns") or []
         api_chat_history = [{"role": t.get("role"), "content": t.get("text")} for t in turns]
         query = next((t["content"] for t in reversed(api_chat_history) if (t.get("role") or "user").lower() == "user"), "")
         if not query: continue
         ground_truth = fx.get("ground_truth", {})
         expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
         expected_tags = ground_truth.get("expected_tags", {})
         actual_route = _norm(route_query_type(query))
         route_correct = (actual_route == expected_route)
@@ -203,25 +214,31 @@ def run_comprehensive_evaluation(
             }
         current_test_role = fx.get("test_role", "patient")
-        rag_chain = make_rag_chain(vs_general, vs_personal, role=current_test_role)
         t0 = time.time()
         response = answer_query(rag_chain, query, query_type=actual_route, chat_history=api_chat_history, **final_tags)
         latency_ms = round((time.time() - t0) * 1000.0, 1)
         answer_text = response.get("answer", "ERROR")
         expected_sources_set = set(map(str, ground_truth.get("expected_sources", [])))
         raw_sources = response.get("sources", [])
         actual_sources_set = set(map(str, raw_sources if isinstance(raw_sources, (list, tuple)) else [raw_sources]))
-        # --- START: ADD THIS STRATEGIC PRINT BLOCK ---
         print("\n" + "-"*20 + " SOURCE EVALUATION " + "-"*20)
         print(f"  - Expected: {sorted(list(expected_sources_set))}")
         print(f"  - Actual:   {sorted(list(actual_sources_set))}")
         true_positives = expected_sources_set.intersection(actual_sources_set)
         false_positives = actual_sources_set - expected_sources_set
-        false_negatives = expected_sources_set - actual_sources_set
         if not false_positives and not false_negatives:
             print("  - Result: ✅ Perfect Match!")
@@ -231,35 +248,34 @@ def run_comprehensive_evaluation(
             if false_negatives:
                 print(f"  - 🔻 False Negatives (hurts recall):    {sorted(list(false_negatives))}")
         print("-"*59 + "\n")
-        # --- END: ADD THIS STRATEGIC PRINT BLOCK ---
         context_precision, context_recall = 0.0, 0.0
         if expected_sources_set or actual_sources_set:
-            true_positives = len(expected_sources_set.intersection(actual_sources_set))
-            if len(actual_sources_set) > 0: context_precision = true_positives / len(actual_sources_set)
-            if len(expected_sources_set) > 0: context_recall = true_positives / len(expected_sources_set)
         elif not expected_sources_set and not actual_sources_set:
             context_precision, context_recall = 1.0, 1.0
-        answer_correctness_score = None
-        ground_truth_answer = ground_truth.get("ground_truth_answer")
-        error_class = None  # initialise  #NEW
         if ground_truth_answer and "ERROR" not in answer_text:
             try:
                 judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
                 raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
                 correctness_data = _parse_judge_json(raw_correctness)
                 if correctness_data and "correctness_score" in correctness_data:
                     answer_correctness_score = float(correctness_data["correctness_score"])
             except Exception as e:
                 print(f"ERROR during answer correctness judging: {e}")
-            # --- NEW: derive error class for diagnostics ---
-            error_class = _classify_error(ground_truth_answer, answer_text)
         faithfulness = None
         source_docs = response.get("source_documents", [])
         if source_docs and "ERROR" not in answer_text:
@@ -279,9 +295,6 @@ def run_comprehensive_evaluation(
         sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
         results.append({
             "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
-            # NEW for debugging
-            "category": _categorize_test(test_id), "error_class": error_class,
-            # END
             "route_correct": "✅" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
@@ -289,110 +302,54 @@ def run_comprehensive_evaluation(
             "latency_ms": latency_ms, "faithfulness": faithfulness,
             "context_precision": context_precision, "context_recall": context_recall,
             "answer_correctness": answer_correctness_score,
         })
     df = pd.DataFrame(results)
-    output_path = "evaluation_results.csv"
     if not df.empty:
-        cols = [
-            "test_id", "title", "route_correct", "expected_route", "actual_route",
-            "context_precision", "context_recall", "faithfulness", "answer_correctness",
-            "behavior_f1", "emotion_f1", "topic_f1", "context_f1",
-            "source_count", "latency_ms", "sources", "generated_answer"
-        ]
         df = df[[c for c in cols if c in df.columns]]
         df.to_csv(output_path, index=False, encoding="utf-8")
         print(f"Evaluation results saved to {output_path}")
-        # --- NEW: write detailed results to a log file instead of CSV ---
-        log_path = Path(__file__).parent / "evaluation_log.txt"
-        with open(log_path, "a", encoding="utf-8") as logf:
-            logf.write("\n===== Detailed Evaluation Run =====\n")
-            logf.write(df.to_string(index=False))
             logf.write("\n\n")
-        # --- NEW: per-category averages ---
-        try:
-            cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
-            print("\n📊 Correctness by Category:")
-            print(cat_means.to_string(index=False))
-            with open("evaluation_log.txt", "a", encoding="utf-8") as logf:
                 logf.write("\n📊 Correctness by Category:\n")
                 logf.write(cat_means.to_string(index=False))
                 logf.write("\n")
-        except Exception as e:
-            print(f"WARNING: Could not compute category breakdown: {e}")
-        # --- NEW: confusion-style matrix ---
-        try:
-            confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
-                                    rownames=["Category"], colnames=["Error Class"], dropna=False)
-            print("\n📊 Error Class Distribution by Category:")
-            print(confusion.to_string())
-            with open("evaluation_log.txt", "a", encoding="utf-8") as logf:
                 logf.write("\n📊 Error Class Distribution by Category:\n")
                 logf.write(confusion.to_string())
                 logf.write("\n")
-        except Exception as e:
-            print(f"WARNING: Could not build confusion matrix: {e}")
-        # NEW: save detailed results
-        df.to_csv("evaluation_results_detailed.csv", index=False, encoding="utf-8")
-        # NEW: per-category averages
-        try:
-            cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
-            print("\n📊 Correctness by Category:")
-            print(cat_means.to_string(index=False))
-            cat_means.to_csv("evaluation_correctness_by_category.csv", index=False)
-        except Exception as e:
-            print(f"WARNING: Could not compute category breakdown: {e}")
-        # NEW: confusion-style matrix
-        try:
-            confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
-                                    rownames=["Category"], colnames=["Error Class"], dropna=False)
-            print("\n📊 Error Class Distribution by Category:")
-            print(confusion.to_string())
-            confusion.to_csv("evaluation_confusion_matrix.csv")
-        except Exception as e:
-            print(f"WARNING: Could not build confusion matrix: {e}")
         pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
         to_f = lambda s: pd.to_numeric(s, errors="coerce")
-        cp_mean = to_f(df["context_precision"]).mean()
-        cr_mean = to_f(df["context_recall"]).mean()
-        faith_mean = to_f(df["faithfulness"]).mean()
-        correct_mean = to_f(df["answer_correctness"]).mean()
-        rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
-        summary_text = f"""
-## Evaluation Summary
-- **Routing Accuracy**: {pct:.2f}%
-- **Behaviour F1 (avg)**: {(to_f(df["behavior_f1"]).mean() * 100):.2f}%
-- **Emotion F1 (avg)**: {(to_f(df["emotion_f1"]).mean() * 100):.2f}%
-- **Topic F1 (avg)**: {(to_f(df["topic_f1"]).mean() * 100):.2f}%
-- **Context F1 (avg)**: {(to_f(df["context_f1"]).mean() * 100):.2f}%
-- **RAG: Context Precision**: {"N/A" if pd.isna(cp_mean) else f'{(cp_mean * 100):.1f}%'}
-- **RAG: Context Recall**: {"N/A" if pd.isna(cr_mean) else f'{(cr_mean * 100):.1f}%'}
-- **RAG: Faithfulness (LLM-judge)**: {"N/A" if pd.isna(faith_mean) else f'{(faith_mean * 100):.1f}%'}
-- **RAG: Answer Correctness (LLM-judge)**: {"N/A" if pd.isna(correct_mean) else f'{(correct_mean * 100):.1f}%'}
-- **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
-- **RAG: Avg Latency (ms)**: {to_f(df["latency_ms"]).mean():.1f}
-"""
-        df_display = df.rename(columns={
-            "context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall",
-            "answer_correctness": "Answer Correct.", "faithfulness": "Faithfulness",
-            "behavior_f1": "Behav. F1", "emotion_f1": "Emo. F1", "topic_f1": "Topic F1", "context_f1": "Ctx. F1"
-        })
         table_rows = df_display.values.tolist()
         headers = df_display.columns.tolist()
-    else:
-        summary_text = "No valid test fixtures found to evaluate."
-        table_rows, headers = [], []
     return summary_text, table_rows, headers

         return "omission"
     return "contradiction"
+## NEW
+# In evaluate.py
 def run_comprehensive_evaluation(
+    vs_general: "Chroma",
+    nlu_vectorstore: "Chroma",
+    config: Dict[str, Any],
+    storage_path: Path
 ):
     global test_fixtures
     if not test_fixtures:
+        # The return signature is now back to 3 items.
+        return "No test fixtures loaded.", [], []
+    vs_personal_test = None
+    personal_context_docs = []
+    personal_context_file = "sample_data/1 Complaints of a Dutiful Daughter.txt"
+    if os.path.exists(personal_context_file):
+        print(f"Found personal context file for evaluation: '{personal_context_file}'")
+        with open(personal_context_file, "r", encoding="utf-8") as f:
+            content = f.read()
+            doc = Document(page_content=content, metadata={"source": os.path.basename(personal_context_file)})
+            personal_context_docs.append(doc)
+    else:
+        print(f"WARNING: Personal context file not found at '{personal_context_file}'. Factual tests will likely fail.")
+    vs_personal_test = build_or_load_vectorstore(
+        personal_context_docs,
+        index_path="tmp/eval_personal_index",
+        is_personal=True
+    )
+    print(f"Successfully created temporary personal vectorstore with {len(personal_context_docs)} document(s) for this evaluation run.")
     def _norm(label: str) -> str:
         label = (label or "").strip().lower()
         return "factual_question" if "factual" in label else label
     print("Starting comprehensive evaluation...")
     results: List[Dict[str, Any]] = []
     total_fixtures = len(test_fixtures)
     print(f"\n🚀 STARTING EVALUATION on {total_fixtures} test cases...")
     for i, fx in enumerate(test_fixtures):
         test_id = fx.get("test_id", "N/A")
         print(f"--- Processing Test Case {i+1}/{total_fixtures}: ID = {test_id} ---")
         turns = fx.get("turns") or []
         api_chat_history = [{"role": t.get("role"), "content": t.get("text")} for t in turns]
         query = next((t["content"] for t in reversed(api_chat_history) if (t.get("role") or "user").lower() == "user"), "")
         if not query: continue
+        print(f'Query: "{query}"')
         ground_truth = fx.get("ground_truth", {})
         expected_route = _norm(ground_truth.get("expected_route", "caregiving_scenario"))
         expected_tags = ground_truth.get("expected_tags", {})
         actual_route = _norm(route_query_type(query))
         route_correct = (actual_route == expected_route)
             }
         current_test_role = fx.get("test_role", "patient")
+        rag_chain = make_rag_chain(
+            vs_general, vs_personal_test, nlu_vectorstore=nlu_vectorstore,
+            config=config, role=current_test_role, for_evaluation=True
+        )
         t0 = time.time()
         response = answer_query(rag_chain, query, query_type=actual_route, chat_history=api_chat_history, **final_tags)
         latency_ms = round((time.time() - t0) * 1000.0, 1)
         answer_text = response.get("answer", "ERROR")
+        ground_truth_answer = ground_truth.get("ground_truth_answer")
+        category = _categorize_test(test_id)
+        error_class = _classify_error(ground_truth_answer, answer_text)
         expected_sources_set = set(map(str, ground_truth.get("expected_sources", [])))
         raw_sources = response.get("sources", [])
         actual_sources_set = set(map(str, raw_sources if isinstance(raw_sources, (list, tuple)) else [raw_sources]))
         print("\n" + "-"*20 + " SOURCE EVALUATION " + "-"*20)
         print(f"  - Expected: {sorted(list(expected_sources_set))}")
         print(f"  - Actual:   {sorted(list(actual_sources_set))}")
         true_positives = expected_sources_set.intersection(actual_sources_set)
         false_positives = actual_sources_set - expected_sources_set
+        false_negatives = expected_sources_set - actual_sources_set
         if not false_positives and not false_negatives:
             print("  - Result: ✅ Perfect Match!")
             if false_negatives:
                 print(f"  - 🔻 False Negatives (hurts recall):    {sorted(list(false_negatives))}")
         print("-"*59 + "\n")
         context_precision, context_recall = 0.0, 0.0
         if expected_sources_set or actual_sources_set:
+            tp = len(expected_sources_set.intersection(actual_sources_set))
+            if len(actual_sources_set) > 0: context_precision = tp / len(actual_sources_set)
+            if len(expected_sources_set) > 0: context_recall = tp / len(expected_sources_set)
         elif not expected_sources_set and not actual_sources_set:
             context_precision, context_recall = 1.0, 1.0
+        print("\n" + "-"*20 + " ANSWER & CORRECTNESS EVALUATION " + "-"*20)
+        print(f"  - Ground Truth Answer: {ground_truth_answer}")
+        print(f"  - Generated Answer:    {answer_text}")
+        print("-" * 59)
+        answer_correctness_score = None
         if ground_truth_answer and "ERROR" not in answer_text:
             try:
                 judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
+                print(f"  - Judge Prompt Sent:\n{judge_msg}")
                 raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
+                print(f"  - Judge Raw Response: {raw_correctness}")
                 correctness_data = _parse_judge_json(raw_correctness)
                 if correctness_data and "correctness_score" in correctness_data:
                     answer_correctness_score = float(correctness_data["correctness_score"])
+                    print(f"  - Final Score: {answer_correctness_score}")
             except Exception as e:
                 print(f"ERROR during answer correctness judging: {e}")
         faithfulness = None
         source_docs = response.get("source_documents", [])
         if source_docs and "ERROR" not in answer_text:
         sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
         results.append({
             "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
             "route_correct": "✅" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
             "latency_ms": latency_ms, "faithfulness": faithfulness,
             "context_precision": context_precision, "context_recall": context_recall,
             "answer_correctness": answer_correctness_score,
+            "category": category,
+            "error_class": error_class
         })
     df = pd.DataFrame(results)
+    summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
     if not df.empty:
+        cols = ["test_id", "title", "route_correct", "expected_route", "actual_route", "context_precision", "context_recall", "faithfulness", "answer_correctness", "behavior_f1", "emotion_f1", "topic_f1", "context_f1", "source_count", "latency_ms", "sources", "generated_answer", "category", "error_class"]
         df = df[[c for c in cols if c in df.columns]]
+        output_path = "evaluation_results.csv"
         df.to_csv(output_path, index=False, encoding="utf-8")
         print(f"Evaluation results saved to {output_path}")
+        log_path = storage_path / "evaluation_log.txt"
+        with open(log_path, "w", encoding="utf-8") as logf:
+            logf.write("===== Detailed Evaluation Run =====\n")
+            df_string = df.to_string(index=False)
+            logf.write(df_string)
             logf.write("\n\n")
+            try:
+                cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
+                print("\n📊 Correctness by Category:")
+                print(cat_means.to_string(index=False))
                 logf.write("\n📊 Correctness by Category:\n")
                 logf.write(cat_means.to_string(index=False))
                 logf.write("\n")
+            except Exception as e:
+                print(f"WARNING: Could not compute category breakdown: {e}")
+            try:
+                confusion = pd.crosstab(df["category"], df["error_class"], rownames=["Category"], colnames=["Error Class"], dropna=False)
+                print("\n📊 Error Class Distribution by Category:")
+                print(confusion.to_string())
                 logf.write("\n📊 Error Class Distribution by Category:\n")
                 logf.write(confusion.to_string())
                 logf.write("\n")
+            except Exception as e:
+                print(f"WARNING: Could not build confusion matrix: {e}")
         pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
         to_f = lambda s: pd.to_numeric(s, errors="coerce")
+        summary_text = f"""## Evaluation Summary\n- **Routing Accuracy**: {pct:.2f}%\n- **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%\n- **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%\n- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%"""
+        df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
         table_rows = df_display.values.tolist()
         headers = df_display.columns.tolist()
     return summary_text, table_rows, headers
+## END