KeenWoo commited on
Commit
8ec6b4d
·
verified ·
1 Parent(s): f208226

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +30 -8
evaluate.py CHANGED
@@ -338,7 +338,36 @@ def run_comprehensive_evaluation(
338
 
339
  if not df.empty:
340
  cols = ["test_id", "title", "route_correct", "expected_route", "actual_route", "context_precision", "context_recall", "faithfulness", "answer_correctness", "behavior_f1", "emotion_f1", "topic_f1", "context_f1", "source_count", "latency_ms", "sources", "generated_answer", "category", "error_class"]
341
- df = df[[c for c in cols if c in df.columns]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  output_path = "evaluation_results.csv"
343
  df.to_csv(output_path, index=False, encoding="utf-8")
344
  print(f"Evaluation results saved to {output_path}")
@@ -369,13 +398,6 @@ def run_comprehensive_evaluation(
369
  logf.write("\n")
370
  except Exception as e:
371
  print(f"WARNING: Could not build confusion matrix: {e}")
372
-
373
- pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
374
- to_f = lambda s: pd.to_numeric(s, errors="coerce")
375
- summary_text = f"""## Evaluation Summary\n- **Routing Accuracy**: {pct:.2f}%\n- **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%\n- **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%\n- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%"""
376
- df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
377
- table_rows = df_display.values.tolist()
378
- headers = df_display.columns.tolist()
379
 
380
  return summary_text, table_rows, headers
381
 
 
338
 
339
  if not df.empty:
340
  cols = ["test_id", "title", "route_correct", "expected_route", "actual_route", "context_precision", "context_recall", "faithfulness", "answer_correctness", "behavior_f1", "emotion_f1", "topic_f1", "context_f1", "source_count", "latency_ms", "sources", "generated_answer", "category", "error_class"]
341
+ df = df[[c for c in cols if c in df.columns]]
342
+
343
+ # --- START OF MODIFICATION ---
344
+ pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
345
+ to_f = lambda s: pd.to_numeric(s, errors="coerce")
346
+
347
+ # Calculate the mean for the NLU F1 scores
348
+ bf1_mean = to_f(df["behavior_f1"]).mean() * 100
349
+ ef1_mean = to_f(df["emotion_f1"]).mean() * 100
350
+ tf1_mean = to_f(df["topic_f1"]).mean() * 100
351
+ cf1_mean = to_f(df["context_f1"]).mean() * 100
352
+
353
+ # Add the NLU metrics to the summary f-string
354
+ summary_text = f"""## Evaluation Summary
355
+ - **Routing Accuracy**: {pct:.2f}%
356
+ - **Behaviour F1 (avg)**: {bf1_mean:.2f}%
357
+ - **Emotion F1 (avg)**: {ef1_mean:.2f}%
358
+ - **Topic F1 (avg)**: {tf1_mean:.2f}%
359
+ - **Context F1 (avg)**: {cf1_mean:.2f}%
360
+ - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
361
+ - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
362
+ - **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%"""
363
+ # --- END OF MODIFICATION ---
364
+
365
+
366
+ df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
367
+ table_rows = df_display.values.tolist()
368
+ headers = df_display.columns.tolist
369
+
370
+
371
  output_path = "evaluation_results.csv"
372
  df.to_csv(output_path, index=False, encoding="utf-8")
373
  print(f"Evaluation results saved to {output_path}")
 
398
  logf.write("\n")
399
  except Exception as e:
400
  print(f"WARNING: Could not build confusion matrix: {e}")
 
 
 
 
 
 
 
401
 
402
  return summary_text, table_rows, headers
403