Spaces:
Running
Running
Update evaluate.py
Browse files- evaluate.py +44 -3
evaluate.py
CHANGED
|
@@ -8,9 +8,11 @@ import pandas as pd
|
|
| 8 |
from typing import List, Dict, Any
|
| 9 |
from pathlib import Path
|
| 10 |
|
| 11 |
-
# ---
|
| 12 |
-
|
|
|
|
| 13 |
|
|
|
|
| 14 |
try:
|
| 15 |
from alz_companion.agent import (
|
| 16 |
make_rag_chain, route_query_type, detect_tags_from_query,
|
|
@@ -379,6 +381,45 @@ def run_comprehensive_evaluation(
|
|
| 379 |
role=current_test_role,
|
| 380 |
for_evaluation=True
|
| 381 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
t0 = time.time()
|
| 384 |
response = answer_query(rag_chain, query, query_type=actual_route, chat_history=api_chat_history, **final_tags)
|
|
@@ -483,7 +524,7 @@ def run_comprehensive_evaluation(
|
|
| 483 |
"faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
|
| 484 |
"answer_correctness": answer_correctness_score,
|
| 485 |
"category": category, "error_class": error_class,
|
| 486 |
-
"recall_at_5": recall_at_5 # <-- ADD THIS LINE
|
| 487 |
"latency_ms": latency_ms
|
| 488 |
})
|
| 489 |
|
|
|
|
| 8 |
from typing import List, Dict, Any
|
| 9 |
from pathlib import Path
|
| 10 |
|
| 11 |
+
# --- ADD THIS FLAG ---
|
| 12 |
+
NLU_ONLY_TEST = True
|
| 13 |
+
# ---------------------
|
| 14 |
|
| 15 |
+
# --- Imports from the main application ---
|
| 16 |
try:
|
| 17 |
from alz_companion.agent import (
|
| 18 |
make_rag_chain, route_query_type, detect_tags_from_query,
|
|
|
|
| 381 |
role=current_test_role,
|
| 382 |
for_evaluation=True
|
| 383 |
)
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
# --- START MODIFICATION ---
|
| 387 |
+
if NLU_ONLY_TEST:
|
| 388 |
+
# 1. Run only the NLU parts
|
| 389 |
+
actual_route = route_query_type(user_query)
|
| 390 |
+
actual_tags = detect_tags_from_query(user_query, actual_route)
|
| 391 |
+
|
| 392 |
+
# 2. Add the NLU results to your list
|
| 393 |
+
results.append({
|
| 394 |
+
"test_id": test_id,
|
| 395 |
+
"title": title,
|
| 396 |
+
"user_query": user_query,
|
| 397 |
+
"actual_route": actual_route,
|
| 398 |
+
"expected_route": expected_route,
|
| 399 |
+
"route_correct": 1 if actual_route == expected_route else 0,
|
| 400 |
+
"actual_tags": actual_tags,
|
| 401 |
+
"expected_tags": expected_tags,
|
| 402 |
+
# Set RAG metrics to default/None values
|
| 403 |
+
"raw_sources": [],
|
| 404 |
+
"expected_sources": expected_sources,
|
| 405 |
+
"answer": "(NLU_ONLY_TEST)",
|
| 406 |
+
"context_precision": None,
|
| 407 |
+
"context_recall": None,
|
| 408 |
+
"recall_at_5": None,
|
| 409 |
+
"answer_correctness": None,
|
| 410 |
+
"faithfulness_score": None,
|
| 411 |
+
"latency_ms": 0
|
| 412 |
+
})
|
| 413 |
+
|
| 414 |
+
# 3. Use 'continue' to skip the rest of the loop and go to the next test case
|
| 415 |
+
continue
|
| 416 |
+
# --- END MODIFICATION ---
|
| 417 |
+
|
| 418 |
+
# ####################################################################
|
| 419 |
+
# ALL OF YOUR ORIGINAL RAG PIPELINE CODE STAYS HERE.
|
| 420 |
+
# IT IS NOT INDENTED AND ONLY RUNS IF NLU_ONLY_TEST IS FALSE.
|
| 421 |
+
# ####################################################################
|
| 422 |
+
|
| 423 |
|
| 424 |
t0 = time.time()
|
| 425 |
response = answer_query(rag_chain, query, query_type=actual_route, chat_history=api_chat_history, **final_tags)
|
|
|
|
| 524 |
"faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
|
| 525 |
"answer_correctness": answer_correctness_score,
|
| 526 |
"category": category, "error_class": error_class,
|
| 527 |
+
"recall_at_5": recall_at_5, # <-- ADD THIS LINE
|
| 528 |
"latency_ms": latency_ms
|
| 529 |
})
|
| 530 |
|