Spaces:
Build error
Build error
Update evaluate.py
Browse files- evaluate.py +74 -3
evaluate.py
CHANGED
|
@@ -52,8 +52,78 @@ except ImportError:
|
|
| 52 |
|
| 53 |
|
| 54 |
# --- LLM-as-a-Judge Prompt for Answer Correctness ---
|
| 55 |
-
# Aware of QUERY TYPE
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
QUERY_TYPE: {query_type}
|
| 59 |
|
|
@@ -343,7 +413,8 @@ def run_comprehensive_evaluation(
|
|
| 343 |
judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(
|
| 344 |
ground_truth_answer=ground_truth_answer,
|
| 345 |
generated_answer=answer_text,
|
| 346 |
-
query_type=expected_route # <-- Add this line
|
|
|
|
| 347 |
)
|
| 348 |
# judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
|
| 349 |
# print(f" - Judge Prompt Sent:\n{judge_msg}")
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
# --- LLM-as-a-Judge Prompt for Answer Correctness ---
|
| 55 |
+
# Aware of QUERY TYPE and ROLE
|
| 56 |
+
# In prompts.py or evaluate.py
|
| 57 |
+
ANSWER_CORRECTNESS_JUDGE_PROMPT = """You are an expert evaluator. Your task is to assess a GENERATED_ANSWER against a GROUND_TRUTH_ANSWER based on the provided context (QUERY_TYPE and USER_ROLE) and the scoring rubric below.
|
| 58 |
+
|
| 59 |
+
--- CONTEXT FOR EVALUATION ---
|
| 60 |
+
QUERY_TYPE: {query_type}
|
| 61 |
+
USER_ROLE: {role}
|
| 62 |
+
|
| 63 |
+
--- General Rules (Apply to ALL evaluations) ---
|
| 64 |
+
- Ignore minor differences in phrasing, tone, or structure. Your evaluation should be based on the substance of the answer, not its style.
|
| 65 |
+
|
| 66 |
+
--- Scoring Rubric ---
|
| 67 |
+
- 1.0 (Fully Correct): The generated answer contains all the key factual points and advice from the ground truth.
|
| 68 |
+
- 0.8 (Mostly Correct): The generated answer captures the main point and is factually correct, but it misses a secondary detail or a specific actionable step.
|
| 69 |
+
- 0.5 (Partially Correct): The generated answer is factually correct in what it states but is too generic or vague. It misses the primary advice or the most critical information.
|
| 70 |
+
- 0.0 (Incorrect): The generated answer is factually incorrect, contains hallucinations, or contradicts the core advice of the ground truth.
|
| 71 |
+
|
| 72 |
+
--- Specific Judging Criteria by Context ---
|
| 73 |
+
- If QUERY_TYPE is 'caregiving_scenario' AND USER_ROLE is 'patient':
|
| 74 |
+
- Apply the rubric with a focus on **emotional support and validation**. The answer does NOT need to be factually exhaustive to get a high score.
|
| 75 |
+
- If QUERY_TYPE is 'caregiving_scenario' AND USER_ROLE is 'caregiver':
|
| 76 |
+
- Apply the rubric with a focus on a **blend of empathy and practical, actionable advice**. The answer should be factually aligned with the ground truth.
|
| 77 |
+
- If QUERY_TYPE is 'factual_question':
|
| 78 |
+
- Your evaluation should be based on **factual accuracy**. Any empathetic or conversational language should be ignored.
|
| 79 |
+
- For all other QUERY_TYPEs:
|
| 80 |
+
- Default to applying the rubric with a focus on factual accuracy.
|
| 81 |
+
|
| 82 |
+
--- Examples ---
|
| 83 |
+
# Example for a 1.0 Score (Patient Role - Emotional Support)
|
| 84 |
+
GROUND_TRUTH: It's frustrating when something important goes missing. I understand why you're upset. Why don't we look for it together?
|
| 85 |
+
GENERATED_ANSWER: I hear how frustrating this is for you. You're not alone, let's try and find it together.
|
| 86 |
+
Score: 1.0
|
| 87 |
+
|
| 88 |
+
# --- NEW CAREGIVER EXAMPLE ---
|
| 89 |
+
# Example for a 1.0 Score (Caregiver Role - Empathy + Action)
|
| 90 |
+
GROUND_TRUTH: This can be very trying. Repetitive questioning happens because the brain isn't retaining new information. Try to answer in a calm, reassuring tone each time.
|
| 91 |
+
GENERATED_ANSWER: It can be very frustrating to answer the same question repeatedly. Remember that this is due to memory changes. The best approach is to stay patient and answer calmly.
|
| 92 |
+
Score: 1.0
|
| 93 |
+
# --- END NEW EXAMPLE ---
|
| 94 |
+
|
| 95 |
+
# Example for a 0.8 Score (Mostly Correct but Incomplete)
|
| 96 |
+
GROUND_TRUTH: A calm and reassuring approach is best. Instead of arguing, validate their feelings and suggest looking for the item together.
|
| 97 |
+
GENERATED_ANSWER: It's important to stay calm and reassure them. You should tell them you understand they are upset.
|
| 98 |
+
Score: 0.8
|
| 99 |
+
|
| 100 |
+
# Example for a 0.5 Score (Partially Correct but Vague)
|
| 101 |
+
GROUND_TRUTH: Repetitive questioning happens because the brain isn't retaining new info. Answer calmly, and consider writing the answer on a visible whiteboard.
|
| 102 |
+
GENERATED_ANSWER: It's important to be patient when they ask the same question over and over.
|
| 103 |
+
Score: 0.5
|
| 104 |
+
|
| 105 |
+
# Example for a 0.0 Score (Contradicts Core Advice)
|
| 106 |
+
GROUND_TRUTH: A calm and reassuring approach is best. Try not to argue about the facts.
|
| 107 |
+
GENERATED_ANSWER: You need to firmly correct him and explain that the carer did not steal his watch. It is important to confront these delusions directly with facts.
|
| 108 |
+
Score: 0.0
|
| 109 |
+
---
|
| 110 |
+
|
| 111 |
+
--- DATA TO EVALUATE ---
|
| 112 |
+
GROUND_TRUTH_ANSWER:
|
| 113 |
+
{ground_truth_answer}
|
| 114 |
+
|
| 115 |
+
GENERATED_ANSWER:
|
| 116 |
+
{generated_answer}
|
| 117 |
+
---
|
| 118 |
+
|
| 119 |
+
Return a single JSON object with your score based on the rubric and examples:
|
| 120 |
+
{{
|
| 121 |
+
"correctness_score": <float>
|
| 122 |
+
}}
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
ORIG_ANSWER_CORRECTNESS_JUDGE_PROMPT = """You are an expert evaluator. Your task is to assess a GENERATED_ANSWER against a GROUND_TRUTH_ANSWER based on the provided QUERY_TYPE and the scoring rubric below.
|
| 127 |
|
| 128 |
QUERY_TYPE: {query_type}
|
| 129 |
|
|
|
|
| 413 |
judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(
|
| 414 |
ground_truth_answer=ground_truth_answer,
|
| 415 |
generated_answer=answer_text,
|
| 416 |
+
query_type=expected_route, # <-- Add this line
|
| 417 |
+
role=current_test_role # <-- ADD THIS LINE
|
| 418 |
)
|
| 419 |
# judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
|
| 420 |
# print(f" - Judge Prompt Sent:\n{judge_msg}")
|