File size: 6,486 Bytes
f9cf36d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import json
import pandas as pd
from typing import Dict, List, Any
from difflib import SequenceMatcher
import re


class AnswerComparator:
    def __init__(self, metadata_path: str = "data/metadata.jsonl"):
        """Initialize the comparator with ground truth data."""
        self.ground_truth = self._load_ground_truth(metadata_path)
        print(f"Loaded ground truth for {len(self.ground_truth)} questions")

    def _load_ground_truth(self, metadata_path: str) -> Dict[str, str]:
        """Load ground truth answers from metadata.jsonl file."""
        ground_truth = {}
        try:
            with open(metadata_path, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        data = json.loads(line)
                        task_id = data.get("task_id")
                        final_answer = data.get("Final answer")
                        if task_id and final_answer is not None:
                            ground_truth[task_id] = str(final_answer)
        except FileNotFoundError:
            print(f"Warning: Ground truth file {metadata_path} not found")
        except Exception as e:
            print(f"Error loading ground truth: {e}")

        return ground_truth

    def normalize_answer(self, answer: str) -> str:
        """Normalize answer for comparison."""
        if answer is None:
            return ""

        # Convert to string and strip whitespace
        answer = str(answer).strip()

        # Convert to lowercase for case-insensitive comparison
        answer = answer.lower()

        # Remove common punctuation that might not affect correctness
        answer = re.sub(r'[.,;:!?"\']', '', answer)

        # Normalize whitespace
        answer = re.sub(r'\s+', ' ', answer)

        return answer

    def exact_match(self, predicted: str, actual: str) -> bool:
        """Check if answers match exactly after normalization."""
        return self.normalize_answer(predicted) == self.normalize_answer(actual)

    def similarity_score(self, predicted: str, actual: str) -> float:
        """Calculate similarity score between predicted and actual answers."""
        normalized_pred = self.normalize_answer(predicted)
        normalized_actual = self.normalize_answer(actual)

        if not normalized_pred and not normalized_actual:
            return 1.0
        if not normalized_pred or not normalized_actual:
            return 0.0

        return SequenceMatcher(None, normalized_pred, normalized_actual).ratio()

    def contains_answer(self, predicted: str, actual: str) -> bool:
        """Check if the actual answer is contained in the predicted answer."""
        normalized_pred = self.normalize_answer(predicted)
        normalized_actual = self.normalize_answer(actual)

        return normalized_actual in normalized_pred

    def evaluate_answer(self, task_id: str, predicted_answer: str) -> Dict[str, Any]:
        """Evaluate a single answer against ground truth."""
        actual_answer = self.ground_truth.get(task_id)

        if actual_answer is None:
            return {
                "task_id": task_id,
                "predicted_answer": predicted_answer,
                "actual_answer": None,
                "exact_match": False,
                "similarity_score": 0.0,
                "contains_answer": False,
                "error": "No ground truth available"
            }

        return {
            "task_id": task_id,
            "predicted_answer": predicted_answer,
            "actual_answer": actual_answer,
            "exact_match": self.exact_match(predicted_answer, actual_answer),
            "similarity_score": self.similarity_score(predicted_answer, actual_answer),
            "contains_answer": self.contains_answer(predicted_answer, actual_answer),
            "error": None
        }

    def evaluate_batch(self, results: List[Dict[str, Any]]) -> pd.DataFrame:
        """Evaluate a batch of results."""
        evaluations = []

        for result in results:
            task_id = result.get("task_id") or result.get("Task ID")
            predicted_answer = result.get("submitted_answer") or result.get("Submitted Answer", "")

            if task_id is not None:
                evaluation = self.evaluate_answer(task_id, predicted_answer)
                evaluations.append(evaluation)

        return pd.DataFrame(evaluations)

    def get_summary_stats(self, evaluations_df: pd.DataFrame) -> Dict[str, Any]:
        """Get summary statistics from evaluations."""
        if evaluations_df.empty:
            return {"error": "No evaluations available"}

        # Filter out entries without ground truth
        valid_evaluations = evaluations_df[evaluations_df['error'].isna()]

        if valid_evaluations.empty:
            return {"error": "No valid ground truth available"}

        total_questions = len(valid_evaluations)
        exact_matches = valid_evaluations['exact_match'].sum()
        avg_similarity = valid_evaluations['similarity_score'].mean()
        contains_matches = valid_evaluations['contains_answer'].sum()

        return {
            "total_questions": total_questions,
            "exact_matches": exact_matches,
            "exact_match_rate": exact_matches / total_questions,
            "average_similarity": avg_similarity,
            "contains_matches": contains_matches,
            "contains_match_rate": contains_matches / total_questions,
            "questions_with_ground_truth": total_questions
        }

    def enhance_results_log(self, results_log: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Add comparison columns to results log."""
        enhanced_results = []

        for result in results_log:
            task_id = result.get("Task ID")
            predicted_answer = result.get("Submitted Answer", "")

            if task_id is not None:
                evaluation = self.evaluate_answer(task_id, predicted_answer)

                # Add comparison info to result
                enhanced_result = result.copy()
                enhanced_result["Ground Truth"] = evaluation["actual_answer"] or "N/A"
                enhanced_result["Exact Match"] = evaluation["exact_match"]
                enhanced_result["Similarity"] = f"{evaluation['similarity_score']:.3f}"
                enhanced_result["Contains Answer"] = evaluation["contains_answer"]

                enhanced_results.append(enhanced_result)

        return enhanced_results