File size: 6,486 Bytes
f9cf36d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import json
import pandas as pd
from typing import Dict, List, Any
from difflib import SequenceMatcher
import re
class AnswerComparator:
def __init__(self, metadata_path: str = "data/metadata.jsonl"):
"""Initialize the comparator with ground truth data."""
self.ground_truth = self._load_ground_truth(metadata_path)
print(f"Loaded ground truth for {len(self.ground_truth)} questions")
def _load_ground_truth(self, metadata_path: str) -> Dict[str, str]:
"""Load ground truth answers from metadata.jsonl file."""
ground_truth = {}
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
data = json.loads(line)
task_id = data.get("task_id")
final_answer = data.get("Final answer")
if task_id and final_answer is not None:
ground_truth[task_id] = str(final_answer)
except FileNotFoundError:
print(f"Warning: Ground truth file {metadata_path} not found")
except Exception as e:
print(f"Error loading ground truth: {e}")
return ground_truth
def normalize_answer(self, answer: str) -> str:
"""Normalize answer for comparison."""
if answer is None:
return ""
# Convert to string and strip whitespace
answer = str(answer).strip()
# Convert to lowercase for case-insensitive comparison
answer = answer.lower()
# Remove common punctuation that might not affect correctness
answer = re.sub(r'[.,;:!?"\']', '', answer)
# Normalize whitespace
answer = re.sub(r'\s+', ' ', answer)
return answer
def exact_match(self, predicted: str, actual: str) -> bool:
"""Check if answers match exactly after normalization."""
return self.normalize_answer(predicted) == self.normalize_answer(actual)
def similarity_score(self, predicted: str, actual: str) -> float:
"""Calculate similarity score between predicted and actual answers."""
normalized_pred = self.normalize_answer(predicted)
normalized_actual = self.normalize_answer(actual)
if not normalized_pred and not normalized_actual:
return 1.0
if not normalized_pred or not normalized_actual:
return 0.0
return SequenceMatcher(None, normalized_pred, normalized_actual).ratio()
def contains_answer(self, predicted: str, actual: str) -> bool:
"""Check if the actual answer is contained in the predicted answer."""
normalized_pred = self.normalize_answer(predicted)
normalized_actual = self.normalize_answer(actual)
return normalized_actual in normalized_pred
def evaluate_answer(self, task_id: str, predicted_answer: str) -> Dict[str, Any]:
"""Evaluate a single answer against ground truth."""
actual_answer = self.ground_truth.get(task_id)
if actual_answer is None:
return {
"task_id": task_id,
"predicted_answer": predicted_answer,
"actual_answer": None,
"exact_match": False,
"similarity_score": 0.0,
"contains_answer": False,
"error": "No ground truth available"
}
return {
"task_id": task_id,
"predicted_answer": predicted_answer,
"actual_answer": actual_answer,
"exact_match": self.exact_match(predicted_answer, actual_answer),
"similarity_score": self.similarity_score(predicted_answer, actual_answer),
"contains_answer": self.contains_answer(predicted_answer, actual_answer),
"error": None
}
def evaluate_batch(self, results: List[Dict[str, Any]]) -> pd.DataFrame:
"""Evaluate a batch of results."""
evaluations = []
for result in results:
task_id = result.get("task_id") or result.get("Task ID")
predicted_answer = result.get("submitted_answer") or result.get("Submitted Answer", "")
if task_id is not None:
evaluation = self.evaluate_answer(task_id, predicted_answer)
evaluations.append(evaluation)
return pd.DataFrame(evaluations)
def get_summary_stats(self, evaluations_df: pd.DataFrame) -> Dict[str, Any]:
"""Get summary statistics from evaluations."""
if evaluations_df.empty:
return {"error": "No evaluations available"}
# Filter out entries without ground truth
valid_evaluations = evaluations_df[evaluations_df['error'].isna()]
if valid_evaluations.empty:
return {"error": "No valid ground truth available"}
total_questions = len(valid_evaluations)
exact_matches = valid_evaluations['exact_match'].sum()
avg_similarity = valid_evaluations['similarity_score'].mean()
contains_matches = valid_evaluations['contains_answer'].sum()
return {
"total_questions": total_questions,
"exact_matches": exact_matches,
"exact_match_rate": exact_matches / total_questions,
"average_similarity": avg_similarity,
"contains_matches": contains_matches,
"contains_match_rate": contains_matches / total_questions,
"questions_with_ground_truth": total_questions
}
def enhance_results_log(self, results_log: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Add comparison columns to results log."""
enhanced_results = []
for result in results_log:
task_id = result.get("Task ID")
predicted_answer = result.get("Submitted Answer", "")
if task_id is not None:
evaluation = self.evaluate_answer(task_id, predicted_answer)
# Add comparison info to result
enhanced_result = result.copy()
enhanced_result["Ground Truth"] = evaluation["actual_answer"] or "N/A"
enhanced_result["Exact Match"] = evaluation["exact_match"]
enhanced_result["Similarity"] = f"{evaluation['similarity_score']:.3f}"
enhanced_result["Contains Answer"] = evaluation["contains_answer"]
enhanced_results.append(enhanced_result)
return enhanced_results
|