|
|
import json |
|
|
import pandas as pd |
|
|
from typing import Dict, List, Any |
|
|
from difflib import SequenceMatcher |
|
|
import re |
|
|
|
|
|
|
|
|
class AnswerComparator: |
|
|
def __init__(self, metadata_path: str = "data/metadata.jsonl"): |
|
|
"""Initialize the comparator with ground truth data.""" |
|
|
self.ground_truth = self._load_ground_truth(metadata_path) |
|
|
print(f"Loaded ground truth for {len(self.ground_truth)} questions") |
|
|
|
|
|
def _load_ground_truth(self, metadata_path: str) -> Dict[str, str]: |
|
|
"""Load ground truth answers from metadata.jsonl file.""" |
|
|
ground_truth = {} |
|
|
try: |
|
|
with open(metadata_path, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
if line.strip(): |
|
|
data = json.loads(line) |
|
|
task_id = data.get("task_id") |
|
|
final_answer = data.get("Final answer") |
|
|
if task_id and final_answer is not None: |
|
|
ground_truth[task_id] = str(final_answer) |
|
|
except FileNotFoundError: |
|
|
print(f"Warning: Ground truth file {metadata_path} not found") |
|
|
except Exception as e: |
|
|
print(f"Error loading ground truth: {e}") |
|
|
|
|
|
return ground_truth |
|
|
|
|
|
def normalize_answer(self, answer: str) -> str: |
|
|
"""Normalize answer for comparison.""" |
|
|
if answer is None: |
|
|
return "" |
|
|
|
|
|
|
|
|
answer = str(answer).strip() |
|
|
|
|
|
|
|
|
answer = answer.lower() |
|
|
|
|
|
|
|
|
answer = re.sub(r'[.,;:!?"\']', '', answer) |
|
|
|
|
|
|
|
|
answer = re.sub(r'\s+', ' ', answer) |
|
|
|
|
|
return answer |
|
|
|
|
|
def exact_match(self, predicted: str, actual: str) -> bool: |
|
|
"""Check if answers match exactly after normalization.""" |
|
|
return self.normalize_answer(predicted) == self.normalize_answer(actual) |
|
|
|
|
|
def similarity_score(self, predicted: str, actual: str) -> float: |
|
|
"""Calculate similarity score between predicted and actual answers.""" |
|
|
normalized_pred = self.normalize_answer(predicted) |
|
|
normalized_actual = self.normalize_answer(actual) |
|
|
|
|
|
if not normalized_pred and not normalized_actual: |
|
|
return 1.0 |
|
|
if not normalized_pred or not normalized_actual: |
|
|
return 0.0 |
|
|
|
|
|
return SequenceMatcher(None, normalized_pred, normalized_actual).ratio() |
|
|
|
|
|
def contains_answer(self, predicted: str, actual: str) -> bool: |
|
|
"""Check if the actual answer is contained in the predicted answer.""" |
|
|
normalized_pred = self.normalize_answer(predicted) |
|
|
normalized_actual = self.normalize_answer(actual) |
|
|
|
|
|
return normalized_actual in normalized_pred |
|
|
|
|
|
def evaluate_answer(self, task_id: str, predicted_answer: str) -> Dict[str, Any]: |
|
|
"""Evaluate a single answer against ground truth.""" |
|
|
actual_answer = self.ground_truth.get(task_id) |
|
|
|
|
|
if actual_answer is None: |
|
|
return { |
|
|
"task_id": task_id, |
|
|
"predicted_answer": predicted_answer, |
|
|
"actual_answer": None, |
|
|
"exact_match": False, |
|
|
"similarity_score": 0.0, |
|
|
"contains_answer": False, |
|
|
"error": "No ground truth available" |
|
|
} |
|
|
|
|
|
return { |
|
|
"task_id": task_id, |
|
|
"predicted_answer": predicted_answer, |
|
|
"actual_answer": actual_answer, |
|
|
"exact_match": self.exact_match(predicted_answer, actual_answer), |
|
|
"similarity_score": self.similarity_score(predicted_answer, actual_answer), |
|
|
"contains_answer": self.contains_answer(predicted_answer, actual_answer), |
|
|
"error": None |
|
|
} |
|
|
|
|
|
def evaluate_batch(self, results: List[Dict[str, Any]]) -> pd.DataFrame: |
|
|
"""Evaluate a batch of results.""" |
|
|
evaluations = [] |
|
|
|
|
|
for result in results: |
|
|
task_id = result.get("task_id") or result.get("Task ID") |
|
|
predicted_answer = result.get("submitted_answer") or result.get("Submitted Answer", "") |
|
|
|
|
|
if task_id is not None: |
|
|
evaluation = self.evaluate_answer(task_id, predicted_answer) |
|
|
evaluations.append(evaluation) |
|
|
|
|
|
return pd.DataFrame(evaluations) |
|
|
|
|
|
def get_summary_stats(self, evaluations_df: pd.DataFrame) -> Dict[str, Any]: |
|
|
"""Get summary statistics from evaluations.""" |
|
|
if evaluations_df.empty: |
|
|
return {"error": "No evaluations available"} |
|
|
|
|
|
|
|
|
valid_evaluations = evaluations_df[evaluations_df['error'].isna()] |
|
|
|
|
|
if valid_evaluations.empty: |
|
|
return {"error": "No valid ground truth available"} |
|
|
|
|
|
total_questions = len(valid_evaluations) |
|
|
exact_matches = valid_evaluations['exact_match'].sum() |
|
|
avg_similarity = valid_evaluations['similarity_score'].mean() |
|
|
contains_matches = valid_evaluations['contains_answer'].sum() |
|
|
|
|
|
return { |
|
|
"total_questions": total_questions, |
|
|
"exact_matches": exact_matches, |
|
|
"exact_match_rate": exact_matches / total_questions, |
|
|
"average_similarity": avg_similarity, |
|
|
"contains_matches": contains_matches, |
|
|
"contains_match_rate": contains_matches / total_questions, |
|
|
"questions_with_ground_truth": total_questions |
|
|
} |
|
|
|
|
|
def enhance_results_log(self, results_log: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
|
|
"""Add comparison columns to results log.""" |
|
|
enhanced_results = [] |
|
|
|
|
|
for result in results_log: |
|
|
task_id = result.get("Task ID") |
|
|
predicted_answer = result.get("Submitted Answer", "") |
|
|
|
|
|
if task_id is not None: |
|
|
evaluation = self.evaluate_answer(task_id, predicted_answer) |
|
|
|
|
|
|
|
|
enhanced_result = result.copy() |
|
|
enhanced_result["Ground Truth"] = evaluation["actual_answer"] or "N/A" |
|
|
enhanced_result["Exact Match"] = evaluation["exact_match"] |
|
|
enhanced_result["Similarity"] = f"{evaluation['similarity_score']:.3f}" |
|
|
enhanced_result["Contains Answer"] = evaluation["contains_answer"] |
|
|
|
|
|
enhanced_results.append(enhanced_result) |
|
|
|
|
|
return enhanced_results |
|
|
|