import json import pandas as pd from typing import Dict, List, Any from difflib import SequenceMatcher import re class AnswerComparator: def __init__(self, metadata_path: str = "data/metadata.jsonl"): """Initialize the comparator with ground truth data.""" self.ground_truth = self._load_ground_truth(metadata_path) print(f"Loaded ground truth for {len(self.ground_truth)} questions") def _load_ground_truth(self, metadata_path: str) -> Dict[str, str]: """Load ground truth answers from metadata.jsonl file.""" ground_truth = {} try: with open(metadata_path, 'r', encoding='utf-8') as f: for line in f: if line.strip(): data = json.loads(line) task_id = data.get("task_id") final_answer = data.get("Final answer") if task_id and final_answer is not None: ground_truth[task_id] = str(final_answer) except FileNotFoundError: print(f"Warning: Ground truth file {metadata_path} not found") except Exception as e: print(f"Error loading ground truth: {e}") return ground_truth def normalize_answer(self, answer: str) -> str: """Normalize answer for comparison.""" if answer is None: return "" # Convert to string and strip whitespace answer = str(answer).strip() # Convert to lowercase for case-insensitive comparison answer = answer.lower() # Remove common punctuation that might not affect correctness answer = re.sub(r'[.,;:!?"\']', '', answer) # Normalize whitespace answer = re.sub(r'\s+', ' ', answer) return answer def exact_match(self, predicted: str, actual: str) -> bool: """Check if answers match exactly after normalization.""" return self.normalize_answer(predicted) == self.normalize_answer(actual) def similarity_score(self, predicted: str, actual: str) -> float: """Calculate similarity score between predicted and actual answers.""" normalized_pred = self.normalize_answer(predicted) normalized_actual = self.normalize_answer(actual) if not normalized_pred and not normalized_actual: return 1.0 if not normalized_pred or not normalized_actual: return 0.0 return SequenceMatcher(None, normalized_pred, normalized_actual).ratio() def contains_answer(self, predicted: str, actual: str) -> bool: """Check if the actual answer is contained in the predicted answer.""" normalized_pred = self.normalize_answer(predicted) normalized_actual = self.normalize_answer(actual) return normalized_actual in normalized_pred def evaluate_answer(self, task_id: str, predicted_answer: str) -> Dict[str, Any]: """Evaluate a single answer against ground truth.""" actual_answer = self.ground_truth.get(task_id) if actual_answer is None: return { "task_id": task_id, "predicted_answer": predicted_answer, "actual_answer": None, "exact_match": False, "similarity_score": 0.0, "contains_answer": False, "error": "No ground truth available" } return { "task_id": task_id, "predicted_answer": predicted_answer, "actual_answer": actual_answer, "exact_match": self.exact_match(predicted_answer, actual_answer), "similarity_score": self.similarity_score(predicted_answer, actual_answer), "contains_answer": self.contains_answer(predicted_answer, actual_answer), "error": None } def evaluate_batch(self, results: List[Dict[str, Any]]) -> pd.DataFrame: """Evaluate a batch of results.""" evaluations = [] for result in results: task_id = result.get("task_id") or result.get("Task ID") predicted_answer = result.get("submitted_answer") or result.get("Submitted Answer", "") if task_id is not None: evaluation = self.evaluate_answer(task_id, predicted_answer) evaluations.append(evaluation) return pd.DataFrame(evaluations) def get_summary_stats(self, evaluations_df: pd.DataFrame) -> Dict[str, Any]: """Get summary statistics from evaluations.""" if evaluations_df.empty: return {"error": "No evaluations available"} # Filter out entries without ground truth valid_evaluations = evaluations_df[evaluations_df['error'].isna()] if valid_evaluations.empty: return {"error": "No valid ground truth available"} total_questions = len(valid_evaluations) exact_matches = valid_evaluations['exact_match'].sum() avg_similarity = valid_evaluations['similarity_score'].mean() contains_matches = valid_evaluations['contains_answer'].sum() return { "total_questions": total_questions, "exact_matches": exact_matches, "exact_match_rate": exact_matches / total_questions, "average_similarity": avg_similarity, "contains_matches": contains_matches, "contains_match_rate": contains_matches / total_questions, "questions_with_ground_truth": total_questions } def enhance_results_log(self, results_log: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Add comparison columns to results log.""" enhanced_results = [] for result in results_log: task_id = result.get("Task ID") predicted_answer = result.get("Submitted Answer", "") if task_id is not None: evaluation = self.evaluate_answer(task_id, predicted_answer) # Add comparison info to result enhanced_result = result.copy() enhanced_result["Ground Truth"] = evaluation["actual_answer"] or "N/A" enhanced_result["Exact Match"] = evaluation["exact_match"] enhanced_result["Similarity"] = f"{evaluation['similarity_score']:.3f}" enhanced_result["Contains Answer"] = evaluation["contains_answer"] enhanced_results.append(enhanced_result) return enhanced_results