Final_Assignment_Template / comparison.py
Romain Fayoux
Added ground evaluation and phoenix login
f9cf36d
raw
history blame
6.49 kB
import json
import pandas as pd
from typing import Dict, List, Any
from difflib import SequenceMatcher
import re
class AnswerComparator:
def __init__(self, metadata_path: str = "data/metadata.jsonl"):
"""Initialize the comparator with ground truth data."""
self.ground_truth = self._load_ground_truth(metadata_path)
print(f"Loaded ground truth for {len(self.ground_truth)} questions")
def _load_ground_truth(self, metadata_path: str) -> Dict[str, str]:
"""Load ground truth answers from metadata.jsonl file."""
ground_truth = {}
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
data = json.loads(line)
task_id = data.get("task_id")
final_answer = data.get("Final answer")
if task_id and final_answer is not None:
ground_truth[task_id] = str(final_answer)
except FileNotFoundError:
print(f"Warning: Ground truth file {metadata_path} not found")
except Exception as e:
print(f"Error loading ground truth: {e}")
return ground_truth
def normalize_answer(self, answer: str) -> str:
"""Normalize answer for comparison."""
if answer is None:
return ""
# Convert to string and strip whitespace
answer = str(answer).strip()
# Convert to lowercase for case-insensitive comparison
answer = answer.lower()
# Remove common punctuation that might not affect correctness
answer = re.sub(r'[.,;:!?"\']', '', answer)
# Normalize whitespace
answer = re.sub(r'\s+', ' ', answer)
return answer
def exact_match(self, predicted: str, actual: str) -> bool:
"""Check if answers match exactly after normalization."""
return self.normalize_answer(predicted) == self.normalize_answer(actual)
def similarity_score(self, predicted: str, actual: str) -> float:
"""Calculate similarity score between predicted and actual answers."""
normalized_pred = self.normalize_answer(predicted)
normalized_actual = self.normalize_answer(actual)
if not normalized_pred and not normalized_actual:
return 1.0
if not normalized_pred or not normalized_actual:
return 0.0
return SequenceMatcher(None, normalized_pred, normalized_actual).ratio()
def contains_answer(self, predicted: str, actual: str) -> bool:
"""Check if the actual answer is contained in the predicted answer."""
normalized_pred = self.normalize_answer(predicted)
normalized_actual = self.normalize_answer(actual)
return normalized_actual in normalized_pred
def evaluate_answer(self, task_id: str, predicted_answer: str) -> Dict[str, Any]:
"""Evaluate a single answer against ground truth."""
actual_answer = self.ground_truth.get(task_id)
if actual_answer is None:
return {
"task_id": task_id,
"predicted_answer": predicted_answer,
"actual_answer": None,
"exact_match": False,
"similarity_score": 0.0,
"contains_answer": False,
"error": "No ground truth available"
}
return {
"task_id": task_id,
"predicted_answer": predicted_answer,
"actual_answer": actual_answer,
"exact_match": self.exact_match(predicted_answer, actual_answer),
"similarity_score": self.similarity_score(predicted_answer, actual_answer),
"contains_answer": self.contains_answer(predicted_answer, actual_answer),
"error": None
}
def evaluate_batch(self, results: List[Dict[str, Any]]) -> pd.DataFrame:
"""Evaluate a batch of results."""
evaluations = []
for result in results:
task_id = result.get("task_id") or result.get("Task ID")
predicted_answer = result.get("submitted_answer") or result.get("Submitted Answer", "")
if task_id is not None:
evaluation = self.evaluate_answer(task_id, predicted_answer)
evaluations.append(evaluation)
return pd.DataFrame(evaluations)
def get_summary_stats(self, evaluations_df: pd.DataFrame) -> Dict[str, Any]:
"""Get summary statistics from evaluations."""
if evaluations_df.empty:
return {"error": "No evaluations available"}
# Filter out entries without ground truth
valid_evaluations = evaluations_df[evaluations_df['error'].isna()]
if valid_evaluations.empty:
return {"error": "No valid ground truth available"}
total_questions = len(valid_evaluations)
exact_matches = valid_evaluations['exact_match'].sum()
avg_similarity = valid_evaluations['similarity_score'].mean()
contains_matches = valid_evaluations['contains_answer'].sum()
return {
"total_questions": total_questions,
"exact_matches": exact_matches,
"exact_match_rate": exact_matches / total_questions,
"average_similarity": avg_similarity,
"contains_matches": contains_matches,
"contains_match_rate": contains_matches / total_questions,
"questions_with_ground_truth": total_questions
}
def enhance_results_log(self, results_log: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Add comparison columns to results log."""
enhanced_results = []
for result in results_log:
task_id = result.get("Task ID")
predicted_answer = result.get("Submitted Answer", "")
if task_id is not None:
evaluation = self.evaluate_answer(task_id, predicted_answer)
# Add comparison info to result
enhanced_result = result.copy()
enhanced_result["Ground Truth"] = evaluation["actual_answer"] or "N/A"
enhanced_result["Exact Match"] = evaluation["exact_match"]
enhanced_result["Similarity"] = f"{evaluation['similarity_score']:.3f}"
enhanced_result["Contains Answer"] = evaluation["contains_answer"]
enhanced_results.append(enhanced_result)
return enhanced_results