Final_Assignment_Template

Sleeping

Final_Assignment_Template / comparison.py

Romain Fayoux

Added ground evaluation and phoenix login

f9cf36d 4 months ago

6.49 kB

	import json
	import pandas as pd
	from typing import Dict, List, Any
	from difflib import SequenceMatcher
	import re


	class AnswerComparator:
	def __init__(self, metadata_path: str = "data/metadata.jsonl"):
	"""Initialize the comparator with ground truth data."""
	self.ground_truth = self._load_ground_truth(metadata_path)
	print(f"Loaded ground truth for {len(self.ground_truth)} questions")

	def _load_ground_truth(self, metadata_path: str) -> Dict[str, str]:
	"""Load ground truth answers from metadata.jsonl file."""
	ground_truth = {}
	try:
	with open(metadata_path, 'r', encoding='utf-8') as f:
	for line in f:
	if line.strip():
	data = json.loads(line)
	task_id = data.get("task_id")
	final_answer = data.get("Final answer")
	if task_id and final_answer is not None:
	ground_truth[task_id] = str(final_answer)
	except FileNotFoundError:
	print(f"Warning: Ground truth file {metadata_path} not found")
	except Exception as e:
	print(f"Error loading ground truth: {e}")

	return ground_truth

	def normalize_answer(self, answer: str) -> str:
	"""Normalize answer for comparison."""
	if answer is None:
	return ""

	# Convert to string and strip whitespace
	answer = str(answer).strip()

	# Convert to lowercase for case-insensitive comparison
	answer = answer.lower()

	# Remove common punctuation that might not affect correctness
	answer = re.sub(r'[.,;:!?"\']', '', answer)

	# Normalize whitespace
	answer = re.sub(r'\s+', ' ', answer)

	return answer

	def exact_match(self, predicted: str, actual: str) -> bool:
	"""Check if answers match exactly after normalization."""
	return self.normalize_answer(predicted) == self.normalize_answer(actual)

	def similarity_score(self, predicted: str, actual: str) -> float:
	"""Calculate similarity score between predicted and actual answers."""
	normalized_pred = self.normalize_answer(predicted)
	normalized_actual = self.normalize_answer(actual)

	if not normalized_pred and not normalized_actual:
	return 1.0
	if not normalized_pred or not normalized_actual:
	return 0.0

	return SequenceMatcher(None, normalized_pred, normalized_actual).ratio()

	def contains_answer(self, predicted: str, actual: str) -> bool:
	"""Check if the actual answer is contained in the predicted answer."""
	normalized_pred = self.normalize_answer(predicted)
	normalized_actual = self.normalize_answer(actual)

	return normalized_actual in normalized_pred

	def evaluate_answer(self, task_id: str, predicted_answer: str) -> Dict[str, Any]:
	"""Evaluate a single answer against ground truth."""
	actual_answer = self.ground_truth.get(task_id)

	if actual_answer is None:
	return {
	"task_id": task_id,
	"predicted_answer": predicted_answer,
	"actual_answer": None,
	"exact_match": False,
	"similarity_score": 0.0,
	"contains_answer": False,
	"error": "No ground truth available"
	}

	return {
	"task_id": task_id,
	"predicted_answer": predicted_answer,
	"actual_answer": actual_answer,
	"exact_match": self.exact_match(predicted_answer, actual_answer),
	"similarity_score": self.similarity_score(predicted_answer, actual_answer),
	"contains_answer": self.contains_answer(predicted_answer, actual_answer),
	"error": None
	}

	def evaluate_batch(self, results: List[Dict[str, Any]]) -> pd.DataFrame:
	"""Evaluate a batch of results."""
	evaluations = []

	for result in results:
	task_id = result.get("task_id") or result.get("Task ID")
	predicted_answer = result.get("submitted_answer") or result.get("Submitted Answer", "")

	if task_id is not None:
	evaluation = self.evaluate_answer(task_id, predicted_answer)
	evaluations.append(evaluation)

	return pd.DataFrame(evaluations)

	def get_summary_stats(self, evaluations_df: pd.DataFrame) -> Dict[str, Any]:
	"""Get summary statistics from evaluations."""
	if evaluations_df.empty:
	return {"error": "No evaluations available"}

	# Filter out entries without ground truth
	valid_evaluations = evaluations_df[evaluations_df['error'].isna()]

	if valid_evaluations.empty:
	return {"error": "No valid ground truth available"}

	total_questions = len(valid_evaluations)
	exact_matches = valid_evaluations['exact_match'].sum()
	avg_similarity = valid_evaluations['similarity_score'].mean()
	contains_matches = valid_evaluations['contains_answer'].sum()

	return {
	"total_questions": total_questions,
	"exact_matches": exact_matches,
	"exact_match_rate": exact_matches / total_questions,
	"average_similarity": avg_similarity,
	"contains_matches": contains_matches,
	"contains_match_rate": contains_matches / total_questions,
	"questions_with_ground_truth": total_questions
	}

	def enhance_results_log(self, results_log: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""Add comparison columns to results log."""
	enhanced_results = []

	for result in results_log:
	task_id = result.get("Task ID")
	predicted_answer = result.get("Submitted Answer", "")

	if task_id is not None:
	evaluation = self.evaluate_answer(task_id, predicted_answer)

	# Add comparison info to result
	enhanced_result = result.copy()
	enhanced_result["Ground Truth"] = evaluation["actual_answer"] or "N/A"
	enhanced_result["Exact Match"] = evaluation["exact_match"]
	enhanced_result["Similarity"] = f"{evaluation['similarity_score']:.3f}"
	enhanced_result["Contains Answer"] = evaluation["contains_answer"]

	enhanced_results.append(enhanced_result)

	return enhanced_results