Final_Assignment_Template

Sleeping

File size: 6,486 Bytes

f9cf36d

import json
import pandas as pd
from typing import Dict, List, Any
from difflib import SequenceMatcher
import re


class AnswerComparator:
    def __init__(self, metadata_path: str = "data/metadata.jsonl"):
        """Initialize the comparator with ground truth data."""
        self.ground_truth = self._load_ground_truth(metadata_path)
        print(f"Loaded ground truth for {len(self.ground_truth)} questions")

    def _load_ground_truth(self, metadata_path: str) -> Dict[str, str]:
        """Load ground truth answers from metadata.jsonl file."""
        ground_truth = {}
        try:
            with open(metadata_path, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        data = json.loads(line)
                        task_id = data.get("task_id")
                        final_answer = data.get("Final answer")
                        if task_id and final_answer is not None:
                            ground_truth[task_id] = str(final_answer)
        except FileNotFoundError:
            print(f"Warning: Ground truth file {metadata_path} not found")
        except Exception as e:
            print(f"Error loading ground truth: {e}")

        return ground_truth

    def normalize_answer(self, answer: str) -> str:
        """Normalize answer for comparison."""
        if answer is None:
            return ""

        # Convert to string and strip whitespace
        answer = str(answer).strip()

        # Convert to lowercase for case-insensitive comparison
        answer = answer.lower()

        # Remove common punctuation that might not affect correctness
        answer = re.sub(r'[.,;:!?"\']', '', answer)

        # Normalize whitespace
        answer = re.sub(r'\s+', ' ', answer)

        return answer

    def exact_match(self, predicted: str, actual: str) -> bool:
        """Check if answers match exactly after normalization."""
        return self.normalize_answer(predicted) == self.normalize_answer(actual)

    def similarity_score(self, predicted: str, actual: str) -> float:
        """Calculate similarity score between predicted and actual answers."""
        normalized_pred = self.normalize_answer(predicted)
        normalized_actual = self.normalize_answer(actual)

        if not normalized_pred and not normalized_actual:
            return 1.0
        if not normalized_pred or not normalized_actual:
            return 0.0

        return SequenceMatcher(None, normalized_pred, normalized_actual).ratio()

    def contains_answer(self, predicted: str, actual: str) -> bool:
        """Check if the actual answer is contained in the predicted answer."""
        normalized_pred = self.normalize_answer(predicted)
        normalized_actual = self.normalize_answer(actual)

        return normalized_actual in normalized_pred

    def evaluate_answer(self, task_id: str, predicted_answer: str) -> Dict[str, Any]:
        """Evaluate a single answer against ground truth."""
        actual_answer = self.ground_truth.get(task_id)

        if actual_answer is None:
            return {
                "task_id": task_id,
                "predicted_answer": predicted_answer,
                "actual_answer": None,
                "exact_match": False,
                "similarity_score": 0.0,
                "contains_answer": False,
                "error": "No ground truth available"
            }

        return {
            "task_id": task_id,
            "predicted_answer": predicted_answer,
            "actual_answer": actual_answer,
            "exact_match": self.exact_match(predicted_answer, actual_answer),
            "similarity_score": self.similarity_score(predicted_answer, actual_answer),
            "contains_answer": self.contains_answer(predicted_answer, actual_answer),
            "error": None
        }

    def evaluate_batch(self, results: List[Dict[str, Any]]) -> pd.DataFrame:
        """Evaluate a batch of results."""
        evaluations = []

        for result in results:
            task_id = result.get("task_id") or result.get("Task ID")
            predicted_answer = result.get("submitted_answer") or result.get("Submitted Answer", "")

            if task_id is not None:
                evaluation = self.evaluate_answer(task_id, predicted_answer)
                evaluations.append(evaluation)

        return pd.DataFrame(evaluations)

    def get_summary_stats(self, evaluations_df: pd.DataFrame) -> Dict[str, Any]:
        """Get summary statistics from evaluations."""
        if evaluations_df.empty:
            return {"error": "No evaluations available"}

        # Filter out entries without ground truth
        valid_evaluations = evaluations_df[evaluations_df['error'].isna()]

        if valid_evaluations.empty:
            return {"error": "No valid ground truth available"}

        total_questions = len(valid_evaluations)
        exact_matches = valid_evaluations['exact_match'].sum()
        avg_similarity = valid_evaluations['similarity_score'].mean()
        contains_matches = valid_evaluations['contains_answer'].sum()

        return {
            "total_questions": total_questions,
            "exact_matches": exact_matches,
            "exact_match_rate": exact_matches / total_questions,
            "average_similarity": avg_similarity,
            "contains_matches": contains_matches,
            "contains_match_rate": contains_matches / total_questions,
            "questions_with_ground_truth": total_questions
        }

    def enhance_results_log(self, results_log: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Add comparison columns to results log."""
        enhanced_results = []

        for result in results_log:
            task_id = result.get("Task ID")
            predicted_answer = result.get("Submitted Answer", "")

            if task_id is not None:
                evaluation = self.evaluate_answer(task_id, predicted_answer)

                # Add comparison info to result
                enhanced_result = result.copy()
                enhanced_result["Ground Truth"] = evaluation["actual_answer"] or "N/A"
                enhanced_result["Exact Match"] = evaluation["exact_match"]
                enhanced_result["Similarity"] = f"{evaluation['similarity_score']:.3f}"
                enhanced_result["Contains Answer"] = evaluation["contains_answer"]

                enhanced_results.append(enhanced_result)

        return enhanced_results