Final_Assignment_Template / test_comparison.py
Romain Fayoux
Added ground evaluation and phoenix login
f9cf36d
raw
history blame
4.22 kB
#!/usr/bin/env python3
"""
Test script for GAIA comparison functionality.
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
import pandas as pd
def test_basic_comparison():
"""Test basic comparison functionality."""
print("Testing basic comparison...")
# Initialize comparator
comparator = AnswerComparator()
# Test with some sample data
sample_results = [
{"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "submitted_answer": "3"},
{"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "submitted_answer": "3"},
{"task_id": "nonexistent-task", "submitted_answer": "test"}
]
# Evaluate batch
evaluations_df = comparator.evaluate_batch(sample_results)
print(f"Evaluated {len(evaluations_df)} answers")
# Get summary stats
summary_stats = comparator.get_summary_stats(evaluations_df)
print("Summary statistics:")
for key, value in summary_stats.items():
print(f" {key}: {value}")
# Test single evaluation
print("\nTesting single evaluation...")
single_eval = comparator.evaluate_answer("8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "3")
print(f"Single evaluation result: {single_eval}")
return evaluations_df
def test_results_enhancement():
"""Test results log enhancement."""
print("\nTesting results log enhancement...")
comparator = AnswerComparator()
# Sample results log (like what comes from your agent)
sample_results_log = [
{
"Task ID": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
"Question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009?",
"Submitted Answer": "3"
},
{
"Task ID": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
"Question": "Test question",
"Submitted Answer": "wrong answer"
}
]
# Enhance results
enhanced_results = comparator.enhance_results_log(sample_results_log)
print("Enhanced results:")
for result in enhanced_results:
print(f" Task: {result['Task ID']}")
print(f" Answer: {result['Submitted Answer']}")
print(f" Ground Truth: {result['Ground Truth']}")
print(f" Exact Match: {result['Exact Match']}")
print(f" Similarity: {result['Similarity']}")
print()
def test_phoenix_integration():
"""Test Phoenix integration (basic)."""
print("\nTesting Phoenix integration...")
# Create sample evaluations
sample_evaluations = pd.DataFrame([
{
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
"predicted_answer": "3",
"actual_answer": "3",
"exact_match": True,
"similarity_score": 1.0,
"contains_answer": True,
"error": None
},
{
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
"predicted_answer": "wrong",
"actual_answer": "3",
"exact_match": False,
"similarity_score": 0.2,
"contains_answer": False,
"error": None
}
])
# Try to log to Phoenix
try:
result = log_evaluations_to_phoenix(sample_evaluations)
if result is not None:
print("✅ Phoenix integration successful")
else:
print("⚠️ Phoenix integration failed (likely Phoenix not running)")
except Exception as e:
print(f"⚠️ Phoenix integration error: {e}")
def main():
"""Run all tests."""
print("="*50)
print("GAIA Comparison Test Suite")
print("="*50)
try:
# Test basic comparison
evaluations_df = test_basic_comparison()
# Test results enhancement
test_results_enhancement()
# Test Phoenix integration
test_phoenix_integration()
print("\n" + "="*50)
print("All tests completed!")
print("="*50)
except Exception as e:
print(f"❌ Test failed with error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()