File size: 4,223 Bytes
f9cf36d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
#!/usr/bin/env python3
"""
Test script for GAIA comparison functionality.
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
import pandas as pd
def test_basic_comparison():
"""Test basic comparison functionality."""
print("Testing basic comparison...")
# Initialize comparator
comparator = AnswerComparator()
# Test with some sample data
sample_results = [
{"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "submitted_answer": "3"},
{"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "submitted_answer": "3"},
{"task_id": "nonexistent-task", "submitted_answer": "test"}
]
# Evaluate batch
evaluations_df = comparator.evaluate_batch(sample_results)
print(f"Evaluated {len(evaluations_df)} answers")
# Get summary stats
summary_stats = comparator.get_summary_stats(evaluations_df)
print("Summary statistics:")
for key, value in summary_stats.items():
print(f" {key}: {value}")
# Test single evaluation
print("\nTesting single evaluation...")
single_eval = comparator.evaluate_answer("8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "3")
print(f"Single evaluation result: {single_eval}")
return evaluations_df
def test_results_enhancement():
"""Test results log enhancement."""
print("\nTesting results log enhancement...")
comparator = AnswerComparator()
# Sample results log (like what comes from your agent)
sample_results_log = [
{
"Task ID": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
"Question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009?",
"Submitted Answer": "3"
},
{
"Task ID": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
"Question": "Test question",
"Submitted Answer": "wrong answer"
}
]
# Enhance results
enhanced_results = comparator.enhance_results_log(sample_results_log)
print("Enhanced results:")
for result in enhanced_results:
print(f" Task: {result['Task ID']}")
print(f" Answer: {result['Submitted Answer']}")
print(f" Ground Truth: {result['Ground Truth']}")
print(f" Exact Match: {result['Exact Match']}")
print(f" Similarity: {result['Similarity']}")
print()
def test_phoenix_integration():
"""Test Phoenix integration (basic)."""
print("\nTesting Phoenix integration...")
# Create sample evaluations
sample_evaluations = pd.DataFrame([
{
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
"predicted_answer": "3",
"actual_answer": "3",
"exact_match": True,
"similarity_score": 1.0,
"contains_answer": True,
"error": None
},
{
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
"predicted_answer": "wrong",
"actual_answer": "3",
"exact_match": False,
"similarity_score": 0.2,
"contains_answer": False,
"error": None
}
])
# Try to log to Phoenix
try:
result = log_evaluations_to_phoenix(sample_evaluations)
if result is not None:
print("✅ Phoenix integration successful")
else:
print("⚠️ Phoenix integration failed (likely Phoenix not running)")
except Exception as e:
print(f"⚠️ Phoenix integration error: {e}")
def main():
"""Run all tests."""
print("="*50)
print("GAIA Comparison Test Suite")
print("="*50)
try:
# Test basic comparison
evaluations_df = test_basic_comparison()
# Test results enhancement
test_results_enhancement()
# Test Phoenix integration
test_phoenix_integration()
print("\n" + "="*50)
print("All tests completed!")
print("="*50)
except Exception as e:
print(f"❌ Test failed with error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()
|