#!/usr/bin/env python3 """ Test script for GAIA comparison functionality. """ import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) from comparison import AnswerComparator from phoenix_evaluator import log_evaluations_to_phoenix import pandas as pd def test_basic_comparison(): """Test basic comparison functionality.""" print("Testing basic comparison...") # Initialize comparator comparator = AnswerComparator() # Test with some sample data sample_results = [ {"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "submitted_answer": "3"}, {"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "submitted_answer": "3"}, {"task_id": "nonexistent-task", "submitted_answer": "test"} ] # Evaluate batch evaluations_df = comparator.evaluate_batch(sample_results) print(f"Evaluated {len(evaluations_df)} answers") # Get summary stats summary_stats = comparator.get_summary_stats(evaluations_df) print("Summary statistics:") for key, value in summary_stats.items(): print(f" {key}: {value}") # Test single evaluation print("\nTesting single evaluation...") single_eval = comparator.evaluate_answer("8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "3") print(f"Single evaluation result: {single_eval}") return evaluations_df def test_results_enhancement(): """Test results log enhancement.""" print("\nTesting results log enhancement...") comparator = AnswerComparator() # Sample results log (like what comes from your agent) sample_results_log = [ { "Task ID": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "Question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009?", "Submitted Answer": "3" }, { "Task ID": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "Question": "Test question", "Submitted Answer": "wrong answer" } ] # Enhance results enhanced_results = comparator.enhance_results_log(sample_results_log) print("Enhanced results:") for result in enhanced_results: print(f" Task: {result['Task ID']}") print(f" Answer: {result['Submitted Answer']}") print(f" Ground Truth: {result['Ground Truth']}") print(f" Exact Match: {result['Exact Match']}") print(f" Similarity: {result['Similarity']}") print() def test_phoenix_integration(): """Test Phoenix integration (basic).""" print("\nTesting Phoenix integration...") # Create sample evaluations sample_evaluations = pd.DataFrame([ { "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "predicted_answer": "3", "actual_answer": "3", "exact_match": True, "similarity_score": 1.0, "contains_answer": True, "error": None }, { "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "predicted_answer": "wrong", "actual_answer": "3", "exact_match": False, "similarity_score": 0.2, "contains_answer": False, "error": None } ]) # Try to log to Phoenix try: result = log_evaluations_to_phoenix(sample_evaluations) if result is not None: print("✅ Phoenix integration successful") else: print("⚠️ Phoenix integration failed (likely Phoenix not running)") except Exception as e: print(f"⚠️ Phoenix integration error: {e}") def main(): """Run all tests.""" print("="*50) print("GAIA Comparison Test Suite") print("="*50) try: # Test basic comparison evaluations_df = test_basic_comparison() # Test results enhancement test_results_enhancement() # Test Phoenix integration test_phoenix_integration() print("\n" + "="*50) print("All tests completed!") print("="*50) except Exception as e: print(f"❌ Test failed with error: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()