File size: 4,223 Bytes
f9cf36d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
"""
Test script for GAIA comparison functionality.
"""

import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
import pandas as pd


def test_basic_comparison():
    """Test basic comparison functionality."""
    print("Testing basic comparison...")

    # Initialize comparator
    comparator = AnswerComparator()

    # Test with some sample data
    sample_results = [
        {"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "submitted_answer": "3"},
        {"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "submitted_answer": "3"},
        {"task_id": "nonexistent-task", "submitted_answer": "test"}
    ]

    # Evaluate batch
    evaluations_df = comparator.evaluate_batch(sample_results)
    print(f"Evaluated {len(evaluations_df)} answers")

    # Get summary stats
    summary_stats = comparator.get_summary_stats(evaluations_df)
    print("Summary statistics:")
    for key, value in summary_stats.items():
        print(f"  {key}: {value}")

    # Test single evaluation
    print("\nTesting single evaluation...")
    single_eval = comparator.evaluate_answer("8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "3")
    print(f"Single evaluation result: {single_eval}")

    return evaluations_df


def test_results_enhancement():
    """Test results log enhancement."""
    print("\nTesting results log enhancement...")

    comparator = AnswerComparator()

    # Sample results log (like what comes from your agent)
    sample_results_log = [
        {
            "Task ID": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
            "Question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009?",
            "Submitted Answer": "3"
        },
        {
            "Task ID": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
            "Question": "Test question",
            "Submitted Answer": "wrong answer"
        }
    ]

    # Enhance results
    enhanced_results = comparator.enhance_results_log(sample_results_log)

    print("Enhanced results:")
    for result in enhanced_results:
        print(f"  Task: {result['Task ID']}")
        print(f"  Answer: {result['Submitted Answer']}")
        print(f"  Ground Truth: {result['Ground Truth']}")
        print(f"  Exact Match: {result['Exact Match']}")
        print(f"  Similarity: {result['Similarity']}")
        print()


def test_phoenix_integration():
    """Test Phoenix integration (basic)."""
    print("\nTesting Phoenix integration...")

    # Create sample evaluations
    sample_evaluations = pd.DataFrame([
        {
            "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
            "predicted_answer": "3",
            "actual_answer": "3",
            "exact_match": True,
            "similarity_score": 1.0,
            "contains_answer": True,
            "error": None
        },
        {
            "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
            "predicted_answer": "wrong",
            "actual_answer": "3",
            "exact_match": False,
            "similarity_score": 0.2,
            "contains_answer": False,
            "error": None
        }
    ])

    # Try to log to Phoenix
    try:
        result = log_evaluations_to_phoenix(sample_evaluations)
        if result is not None:
            print("✅ Phoenix integration successful")
        else:
            print("⚠️ Phoenix integration failed (likely Phoenix not running)")
    except Exception as e:
        print(f"⚠️ Phoenix integration error: {e}")


def main():
    """Run all tests."""
    print("="*50)
    print("GAIA Comparison Test Suite")
    print("="*50)

    try:
        # Test basic comparison
        evaluations_df = test_basic_comparison()

        # Test results enhancement
        test_results_enhancement()

        # Test Phoenix integration
        test_phoenix_integration()

        print("\n" + "="*50)
        print("All tests completed!")
        print("="*50)

    except Exception as e:
        print(f"❌ Test failed with error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()