File size: 4,299 Bytes
f9cf36d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
"""
Simple test for Phoenix evaluations logging.
"""

import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix


def test_phoenix_logging():
    """Test Phoenix evaluations logging with simple data."""
    print("πŸ§ͺ Testing Phoenix Evaluations Logging")
    print("=" * 50)

    # Step 1: Check Phoenix connection
    print("1. Checking Phoenix connection...")
    try:
        client = px.Client()
        print("βœ… Phoenix connected successfully")
    except Exception as e:
        print(f"❌ Phoenix connection failed: {e}")
        return False

    # Step 2: Create test evaluations
    print("\n2. Creating test evaluations...")
    test_evaluations = pd.DataFrame([
        {
            "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
            "predicted_answer": "3",
            "actual_answer": "3",
            "exact_match": True,
            "similarity_score": 1.0,
            "contains_answer": True,
            "error": None
        },
        {
            "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
            "predicted_answer": "5",
            "actual_answer": "3",
            "exact_match": False,
            "similarity_score": 0.2,
            "contains_answer": False,
            "error": None
        }
    ])
    print(f"βœ… Created {len(test_evaluations)} test evaluations")

    # Step 3: Check existing spans
    print("\n3. Checking existing spans...")
    try:
        spans_df = client.get_spans_dataframe()
        print(f"πŸ“Š Found {len(spans_df)} existing spans")

        if len(spans_df) == 0:
            print("⚠️ No spans found - you need to run your agent first to create spans")
            return False

    except Exception as e:
        print(f"❌ Error getting spans: {e}")
        return False

    # Step 4: Test logging
    print("\n4. Testing evaluation logging...")
    try:
        result = log_evaluations_to_phoenix(test_evaluations)

        if result is not None:
            print(f"βœ… Successfully logged {len(result)} evaluations to Phoenix")
            print("Sample evaluation:")
            print(f"  - Score: {result.iloc[0]['score']}")
            print(f"  - Label: {result.iloc[0]['label']}")
            print(f"  - Explanation: {result.iloc[0]['explanation'][:100]}...")

            # Step 5: Verify evaluations were logged
            print("\n5. Verifying evaluations in Phoenix...")
            try:
                import time
                time.sleep(2)  # Give Phoenix time to process

                evals_df = client.get_evaluations_dataframe()
                gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']

                print(f"πŸ“Š Found {len(gaia_evals)} GAIA evaluations in Phoenix")

                if len(gaia_evals) > 0:
                    print("βœ… Evaluations successfully verified in Phoenix!")
                    return True
                else:
                    print("⚠️ No GAIA evaluations found in Phoenix")
                    return False

            except Exception as e:
                print(f"⚠️ Could not verify evaluations: {e}")
                print("βœ… Logging appeared successful though")
                return True

        else:
            print("❌ Evaluation logging failed")
            return False

    except Exception as e:
        print(f"❌ Error during logging: {e}")
        import traceback
        traceback.print_exc()
        return False


def main():
    """Main test function."""
    success = test_phoenix_logging()

    print("\n" + "=" * 50)
    if success:
        print("πŸŽ‰ Phoenix evaluations logging test PASSED!")
        print("You should now see 'gaia_ground_truth' evaluations in Phoenix UI")
        print("🌐 Visit: http://localhost:6006")
    else:
        print("❌ Phoenix evaluations logging test FAILED!")
        print("Make sure:")
        print("  1. Your agent app is running (it starts Phoenix)")
        print("  2. You've run your agent at least once to create spans")
        print("  3. Phoenix is accessible at http://localhost:6006")


if __name__ == "__main__":
    main()