File size: 4,299 Bytes
f9cf36d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
#!/usr/bin/env python3
"""
Simple test for Phoenix evaluations logging.
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
def test_phoenix_logging():
"""Test Phoenix evaluations logging with simple data."""
print("π§ͺ Testing Phoenix Evaluations Logging")
print("=" * 50)
# Step 1: Check Phoenix connection
print("1. Checking Phoenix connection...")
try:
client = px.Client()
print("β
Phoenix connected successfully")
except Exception as e:
print(f"β Phoenix connection failed: {e}")
return False
# Step 2: Create test evaluations
print("\n2. Creating test evaluations...")
test_evaluations = pd.DataFrame([
{
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
"predicted_answer": "3",
"actual_answer": "3",
"exact_match": True,
"similarity_score": 1.0,
"contains_answer": True,
"error": None
},
{
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
"predicted_answer": "5",
"actual_answer": "3",
"exact_match": False,
"similarity_score": 0.2,
"contains_answer": False,
"error": None
}
])
print(f"β
Created {len(test_evaluations)} test evaluations")
# Step 3: Check existing spans
print("\n3. Checking existing spans...")
try:
spans_df = client.get_spans_dataframe()
print(f"π Found {len(spans_df)} existing spans")
if len(spans_df) == 0:
print("β οΈ No spans found - you need to run your agent first to create spans")
return False
except Exception as e:
print(f"β Error getting spans: {e}")
return False
# Step 4: Test logging
print("\n4. Testing evaluation logging...")
try:
result = log_evaluations_to_phoenix(test_evaluations)
if result is not None:
print(f"β
Successfully logged {len(result)} evaluations to Phoenix")
print("Sample evaluation:")
print(f" - Score: {result.iloc[0]['score']}")
print(f" - Label: {result.iloc[0]['label']}")
print(f" - Explanation: {result.iloc[0]['explanation'][:100]}...")
# Step 5: Verify evaluations were logged
print("\n5. Verifying evaluations in Phoenix...")
try:
import time
time.sleep(2) # Give Phoenix time to process
evals_df = client.get_evaluations_dataframe()
gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
print(f"π Found {len(gaia_evals)} GAIA evaluations in Phoenix")
if len(gaia_evals) > 0:
print("β
Evaluations successfully verified in Phoenix!")
return True
else:
print("β οΈ No GAIA evaluations found in Phoenix")
return False
except Exception as e:
print(f"β οΈ Could not verify evaluations: {e}")
print("β
Logging appeared successful though")
return True
else:
print("β Evaluation logging failed")
return False
except Exception as e:
print(f"β Error during logging: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Main test function."""
success = test_phoenix_logging()
print("\n" + "=" * 50)
if success:
print("π Phoenix evaluations logging test PASSED!")
print("You should now see 'gaia_ground_truth' evaluations in Phoenix UI")
print("π Visit: http://localhost:6006")
else:
print("β Phoenix evaluations logging test FAILED!")
print("Make sure:")
print(" 1. Your agent app is running (it starts Phoenix)")
print(" 2. You've run your agent at least once to create spans")
print(" 3. Phoenix is accessible at http://localhost:6006")
if __name__ == "__main__":
main()
|