File size: 8,158 Bytes
f9cf36d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
#!/usr/bin/env python3
"""
Test script to verify Phoenix evaluations logging.
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
from datetime import datetime
import time
def test_phoenix_connection():
"""Test Phoenix connection and basic functionality."""
print("π Testing Phoenix Connection...")
try:
client = px.Client()
print("β
Phoenix client connected successfully")
# Check if Phoenix is actually running
spans_df = client.get_spans_dataframe()
print(f"π Found {len(spans_df)} existing spans in Phoenix")
return client, spans_df
except Exception as e:
print(f"β Phoenix connection failed: {e}")
print("Make sure Phoenix is running and accessible at http://localhost:6006")
return None, None
def create_test_evaluations():
"""Create test evaluations for logging."""
print("\nπ§ͺ Creating test evaluations...")
test_data = [
{
"task_id": "test-exact-match",
"predicted_answer": "Paris",
"actual_answer": "Paris",
"exact_match": True,
"similarity_score": 1.0,
"contains_answer": True,
"error": None
},
{
"task_id": "test-partial-match",
"predicted_answer": "The capital of France is Paris",
"actual_answer": "Paris",
"exact_match": False,
"similarity_score": 0.75,
"contains_answer": True,
"error": None
},
{
"task_id": "test-no-match",
"predicted_answer": "London",
"actual_answer": "Paris",
"exact_match": False,
"similarity_score": 0.2,
"contains_answer": False,
"error": None
}
]
evaluations_df = pd.DataFrame(test_data)
print(f"Created {len(evaluations_df)} test evaluations")
return evaluations_df
def create_mock_spans(client):
"""Create mock spans for testing (if no real spans exist)."""
print("\nπ Creating mock spans for testing...")
# Note: This is a simplified mock - in real usage, spans are created by agent runs
mock_spans = [
{
"context.span_id": "mock-span-1",
"name": "test_agent_run",
"input.value": "Question about test-exact-match",
"output.value": "Paris",
"start_time": datetime.now(),
"end_time": datetime.now()
},
{
"context.span_id": "mock-span-2",
"name": "test_agent_run",
"input.value": "Question about test-partial-match",
"output.value": "The capital of France is Paris",
"start_time": datetime.now(),
"end_time": datetime.now()
},
{
"context.span_id": "mock-span-3",
"name": "test_agent_run",
"input.value": "Question about test-no-match",
"output.value": "London",
"start_time": datetime.now(),
"end_time": datetime.now()
}
]
print(f"Created {len(mock_spans)} mock spans")
return pd.DataFrame(mock_spans)
def test_evaluation_logging():
"""Test the actual evaluation logging to Phoenix."""
print("\nπ Testing evaluation logging...")
# Create test evaluations
evaluations_df = create_test_evaluations()
# Try to log to Phoenix
try:
result = log_evaluations_to_phoenix(evaluations_df)
if result is not None:
print("β
Evaluation logging test successful!")
print(f"Logged {len(result)} evaluations")
return True
else:
print("β Evaluation logging test failed - no result returned")
return False
except Exception as e:
print(f"β Evaluation logging test failed with error: {e}")
import traceback
traceback.print_exc()
return False
def verify_logged_evaluations(client):
"""Verify that evaluations were actually logged to Phoenix."""
print("\nπ Verifying logged evaluations...")
try:
# Give Phoenix a moment to process
time.sleep(2)
# Try to retrieve evaluations
evals_df = client.get_evaluations_dataframe()
print(f"π Found {len(evals_df)} total evaluations in Phoenix")
# Look for our specific evaluations
if len(evals_df) > 0:
gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
print(f"π― Found {len(gaia_evals)} GAIA ground truth evaluations")
if len(gaia_evals) > 0:
print("β
Successfully verified evaluations in Phoenix!")
# Show sample evaluation
sample_eval = gaia_evals.iloc[0]
print(f"Sample evaluation:")
print(f" - Score: {sample_eval.get('score', 'N/A')}")
print(f" - Label: {sample_eval.get('label', 'N/A')}")
print(f" - Explanation: {sample_eval.get('explanation', 'N/A')}")
return True
else:
print("β No GAIA evaluations found after logging")
return False
else:
print("β No evaluations found in Phoenix")
return False
except Exception as e:
print(f"β Error verifying evaluations: {e}")
return False
def test_with_real_gaia_data():
"""Test with actual GAIA data if available."""
print("\nπ Testing with real GAIA data...")
try:
# Initialize comparator
comparator = AnswerComparator()
if len(comparator.ground_truth) == 0:
print("β οΈ No GAIA ground truth data available")
return False
# Create a real evaluation with GAIA data
real_task_id = list(comparator.ground_truth.keys())[0]
real_ground_truth = comparator.ground_truth[real_task_id]
real_evaluation = comparator.evaluate_answer(real_task_id, "test answer")
real_eval_df = pd.DataFrame([real_evaluation])
# Log to Phoenix
result = log_evaluations_to_phoenix(real_eval_df)
if result is not None:
print("β
Real GAIA data logging successful!")
print(f"Task ID: {real_task_id}")
print(f"Ground Truth: {real_ground_truth}")
print(f"Similarity Score: {real_evaluation['similarity_score']:.3f}")
return True
else:
print("β Real GAIA data logging failed")
return False
except Exception as e:
print(f"β Error testing with real GAIA data: {e}")
return False
def main():
"""Main test function."""
print("π Phoenix Evaluations Logging Test")
print("=" * 50)
# Test Phoenix connection
client, spans_df = test_phoenix_connection()
if not client:
print("β Cannot proceed without Phoenix connection")
return
# Run tests
tests_passed = 0
total_tests = 3
print(f"\nπ§ͺ Running {total_tests} tests...")
# Test 1: Basic evaluation logging
if test_evaluation_logging():
tests_passed += 1
# Test 2: Verify evaluations were logged
if verify_logged_evaluations(client):
tests_passed += 1
# Test 3: Test with real GAIA data
if test_with_real_gaia_data():
tests_passed += 1
# Summary
print("\n" + "=" * 50)
print(f"π― Test Results: {tests_passed}/{total_tests} tests passed")
if tests_passed == total_tests:
print("π All tests passed! Phoenix evaluations logging is working correctly.")
print("You should now see 'gaia_ground_truth' evaluations in the Phoenix UI.")
else:
print("β οΈ Some tests failed. Check the output above for details.")
print(f"\nπ Phoenix UI: http://localhost:6006")
print("Look for 'Evaluations' or 'Evals' tab to see the logged evaluations.")
if __name__ == "__main__":
main()
|