Final_Assignment_Template / test_phoenix_logging.py
Romain Fayoux
Added ground evaluation and phoenix login
f9cf36d
raw
history blame
8.16 kB
#!/usr/bin/env python3
"""
Test script to verify Phoenix evaluations logging.
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
from datetime import datetime
import time
def test_phoenix_connection():
"""Test Phoenix connection and basic functionality."""
print("πŸ” Testing Phoenix Connection...")
try:
client = px.Client()
print("βœ… Phoenix client connected successfully")
# Check if Phoenix is actually running
spans_df = client.get_spans_dataframe()
print(f"πŸ“Š Found {len(spans_df)} existing spans in Phoenix")
return client, spans_df
except Exception as e:
print(f"❌ Phoenix connection failed: {e}")
print("Make sure Phoenix is running and accessible at http://localhost:6006")
return None, None
def create_test_evaluations():
"""Create test evaluations for logging."""
print("\nπŸ§ͺ Creating test evaluations...")
test_data = [
{
"task_id": "test-exact-match",
"predicted_answer": "Paris",
"actual_answer": "Paris",
"exact_match": True,
"similarity_score": 1.0,
"contains_answer": True,
"error": None
},
{
"task_id": "test-partial-match",
"predicted_answer": "The capital of France is Paris",
"actual_answer": "Paris",
"exact_match": False,
"similarity_score": 0.75,
"contains_answer": True,
"error": None
},
{
"task_id": "test-no-match",
"predicted_answer": "London",
"actual_answer": "Paris",
"exact_match": False,
"similarity_score": 0.2,
"contains_answer": False,
"error": None
}
]
evaluations_df = pd.DataFrame(test_data)
print(f"Created {len(evaluations_df)} test evaluations")
return evaluations_df
def create_mock_spans(client):
"""Create mock spans for testing (if no real spans exist)."""
print("\n🎭 Creating mock spans for testing...")
# Note: This is a simplified mock - in real usage, spans are created by agent runs
mock_spans = [
{
"context.span_id": "mock-span-1",
"name": "test_agent_run",
"input.value": "Question about test-exact-match",
"output.value": "Paris",
"start_time": datetime.now(),
"end_time": datetime.now()
},
{
"context.span_id": "mock-span-2",
"name": "test_agent_run",
"input.value": "Question about test-partial-match",
"output.value": "The capital of France is Paris",
"start_time": datetime.now(),
"end_time": datetime.now()
},
{
"context.span_id": "mock-span-3",
"name": "test_agent_run",
"input.value": "Question about test-no-match",
"output.value": "London",
"start_time": datetime.now(),
"end_time": datetime.now()
}
]
print(f"Created {len(mock_spans)} mock spans")
return pd.DataFrame(mock_spans)
def test_evaluation_logging():
"""Test the actual evaluation logging to Phoenix."""
print("\nπŸ“ Testing evaluation logging...")
# Create test evaluations
evaluations_df = create_test_evaluations()
# Try to log to Phoenix
try:
result = log_evaluations_to_phoenix(evaluations_df)
if result is not None:
print("βœ… Evaluation logging test successful!")
print(f"Logged {len(result)} evaluations")
return True
else:
print("❌ Evaluation logging test failed - no result returned")
return False
except Exception as e:
print(f"❌ Evaluation logging test failed with error: {e}")
import traceback
traceback.print_exc()
return False
def verify_logged_evaluations(client):
"""Verify that evaluations were actually logged to Phoenix."""
print("\nπŸ” Verifying logged evaluations...")
try:
# Give Phoenix a moment to process
time.sleep(2)
# Try to retrieve evaluations
evals_df = client.get_evaluations_dataframe()
print(f"πŸ“Š Found {len(evals_df)} total evaluations in Phoenix")
# Look for our specific evaluations
if len(evals_df) > 0:
gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
print(f"🎯 Found {len(gaia_evals)} GAIA ground truth evaluations")
if len(gaia_evals) > 0:
print("βœ… Successfully verified evaluations in Phoenix!")
# Show sample evaluation
sample_eval = gaia_evals.iloc[0]
print(f"Sample evaluation:")
print(f" - Score: {sample_eval.get('score', 'N/A')}")
print(f" - Label: {sample_eval.get('label', 'N/A')}")
print(f" - Explanation: {sample_eval.get('explanation', 'N/A')}")
return True
else:
print("❌ No GAIA evaluations found after logging")
return False
else:
print("❌ No evaluations found in Phoenix")
return False
except Exception as e:
print(f"❌ Error verifying evaluations: {e}")
return False
def test_with_real_gaia_data():
"""Test with actual GAIA data if available."""
print("\nπŸ“š Testing with real GAIA data...")
try:
# Initialize comparator
comparator = AnswerComparator()
if len(comparator.ground_truth) == 0:
print("⚠️ No GAIA ground truth data available")
return False
# Create a real evaluation with GAIA data
real_task_id = list(comparator.ground_truth.keys())[0]
real_ground_truth = comparator.ground_truth[real_task_id]
real_evaluation = comparator.evaluate_answer(real_task_id, "test answer")
real_eval_df = pd.DataFrame([real_evaluation])
# Log to Phoenix
result = log_evaluations_to_phoenix(real_eval_df)
if result is not None:
print("βœ… Real GAIA data logging successful!")
print(f"Task ID: {real_task_id}")
print(f"Ground Truth: {real_ground_truth}")
print(f"Similarity Score: {real_evaluation['similarity_score']:.3f}")
return True
else:
print("❌ Real GAIA data logging failed")
return False
except Exception as e:
print(f"❌ Error testing with real GAIA data: {e}")
return False
def main():
"""Main test function."""
print("πŸš€ Phoenix Evaluations Logging Test")
print("=" * 50)
# Test Phoenix connection
client, spans_df = test_phoenix_connection()
if not client:
print("❌ Cannot proceed without Phoenix connection")
return
# Run tests
tests_passed = 0
total_tests = 3
print(f"\nπŸ§ͺ Running {total_tests} tests...")
# Test 1: Basic evaluation logging
if test_evaluation_logging():
tests_passed += 1
# Test 2: Verify evaluations were logged
if verify_logged_evaluations(client):
tests_passed += 1
# Test 3: Test with real GAIA data
if test_with_real_gaia_data():
tests_passed += 1
# Summary
print("\n" + "=" * 50)
print(f"🎯 Test Results: {tests_passed}/{total_tests} tests passed")
if tests_passed == total_tests:
print("πŸŽ‰ All tests passed! Phoenix evaluations logging is working correctly.")
print("You should now see 'gaia_ground_truth' evaluations in the Phoenix UI.")
else:
print("⚠️ Some tests failed. Check the output above for details.")
print(f"\n🌐 Phoenix UI: http://localhost:6006")
print("Look for 'Evaluations' or 'Evals' tab to see the logged evaluations.")
if __name__ == "__main__":
main()