Final_Assignment_Template / test_phoenix_simple.py
Romain Fayoux
Added ground evaluation and phoenix login
f9cf36d
raw
history blame
4.3 kB
#!/usr/bin/env python3
"""
Simple test for Phoenix evaluations logging.
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
def test_phoenix_logging():
"""Test Phoenix evaluations logging with simple data."""
print("πŸ§ͺ Testing Phoenix Evaluations Logging")
print("=" * 50)
# Step 1: Check Phoenix connection
print("1. Checking Phoenix connection...")
try:
client = px.Client()
print("βœ… Phoenix connected successfully")
except Exception as e:
print(f"❌ Phoenix connection failed: {e}")
return False
# Step 2: Create test evaluations
print("\n2. Creating test evaluations...")
test_evaluations = pd.DataFrame([
{
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
"predicted_answer": "3",
"actual_answer": "3",
"exact_match": True,
"similarity_score": 1.0,
"contains_answer": True,
"error": None
},
{
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
"predicted_answer": "5",
"actual_answer": "3",
"exact_match": False,
"similarity_score": 0.2,
"contains_answer": False,
"error": None
}
])
print(f"βœ… Created {len(test_evaluations)} test evaluations")
# Step 3: Check existing spans
print("\n3. Checking existing spans...")
try:
spans_df = client.get_spans_dataframe()
print(f"πŸ“Š Found {len(spans_df)} existing spans")
if len(spans_df) == 0:
print("⚠️ No spans found - you need to run your agent first to create spans")
return False
except Exception as e:
print(f"❌ Error getting spans: {e}")
return False
# Step 4: Test logging
print("\n4. Testing evaluation logging...")
try:
result = log_evaluations_to_phoenix(test_evaluations)
if result is not None:
print(f"βœ… Successfully logged {len(result)} evaluations to Phoenix")
print("Sample evaluation:")
print(f" - Score: {result.iloc[0]['score']}")
print(f" - Label: {result.iloc[0]['label']}")
print(f" - Explanation: {result.iloc[0]['explanation'][:100]}...")
# Step 5: Verify evaluations were logged
print("\n5. Verifying evaluations in Phoenix...")
try:
import time
time.sleep(2) # Give Phoenix time to process
evals_df = client.get_evaluations_dataframe()
gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
print(f"πŸ“Š Found {len(gaia_evals)} GAIA evaluations in Phoenix")
if len(gaia_evals) > 0:
print("βœ… Evaluations successfully verified in Phoenix!")
return True
else:
print("⚠️ No GAIA evaluations found in Phoenix")
return False
except Exception as e:
print(f"⚠️ Could not verify evaluations: {e}")
print("βœ… Logging appeared successful though")
return True
else:
print("❌ Evaluation logging failed")
return False
except Exception as e:
print(f"❌ Error during logging: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Main test function."""
success = test_phoenix_logging()
print("\n" + "=" * 50)
if success:
print("πŸŽ‰ Phoenix evaluations logging test PASSED!")
print("You should now see 'gaia_ground_truth' evaluations in Phoenix UI")
print("🌐 Visit: http://localhost:6006")
else:
print("❌ Phoenix evaluations logging test FAILED!")
print("Make sure:")
print(" 1. Your agent app is running (it starts Phoenix)")
print(" 2. You've run your agent at least once to create spans")
print(" 3. Phoenix is accessible at http://localhost:6006")
if __name__ == "__main__":
main()