#!/usr/bin/env python3 """ Test script to verify Phoenix evaluations logging. """ import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) import phoenix as px import pandas as pd from comparison import AnswerComparator from phoenix_evaluator import log_evaluations_to_phoenix from datetime import datetime import time def test_phoenix_connection(): """Test Phoenix connection and basic functionality.""" print("๐Ÿ” Testing Phoenix Connection...") try: client = px.Client() print("โœ… Phoenix client connected successfully") # Check if Phoenix is actually running spans_df = client.get_spans_dataframe() print(f"๐Ÿ“Š Found {len(spans_df)} existing spans in Phoenix") return client, spans_df except Exception as e: print(f"โŒ Phoenix connection failed: {e}") print("Make sure Phoenix is running and accessible at http://localhost:6006") return None, None def create_test_evaluations(): """Create test evaluations for logging.""" print("\n๐Ÿงช Creating test evaluations...") test_data = [ { "task_id": "test-exact-match", "predicted_answer": "Paris", "actual_answer": "Paris", "exact_match": True, "similarity_score": 1.0, "contains_answer": True, "error": None }, { "task_id": "test-partial-match", "predicted_answer": "The capital of France is Paris", "actual_answer": "Paris", "exact_match": False, "similarity_score": 0.75, "contains_answer": True, "error": None }, { "task_id": "test-no-match", "predicted_answer": "London", "actual_answer": "Paris", "exact_match": False, "similarity_score": 0.2, "contains_answer": False, "error": None } ] evaluations_df = pd.DataFrame(test_data) print(f"Created {len(evaluations_df)} test evaluations") return evaluations_df def create_mock_spans(client): """Create mock spans for testing (if no real spans exist).""" print("\n๐ŸŽญ Creating mock spans for testing...") # Note: This is a simplified mock - in real usage, spans are created by agent runs mock_spans = [ { "context.span_id": "mock-span-1", "name": "test_agent_run", "input.value": "Question about test-exact-match", "output.value": "Paris", "start_time": datetime.now(), "end_time": datetime.now() }, { "context.span_id": "mock-span-2", "name": "test_agent_run", "input.value": "Question about test-partial-match", "output.value": "The capital of France is Paris", "start_time": datetime.now(), "end_time": datetime.now() }, { "context.span_id": "mock-span-3", "name": "test_agent_run", "input.value": "Question about test-no-match", "output.value": "London", "start_time": datetime.now(), "end_time": datetime.now() } ] print(f"Created {len(mock_spans)} mock spans") return pd.DataFrame(mock_spans) def test_evaluation_logging(): """Test the actual evaluation logging to Phoenix.""" print("\n๐Ÿ“ Testing evaluation logging...") # Create test evaluations evaluations_df = create_test_evaluations() # Try to log to Phoenix try: result = log_evaluations_to_phoenix(evaluations_df) if result is not None: print("โœ… Evaluation logging test successful!") print(f"Logged {len(result)} evaluations") return True else: print("โŒ Evaluation logging test failed - no result returned") return False except Exception as e: print(f"โŒ Evaluation logging test failed with error: {e}") import traceback traceback.print_exc() return False def verify_logged_evaluations(client): """Verify that evaluations were actually logged to Phoenix.""" print("\n๐Ÿ” Verifying logged evaluations...") try: # Give Phoenix a moment to process time.sleep(2) # Try to retrieve evaluations evals_df = client.get_evaluations_dataframe() print(f"๐Ÿ“Š Found {len(evals_df)} total evaluations in Phoenix") # Look for our specific evaluations if len(evals_df) > 0: gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth'] print(f"๐ŸŽฏ Found {len(gaia_evals)} GAIA ground truth evaluations") if len(gaia_evals) > 0: print("โœ… Successfully verified evaluations in Phoenix!") # Show sample evaluation sample_eval = gaia_evals.iloc[0] print(f"Sample evaluation:") print(f" - Score: {sample_eval.get('score', 'N/A')}") print(f" - Label: {sample_eval.get('label', 'N/A')}") print(f" - Explanation: {sample_eval.get('explanation', 'N/A')}") return True else: print("โŒ No GAIA evaluations found after logging") return False else: print("โŒ No evaluations found in Phoenix") return False except Exception as e: print(f"โŒ Error verifying evaluations: {e}") return False def test_with_real_gaia_data(): """Test with actual GAIA data if available.""" print("\n๐Ÿ“š Testing with real GAIA data...") try: # Initialize comparator comparator = AnswerComparator() if len(comparator.ground_truth) == 0: print("โš ๏ธ No GAIA ground truth data available") return False # Create a real evaluation with GAIA data real_task_id = list(comparator.ground_truth.keys())[0] real_ground_truth = comparator.ground_truth[real_task_id] real_evaluation = comparator.evaluate_answer(real_task_id, "test answer") real_eval_df = pd.DataFrame([real_evaluation]) # Log to Phoenix result = log_evaluations_to_phoenix(real_eval_df) if result is not None: print("โœ… Real GAIA data logging successful!") print(f"Task ID: {real_task_id}") print(f"Ground Truth: {real_ground_truth}") print(f"Similarity Score: {real_evaluation['similarity_score']:.3f}") return True else: print("โŒ Real GAIA data logging failed") return False except Exception as e: print(f"โŒ Error testing with real GAIA data: {e}") return False def main(): """Main test function.""" print("๐Ÿš€ Phoenix Evaluations Logging Test") print("=" * 50) # Test Phoenix connection client, spans_df = test_phoenix_connection() if not client: print("โŒ Cannot proceed without Phoenix connection") return # Run tests tests_passed = 0 total_tests = 3 print(f"\n๐Ÿงช Running {total_tests} tests...") # Test 1: Basic evaluation logging if test_evaluation_logging(): tests_passed += 1 # Test 2: Verify evaluations were logged if verify_logged_evaluations(client): tests_passed += 1 # Test 3: Test with real GAIA data if test_with_real_gaia_data(): tests_passed += 1 # Summary print("\n" + "=" * 50) print(f"๐ŸŽฏ Test Results: {tests_passed}/{total_tests} tests passed") if tests_passed == total_tests: print("๐ŸŽ‰ All tests passed! Phoenix evaluations logging is working correctly.") print("You should now see 'gaia_ground_truth' evaluations in the Phoenix UI.") else: print("โš ๏ธ Some tests failed. Check the output above for details.") print(f"\n๐ŸŒ Phoenix UI: http://localhost:6006") print("Look for 'Evaluations' or 'Evals' tab to see the logged evaluations.") if __name__ == "__main__": main()