#!/usr/bin/env python3 """ Simple test for Phoenix evaluations logging. """ import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) import phoenix as px import pandas as pd from comparison import AnswerComparator from phoenix_evaluator import log_evaluations_to_phoenix def test_phoenix_logging(): """Test Phoenix evaluations logging with simple data.""" print("๐Ÿงช Testing Phoenix Evaluations Logging") print("=" * 50) # Step 1: Check Phoenix connection print("1. Checking Phoenix connection...") try: client = px.Client() print("โœ… Phoenix connected successfully") except Exception as e: print(f"โŒ Phoenix connection failed: {e}") return False # Step 2: Create test evaluations print("\n2. Creating test evaluations...") test_evaluations = pd.DataFrame([ { "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", "predicted_answer": "3", "actual_answer": "3", "exact_match": True, "similarity_score": 1.0, "contains_answer": True, "error": None }, { "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", "predicted_answer": "5", "actual_answer": "3", "exact_match": False, "similarity_score": 0.2, "contains_answer": False, "error": None } ]) print(f"โœ… Created {len(test_evaluations)} test evaluations") # Step 3: Check existing spans print("\n3. Checking existing spans...") try: spans_df = client.get_spans_dataframe() print(f"๐Ÿ“Š Found {len(spans_df)} existing spans") if len(spans_df) == 0: print("โš ๏ธ No spans found - you need to run your agent first to create spans") return False except Exception as e: print(f"โŒ Error getting spans: {e}") return False # Step 4: Test logging print("\n4. Testing evaluation logging...") try: result = log_evaluations_to_phoenix(test_evaluations) if result is not None: print(f"โœ… Successfully logged {len(result)} evaluations to Phoenix") print("Sample evaluation:") print(f" - Score: {result.iloc[0]['score']}") print(f" - Label: {result.iloc[0]['label']}") print(f" - Explanation: {result.iloc[0]['explanation'][:100]}...") # Step 5: Verify evaluations were logged print("\n5. Verifying evaluations in Phoenix...") try: import time time.sleep(2) # Give Phoenix time to process evals_df = client.get_evaluations_dataframe() gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth'] print(f"๐Ÿ“Š Found {len(gaia_evals)} GAIA evaluations in Phoenix") if len(gaia_evals) > 0: print("โœ… Evaluations successfully verified in Phoenix!") return True else: print("โš ๏ธ No GAIA evaluations found in Phoenix") return False except Exception as e: print(f"โš ๏ธ Could not verify evaluations: {e}") print("โœ… Logging appeared successful though") return True else: print("โŒ Evaluation logging failed") return False except Exception as e: print(f"โŒ Error during logging: {e}") import traceback traceback.print_exc() return False def main(): """Main test function.""" success = test_phoenix_logging() print("\n" + "=" * 50) if success: print("๐ŸŽ‰ Phoenix evaluations logging test PASSED!") print("You should now see 'gaia_ground_truth' evaluations in Phoenix UI") print("๐ŸŒ Visit: http://localhost:6006") else: print("โŒ Phoenix evaluations logging test FAILED!") print("Make sure:") print(" 1. Your agent app is running (it starts Phoenix)") print(" 2. You've run your agent at least once to create spans") print(" 3. Phoenix is accessible at http://localhost:6006") if __name__ == "__main__": main()