|
|
|
|
|
""" |
|
|
Test script to verify Phoenix evaluations logging. |
|
|
""" |
|
|
|
|
|
import sys |
|
|
import os |
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|
|
|
|
|
import phoenix as px |
|
|
import pandas as pd |
|
|
from comparison import AnswerComparator |
|
|
from phoenix_evaluator import log_evaluations_to_phoenix |
|
|
from datetime import datetime |
|
|
import time |
|
|
|
|
|
|
|
|
def test_phoenix_connection(): |
|
|
"""Test Phoenix connection and basic functionality.""" |
|
|
print("π Testing Phoenix Connection...") |
|
|
|
|
|
try: |
|
|
client = px.Client() |
|
|
print("β
Phoenix client connected successfully") |
|
|
|
|
|
|
|
|
spans_df = client.get_spans_dataframe() |
|
|
print(f"π Found {len(spans_df)} existing spans in Phoenix") |
|
|
|
|
|
return client, spans_df |
|
|
except Exception as e: |
|
|
print(f"β Phoenix connection failed: {e}") |
|
|
print("Make sure Phoenix is running and accessible at http://localhost:6006") |
|
|
return None, None |
|
|
|
|
|
|
|
|
def create_test_evaluations(): |
|
|
"""Create test evaluations for logging.""" |
|
|
print("\nπ§ͺ Creating test evaluations...") |
|
|
|
|
|
test_data = [ |
|
|
{ |
|
|
"task_id": "test-exact-match", |
|
|
"predicted_answer": "Paris", |
|
|
"actual_answer": "Paris", |
|
|
"exact_match": True, |
|
|
"similarity_score": 1.0, |
|
|
"contains_answer": True, |
|
|
"error": None |
|
|
}, |
|
|
{ |
|
|
"task_id": "test-partial-match", |
|
|
"predicted_answer": "The capital of France is Paris", |
|
|
"actual_answer": "Paris", |
|
|
"exact_match": False, |
|
|
"similarity_score": 0.75, |
|
|
"contains_answer": True, |
|
|
"error": None |
|
|
}, |
|
|
{ |
|
|
"task_id": "test-no-match", |
|
|
"predicted_answer": "London", |
|
|
"actual_answer": "Paris", |
|
|
"exact_match": False, |
|
|
"similarity_score": 0.2, |
|
|
"contains_answer": False, |
|
|
"error": None |
|
|
} |
|
|
] |
|
|
|
|
|
evaluations_df = pd.DataFrame(test_data) |
|
|
print(f"Created {len(evaluations_df)} test evaluations") |
|
|
|
|
|
return evaluations_df |
|
|
|
|
|
|
|
|
def create_mock_spans(client): |
|
|
"""Create mock spans for testing (if no real spans exist).""" |
|
|
print("\nπ Creating mock spans for testing...") |
|
|
|
|
|
|
|
|
mock_spans = [ |
|
|
{ |
|
|
"context.span_id": "mock-span-1", |
|
|
"name": "test_agent_run", |
|
|
"input.value": "Question about test-exact-match", |
|
|
"output.value": "Paris", |
|
|
"start_time": datetime.now(), |
|
|
"end_time": datetime.now() |
|
|
}, |
|
|
{ |
|
|
"context.span_id": "mock-span-2", |
|
|
"name": "test_agent_run", |
|
|
"input.value": "Question about test-partial-match", |
|
|
"output.value": "The capital of France is Paris", |
|
|
"start_time": datetime.now(), |
|
|
"end_time": datetime.now() |
|
|
}, |
|
|
{ |
|
|
"context.span_id": "mock-span-3", |
|
|
"name": "test_agent_run", |
|
|
"input.value": "Question about test-no-match", |
|
|
"output.value": "London", |
|
|
"start_time": datetime.now(), |
|
|
"end_time": datetime.now() |
|
|
} |
|
|
] |
|
|
|
|
|
print(f"Created {len(mock_spans)} mock spans") |
|
|
return pd.DataFrame(mock_spans) |
|
|
|
|
|
|
|
|
def test_evaluation_logging(): |
|
|
"""Test the actual evaluation logging to Phoenix.""" |
|
|
print("\nπ Testing evaluation logging...") |
|
|
|
|
|
|
|
|
evaluations_df = create_test_evaluations() |
|
|
|
|
|
|
|
|
try: |
|
|
result = log_evaluations_to_phoenix(evaluations_df) |
|
|
|
|
|
if result is not None: |
|
|
print("β
Evaluation logging test successful!") |
|
|
print(f"Logged {len(result)} evaluations") |
|
|
return True |
|
|
else: |
|
|
print("β Evaluation logging test failed - no result returned") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Evaluation logging test failed with error: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
|
|
|
def verify_logged_evaluations(client): |
|
|
"""Verify that evaluations were actually logged to Phoenix.""" |
|
|
print("\nπ Verifying logged evaluations...") |
|
|
|
|
|
try: |
|
|
|
|
|
time.sleep(2) |
|
|
|
|
|
|
|
|
evals_df = client.get_evaluations_dataframe() |
|
|
print(f"π Found {len(evals_df)} total evaluations in Phoenix") |
|
|
|
|
|
|
|
|
if len(evals_df) > 0: |
|
|
gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth'] |
|
|
print(f"π― Found {len(gaia_evals)} GAIA ground truth evaluations") |
|
|
|
|
|
if len(gaia_evals) > 0: |
|
|
print("β
Successfully verified evaluations in Phoenix!") |
|
|
|
|
|
|
|
|
sample_eval = gaia_evals.iloc[0] |
|
|
print(f"Sample evaluation:") |
|
|
print(f" - Score: {sample_eval.get('score', 'N/A')}") |
|
|
print(f" - Label: {sample_eval.get('label', 'N/A')}") |
|
|
print(f" - Explanation: {sample_eval.get('explanation', 'N/A')}") |
|
|
|
|
|
return True |
|
|
else: |
|
|
print("β No GAIA evaluations found after logging") |
|
|
return False |
|
|
else: |
|
|
print("β No evaluations found in Phoenix") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error verifying evaluations: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
def test_with_real_gaia_data(): |
|
|
"""Test with actual GAIA data if available.""" |
|
|
print("\nπ Testing with real GAIA data...") |
|
|
|
|
|
try: |
|
|
|
|
|
comparator = AnswerComparator() |
|
|
|
|
|
if len(comparator.ground_truth) == 0: |
|
|
print("β οΈ No GAIA ground truth data available") |
|
|
return False |
|
|
|
|
|
|
|
|
real_task_id = list(comparator.ground_truth.keys())[0] |
|
|
real_ground_truth = comparator.ground_truth[real_task_id] |
|
|
|
|
|
real_evaluation = comparator.evaluate_answer(real_task_id, "test answer") |
|
|
|
|
|
real_eval_df = pd.DataFrame([real_evaluation]) |
|
|
|
|
|
|
|
|
result = log_evaluations_to_phoenix(real_eval_df) |
|
|
|
|
|
if result is not None: |
|
|
print("β
Real GAIA data logging successful!") |
|
|
print(f"Task ID: {real_task_id}") |
|
|
print(f"Ground Truth: {real_ground_truth}") |
|
|
print(f"Similarity Score: {real_evaluation['similarity_score']:.3f}") |
|
|
return True |
|
|
else: |
|
|
print("β Real GAIA data logging failed") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error testing with real GAIA data: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main test function.""" |
|
|
print("π Phoenix Evaluations Logging Test") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
client, spans_df = test_phoenix_connection() |
|
|
if not client: |
|
|
print("β Cannot proceed without Phoenix connection") |
|
|
return |
|
|
|
|
|
|
|
|
tests_passed = 0 |
|
|
total_tests = 3 |
|
|
|
|
|
print(f"\nπ§ͺ Running {total_tests} tests...") |
|
|
|
|
|
|
|
|
if test_evaluation_logging(): |
|
|
tests_passed += 1 |
|
|
|
|
|
|
|
|
if verify_logged_evaluations(client): |
|
|
tests_passed += 1 |
|
|
|
|
|
|
|
|
if test_with_real_gaia_data(): |
|
|
tests_passed += 1 |
|
|
|
|
|
|
|
|
print("\n" + "=" * 50) |
|
|
print(f"π― Test Results: {tests_passed}/{total_tests} tests passed") |
|
|
|
|
|
if tests_passed == total_tests: |
|
|
print("π All tests passed! Phoenix evaluations logging is working correctly.") |
|
|
print("You should now see 'gaia_ground_truth' evaluations in the Phoenix UI.") |
|
|
else: |
|
|
print("β οΈ Some tests failed. Check the output above for details.") |
|
|
|
|
|
print(f"\nπ Phoenix UI: http://localhost:6006") |
|
|
print("Look for 'Evaluations' or 'Evals' tab to see the logged evaluations.") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|