|
|
|
|
|
""" |
|
|
Simple test for Phoenix evaluations logging. |
|
|
""" |
|
|
|
|
|
import sys |
|
|
import os |
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|
|
|
|
|
import phoenix as px |
|
|
import pandas as pd |
|
|
from comparison import AnswerComparator |
|
|
from phoenix_evaluator import log_evaluations_to_phoenix |
|
|
|
|
|
|
|
|
def test_phoenix_logging(): |
|
|
"""Test Phoenix evaluations logging with simple data.""" |
|
|
print("π§ͺ Testing Phoenix Evaluations Logging") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
print("1. Checking Phoenix connection...") |
|
|
try: |
|
|
client = px.Client() |
|
|
print("β
Phoenix connected successfully") |
|
|
except Exception as e: |
|
|
print(f"β Phoenix connection failed: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\n2. Creating test evaluations...") |
|
|
test_evaluations = pd.DataFrame([ |
|
|
{ |
|
|
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", |
|
|
"predicted_answer": "3", |
|
|
"actual_answer": "3", |
|
|
"exact_match": True, |
|
|
"similarity_score": 1.0, |
|
|
"contains_answer": True, |
|
|
"error": None |
|
|
}, |
|
|
{ |
|
|
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", |
|
|
"predicted_answer": "5", |
|
|
"actual_answer": "3", |
|
|
"exact_match": False, |
|
|
"similarity_score": 0.2, |
|
|
"contains_answer": False, |
|
|
"error": None |
|
|
} |
|
|
]) |
|
|
print(f"β
Created {len(test_evaluations)} test evaluations") |
|
|
|
|
|
|
|
|
print("\n3. Checking existing spans...") |
|
|
try: |
|
|
spans_df = client.get_spans_dataframe() |
|
|
print(f"π Found {len(spans_df)} existing spans") |
|
|
|
|
|
if len(spans_df) == 0: |
|
|
print("β οΈ No spans found - you need to run your agent first to create spans") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error getting spans: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\n4. Testing evaluation logging...") |
|
|
try: |
|
|
result = log_evaluations_to_phoenix(test_evaluations) |
|
|
|
|
|
if result is not None: |
|
|
print(f"β
Successfully logged {len(result)} evaluations to Phoenix") |
|
|
print("Sample evaluation:") |
|
|
print(f" - Score: {result.iloc[0]['score']}") |
|
|
print(f" - Label: {result.iloc[0]['label']}") |
|
|
print(f" - Explanation: {result.iloc[0]['explanation'][:100]}...") |
|
|
|
|
|
|
|
|
print("\n5. Verifying evaluations in Phoenix...") |
|
|
try: |
|
|
import time |
|
|
time.sleep(2) |
|
|
|
|
|
evals_df = client.get_evaluations_dataframe() |
|
|
gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth'] |
|
|
|
|
|
print(f"π Found {len(gaia_evals)} GAIA evaluations in Phoenix") |
|
|
|
|
|
if len(gaia_evals) > 0: |
|
|
print("β
Evaluations successfully verified in Phoenix!") |
|
|
return True |
|
|
else: |
|
|
print("β οΈ No GAIA evaluations found in Phoenix") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β οΈ Could not verify evaluations: {e}") |
|
|
print("β
Logging appeared successful though") |
|
|
return True |
|
|
|
|
|
else: |
|
|
print("β Evaluation logging failed") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error during logging: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main test function.""" |
|
|
success = test_phoenix_logging() |
|
|
|
|
|
print("\n" + "=" * 50) |
|
|
if success: |
|
|
print("π Phoenix evaluations logging test PASSED!") |
|
|
print("You should now see 'gaia_ground_truth' evaluations in Phoenix UI") |
|
|
print("π Visit: http://localhost:6006") |
|
|
else: |
|
|
print("β Phoenix evaluations logging test FAILED!") |
|
|
print("Make sure:") |
|
|
print(" 1. Your agent app is running (it starts Phoenix)") |
|
|
print(" 2. You've run your agent at least once to create spans") |
|
|
print(" 3. Phoenix is accessible at http://localhost:6006") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|