Final_Assignment_Template

Sleeping

File size: 8,158 Bytes

f9cf36d

#!/usr/bin/env python3
"""
Test script to verify Phoenix evaluations logging.
"""

import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
from datetime import datetime
import time


def test_phoenix_connection():
    """Test Phoenix connection and basic functionality."""
    print("🔍 Testing Phoenix Connection...")

    try:
        client = px.Client()
        print("✅ Phoenix client connected successfully")

        # Check if Phoenix is actually running
        spans_df = client.get_spans_dataframe()
        print(f"📊 Found {len(spans_df)} existing spans in Phoenix")

        return client, spans_df
    except Exception as e:
        print(f"❌ Phoenix connection failed: {e}")
        print("Make sure Phoenix is running and accessible at http://localhost:6006")
        return None, None


def create_test_evaluations():
    """Create test evaluations for logging."""
    print("\n🧪 Creating test evaluations...")

    test_data = [
        {
            "task_id": "test-exact-match",
            "predicted_answer": "Paris",
            "actual_answer": "Paris",
            "exact_match": True,
            "similarity_score": 1.0,
            "contains_answer": True,
            "error": None
        },
        {
            "task_id": "test-partial-match",
            "predicted_answer": "The capital of France is Paris",
            "actual_answer": "Paris",
            "exact_match": False,
            "similarity_score": 0.75,
            "contains_answer": True,
            "error": None
        },
        {
            "task_id": "test-no-match",
            "predicted_answer": "London",
            "actual_answer": "Paris",
            "exact_match": False,
            "similarity_score": 0.2,
            "contains_answer": False,
            "error": None
        }
    ]

    evaluations_df = pd.DataFrame(test_data)
    print(f"Created {len(evaluations_df)} test evaluations")

    return evaluations_df


def create_mock_spans(client):
    """Create mock spans for testing (if no real spans exist)."""
    print("\n🎭 Creating mock spans for testing...")

    # Note: This is a simplified mock - in real usage, spans are created by agent runs
    mock_spans = [
        {
            "context.span_id": "mock-span-1",
            "name": "test_agent_run",
            "input.value": "Question about test-exact-match",
            "output.value": "Paris",
            "start_time": datetime.now(),
            "end_time": datetime.now()
        },
        {
            "context.span_id": "mock-span-2",
            "name": "test_agent_run",
            "input.value": "Question about test-partial-match",
            "output.value": "The capital of France is Paris",
            "start_time": datetime.now(),
            "end_time": datetime.now()
        },
        {
            "context.span_id": "mock-span-3",
            "name": "test_agent_run",
            "input.value": "Question about test-no-match",
            "output.value": "London",
            "start_time": datetime.now(),
            "end_time": datetime.now()
        }
    ]

    print(f"Created {len(mock_spans)} mock spans")
    return pd.DataFrame(mock_spans)


def test_evaluation_logging():
    """Test the actual evaluation logging to Phoenix."""
    print("\n📝 Testing evaluation logging...")

    # Create test evaluations
    evaluations_df = create_test_evaluations()

    # Try to log to Phoenix
    try:
        result = log_evaluations_to_phoenix(evaluations_df)

        if result is not None:
            print("✅ Evaluation logging test successful!")
            print(f"Logged {len(result)} evaluations")
            return True
        else:
            print("❌ Evaluation logging test failed - no result returned")
            return False

    except Exception as e:
        print(f"❌ Evaluation logging test failed with error: {e}")
        import traceback
        traceback.print_exc()
        return False


def verify_logged_evaluations(client):
    """Verify that evaluations were actually logged to Phoenix."""
    print("\n🔍 Verifying logged evaluations...")

    try:
        # Give Phoenix a moment to process
        time.sleep(2)

        # Try to retrieve evaluations
        evals_df = client.get_evaluations_dataframe()
        print(f"📊 Found {len(evals_df)} total evaluations in Phoenix")

        # Look for our specific evaluations
        if len(evals_df) > 0:
            gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
            print(f"🎯 Found {len(gaia_evals)} GAIA ground truth evaluations")

            if len(gaia_evals) > 0:
                print("✅ Successfully verified evaluations in Phoenix!")

                # Show sample evaluation
                sample_eval = gaia_evals.iloc[0]
                print(f"Sample evaluation:")
                print(f"  - Score: {sample_eval.get('score', 'N/A')}")
                print(f"  - Label: {sample_eval.get('label', 'N/A')}")
                print(f"  - Explanation: {sample_eval.get('explanation', 'N/A')}")

                return True
            else:
                print("❌ No GAIA evaluations found after logging")
                return False
        else:
            print("❌ No evaluations found in Phoenix")
            return False

    except Exception as e:
        print(f"❌ Error verifying evaluations: {e}")
        return False


def test_with_real_gaia_data():
    """Test with actual GAIA data if available."""
    print("\n📚 Testing with real GAIA data...")

    try:
        # Initialize comparator
        comparator = AnswerComparator()

        if len(comparator.ground_truth) == 0:
            print("⚠️ No GAIA ground truth data available")
            return False

        # Create a real evaluation with GAIA data
        real_task_id = list(comparator.ground_truth.keys())[0]
        real_ground_truth = comparator.ground_truth[real_task_id]

        real_evaluation = comparator.evaluate_answer(real_task_id, "test answer")

        real_eval_df = pd.DataFrame([real_evaluation])

        # Log to Phoenix
        result = log_evaluations_to_phoenix(real_eval_df)

        if result is not None:
            print("✅ Real GAIA data logging successful!")
            print(f"Task ID: {real_task_id}")
            print(f"Ground Truth: {real_ground_truth}")
            print(f"Similarity Score: {real_evaluation['similarity_score']:.3f}")
            return True
        else:
            print("❌ Real GAIA data logging failed")
            return False

    except Exception as e:
        print(f"❌ Error testing with real GAIA data: {e}")
        return False


def main():
    """Main test function."""
    print("🚀 Phoenix Evaluations Logging Test")
    print("=" * 50)

    # Test Phoenix connection
    client, spans_df = test_phoenix_connection()
    if not client:
        print("❌ Cannot proceed without Phoenix connection")
        return

    # Run tests
    tests_passed = 0
    total_tests = 3

    print(f"\n🧪 Running {total_tests} tests...")

    # Test 1: Basic evaluation logging
    if test_evaluation_logging():
        tests_passed += 1

    # Test 2: Verify evaluations were logged
    if verify_logged_evaluations(client):
        tests_passed += 1

    # Test 3: Test with real GAIA data
    if test_with_real_gaia_data():
        tests_passed += 1

    # Summary
    print("\n" + "=" * 50)
    print(f"🎯 Test Results: {tests_passed}/{total_tests} tests passed")

    if tests_passed == total_tests:
        print("🎉 All tests passed! Phoenix evaluations logging is working correctly.")
        print("You should now see 'gaia_ground_truth' evaluations in the Phoenix UI.")
    else:
        print("⚠️ Some tests failed. Check the output above for details.")

    print(f"\n🌐 Phoenix UI: http://localhost:6006")
    print("Look for 'Evaluations' or 'Evals' tab to see the logged evaluations.")


if __name__ == "__main__":
    main()