Final_Assignment_Template

Sleeping

File size: 4,299 Bytes

f9cf36d

#!/usr/bin/env python3
"""
Simple test for Phoenix evaluations logging.
"""

import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix


def test_phoenix_logging():
    """Test Phoenix evaluations logging with simple data."""
    print("🧪 Testing Phoenix Evaluations Logging")
    print("=" * 50)

    # Step 1: Check Phoenix connection
    print("1. Checking Phoenix connection...")
    try:
        client = px.Client()
        print("✅ Phoenix connected successfully")
    except Exception as e:
        print(f"❌ Phoenix connection failed: {e}")
        return False

    # Step 2: Create test evaluations
    print("\n2. Creating test evaluations...")
    test_evaluations = pd.DataFrame([
        {
            "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
            "predicted_answer": "3",
            "actual_answer": "3",
            "exact_match": True,
            "similarity_score": 1.0,
            "contains_answer": True,
            "error": None
        },
        {
            "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
            "predicted_answer": "5",
            "actual_answer": "3",
            "exact_match": False,
            "similarity_score": 0.2,
            "contains_answer": False,
            "error": None
        }
    ])
    print(f"✅ Created {len(test_evaluations)} test evaluations")

    # Step 3: Check existing spans
    print("\n3. Checking existing spans...")
    try:
        spans_df = client.get_spans_dataframe()
        print(f"📊 Found {len(spans_df)} existing spans")

        if len(spans_df) == 0:
            print("⚠️ No spans found - you need to run your agent first to create spans")
            return False

    except Exception as e:
        print(f"❌ Error getting spans: {e}")
        return False

    # Step 4: Test logging
    print("\n4. Testing evaluation logging...")
    try:
        result = log_evaluations_to_phoenix(test_evaluations)

        if result is not None:
            print(f"✅ Successfully logged {len(result)} evaluations to Phoenix")
            print("Sample evaluation:")
            print(f"  - Score: {result.iloc[0]['score']}")
            print(f"  - Label: {result.iloc[0]['label']}")
            print(f"  - Explanation: {result.iloc[0]['explanation'][:100]}...")

            # Step 5: Verify evaluations were logged
            print("\n5. Verifying evaluations in Phoenix...")
            try:
                import time
                time.sleep(2)  # Give Phoenix time to process

                evals_df = client.get_evaluations_dataframe()
                gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']

                print(f"📊 Found {len(gaia_evals)} GAIA evaluations in Phoenix")

                if len(gaia_evals) > 0:
                    print("✅ Evaluations successfully verified in Phoenix!")
                    return True
                else:
                    print("⚠️ No GAIA evaluations found in Phoenix")
                    return False

            except Exception as e:
                print(f"⚠️ Could not verify evaluations: {e}")
                print("✅ Logging appeared successful though")
                return True

        else:
            print("❌ Evaluation logging failed")
            return False

    except Exception as e:
        print(f"❌ Error during logging: {e}")
        import traceback
        traceback.print_exc()
        return False


def main():
    """Main test function."""
    success = test_phoenix_logging()

    print("\n" + "=" * 50)
    if success:
        print("🎉 Phoenix evaluations logging test PASSED!")
        print("You should now see 'gaia_ground_truth' evaluations in Phoenix UI")
        print("🌐 Visit: http://localhost:6006")
    else:
        print("❌ Phoenix evaluations logging test FAILED!")
        print("Make sure:")
        print("  1. Your agent app is running (it starts Phoenix)")
        print("  2. You've run your agent at least once to create spans")
        print("  3. Phoenix is accessible at http://localhost:6006")


if __name__ == "__main__":
    main()