Final_Assignment_Template

Sleeping

File size: 9,751 Bytes

f9cf36d

#!/usr/bin/env python3
"""
Enhanced debug script to check Phoenix status and evaluations.
"""

import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
import time
from datetime import datetime


def check_phoenix_connection():
    """Check if Phoenix is running and accessible."""
    try:
        client = px.Client()
        print("✅ Phoenix client connected successfully")

        # Try to get basic info
        try:
            spans_df = client.get_spans_dataframe()
            print(f"✅ Phoenix API working - can retrieve spans")
            return client
        except Exception as e:
            print(f"⚠️ Phoenix connected but API might have issues: {e}")
            return client

    except Exception as e:
        print(f"❌ Phoenix connection failed: {e}")
        print("Make sure Phoenix is running. You should see a message like:")
        print("🌍 To view the Phoenix app in your browser, visit http://localhost:6006")
        return None


def check_spans(client):
    """Check spans in Phoenix."""
    try:
        spans_df = client.get_spans_dataframe()
        print(f"📊 Found {len(spans_df)} spans in Phoenix")

        if len(spans_df) > 0:
            print("Recent spans:")
            for i, (_, span) in enumerate(spans_df.head(5).iterrows()):
                span_id = span.get('context.span_id', 'no-id')
                span_name = span.get('name', 'unnamed')
                start_time = span.get('start_time', 'unknown')
                print(f"  {i+1}. {span_name} ({span_id[:8]}...) - {start_time}")

            # Show input/output samples
            print("\nSpan content samples:")
            for i, (_, span) in enumerate(spans_df.head(3).iterrows()):
                input_val = str(span.get('input.value', ''))[:100]
                output_val = str(span.get('output.value', ''))[:100]
                print(f"  Span {i+1}:")
                print(f"    Input: {input_val}...")
                print(f"    Output: {output_val}...")

        else:
            print("⚠️ No spans found. Run your agent first to generate traces.")

        return spans_df

    except Exception as e:
        print(f"❌ Error getting spans: {e}")
        return pd.DataFrame()


def check_evaluations(client):
    """Check evaluations in Phoenix."""
    try:
        # Try different methods to get evaluations
        print("🔍 Checking evaluations...")

        # Method 1: Direct evaluation dataframe
        try:
            evals_df = client.get_evaluations_dataframe()
            print(f"📊 Found {len(evals_df)} evaluations in Phoenix")

            if len(evals_df) > 0:
                print("Evaluation breakdown:")
                eval_names = evals_df['name'].value_counts()
                for name, count in eval_names.items():
                    print(f"  - {name}: {count} evaluations")

                # Check for GAIA evaluations specifically
                gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
                if len(gaia_evals) > 0:
                    print(f"✅ Found {len(gaia_evals)} GAIA ground truth evaluations")

                    # Show sample evaluation
                    sample = gaia_evals.iloc[0]
                    print("Sample GAIA evaluation:")
                    print(f"  - Score: {sample.get('score', 'N/A')}")
                    print(f"  - Label: {sample.get('label', 'N/A')}")
                    print(f"  - Explanation: {sample.get('explanation', 'N/A')[:100]}...")

                    # Show metadata if available
                    metadata = sample.get('metadata', {})
                    if metadata:
                        print(f"  - Metadata keys: {list(metadata.keys())}")

                else:
                    print("❌ No GAIA ground truth evaluations found")
                    print("Available evaluation types:", list(eval_names.keys()))

            else:
                print("⚠️ No evaluations found in Phoenix")

            return evals_df

        except AttributeError as e:
            print(f"⚠️ get_evaluations_dataframe not available: {e}")
            print("This might be a Phoenix version issue")
            return pd.DataFrame()

    except Exception as e:
        print(f"❌ Error getting evaluations: {e}")
        return pd.DataFrame()


def test_evaluation_creation_and_logging():
    """Test creating and logging evaluations."""
    print("\n🧪 Testing evaluation creation and logging...")

    # Create sample evaluations
    sample_data = [
        {
            "task_id": "debug-test-1",
            "predicted_answer": "test answer 1",
            "actual_answer": "correct answer 1",
            "exact_match": False,
            "similarity_score": 0.75,
            "contains_answer": True,
            "error": None
        },
        {
            "task_id": "debug-test-2",
            "predicted_answer": "exact match",
            "actual_answer": "exact match",
            "exact_match": True,
            "similarity_score": 1.0,
            "contains_answer": True,
            "error": None
        }
    ]

    evaluations_df = pd.DataFrame(sample_data)
    print(f"Created {len(evaluations_df)} test evaluations")

    # Try to log to Phoenix
    try:
        print("Attempting to log evaluations to Phoenix...")
        result = log_evaluations_to_phoenix(evaluations_df)

        if result is not None:
            print("✅ Test evaluation logging successful")
            print(f"Logged {len(result)} evaluations")
            return True
        else:
            print("❌ Test evaluation logging failed - no result returned")
            return False

    except Exception as e:
        print(f"❌ Test evaluation logging error: {e}")
        import traceback
        traceback.print_exc()
        return False


def check_gaia_data():
    """Check GAIA ground truth data availability."""
    print("\n📚 Checking GAIA ground truth data...")

    try:
        comparator = AnswerComparator()

        print(f"✅ Loaded {len(comparator.ground_truth)} GAIA ground truth answers")

        if len(comparator.ground_truth) > 0:
            # Show sample
            sample_task_id = list(comparator.ground_truth.keys())[0]
            sample_answer = comparator.ground_truth[sample_task_id]
            print(f"Sample: {sample_task_id} -> '{sample_answer}'")

            # Test evaluation
            test_eval = comparator.evaluate_answer(sample_task_id, "test answer")
            print(f"Test evaluation result: {test_eval}")

            return True
        else:
            print("❌ No GAIA ground truth data found")
            return False

    except Exception as e:
        print(f"❌ Error checking GAIA data: {e}")
        return False


def show_phoenix_ui_info():
    """Show information about Phoenix UI."""
    print("\n🌐 Phoenix UI Information:")
    print("-" * 30)
    print("Phoenix UI should be available at: http://localhost:6006")
    print("")
    print("In the Phoenix UI, look for:")
    print("  • 'Evaluations' tab or section")
    print("  • 'Evals' section")
    print("  • 'Annotations' tab")
    print("  • In 'Spans' view, look for evaluation badges on spans")
    print("")
    print("If you see evaluations, they should be named 'gaia_ground_truth'")
    print("Each evaluation should show:")
    print("  - Score (similarity score 0-1)")
    print("  - Label (correct/incorrect)")
    print("  - Explanation (predicted vs ground truth)")
    print("  - Metadata (task_id, exact_match, etc.)")


def main():
    """Main debug function."""
    print("🔍 Enhanced Phoenix Debug Script")
    print("=" * 50)

    # Check Phoenix connection
    client = check_phoenix_connection()
    if not client:
        print("\n❌ Cannot proceed without Phoenix connection")
        print("Make sure your agent app is running (it starts Phoenix)")
        return

    print("\n📋 Checking Phoenix Data:")
    print("-" * 30)

    # Check spans
    spans_df = check_spans(client)

    # Check evaluations
    evals_df = check_evaluations(client)

    # Test evaluation creation
    test_success = test_evaluation_creation_and_logging()

    # Wait a moment and recheck evaluations
    if test_success:
        print("\n⏳ Waiting for evaluations to be processed...")
        time.sleep(3)

        print("🔍 Rechecking evaluations after test logging...")
        evals_df_after = check_evaluations(client)

        if len(evals_df_after) > len(evals_df):
            print("✅ New evaluations detected after test!")
        else:
            print("⚠️ No new evaluations detected")

    # Check GAIA data
    gaia_available = check_gaia_data()

    # Show Phoenix UI info
    show_phoenix_ui_info()

    # Final summary
    print("\n" + "=" * 50)
    print("📊 Summary:")
    print(f"  • Phoenix connected: {'✅' if client else '❌'}")
    print(f"  • Spans available: {len(spans_df)} spans")
    print(f"  • Evaluations found: {len(evals_df)} evaluations")
    print(f"  • GAIA data available: {'✅' if gaia_available else '❌'}")
    print(f"  • Test logging worked: {'✅' if test_success else '❌'}")

    print("\n💡 Next Steps:")
    if len(spans_df) == 0:
        print("  • Run your agent to generate traces first")
    if len(evals_df) == 0:
        print("  • Check if evaluations are being logged correctly")
        print("  • Verify Phoenix version compatibility")
    if not gaia_available:
        print("  • Check that data/metadata.jsonl exists and is readable")

    print(f"\n🌐 Phoenix UI: http://localhost:6006")


if __name__ == "__main__":
    main()