Final_Assignment_Template / debug_phoenix.py
Romain Fayoux
Added ground evaluation and phoenix login
f9cf36d
raw
history blame
9.75 kB
#!/usr/bin/env python3
"""
Enhanced debug script to check Phoenix status and evaluations.
"""
import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
import time
from datetime import datetime
def check_phoenix_connection():
"""Check if Phoenix is running and accessible."""
try:
client = px.Client()
print("βœ… Phoenix client connected successfully")
# Try to get basic info
try:
spans_df = client.get_spans_dataframe()
print(f"βœ… Phoenix API working - can retrieve spans")
return client
except Exception as e:
print(f"⚠️ Phoenix connected but API might have issues: {e}")
return client
except Exception as e:
print(f"❌ Phoenix connection failed: {e}")
print("Make sure Phoenix is running. You should see a message like:")
print("🌍 To view the Phoenix app in your browser, visit http://localhost:6006")
return None
def check_spans(client):
"""Check spans in Phoenix."""
try:
spans_df = client.get_spans_dataframe()
print(f"πŸ“Š Found {len(spans_df)} spans in Phoenix")
if len(spans_df) > 0:
print("Recent spans:")
for i, (_, span) in enumerate(spans_df.head(5).iterrows()):
span_id = span.get('context.span_id', 'no-id')
span_name = span.get('name', 'unnamed')
start_time = span.get('start_time', 'unknown')
print(f" {i+1}. {span_name} ({span_id[:8]}...) - {start_time}")
# Show input/output samples
print("\nSpan content samples:")
for i, (_, span) in enumerate(spans_df.head(3).iterrows()):
input_val = str(span.get('input.value', ''))[:100]
output_val = str(span.get('output.value', ''))[:100]
print(f" Span {i+1}:")
print(f" Input: {input_val}...")
print(f" Output: {output_val}...")
else:
print("⚠️ No spans found. Run your agent first to generate traces.")
return spans_df
except Exception as e:
print(f"❌ Error getting spans: {e}")
return pd.DataFrame()
def check_evaluations(client):
"""Check evaluations in Phoenix."""
try:
# Try different methods to get evaluations
print("πŸ” Checking evaluations...")
# Method 1: Direct evaluation dataframe
try:
evals_df = client.get_evaluations_dataframe()
print(f"πŸ“Š Found {len(evals_df)} evaluations in Phoenix")
if len(evals_df) > 0:
print("Evaluation breakdown:")
eval_names = evals_df['name'].value_counts()
for name, count in eval_names.items():
print(f" - {name}: {count} evaluations")
# Check for GAIA evaluations specifically
gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
if len(gaia_evals) > 0:
print(f"βœ… Found {len(gaia_evals)} GAIA ground truth evaluations")
# Show sample evaluation
sample = gaia_evals.iloc[0]
print("Sample GAIA evaluation:")
print(f" - Score: {sample.get('score', 'N/A')}")
print(f" - Label: {sample.get('label', 'N/A')}")
print(f" - Explanation: {sample.get('explanation', 'N/A')[:100]}...")
# Show metadata if available
metadata = sample.get('metadata', {})
if metadata:
print(f" - Metadata keys: {list(metadata.keys())}")
else:
print("❌ No GAIA ground truth evaluations found")
print("Available evaluation types:", list(eval_names.keys()))
else:
print("⚠️ No evaluations found in Phoenix")
return evals_df
except AttributeError as e:
print(f"⚠️ get_evaluations_dataframe not available: {e}")
print("This might be a Phoenix version issue")
return pd.DataFrame()
except Exception as e:
print(f"❌ Error getting evaluations: {e}")
return pd.DataFrame()
def test_evaluation_creation_and_logging():
"""Test creating and logging evaluations."""
print("\nπŸ§ͺ Testing evaluation creation and logging...")
# Create sample evaluations
sample_data = [
{
"task_id": "debug-test-1",
"predicted_answer": "test answer 1",
"actual_answer": "correct answer 1",
"exact_match": False,
"similarity_score": 0.75,
"contains_answer": True,
"error": None
},
{
"task_id": "debug-test-2",
"predicted_answer": "exact match",
"actual_answer": "exact match",
"exact_match": True,
"similarity_score": 1.0,
"contains_answer": True,
"error": None
}
]
evaluations_df = pd.DataFrame(sample_data)
print(f"Created {len(evaluations_df)} test evaluations")
# Try to log to Phoenix
try:
print("Attempting to log evaluations to Phoenix...")
result = log_evaluations_to_phoenix(evaluations_df)
if result is not None:
print("βœ… Test evaluation logging successful")
print(f"Logged {len(result)} evaluations")
return True
else:
print("❌ Test evaluation logging failed - no result returned")
return False
except Exception as e:
print(f"❌ Test evaluation logging error: {e}")
import traceback
traceback.print_exc()
return False
def check_gaia_data():
"""Check GAIA ground truth data availability."""
print("\nπŸ“š Checking GAIA ground truth data...")
try:
comparator = AnswerComparator()
print(f"βœ… Loaded {len(comparator.ground_truth)} GAIA ground truth answers")
if len(comparator.ground_truth) > 0:
# Show sample
sample_task_id = list(comparator.ground_truth.keys())[0]
sample_answer = comparator.ground_truth[sample_task_id]
print(f"Sample: {sample_task_id} -> '{sample_answer}'")
# Test evaluation
test_eval = comparator.evaluate_answer(sample_task_id, "test answer")
print(f"Test evaluation result: {test_eval}")
return True
else:
print("❌ No GAIA ground truth data found")
return False
except Exception as e:
print(f"❌ Error checking GAIA data: {e}")
return False
def show_phoenix_ui_info():
"""Show information about Phoenix UI."""
print("\n🌐 Phoenix UI Information:")
print("-" * 30)
print("Phoenix UI should be available at: http://localhost:6006")
print("")
print("In the Phoenix UI, look for:")
print(" β€’ 'Evaluations' tab or section")
print(" β€’ 'Evals' section")
print(" β€’ 'Annotations' tab")
print(" β€’ In 'Spans' view, look for evaluation badges on spans")
print("")
print("If you see evaluations, they should be named 'gaia_ground_truth'")
print("Each evaluation should show:")
print(" - Score (similarity score 0-1)")
print(" - Label (correct/incorrect)")
print(" - Explanation (predicted vs ground truth)")
print(" - Metadata (task_id, exact_match, etc.)")
def main():
"""Main debug function."""
print("πŸ” Enhanced Phoenix Debug Script")
print("=" * 50)
# Check Phoenix connection
client = check_phoenix_connection()
if not client:
print("\n❌ Cannot proceed without Phoenix connection")
print("Make sure your agent app is running (it starts Phoenix)")
return
print("\nπŸ“‹ Checking Phoenix Data:")
print("-" * 30)
# Check spans
spans_df = check_spans(client)
# Check evaluations
evals_df = check_evaluations(client)
# Test evaluation creation
test_success = test_evaluation_creation_and_logging()
# Wait a moment and recheck evaluations
if test_success:
print("\n⏳ Waiting for evaluations to be processed...")
time.sleep(3)
print("πŸ” Rechecking evaluations after test logging...")
evals_df_after = check_evaluations(client)
if len(evals_df_after) > len(evals_df):
print("βœ… New evaluations detected after test!")
else:
print("⚠️ No new evaluations detected")
# Check GAIA data
gaia_available = check_gaia_data()
# Show Phoenix UI info
show_phoenix_ui_info()
# Final summary
print("\n" + "=" * 50)
print("πŸ“Š Summary:")
print(f" β€’ Phoenix connected: {'βœ…' if client else '❌'}")
print(f" β€’ Spans available: {len(spans_df)} spans")
print(f" β€’ Evaluations found: {len(evals_df)} evaluations")
print(f" β€’ GAIA data available: {'βœ…' if gaia_available else '❌'}")
print(f" β€’ Test logging worked: {'βœ…' if test_success else '❌'}")
print("\nπŸ’‘ Next Steps:")
if len(spans_df) == 0:
print(" β€’ Run your agent to generate traces first")
if len(evals_df) == 0:
print(" β€’ Check if evaluations are being logged correctly")
print(" β€’ Verify Phoenix version compatibility")
if not gaia_available:
print(" β€’ Check that data/metadata.jsonl exists and is readable")
print(f"\n🌐 Phoenix UI: http://localhost:6006")
if __name__ == "__main__":
main()