Final_Assignment_Template

Sleeping

Final_Assignment_Template / debug_phoenix.py

Romain Fayoux

Added ground evaluation and phoenix login

f9cf36d 4 months ago

9.75 kB

	#!/usr/bin/env python3
	"""
	Enhanced debug script to check Phoenix status and evaluations.
	"""

	import phoenix as px
	import pandas as pd
	from comparison import AnswerComparator
	from phoenix_evaluator import log_evaluations_to_phoenix
	import time
	from datetime import datetime


	def check_phoenix_connection():
	"""Check if Phoenix is running and accessible."""
	try:
	client = px.Client()
	print("✅ Phoenix client connected successfully")

	# Try to get basic info
	try:
	spans_df = client.get_spans_dataframe()
	print(f"✅ Phoenix API working - can retrieve spans")
	return client
	except Exception as e:
	print(f"⚠️ Phoenix connected but API might have issues: {e}")
	return client

	except Exception as e:
	print(f"❌ Phoenix connection failed: {e}")
	print("Make sure Phoenix is running. You should see a message like:")
	print("🌍 To view the Phoenix app in your browser, visit http://localhost:6006")
	return None


	def check_spans(client):
	"""Check spans in Phoenix."""
	try:
	spans_df = client.get_spans_dataframe()
	print(f"📊 Found {len(spans_df)} spans in Phoenix")

	if len(spans_df) > 0:
	print("Recent spans:")
	for i, (_, span) in enumerate(spans_df.head(5).iterrows()):
	span_id = span.get('context.span_id', 'no-id')
	span_name = span.get('name', 'unnamed')
	start_time = span.get('start_time', 'unknown')
	print(f" {i+1}. {span_name} ({span_id[:8]}...) - {start_time}")

	# Show input/output samples
	print("\nSpan content samples:")
	for i, (_, span) in enumerate(spans_df.head(3).iterrows()):
	input_val = str(span.get('input.value', ''))[:100]
	output_val = str(span.get('output.value', ''))[:100]
	print(f" Span {i+1}:")
	print(f" Input: {input_val}...")
	print(f" Output: {output_val}...")

	else:
	print("⚠️ No spans found. Run your agent first to generate traces.")

	return spans_df

	except Exception as e:
	print(f"❌ Error getting spans: {e}")
	return pd.DataFrame()


	def check_evaluations(client):
	"""Check evaluations in Phoenix."""
	try:
	# Try different methods to get evaluations
	print("🔍 Checking evaluations...")

	# Method 1: Direct evaluation dataframe
	try:
	evals_df = client.get_evaluations_dataframe()
	print(f"📊 Found {len(evals_df)} evaluations in Phoenix")

	if len(evals_df) > 0:
	print("Evaluation breakdown:")
	eval_names = evals_df['name'].value_counts()
	for name, count in eval_names.items():
	print(f" - {name}: {count} evaluations")

	# Check for GAIA evaluations specifically
	gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
	if len(gaia_evals) > 0:
	print(f"✅ Found {len(gaia_evals)} GAIA ground truth evaluations")

	# Show sample evaluation
	sample = gaia_evals.iloc[0]
	print("Sample GAIA evaluation:")
	print(f" - Score: {sample.get('score', 'N/A')}")
	print(f" - Label: {sample.get('label', 'N/A')}")
	print(f" - Explanation: {sample.get('explanation', 'N/A')[:100]}...")

	# Show metadata if available
	metadata = sample.get('metadata', {})
	if metadata:
	print(f" - Metadata keys: {list(metadata.keys())}")

	else:
	print("❌ No GAIA ground truth evaluations found")
	print("Available evaluation types:", list(eval_names.keys()))

	else:
	print("⚠️ No evaluations found in Phoenix")

	return evals_df

	except AttributeError as e:
	print(f"⚠️ get_evaluations_dataframe not available: {e}")
	print("This might be a Phoenix version issue")
	return pd.DataFrame()

	except Exception as e:
	print(f"❌ Error getting evaluations: {e}")
	return pd.DataFrame()


	def test_evaluation_creation_and_logging():
	"""Test creating and logging evaluations."""
	print("\n🧪 Testing evaluation creation and logging...")

	# Create sample evaluations
	sample_data = [
	{
	"task_id": "debug-test-1",
	"predicted_answer": "test answer 1",
	"actual_answer": "correct answer 1",
	"exact_match": False,
	"similarity_score": 0.75,
	"contains_answer": True,
	"error": None
	},
	{
	"task_id": "debug-test-2",
	"predicted_answer": "exact match",
	"actual_answer": "exact match",
	"exact_match": True,
	"similarity_score": 1.0,
	"contains_answer": True,
	"error": None
	}
	]

	evaluations_df = pd.DataFrame(sample_data)
	print(f"Created {len(evaluations_df)} test evaluations")

	# Try to log to Phoenix
	try:
	print("Attempting to log evaluations to Phoenix...")
	result = log_evaluations_to_phoenix(evaluations_df)

	if result is not None:
	print("✅ Test evaluation logging successful")
	print(f"Logged {len(result)} evaluations")
	return True
	else:
	print("❌ Test evaluation logging failed - no result returned")
	return False

	except Exception as e:
	print(f"❌ Test evaluation logging error: {e}")
	import traceback
	traceback.print_exc()
	return False


	def check_gaia_data():
	"""Check GAIA ground truth data availability."""
	print("\n📚 Checking GAIA ground truth data...")

	try:
	comparator = AnswerComparator()

	print(f"✅ Loaded {len(comparator.ground_truth)} GAIA ground truth answers")

	if len(comparator.ground_truth) > 0:
	# Show sample
	sample_task_id = list(comparator.ground_truth.keys())[0]
	sample_answer = comparator.ground_truth[sample_task_id]
	print(f"Sample: {sample_task_id} -> '{sample_answer}'")

	# Test evaluation
	test_eval = comparator.evaluate_answer(sample_task_id, "test answer")
	print(f"Test evaluation result: {test_eval}")

	return True
	else:
	print("❌ No GAIA ground truth data found")
	return False

	except Exception as e:
	print(f"❌ Error checking GAIA data: {e}")
	return False


	def show_phoenix_ui_info():
	"""Show information about Phoenix UI."""
	print("\n🌐 Phoenix UI Information:")
	print("-" * 30)
	print("Phoenix UI should be available at: http://localhost:6006")
	print("")
	print("In the Phoenix UI, look for:")
	print(" • 'Evaluations' tab or section")
	print(" • 'Evals' section")
	print(" • 'Annotations' tab")
	print(" • In 'Spans' view, look for evaluation badges on spans")
	print("")
	print("If you see evaluations, they should be named 'gaia_ground_truth'")
	print("Each evaluation should show:")
	print(" - Score (similarity score 0-1)")
	print(" - Label (correct/incorrect)")
	print(" - Explanation (predicted vs ground truth)")
	print(" - Metadata (task_id, exact_match, etc.)")


	def main():
	"""Main debug function."""
	print("🔍 Enhanced Phoenix Debug Script")
	print("=" * 50)

	# Check Phoenix connection
	client = check_phoenix_connection()
	if not client:
	print("\n❌ Cannot proceed without Phoenix connection")
	print("Make sure your agent app is running (it starts Phoenix)")
	return

	print("\n📋 Checking Phoenix Data:")
	print("-" * 30)

	# Check spans
	spans_df = check_spans(client)

	# Check evaluations
	evals_df = check_evaluations(client)

	# Test evaluation creation
	test_success = test_evaluation_creation_and_logging()

	# Wait a moment and recheck evaluations
	if test_success:
	print("\n⏳ Waiting for evaluations to be processed...")
	time.sleep(3)

	print("🔍 Rechecking evaluations after test logging...")
	evals_df_after = check_evaluations(client)

	if len(evals_df_after) > len(evals_df):
	print("✅ New evaluations detected after test!")
	else:
	print("⚠️ No new evaluations detected")

	# Check GAIA data
	gaia_available = check_gaia_data()

	# Show Phoenix UI info
	show_phoenix_ui_info()

	# Final summary
	print("\n" + "=" * 50)
	print("📊 Summary:")
	print(f" • Phoenix connected: {'✅' if client else '❌'}")
	print(f" • Spans available: {len(spans_df)} spans")
	print(f" • Evaluations found: {len(evals_df)} evaluations")
	print(f" • GAIA data available: {'✅' if gaia_available else '❌'}")
	print(f" • Test logging worked: {'✅' if test_success else '❌'}")

	print("\n💡 Next Steps:")
	if len(spans_df) == 0:
	print(" • Run your agent to generate traces first")
	if len(evals_df) == 0:
	print(" • Check if evaluations are being logged correctly")
	print(" • Verify Phoenix version compatibility")
	if not gaia_available:
	print(" • Check that data/metadata.jsonl exists and is readable")

	print(f"\n🌐 Phoenix UI: http://localhost:6006")


	if __name__ == "__main__":
	main()