Final_Assignment_Template

Sleeping

Final_Assignment_Template / test_phoenix_logging.py

Romain Fayoux

Added ground evaluation and phoenix login

f9cf36d 5 months ago

8.16 kB

	#!/usr/bin/env python3
	"""
	Test script to verify Phoenix evaluations logging.
	"""

	import sys
	import os
	sys.path.append(os.path.dirname(os.path.abspath(__file__)))

	import phoenix as px
	import pandas as pd
	from comparison import AnswerComparator
	from phoenix_evaluator import log_evaluations_to_phoenix
	from datetime import datetime
	import time


	def test_phoenix_connection():
	"""Test Phoenix connection and basic functionality."""
	print("🔍 Testing Phoenix Connection...")

	try:
	client = px.Client()
	print("✅ Phoenix client connected successfully")

	# Check if Phoenix is actually running
	spans_df = client.get_spans_dataframe()
	print(f"📊 Found {len(spans_df)} existing spans in Phoenix")

	return client, spans_df
	except Exception as e:
	print(f"❌ Phoenix connection failed: {e}")
	print("Make sure Phoenix is running and accessible at http://localhost:6006")
	return None, None


	def create_test_evaluations():
	"""Create test evaluations for logging."""
	print("\n🧪 Creating test evaluations...")

	test_data = [
	{
	"task_id": "test-exact-match",
	"predicted_answer": "Paris",
	"actual_answer": "Paris",
	"exact_match": True,
	"similarity_score": 1.0,
	"contains_answer": True,
	"error": None
	},
	{
	"task_id": "test-partial-match",
	"predicted_answer": "The capital of France is Paris",
	"actual_answer": "Paris",
	"exact_match": False,
	"similarity_score": 0.75,
	"contains_answer": True,
	"error": None
	},
	{
	"task_id": "test-no-match",
	"predicted_answer": "London",
	"actual_answer": "Paris",
	"exact_match": False,
	"similarity_score": 0.2,
	"contains_answer": False,
	"error": None
	}
	]

	evaluations_df = pd.DataFrame(test_data)
	print(f"Created {len(evaluations_df)} test evaluations")

	return evaluations_df


	def create_mock_spans(client):
	"""Create mock spans for testing (if no real spans exist)."""
	print("\n🎭 Creating mock spans for testing...")

	# Note: This is a simplified mock - in real usage, spans are created by agent runs
	mock_spans = [
	{
	"context.span_id": "mock-span-1",
	"name": "test_agent_run",
	"input.value": "Question about test-exact-match",
	"output.value": "Paris",
	"start_time": datetime.now(),
	"end_time": datetime.now()
	},
	{
	"context.span_id": "mock-span-2",
	"name": "test_agent_run",
	"input.value": "Question about test-partial-match",
	"output.value": "The capital of France is Paris",
	"start_time": datetime.now(),
	"end_time": datetime.now()
	},
	{
	"context.span_id": "mock-span-3",
	"name": "test_agent_run",
	"input.value": "Question about test-no-match",
	"output.value": "London",
	"start_time": datetime.now(),
	"end_time": datetime.now()
	}
	]

	print(f"Created {len(mock_spans)} mock spans")
	return pd.DataFrame(mock_spans)


	def test_evaluation_logging():
	"""Test the actual evaluation logging to Phoenix."""
	print("\n📝 Testing evaluation logging...")

	# Create test evaluations
	evaluations_df = create_test_evaluations()

	# Try to log to Phoenix
	try:
	result = log_evaluations_to_phoenix(evaluations_df)

	if result is not None:
	print("✅ Evaluation logging test successful!")
	print(f"Logged {len(result)} evaluations")
	return True
	else:
	print("❌ Evaluation logging test failed - no result returned")
	return False

	except Exception as e:
	print(f"❌ Evaluation logging test failed with error: {e}")
	import traceback
	traceback.print_exc()
	return False


	def verify_logged_evaluations(client):
	"""Verify that evaluations were actually logged to Phoenix."""
	print("\n🔍 Verifying logged evaluations...")

	try:
	# Give Phoenix a moment to process
	time.sleep(2)

	# Try to retrieve evaluations
	evals_df = client.get_evaluations_dataframe()
	print(f"📊 Found {len(evals_df)} total evaluations in Phoenix")

	# Look for our specific evaluations
	if len(evals_df) > 0:
	gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
	print(f"🎯 Found {len(gaia_evals)} GAIA ground truth evaluations")

	if len(gaia_evals) > 0:
	print("✅ Successfully verified evaluations in Phoenix!")

	# Show sample evaluation
	sample_eval = gaia_evals.iloc[0]
	print(f"Sample evaluation:")
	print(f" - Score: {sample_eval.get('score', 'N/A')}")
	print(f" - Label: {sample_eval.get('label', 'N/A')}")
	print(f" - Explanation: {sample_eval.get('explanation', 'N/A')}")

	return True
	else:
	print("❌ No GAIA evaluations found after logging")
	return False
	else:
	print("❌ No evaluations found in Phoenix")
	return False

	except Exception as e:
	print(f"❌ Error verifying evaluations: {e}")
	return False


	def test_with_real_gaia_data():
	"""Test with actual GAIA data if available."""
	print("\n📚 Testing with real GAIA data...")

	try:
	# Initialize comparator
	comparator = AnswerComparator()

	if len(comparator.ground_truth) == 0:
	print("⚠️ No GAIA ground truth data available")
	return False

	# Create a real evaluation with GAIA data
	real_task_id = list(comparator.ground_truth.keys())[0]
	real_ground_truth = comparator.ground_truth[real_task_id]

	real_evaluation = comparator.evaluate_answer(real_task_id, "test answer")

	real_eval_df = pd.DataFrame([real_evaluation])

	# Log to Phoenix
	result = log_evaluations_to_phoenix(real_eval_df)

	if result is not None:
	print("✅ Real GAIA data logging successful!")
	print(f"Task ID: {real_task_id}")
	print(f"Ground Truth: {real_ground_truth}")
	print(f"Similarity Score: {real_evaluation['similarity_score']:.3f}")
	return True
	else:
	print("❌ Real GAIA data logging failed")
	return False

	except Exception as e:
	print(f"❌ Error testing with real GAIA data: {e}")
	return False


	def main():
	"""Main test function."""
	print("🚀 Phoenix Evaluations Logging Test")
	print("=" * 50)

	# Test Phoenix connection
	client, spans_df = test_phoenix_connection()
	if not client:
	print("❌ Cannot proceed without Phoenix connection")
	return

	# Run tests
	tests_passed = 0
	total_tests = 3

	print(f"\n🧪 Running {total_tests} tests...")

	# Test 1: Basic evaluation logging
	if test_evaluation_logging():
	tests_passed += 1

	# Test 2: Verify evaluations were logged
	if verify_logged_evaluations(client):
	tests_passed += 1

	# Test 3: Test with real GAIA data
	if test_with_real_gaia_data():
	tests_passed += 1

	# Summary
	print("\n" + "=" * 50)
	print(f"🎯 Test Results: {tests_passed}/{total_tests} tests passed")

	if tests_passed == total_tests:
	print("🎉 All tests passed! Phoenix evaluations logging is working correctly.")
	print("You should now see 'gaia_ground_truth' evaluations in the Phoenix UI.")
	else:
	print("⚠️ Some tests failed. Check the output above for details.")

	print(f"\n🌐 Phoenix UI: http://localhost:6006")
	print("Look for 'Evaluations' or 'Evals' tab to see the logged evaluations.")


	if __name__ == "__main__":
	main()