Final_Assignment_Template

Running

Final_Assignment_Template / test_phoenix_simple.py

Romain Fayoux

Added ground evaluation and phoenix login

f9cf36d 4 months ago

4.3 kB

	#!/usr/bin/env python3
	"""
	Simple test for Phoenix evaluations logging.
	"""

	import sys
	import os
	sys.path.append(os.path.dirname(os.path.abspath(__file__)))

	import phoenix as px
	import pandas as pd
	from comparison import AnswerComparator
	from phoenix_evaluator import log_evaluations_to_phoenix


	def test_phoenix_logging():
	"""Test Phoenix evaluations logging with simple data."""
	print("🧪 Testing Phoenix Evaluations Logging")
	print("=" * 50)

	# Step 1: Check Phoenix connection
	print("1. Checking Phoenix connection...")
	try:
	client = px.Client()
	print("✅ Phoenix connected successfully")
	except Exception as e:
	print(f"❌ Phoenix connection failed: {e}")
	return False

	# Step 2: Create test evaluations
	print("\n2. Creating test evaluations...")
	test_evaluations = pd.DataFrame([
	{
	"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
	"predicted_answer": "3",
	"actual_answer": "3",
	"exact_match": True,
	"similarity_score": 1.0,
	"contains_answer": True,
	"error": None
	},
	{
	"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
	"predicted_answer": "5",
	"actual_answer": "3",
	"exact_match": False,
	"similarity_score": 0.2,
	"contains_answer": False,
	"error": None
	}
	])
	print(f"✅ Created {len(test_evaluations)} test evaluations")

	# Step 3: Check existing spans
	print("\n3. Checking existing spans...")
	try:
	spans_df = client.get_spans_dataframe()
	print(f"📊 Found {len(spans_df)} existing spans")

	if len(spans_df) == 0:
	print("⚠️ No spans found - you need to run your agent first to create spans")
	return False

	except Exception as e:
	print(f"❌ Error getting spans: {e}")
	return False

	# Step 4: Test logging
	print("\n4. Testing evaluation logging...")
	try:
	result = log_evaluations_to_phoenix(test_evaluations)

	if result is not None:
	print(f"✅ Successfully logged {len(result)} evaluations to Phoenix")
	print("Sample evaluation:")
	print(f" - Score: {result.iloc[0]['score']}")
	print(f" - Label: {result.iloc[0]['label']}")
	print(f" - Explanation: {result.iloc[0]['explanation'][:100]}...")

	# Step 5: Verify evaluations were logged
	print("\n5. Verifying evaluations in Phoenix...")
	try:
	import time
	time.sleep(2) # Give Phoenix time to process

	evals_df = client.get_evaluations_dataframe()
	gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']

	print(f"📊 Found {len(gaia_evals)} GAIA evaluations in Phoenix")

	if len(gaia_evals) > 0:
	print("✅ Evaluations successfully verified in Phoenix!")
	return True
	else:
	print("⚠️ No GAIA evaluations found in Phoenix")
	return False

	except Exception as e:
	print(f"⚠️ Could not verify evaluations: {e}")
	print("✅ Logging appeared successful though")
	return True

	else:
	print("❌ Evaluation logging failed")
	return False

	except Exception as e:
	print(f"❌ Error during logging: {e}")
	import traceback
	traceback.print_exc()
	return False


	def main():
	"""Main test function."""
	success = test_phoenix_logging()

	print("\n" + "=" * 50)
	if success:
	print("🎉 Phoenix evaluations logging test PASSED!")
	print("You should now see 'gaia_ground_truth' evaluations in Phoenix UI")
	print("🌐 Visit: http://localhost:6006")
	else:
	print("❌ Phoenix evaluations logging test FAILED!")
	print("Make sure:")
	print(" 1. Your agent app is running (it starts Phoenix)")
	print(" 2. You've run your agent at least once to create spans")
	print(" 3. Phoenix is accessible at http://localhost:6006")


	if __name__ == "__main__":
	main()