#!/usr/bin/env python3 """ Enhanced debug script to check Phoenix status and evaluations. """ import phoenix as px import pandas as pd from comparison import AnswerComparator from phoenix_evaluator import log_evaluations_to_phoenix import time from datetime import datetime def check_phoenix_connection(): """Check if Phoenix is running and accessible.""" try: client = px.Client() print("✅ Phoenix client connected successfully") # Try to get basic info try: spans_df = client.get_spans_dataframe() print(f"✅ Phoenix API working - can retrieve spans") return client except Exception as e: print(f"⚠️ Phoenix connected but API might have issues: {e}") return client except Exception as e: print(f"❌ Phoenix connection failed: {e}") print("Make sure Phoenix is running. You should see a message like:") print("🌍 To view the Phoenix app in your browser, visit http://localhost:6006") return None def check_spans(client): """Check spans in Phoenix.""" try: spans_df = client.get_spans_dataframe() print(f"📊 Found {len(spans_df)} spans in Phoenix") if len(spans_df) > 0: print("Recent spans:") for i, (_, span) in enumerate(spans_df.head(5).iterrows()): span_id = span.get('context.span_id', 'no-id') span_name = span.get('name', 'unnamed') start_time = span.get('start_time', 'unknown') print(f" {i+1}. {span_name} ({span_id[:8]}...) - {start_time}") # Show input/output samples print("\nSpan content samples:") for i, (_, span) in enumerate(spans_df.head(3).iterrows()): input_val = str(span.get('input.value', ''))[:100] output_val = str(span.get('output.value', ''))[:100] print(f" Span {i+1}:") print(f" Input: {input_val}...") print(f" Output: {output_val}...") else: print("⚠️ No spans found. Run your agent first to generate traces.") return spans_df except Exception as e: print(f"❌ Error getting spans: {e}") return pd.DataFrame() def check_evaluations(client): """Check evaluations in Phoenix.""" try: # Try different methods to get evaluations print("🔍 Checking evaluations...") # Method 1: Direct evaluation dataframe try: evals_df = client.get_evaluations_dataframe() print(f"📊 Found {len(evals_df)} evaluations in Phoenix") if len(evals_df) > 0: print("Evaluation breakdown:") eval_names = evals_df['name'].value_counts() for name, count in eval_names.items(): print(f" - {name}: {count} evaluations") # Check for GAIA evaluations specifically gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth'] if len(gaia_evals) > 0: print(f"✅ Found {len(gaia_evals)} GAIA ground truth evaluations") # Show sample evaluation sample = gaia_evals.iloc[0] print("Sample GAIA evaluation:") print(f" - Score: {sample.get('score', 'N/A')}") print(f" - Label: {sample.get('label', 'N/A')}") print(f" - Explanation: {sample.get('explanation', 'N/A')[:100]}...") # Show metadata if available metadata = sample.get('metadata', {}) if metadata: print(f" - Metadata keys: {list(metadata.keys())}") else: print("❌ No GAIA ground truth evaluations found") print("Available evaluation types:", list(eval_names.keys())) else: print("⚠️ No evaluations found in Phoenix") return evals_df except AttributeError as e: print(f"⚠️ get_evaluations_dataframe not available: {e}") print("This might be a Phoenix version issue") return pd.DataFrame() except Exception as e: print(f"❌ Error getting evaluations: {e}") return pd.DataFrame() def test_evaluation_creation_and_logging(): """Test creating and logging evaluations.""" print("\n🧪 Testing evaluation creation and logging...") # Create sample evaluations sample_data = [ { "task_id": "debug-test-1", "predicted_answer": "test answer 1", "actual_answer": "correct answer 1", "exact_match": False, "similarity_score": 0.75, "contains_answer": True, "error": None }, { "task_id": "debug-test-2", "predicted_answer": "exact match", "actual_answer": "exact match", "exact_match": True, "similarity_score": 1.0, "contains_answer": True, "error": None } ] evaluations_df = pd.DataFrame(sample_data) print(f"Created {len(evaluations_df)} test evaluations") # Try to log to Phoenix try: print("Attempting to log evaluations to Phoenix...") result = log_evaluations_to_phoenix(evaluations_df) if result is not None: print("✅ Test evaluation logging successful") print(f"Logged {len(result)} evaluations") return True else: print("❌ Test evaluation logging failed - no result returned") return False except Exception as e: print(f"❌ Test evaluation logging error: {e}") import traceback traceback.print_exc() return False def check_gaia_data(): """Check GAIA ground truth data availability.""" print("\n📚 Checking GAIA ground truth data...") try: comparator = AnswerComparator() print(f"✅ Loaded {len(comparator.ground_truth)} GAIA ground truth answers") if len(comparator.ground_truth) > 0: # Show sample sample_task_id = list(comparator.ground_truth.keys())[0] sample_answer = comparator.ground_truth[sample_task_id] print(f"Sample: {sample_task_id} -> '{sample_answer}'") # Test evaluation test_eval = comparator.evaluate_answer(sample_task_id, "test answer") print(f"Test evaluation result: {test_eval}") return True else: print("❌ No GAIA ground truth data found") return False except Exception as e: print(f"❌ Error checking GAIA data: {e}") return False def show_phoenix_ui_info(): """Show information about Phoenix UI.""" print("\n🌐 Phoenix UI Information:") print("-" * 30) print("Phoenix UI should be available at: http://localhost:6006") print("") print("In the Phoenix UI, look for:") print(" • 'Evaluations' tab or section") print(" • 'Evals' section") print(" • 'Annotations' tab") print(" • In 'Spans' view, look for evaluation badges on spans") print("") print("If you see evaluations, they should be named 'gaia_ground_truth'") print("Each evaluation should show:") print(" - Score (similarity score 0-1)") print(" - Label (correct/incorrect)") print(" - Explanation (predicted vs ground truth)") print(" - Metadata (task_id, exact_match, etc.)") def main(): """Main debug function.""" print("🔍 Enhanced Phoenix Debug Script") print("=" * 50) # Check Phoenix connection client = check_phoenix_connection() if not client: print("\n❌ Cannot proceed without Phoenix connection") print("Make sure your agent app is running (it starts Phoenix)") return print("\n📋 Checking Phoenix Data:") print("-" * 30) # Check spans spans_df = check_spans(client) # Check evaluations evals_df = check_evaluations(client) # Test evaluation creation test_success = test_evaluation_creation_and_logging() # Wait a moment and recheck evaluations if test_success: print("\n⏳ Waiting for evaluations to be processed...") time.sleep(3) print("🔍 Rechecking evaluations after test logging...") evals_df_after = check_evaluations(client) if len(evals_df_after) > len(evals_df): print("✅ New evaluations detected after test!") else: print("⚠️ No new evaluations detected") # Check GAIA data gaia_available = check_gaia_data() # Show Phoenix UI info show_phoenix_ui_info() # Final summary print("\n" + "=" * 50) print("📊 Summary:") print(f" • Phoenix connected: {'✅' if client else '❌'}") print(f" • Spans available: {len(spans_df)} spans") print(f" • Evaluations found: {len(evals_df)} evaluations") print(f" • GAIA data available: {'✅' if gaia_available else '❌'}") print(f" • Test logging worked: {'✅' if test_success else '❌'}") print("\n💡 Next Steps:") if len(spans_df) == 0: print(" • Run your agent to generate traces first") if len(evals_df) == 0: print(" • Check if evaluations are being logged correctly") print(" • Verify Phoenix version compatibility") if not gaia_available: print(" • Check that data/metadata.jsonl exists and is readable") print(f"\n🌐 Phoenix UI: http://localhost:6006") if __name__ == "__main__": main()