|
|
|
|
|
""" |
|
|
Enhanced debug script to check Phoenix status and evaluations. |
|
|
""" |
|
|
|
|
|
import phoenix as px |
|
|
import pandas as pd |
|
|
from comparison import AnswerComparator |
|
|
from phoenix_evaluator import log_evaluations_to_phoenix |
|
|
import time |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
def check_phoenix_connection(): |
|
|
"""Check if Phoenix is running and accessible.""" |
|
|
try: |
|
|
client = px.Client() |
|
|
print("β
Phoenix client connected successfully") |
|
|
|
|
|
|
|
|
try: |
|
|
spans_df = client.get_spans_dataframe() |
|
|
print(f"β
Phoenix API working - can retrieve spans") |
|
|
return client |
|
|
except Exception as e: |
|
|
print(f"β οΈ Phoenix connected but API might have issues: {e}") |
|
|
return client |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Phoenix connection failed: {e}") |
|
|
print("Make sure Phoenix is running. You should see a message like:") |
|
|
print("π To view the Phoenix app in your browser, visit http://localhost:6006") |
|
|
return None |
|
|
|
|
|
|
|
|
def check_spans(client): |
|
|
"""Check spans in Phoenix.""" |
|
|
try: |
|
|
spans_df = client.get_spans_dataframe() |
|
|
print(f"π Found {len(spans_df)} spans in Phoenix") |
|
|
|
|
|
if len(spans_df) > 0: |
|
|
print("Recent spans:") |
|
|
for i, (_, span) in enumerate(spans_df.head(5).iterrows()): |
|
|
span_id = span.get('context.span_id', 'no-id') |
|
|
span_name = span.get('name', 'unnamed') |
|
|
start_time = span.get('start_time', 'unknown') |
|
|
print(f" {i+1}. {span_name} ({span_id[:8]}...) - {start_time}") |
|
|
|
|
|
|
|
|
print("\nSpan content samples:") |
|
|
for i, (_, span) in enumerate(spans_df.head(3).iterrows()): |
|
|
input_val = str(span.get('input.value', ''))[:100] |
|
|
output_val = str(span.get('output.value', ''))[:100] |
|
|
print(f" Span {i+1}:") |
|
|
print(f" Input: {input_val}...") |
|
|
print(f" Output: {output_val}...") |
|
|
|
|
|
else: |
|
|
print("β οΈ No spans found. Run your agent first to generate traces.") |
|
|
|
|
|
return spans_df |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error getting spans: {e}") |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
def check_evaluations(client): |
|
|
"""Check evaluations in Phoenix.""" |
|
|
try: |
|
|
|
|
|
print("π Checking evaluations...") |
|
|
|
|
|
|
|
|
try: |
|
|
evals_df = client.get_evaluations_dataframe() |
|
|
print(f"π Found {len(evals_df)} evaluations in Phoenix") |
|
|
|
|
|
if len(evals_df) > 0: |
|
|
print("Evaluation breakdown:") |
|
|
eval_names = evals_df['name'].value_counts() |
|
|
for name, count in eval_names.items(): |
|
|
print(f" - {name}: {count} evaluations") |
|
|
|
|
|
|
|
|
gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth'] |
|
|
if len(gaia_evals) > 0: |
|
|
print(f"β
Found {len(gaia_evals)} GAIA ground truth evaluations") |
|
|
|
|
|
|
|
|
sample = gaia_evals.iloc[0] |
|
|
print("Sample GAIA evaluation:") |
|
|
print(f" - Score: {sample.get('score', 'N/A')}") |
|
|
print(f" - Label: {sample.get('label', 'N/A')}") |
|
|
print(f" - Explanation: {sample.get('explanation', 'N/A')[:100]}...") |
|
|
|
|
|
|
|
|
metadata = sample.get('metadata', {}) |
|
|
if metadata: |
|
|
print(f" - Metadata keys: {list(metadata.keys())}") |
|
|
|
|
|
else: |
|
|
print("β No GAIA ground truth evaluations found") |
|
|
print("Available evaluation types:", list(eval_names.keys())) |
|
|
|
|
|
else: |
|
|
print("β οΈ No evaluations found in Phoenix") |
|
|
|
|
|
return evals_df |
|
|
|
|
|
except AttributeError as e: |
|
|
print(f"β οΈ get_evaluations_dataframe not available: {e}") |
|
|
print("This might be a Phoenix version issue") |
|
|
return pd.DataFrame() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error getting evaluations: {e}") |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
def test_evaluation_creation_and_logging(): |
|
|
"""Test creating and logging evaluations.""" |
|
|
print("\nπ§ͺ Testing evaluation creation and logging...") |
|
|
|
|
|
|
|
|
sample_data = [ |
|
|
{ |
|
|
"task_id": "debug-test-1", |
|
|
"predicted_answer": "test answer 1", |
|
|
"actual_answer": "correct answer 1", |
|
|
"exact_match": False, |
|
|
"similarity_score": 0.75, |
|
|
"contains_answer": True, |
|
|
"error": None |
|
|
}, |
|
|
{ |
|
|
"task_id": "debug-test-2", |
|
|
"predicted_answer": "exact match", |
|
|
"actual_answer": "exact match", |
|
|
"exact_match": True, |
|
|
"similarity_score": 1.0, |
|
|
"contains_answer": True, |
|
|
"error": None |
|
|
} |
|
|
] |
|
|
|
|
|
evaluations_df = pd.DataFrame(sample_data) |
|
|
print(f"Created {len(evaluations_df)} test evaluations") |
|
|
|
|
|
|
|
|
try: |
|
|
print("Attempting to log evaluations to Phoenix...") |
|
|
result = log_evaluations_to_phoenix(evaluations_df) |
|
|
|
|
|
if result is not None: |
|
|
print("β
Test evaluation logging successful") |
|
|
print(f"Logged {len(result)} evaluations") |
|
|
return True |
|
|
else: |
|
|
print("β Test evaluation logging failed - no result returned") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Test evaluation logging error: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
|
|
|
def check_gaia_data(): |
|
|
"""Check GAIA ground truth data availability.""" |
|
|
print("\nπ Checking GAIA ground truth data...") |
|
|
|
|
|
try: |
|
|
comparator = AnswerComparator() |
|
|
|
|
|
print(f"β
Loaded {len(comparator.ground_truth)} GAIA ground truth answers") |
|
|
|
|
|
if len(comparator.ground_truth) > 0: |
|
|
|
|
|
sample_task_id = list(comparator.ground_truth.keys())[0] |
|
|
sample_answer = comparator.ground_truth[sample_task_id] |
|
|
print(f"Sample: {sample_task_id} -> '{sample_answer}'") |
|
|
|
|
|
|
|
|
test_eval = comparator.evaluate_answer(sample_task_id, "test answer") |
|
|
print(f"Test evaluation result: {test_eval}") |
|
|
|
|
|
return True |
|
|
else: |
|
|
print("β No GAIA ground truth data found") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error checking GAIA data: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
def show_phoenix_ui_info(): |
|
|
"""Show information about Phoenix UI.""" |
|
|
print("\nπ Phoenix UI Information:") |
|
|
print("-" * 30) |
|
|
print("Phoenix UI should be available at: http://localhost:6006") |
|
|
print("") |
|
|
print("In the Phoenix UI, look for:") |
|
|
print(" β’ 'Evaluations' tab or section") |
|
|
print(" β’ 'Evals' section") |
|
|
print(" β’ 'Annotations' tab") |
|
|
print(" β’ In 'Spans' view, look for evaluation badges on spans") |
|
|
print("") |
|
|
print("If you see evaluations, they should be named 'gaia_ground_truth'") |
|
|
print("Each evaluation should show:") |
|
|
print(" - Score (similarity score 0-1)") |
|
|
print(" - Label (correct/incorrect)") |
|
|
print(" - Explanation (predicted vs ground truth)") |
|
|
print(" - Metadata (task_id, exact_match, etc.)") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main debug function.""" |
|
|
print("π Enhanced Phoenix Debug Script") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
client = check_phoenix_connection() |
|
|
if not client: |
|
|
print("\nβ Cannot proceed without Phoenix connection") |
|
|
print("Make sure your agent app is running (it starts Phoenix)") |
|
|
return |
|
|
|
|
|
print("\nπ Checking Phoenix Data:") |
|
|
print("-" * 30) |
|
|
|
|
|
|
|
|
spans_df = check_spans(client) |
|
|
|
|
|
|
|
|
evals_df = check_evaluations(client) |
|
|
|
|
|
|
|
|
test_success = test_evaluation_creation_and_logging() |
|
|
|
|
|
|
|
|
if test_success: |
|
|
print("\nβ³ Waiting for evaluations to be processed...") |
|
|
time.sleep(3) |
|
|
|
|
|
print("π Rechecking evaluations after test logging...") |
|
|
evals_df_after = check_evaluations(client) |
|
|
|
|
|
if len(evals_df_after) > len(evals_df): |
|
|
print("β
New evaluations detected after test!") |
|
|
else: |
|
|
print("β οΈ No new evaluations detected") |
|
|
|
|
|
|
|
|
gaia_available = check_gaia_data() |
|
|
|
|
|
|
|
|
show_phoenix_ui_info() |
|
|
|
|
|
|
|
|
print("\n" + "=" * 50) |
|
|
print("π Summary:") |
|
|
print(f" β’ Phoenix connected: {'β
' if client else 'β'}") |
|
|
print(f" β’ Spans available: {len(spans_df)} spans") |
|
|
print(f" β’ Evaluations found: {len(evals_df)} evaluations") |
|
|
print(f" β’ GAIA data available: {'β
' if gaia_available else 'β'}") |
|
|
print(f" β’ Test logging worked: {'β
' if test_success else 'β'}") |
|
|
|
|
|
print("\nπ‘ Next Steps:") |
|
|
if len(spans_df) == 0: |
|
|
print(" β’ Run your agent to generate traces first") |
|
|
if len(evals_df) == 0: |
|
|
print(" β’ Check if evaluations are being logged correctly") |
|
|
print(" β’ Verify Phoenix version compatibility") |
|
|
if not gaia_available: |
|
|
print(" β’ Check that data/metadata.jsonl exists and is readable") |
|
|
|
|
|
print(f"\nπ Phoenix UI: http://localhost:6006") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|