File size: 9,751 Bytes
f9cf36d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/env python3
"""
Enhanced debug script to check Phoenix status and evaluations.
"""

import phoenix as px
import pandas as pd
from comparison import AnswerComparator
from phoenix_evaluator import log_evaluations_to_phoenix
import time
from datetime import datetime


def check_phoenix_connection():
    """Check if Phoenix is running and accessible."""
    try:
        client = px.Client()
        print("βœ… Phoenix client connected successfully")

        # Try to get basic info
        try:
            spans_df = client.get_spans_dataframe()
            print(f"βœ… Phoenix API working - can retrieve spans")
            return client
        except Exception as e:
            print(f"⚠️ Phoenix connected but API might have issues: {e}")
            return client

    except Exception as e:
        print(f"❌ Phoenix connection failed: {e}")
        print("Make sure Phoenix is running. You should see a message like:")
        print("🌍 To view the Phoenix app in your browser, visit http://localhost:6006")
        return None


def check_spans(client):
    """Check spans in Phoenix."""
    try:
        spans_df = client.get_spans_dataframe()
        print(f"πŸ“Š Found {len(spans_df)} spans in Phoenix")

        if len(spans_df) > 0:
            print("Recent spans:")
            for i, (_, span) in enumerate(spans_df.head(5).iterrows()):
                span_id = span.get('context.span_id', 'no-id')
                span_name = span.get('name', 'unnamed')
                start_time = span.get('start_time', 'unknown')
                print(f"  {i+1}. {span_name} ({span_id[:8]}...) - {start_time}")

            # Show input/output samples
            print("\nSpan content samples:")
            for i, (_, span) in enumerate(spans_df.head(3).iterrows()):
                input_val = str(span.get('input.value', ''))[:100]
                output_val = str(span.get('output.value', ''))[:100]
                print(f"  Span {i+1}:")
                print(f"    Input: {input_val}...")
                print(f"    Output: {output_val}...")

        else:
            print("⚠️ No spans found. Run your agent first to generate traces.")

        return spans_df

    except Exception as e:
        print(f"❌ Error getting spans: {e}")
        return pd.DataFrame()


def check_evaluations(client):
    """Check evaluations in Phoenix."""
    try:
        # Try different methods to get evaluations
        print("πŸ” Checking evaluations...")

        # Method 1: Direct evaluation dataframe
        try:
            evals_df = client.get_evaluations_dataframe()
            print(f"πŸ“Š Found {len(evals_df)} evaluations in Phoenix")

            if len(evals_df) > 0:
                print("Evaluation breakdown:")
                eval_names = evals_df['name'].value_counts()
                for name, count in eval_names.items():
                    print(f"  - {name}: {count} evaluations")

                # Check for GAIA evaluations specifically
                gaia_evals = evals_df[evals_df['name'] == 'gaia_ground_truth']
                if len(gaia_evals) > 0:
                    print(f"βœ… Found {len(gaia_evals)} GAIA ground truth evaluations")

                    # Show sample evaluation
                    sample = gaia_evals.iloc[0]
                    print("Sample GAIA evaluation:")
                    print(f"  - Score: {sample.get('score', 'N/A')}")
                    print(f"  - Label: {sample.get('label', 'N/A')}")
                    print(f"  - Explanation: {sample.get('explanation', 'N/A')[:100]}...")

                    # Show metadata if available
                    metadata = sample.get('metadata', {})
                    if metadata:
                        print(f"  - Metadata keys: {list(metadata.keys())}")

                else:
                    print("❌ No GAIA ground truth evaluations found")
                    print("Available evaluation types:", list(eval_names.keys()))

            else:
                print("⚠️ No evaluations found in Phoenix")

            return evals_df

        except AttributeError as e:
            print(f"⚠️ get_evaluations_dataframe not available: {e}")
            print("This might be a Phoenix version issue")
            return pd.DataFrame()

    except Exception as e:
        print(f"❌ Error getting evaluations: {e}")
        return pd.DataFrame()


def test_evaluation_creation_and_logging():
    """Test creating and logging evaluations."""
    print("\nπŸ§ͺ Testing evaluation creation and logging...")

    # Create sample evaluations
    sample_data = [
        {
            "task_id": "debug-test-1",
            "predicted_answer": "test answer 1",
            "actual_answer": "correct answer 1",
            "exact_match": False,
            "similarity_score": 0.75,
            "contains_answer": True,
            "error": None
        },
        {
            "task_id": "debug-test-2",
            "predicted_answer": "exact match",
            "actual_answer": "exact match",
            "exact_match": True,
            "similarity_score": 1.0,
            "contains_answer": True,
            "error": None
        }
    ]

    evaluations_df = pd.DataFrame(sample_data)
    print(f"Created {len(evaluations_df)} test evaluations")

    # Try to log to Phoenix
    try:
        print("Attempting to log evaluations to Phoenix...")
        result = log_evaluations_to_phoenix(evaluations_df)

        if result is not None:
            print("βœ… Test evaluation logging successful")
            print(f"Logged {len(result)} evaluations")
            return True
        else:
            print("❌ Test evaluation logging failed - no result returned")
            return False

    except Exception as e:
        print(f"❌ Test evaluation logging error: {e}")
        import traceback
        traceback.print_exc()
        return False


def check_gaia_data():
    """Check GAIA ground truth data availability."""
    print("\nπŸ“š Checking GAIA ground truth data...")

    try:
        comparator = AnswerComparator()

        print(f"βœ… Loaded {len(comparator.ground_truth)} GAIA ground truth answers")

        if len(comparator.ground_truth) > 0:
            # Show sample
            sample_task_id = list(comparator.ground_truth.keys())[0]
            sample_answer = comparator.ground_truth[sample_task_id]
            print(f"Sample: {sample_task_id} -> '{sample_answer}'")

            # Test evaluation
            test_eval = comparator.evaluate_answer(sample_task_id, "test answer")
            print(f"Test evaluation result: {test_eval}")

            return True
        else:
            print("❌ No GAIA ground truth data found")
            return False

    except Exception as e:
        print(f"❌ Error checking GAIA data: {e}")
        return False


def show_phoenix_ui_info():
    """Show information about Phoenix UI."""
    print("\n🌐 Phoenix UI Information:")
    print("-" * 30)
    print("Phoenix UI should be available at: http://localhost:6006")
    print("")
    print("In the Phoenix UI, look for:")
    print("  β€’ 'Evaluations' tab or section")
    print("  β€’ 'Evals' section")
    print("  β€’ 'Annotations' tab")
    print("  β€’ In 'Spans' view, look for evaluation badges on spans")
    print("")
    print("If you see evaluations, they should be named 'gaia_ground_truth'")
    print("Each evaluation should show:")
    print("  - Score (similarity score 0-1)")
    print("  - Label (correct/incorrect)")
    print("  - Explanation (predicted vs ground truth)")
    print("  - Metadata (task_id, exact_match, etc.)")


def main():
    """Main debug function."""
    print("πŸ” Enhanced Phoenix Debug Script")
    print("=" * 50)

    # Check Phoenix connection
    client = check_phoenix_connection()
    if not client:
        print("\n❌ Cannot proceed without Phoenix connection")
        print("Make sure your agent app is running (it starts Phoenix)")
        return

    print("\nπŸ“‹ Checking Phoenix Data:")
    print("-" * 30)

    # Check spans
    spans_df = check_spans(client)

    # Check evaluations
    evals_df = check_evaluations(client)

    # Test evaluation creation
    test_success = test_evaluation_creation_and_logging()

    # Wait a moment and recheck evaluations
    if test_success:
        print("\n⏳ Waiting for evaluations to be processed...")
        time.sleep(3)

        print("πŸ” Rechecking evaluations after test logging...")
        evals_df_after = check_evaluations(client)

        if len(evals_df_after) > len(evals_df):
            print("βœ… New evaluations detected after test!")
        else:
            print("⚠️ No new evaluations detected")

    # Check GAIA data
    gaia_available = check_gaia_data()

    # Show Phoenix UI info
    show_phoenix_ui_info()

    # Final summary
    print("\n" + "=" * 50)
    print("πŸ“Š Summary:")
    print(f"  β€’ Phoenix connected: {'βœ…' if client else '❌'}")
    print(f"  β€’ Spans available: {len(spans_df)} spans")
    print(f"  β€’ Evaluations found: {len(evals_df)} evaluations")
    print(f"  β€’ GAIA data available: {'βœ…' if gaia_available else '❌'}")
    print(f"  β€’ Test logging worked: {'βœ…' if test_success else '❌'}")

    print("\nπŸ’‘ Next Steps:")
    if len(spans_df) == 0:
        print("  β€’ Run your agent to generate traces first")
    if len(evals_df) == 0:
        print("  β€’ Check if evaluations are being logged correctly")
        print("  β€’ Verify Phoenix version compatibility")
    if not gaia_available:
        print("  β€’ Check that data/metadata.jsonl exists and is readable")

    print(f"\n🌐 Phoenix UI: http://localhost:6006")


if __name__ == "__main__":
    main()