Romain Fayoux commited on
Commit
16c91c0
Β·
1 Parent(s): f9cf36d

Trying to debug phoenix evals

Browse files
Files changed (3) hide show
  1. debug_spans.py +77 -0
  2. phoenix_evaluator.py +95 -36
  3. test_phoenix_simple.py +11 -4
debug_spans.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Debug script to see Phoenix spans column structure.
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
9
+
10
+ import phoenix as px
11
+ import pandas as pd
12
+
13
+
14
+ def debug_spans_structure():
15
+ """Debug the structure of Phoenix spans."""
16
+ print("πŸ” Debugging Phoenix Spans Structure")
17
+ print("=" * 50)
18
+
19
+ try:
20
+ client = px.Client()
21
+ print("βœ… Phoenix connected successfully")
22
+ except Exception as e:
23
+ print(f"❌ Phoenix connection failed: {e}")
24
+ return
25
+
26
+ try:
27
+ spans_df = client.get_spans_dataframe()
28
+ print(f"πŸ“Š Found {len(spans_df)} spans in Phoenix")
29
+
30
+ if len(spans_df) == 0:
31
+ print("⚠️ No spans found. Run your agent first to create spans.")
32
+ return
33
+
34
+ print(f"\nπŸ“‹ Available Columns ({len(spans_df.columns)} total):")
35
+ for i, col in enumerate(spans_df.columns):
36
+ print(f" {i+1:2d}. {col}")
37
+
38
+ print(f"\nπŸ” Sample Data (first span):")
39
+ sample_span = spans_df.iloc[0]
40
+ for col in spans_df.columns:
41
+ value = sample_span.get(col)
42
+ if value is not None:
43
+ value_str = str(value)[:100] + "..." if len(str(value)) > 100 else str(value)
44
+ print(f" {col}: {value_str}")
45
+
46
+ # Look for input/output related columns
47
+ input_cols = [col for col in spans_df.columns if 'input' in col.lower()]
48
+ output_cols = [col for col in spans_df.columns if 'output' in col.lower()]
49
+
50
+ print(f"\n🎯 Input-related columns: {input_cols}")
51
+ print(f"🎯 Output-related columns: {output_cols}")
52
+
53
+ # Look for span ID columns
54
+ id_cols = [col for col in spans_df.columns if 'id' in col.lower()]
55
+ print(f"🎯 ID-related columns: {id_cols}")
56
+
57
+ # Look for columns that might contain task IDs
58
+ print(f"\nπŸ” Searching for task IDs in spans...")
59
+ task_id_sample = "8e867cd7-cff9-4e6c-867a-ff5ddc2550be"
60
+
61
+ for col in spans_df.columns:
62
+ if spans_df[col].dtype == 'object': # String-like columns
63
+ try:
64
+ matches = spans_df[spans_df[col].astype(str).str.contains(task_id_sample, na=False, case=False)]
65
+ if len(matches) > 0:
66
+ print(f" βœ… Found task ID in column '{col}': {len(matches)} matches")
67
+ except:
68
+ pass
69
+
70
+ except Exception as e:
71
+ print(f"❌ Error debugging spans: {e}")
72
+ import traceback
73
+ traceback.print_exc()
74
+
75
+
76
+ if __name__ == "__main__":
77
+ debug_spans_structure()
phoenix_evaluator.py CHANGED
@@ -136,47 +136,100 @@ def log_evaluations_to_phoenix(evaluations_df: pd.DataFrame, session_id: Optiona
136
  print("No spans found to attach evaluations to")
137
  return None
138
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # Create evaluation records for Phoenix
140
  evaluation_records = []
141
  spans_with_evals = []
142
 
143
  for _, eval_row in evaluations_df.iterrows():
144
  task_id = eval_row["task_id"]
145
-
146
- # Try to find matching span by searching for task_id in span input
147
- matching_spans = spans_df[
148
- spans_df['input.value'].astype(str).str.contains(task_id, na=False, case=False)
149
- ]
150
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  if len(matching_spans) == 0:
152
- # Try alternative search in span attributes or name
153
- matching_spans = spans_df[
154
- spans_df['name'].astype(str).str.contains(task_id, na=False, case=False)
155
- ]
 
 
 
 
 
 
 
 
 
156
 
157
  if len(matching_spans) > 0:
158
- span_id = matching_spans.iloc[0]['context.span_id']
159
-
160
- # Create evaluation record in Phoenix format
161
- evaluation_record = {
162
- "span_id": span_id,
163
- "name": "gaia_ground_truth",
164
- "score": eval_row["similarity_score"],
165
- "label": "correct" if bool(eval_row["exact_match"]) else "incorrect",
166
- "explanation": f"Predicted: '{eval_row['predicted_answer']}' | Ground Truth: '{eval_row['actual_answer']}' | Similarity: {eval_row['similarity_score']:.3f} | Exact Match: {eval_row['exact_match']}",
167
- "annotator_kind": "HUMAN",
168
- "metadata": {
169
- "task_id": task_id,
170
- "exact_match": eval_row["exact_match"],
171
- "similarity_score": eval_row["similarity_score"],
172
- "contains_answer": eval_row["contains_answer"],
173
- "predicted_answer": eval_row["predicted_answer"],
174
- "ground_truth": eval_row["actual_answer"]
 
 
175
  }
176
- }
177
 
178
- evaluation_records.append(evaluation_record)
179
- spans_with_evals.append(span_id)
 
 
 
 
180
 
181
  if evaluation_records:
182
  # Convert to DataFrame for Phoenix
@@ -192,19 +245,25 @@ def log_evaluations_to_phoenix(evaluations_df: pd.DataFrame, session_id: Optiona
192
  try:
193
  # Try the newer Phoenix API
194
  px.log_evaluations(span_evaluations)
195
- print(f"βœ… Successfully logged {len(evaluation_records)} evaluations to Phoenix")
196
  except AttributeError:
197
- # Fallback for older Phoenix versions
198
- client.log_evaluations(span_evaluations)
199
- print(f"βœ… Successfully logged {len(evaluation_records)} evaluations to Phoenix (fallback)")
 
 
 
 
 
200
 
201
  return eval_df
202
  else:
203
- print("⚠️ No matching spans found for evaluations")
204
  if spans_df is not None:
205
  print(f"Available spans: {len(spans_df)}")
206
  if len(spans_df) > 0:
207
- print("Sample span names:", spans_df['name'].head(3).tolist())
 
208
  return None
209
 
210
  except Exception as e:
 
136
  print("No spans found to attach evaluations to")
137
  return None
138
 
139
+ # Debug: Show available columns
140
+ print(f"πŸ“Š Available span columns: {list(spans_df.columns)}")
141
+
142
+ # Get possible input/output column names
143
+ input_columns = [col for col in spans_df.columns if 'input' in col.lower()]
144
+ output_columns = [col for col in spans_df.columns if 'output' in col.lower()]
145
+ name_columns = [col for col in spans_df.columns if 'name' in col.lower()]
146
+
147
+ print(f"πŸ“Š Input columns found: {input_columns}")
148
+ print(f"πŸ“Š Output columns found: {output_columns}")
149
+ print(f"πŸ“Š Name columns found: {name_columns}")
150
+
151
  # Create evaluation records for Phoenix
152
  evaluation_records = []
153
  spans_with_evals = []
154
 
155
  for _, eval_row in evaluations_df.iterrows():
156
  task_id = eval_row["task_id"]
157
+ matching_spans = pd.DataFrame()
158
+
159
+ # Try different strategies to find matching spans
160
+
161
+ # Strategy 1: Search in all string columns for task_id
162
+ for col in spans_df.columns:
163
+ if spans_df[col].dtype == 'object': # String-like columns
164
+ try:
165
+ matches = spans_df[
166
+ spans_df[col].astype(str).str.contains(task_id, na=False, case=False)
167
+ ]
168
+ if len(matches) > 0:
169
+ matching_spans = matches
170
+ print(f"βœ… Found match for {task_id} in column '{col}'")
171
+ break
172
+ except Exception as e:
173
+ continue
174
+
175
+ # Strategy 2: If no matches found, try searching in input columns specifically
176
+ if len(matching_spans) == 0 and input_columns:
177
+ for input_col in input_columns:
178
+ try:
179
+ matches = spans_df[
180
+ spans_df[input_col].astype(str).str.contains(task_id, na=False, case=False)
181
+ ]
182
+ if len(matches) > 0:
183
+ matching_spans = matches
184
+ print(f"βœ… Found match for {task_id} in input column '{input_col}'")
185
+ break
186
+ except Exception as e:
187
+ continue
188
+
189
+ # Strategy 3: If still no matches, try with partial task_id (last 8 characters)
190
  if len(matching_spans) == 0:
191
+ short_task_id = task_id[-8:] if len(task_id) > 8 else task_id
192
+ for col in spans_df.columns:
193
+ if spans_df[col].dtype == 'object':
194
+ try:
195
+ matches = spans_df[
196
+ spans_df[col].astype(str).str.contains(short_task_id, na=False, case=False)
197
+ ]
198
+ if len(matches) > 0:
199
+ matching_spans = matches
200
+ print(f"βœ… Found match for {task_id} using short ID in column '{col}'")
201
+ break
202
+ except Exception as e:
203
+ continue
204
 
205
  if len(matching_spans) > 0:
206
+ span_id = matching_spans.iloc[0].get('context.span_id') or matching_spans.iloc[0].get('span_id')
207
+
208
+ if span_id:
209
+ # Create evaluation record in Phoenix format
210
+ evaluation_record = {
211
+ "span_id": span_id,
212
+ "name": "gaia_ground_truth",
213
+ "score": eval_row["similarity_score"],
214
+ "label": "correct" if bool(eval_row["exact_match"]) else "incorrect",
215
+ "explanation": f"Predicted: '{eval_row['predicted_answer']}' | Ground Truth: '{eval_row['actual_answer']}' | Similarity: {eval_row['similarity_score']:.3f} | Exact Match: {eval_row['exact_match']}",
216
+ "annotator_kind": "HUMAN",
217
+ "metadata": {
218
+ "task_id": task_id,
219
+ "exact_match": bool(eval_row["exact_match"]),
220
+ "similarity_score": float(eval_row["similarity_score"]),
221
+ "contains_answer": bool(eval_row["contains_answer"]),
222
+ "predicted_answer": str(eval_row["predicted_answer"]),
223
+ "ground_truth": str(eval_row["actual_answer"])
224
+ }
225
  }
 
226
 
227
+ evaluation_records.append(evaluation_record)
228
+ spans_with_evals.append(span_id)
229
+ else:
230
+ print(f"⚠️ No span_id found for matching span with task {task_id}")
231
+ else:
232
+ print(f"⚠️ No matching span found for task {task_id}")
233
 
234
  if evaluation_records:
235
  # Convert to DataFrame for Phoenix
 
245
  try:
246
  # Try the newer Phoenix API
247
  px.log_evaluations(span_evaluations)
248
+ print(f"βœ… Successfully logged {len(evaluation_records)} evaluations to Phoenix using px.log_evaluations")
249
  except AttributeError:
250
+ try:
251
+ # Fallback for older Phoenix versions
252
+ client.log_evaluations(span_evaluations)
253
+ print(f"βœ… Successfully logged {len(evaluation_records)} evaluations to Phoenix using client.log_evaluations")
254
+ except Exception as e:
255
+ print(f"⚠️ Could not log evaluations using either method: {e}")
256
+ # Still return the DataFrame so we know what would have been logged
257
+ print("Evaluation records created but not logged to Phoenix")
258
 
259
  return eval_df
260
  else:
261
+ print("⚠️ No matching spans found for any evaluations")
262
  if spans_df is not None:
263
  print(f"Available spans: {len(spans_df)}")
264
  if len(spans_df) > 0:
265
+ available_cols = [col for col in spans_df.columns if spans_df[col].dtype == 'object'][:5]
266
+ print(f"Sample searchable columns: {available_cols}")
267
  return None
268
 
269
  except Exception as e:
test_phoenix_simple.py CHANGED
@@ -61,6 +61,11 @@ def test_phoenix_logging():
61
  print("⚠️ No spans found - you need to run your agent first to create spans")
62
  return False
63
 
 
 
 
 
 
64
  except Exception as e:
65
  print(f"❌ Error getting spans: {e}")
66
  return False
@@ -72,10 +77,11 @@ def test_phoenix_logging():
72
 
73
  if result is not None:
74
  print(f"βœ… Successfully logged {len(result)} evaluations to Phoenix")
75
- print("Sample evaluation:")
76
- print(f" - Score: {result.iloc[0]['score']}")
77
- print(f" - Label: {result.iloc[0]['label']}")
78
- print(f" - Explanation: {result.iloc[0]['explanation'][:100]}...")
 
79
 
80
  # Step 5: Verify evaluations were logged
81
  print("\n5. Verifying evaluations in Phoenix...")
@@ -126,6 +132,7 @@ def main():
126
  print(" 1. Your agent app is running (it starts Phoenix)")
127
  print(" 2. You've run your agent at least once to create spans")
128
  print(" 3. Phoenix is accessible at http://localhost:6006")
 
129
 
130
 
131
  if __name__ == "__main__":
 
61
  print("⚠️ No spans found - you need to run your agent first to create spans")
62
  return False
63
 
64
+ # Debug: Show available columns
65
+ print(f"πŸ“Š Available span columns: {list(spans_df.columns)}")
66
+ input_columns = [col for col in spans_df.columns if 'input' in col.lower()]
67
+ print(f"πŸ“Š Input columns found: {input_columns}")
68
+
69
  except Exception as e:
70
  print(f"❌ Error getting spans: {e}")
71
  return False
 
77
 
78
  if result is not None:
79
  print(f"βœ… Successfully logged {len(result)} evaluations to Phoenix")
80
+ if len(result) > 0:
81
+ print("Sample evaluation:")
82
+ print(f" - Score: {result.iloc[0]['score']}")
83
+ print(f" - Label: {result.iloc[0]['label']}")
84
+ print(f" - Explanation: {result.iloc[0]['explanation'][:100]}...")
85
 
86
  # Step 5: Verify evaluations were logged
87
  print("\n5. Verifying evaluations in Phoenix...")
 
132
  print(" 1. Your agent app is running (it starts Phoenix)")
133
  print(" 2. You've run your agent at least once to create spans")
134
  print(" 3. Phoenix is accessible at http://localhost:6006")
135
+ print(" 4. Run 'python debug_spans.py' to see span column structure")
136
 
137
 
138
  if __name__ == "__main__":