Final_Assignment_Template

Sleeping

App Files Files Community

Romain Fayoux commited on Jul 11

Commit

16c91c0

1 Parent(s): f9cf36d

Trying to debug phoenix evals

Browse files

Files changed (3) hide show

debug_spans.py +77 -0
phoenix_evaluator.py +95 -36
test_phoenix_simple.py +11 -4

debug_spans.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/usr/bin/env python3
+"""
+Debug script to see Phoenix spans column structure.
+"""
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import phoenix as px
+import pandas as pd
+def debug_spans_structure():
+    """Debug the structure of Phoenix spans."""
+    print("🔍 Debugging Phoenix Spans Structure")
+    print("=" * 50)
+    try:
+        client = px.Client()
+        print("✅ Phoenix connected successfully")
+    except Exception as e:
+        print(f"❌ Phoenix connection failed: {e}")
+        return
+    try:
+        spans_df = client.get_spans_dataframe()
+        print(f"📊 Found {len(spans_df)} spans in Phoenix")
+        if len(spans_df) == 0:
+            print("⚠️ No spans found. Run your agent first to create spans.")
+            return
+        print(f"\n📋 Available Columns ({len(spans_df.columns)} total):")
+        for i, col in enumerate(spans_df.columns):
+            print(f"  {i+1:2d}. {col}")
+        print(f"\n🔍 Sample Data (first span):")
+        sample_span = spans_df.iloc[0]
+        for col in spans_df.columns:
+            value = sample_span.get(col)
+            if value is not None:
+                value_str = str(value)[:100] + "..." if len(str(value)) > 100 else str(value)
+                print(f"  {col}: {value_str}")
+        # Look for input/output related columns
+        input_cols = [col for col in spans_df.columns if 'input' in col.lower()]
+        output_cols = [col for col in spans_df.columns if 'output' in col.lower()]
+        print(f"\n🎯 Input-related columns: {input_cols}")
+        print(f"🎯 Output-related columns: {output_cols}")
+        # Look for span ID columns
+        id_cols = [col for col in spans_df.columns if 'id' in col.lower()]
+        print(f"🎯 ID-related columns: {id_cols}")
+        # Look for columns that might contain task IDs
+        print(f"\n🔍 Searching for task IDs in spans...")
+        task_id_sample = "8e867cd7-cff9-4e6c-867a-ff5ddc2550be"
+        for col in spans_df.columns:
+            if spans_df[col].dtype == 'object':  # String-like columns
+                try:
+                    matches = spans_df[spans_df[col].astype(str).str.contains(task_id_sample, na=False, case=False)]
+                    if len(matches) > 0:
+                        print(f"  ✅ Found task ID in column '{col}': {len(matches)} matches")
+                except:
+                    pass
+    except Exception as e:
+        print(f"❌ Error debugging spans: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    debug_spans_structure()

phoenix_evaluator.py CHANGED Viewed

@@ -136,47 +136,100 @@ def log_evaluations_to_phoenix(evaluations_df: pd.DataFrame, session_id: Optiona
             print("No spans found to attach evaluations to")
             return None
         # Create evaluation records for Phoenix
         evaluation_records = []
         spans_with_evals = []
         for _, eval_row in evaluations_df.iterrows():
             task_id = eval_row["task_id"]
-            # Try to find matching span by searching for task_id in span input
-            matching_spans = spans_df[
-                spans_df['input.value'].astype(str).str.contains(task_id, na=False, case=False)
-            ]
             if len(matching_spans) == 0:
-                # Try alternative search in span attributes or name
-                matching_spans = spans_df[
-                    spans_df['name'].astype(str).str.contains(task_id, na=False, case=False)
-                ]
             if len(matching_spans) > 0:
-                span_id = matching_spans.iloc[0]['context.span_id']
-                # Create evaluation record in Phoenix format
-                evaluation_record = {
-                    "span_id": span_id,
-                    "name": "gaia_ground_truth",
-                    "score": eval_row["similarity_score"],
-                    "label": "correct" if bool(eval_row["exact_match"]) else "incorrect",
-                    "explanation": f"Predicted: '{eval_row['predicted_answer']}' | Ground Truth: '{eval_row['actual_answer']}' | Similarity: {eval_row['similarity_score']:.3f} | Exact Match: {eval_row['exact_match']}",
-                    "annotator_kind": "HUMAN",
-                    "metadata": {
-                        "task_id": task_id,
-                        "exact_match": eval_row["exact_match"],
-                        "similarity_score": eval_row["similarity_score"],
-                        "contains_answer": eval_row["contains_answer"],
-                        "predicted_answer": eval_row["predicted_answer"],
-                        "ground_truth": eval_row["actual_answer"]
                     }
-                }
-                evaluation_records.append(evaluation_record)
-                spans_with_evals.append(span_id)
         if evaluation_records:
             # Convert to DataFrame for Phoenix
@@ -192,19 +245,25 @@ def log_evaluations_to_phoenix(evaluations_df: pd.DataFrame, session_id: Optiona
             try:
                 # Try the newer Phoenix API
                 px.log_evaluations(span_evaluations)
-                print(f"✅ Successfully logged {len(evaluation_records)} evaluations to Phoenix")
             except AttributeError:
-                # Fallback for older Phoenix versions
-                client.log_evaluations(span_evaluations)
-                print(f"✅ Successfully logged {len(evaluation_records)} evaluations to Phoenix (fallback)")
             return eval_df
         else:
-            print("⚠️ No matching spans found for evaluations")
             if spans_df is not None:
                 print(f"Available spans: {len(spans_df)}")
                 if len(spans_df) > 0:
-                    print("Sample span names:", spans_df['name'].head(3).tolist())
             return None
     except Exception as e:

             print("No spans found to attach evaluations to")
             return None
+        # Debug: Show available columns
+        print(f"📊 Available span columns: {list(spans_df.columns)}")
+        # Get possible input/output column names
+        input_columns = [col for col in spans_df.columns if 'input' in col.lower()]
+        output_columns = [col for col in spans_df.columns if 'output' in col.lower()]
+        name_columns = [col for col in spans_df.columns if 'name' in col.lower()]
+        print(f"📊 Input columns found: {input_columns}")
+        print(f"📊 Output columns found: {output_columns}")
+        print(f"📊 Name columns found: {name_columns}")
         # Create evaluation records for Phoenix
         evaluation_records = []
         spans_with_evals = []
         for _, eval_row in evaluations_df.iterrows():
             task_id = eval_row["task_id"]
+            matching_spans = pd.DataFrame()
+            # Try different strategies to find matching spans
+            # Strategy 1: Search in all string columns for task_id
+            for col in spans_df.columns:
+                if spans_df[col].dtype == 'object':  # String-like columns
+                    try:
+                        matches = spans_df[
+                            spans_df[col].astype(str).str.contains(task_id, na=False, case=False)
+                        ]
+                        if len(matches) > 0:
+                            matching_spans = matches
+                            print(f"✅ Found match for {task_id} in column '{col}'")
+                            break
+                    except Exception as e:
+                        continue
+            # Strategy 2: If no matches found, try searching in input columns specifically
+            if len(matching_spans) == 0 and input_columns:
+                for input_col in input_columns:
+                    try:
+                        matches = spans_df[
+                            spans_df[input_col].astype(str).str.contains(task_id, na=False, case=False)
+                        ]
+                        if len(matches) > 0:
+                            matching_spans = matches
+                            print(f"✅ Found match for {task_id} in input column '{input_col}'")
+                            break
+                    except Exception as e:
+                        continue
+            # Strategy 3: If still no matches, try with partial task_id (last 8 characters)
             if len(matching_spans) == 0:
+                short_task_id = task_id[-8:] if len(task_id) > 8 else task_id
+                for col in spans_df.columns:
+                    if spans_df[col].dtype == 'object':
+                        try:
+                            matches = spans_df[
+                                spans_df[col].astype(str).str.contains(short_task_id, na=False, case=False)
+                            ]
+                            if len(matches) > 0:
+                                matching_spans = matches
+                                print(f"✅ Found match for {task_id} using short ID in column '{col}'")
+                                break
+                        except Exception as e:
+                            continue
             if len(matching_spans) > 0:
+                span_id = matching_spans.iloc[0].get('context.span_id') or matching_spans.iloc[0].get('span_id')
+                if span_id:
+                    # Create evaluation record in Phoenix format
+                    evaluation_record = {
+                        "span_id": span_id,
+                        "name": "gaia_ground_truth",
+                        "score": eval_row["similarity_score"],
+                        "label": "correct" if bool(eval_row["exact_match"]) else "incorrect",
+                        "explanation": f"Predicted: '{eval_row['predicted_answer']}' | Ground Truth: '{eval_row['actual_answer']}' | Similarity: {eval_row['similarity_score']:.3f} | Exact Match: {eval_row['exact_match']}",
+                        "annotator_kind": "HUMAN",
+                        "metadata": {
+                            "task_id": task_id,
+                            "exact_match": bool(eval_row["exact_match"]),
+                            "similarity_score": float(eval_row["similarity_score"]),
+                            "contains_answer": bool(eval_row["contains_answer"]),
+                            "predicted_answer": str(eval_row["predicted_answer"]),
+                            "ground_truth": str(eval_row["actual_answer"])
+                        }
                     }
+                    evaluation_records.append(evaluation_record)
+                    spans_with_evals.append(span_id)
+                else:
+                    print(f"⚠️ No span_id found for matching span with task {task_id}")
+            else:
+                print(f"⚠️ No matching span found for task {task_id}")
         if evaluation_records:
             # Convert to DataFrame for Phoenix
             try:
                 # Try the newer Phoenix API
                 px.log_evaluations(span_evaluations)
+                print(f"✅ Successfully logged {len(evaluation_records)} evaluations to Phoenix using px.log_evaluations")
             except AttributeError:
+                try:
+                    # Fallback for older Phoenix versions
+                    client.log_evaluations(span_evaluations)
+                    print(f"✅ Successfully logged {len(evaluation_records)} evaluations to Phoenix using client.log_evaluations")
+                except Exception as e:
+                    print(f"⚠️ Could not log evaluations using either method: {e}")
+                    # Still return the DataFrame so we know what would have been logged
+                    print("Evaluation records created but not logged to Phoenix")
             return eval_df
         else:
+            print("⚠️ No matching spans found for any evaluations")
             if spans_df is not None:
                 print(f"Available spans: {len(spans_df)}")
                 if len(spans_df) > 0:
+                    available_cols = [col for col in spans_df.columns if spans_df[col].dtype == 'object'][:5]
+                    print(f"Sample searchable columns: {available_cols}")
             return None
     except Exception as e:

test_phoenix_simple.py CHANGED Viewed

@@ -61,6 +61,11 @@ def test_phoenix_logging():
             print("⚠️ No spans found - you need to run your agent first to create spans")
             return False
     except Exception as e:
         print(f"❌ Error getting spans: {e}")
         return False
@@ -72,10 +77,11 @@ def test_phoenix_logging():
         if result is not None:
             print(f"✅ Successfully logged {len(result)} evaluations to Phoenix")
-            print("Sample evaluation:")
-            print(f"  - Score: {result.iloc[0]['score']}")
-            print(f"  - Label: {result.iloc[0]['label']}")
-            print(f"  - Explanation: {result.iloc[0]['explanation'][:100]}...")
             # Step 5: Verify evaluations were logged
             print("\n5. Verifying evaluations in Phoenix...")
@@ -126,6 +132,7 @@ def main():
         print("  1. Your agent app is running (it starts Phoenix)")
         print("  2. You've run your agent at least once to create spans")
         print("  3. Phoenix is accessible at http://localhost:6006")
 if __name__ == "__main__":

             print("⚠️ No spans found - you need to run your agent first to create spans")
             return False
+        # Debug: Show available columns
+        print(f"📊 Available span columns: {list(spans_df.columns)}")
+        input_columns = [col for col in spans_df.columns if 'input' in col.lower()]
+        print(f"📊 Input columns found: {input_columns}")
     except Exception as e:
         print(f"❌ Error getting spans: {e}")
         return False
         if result is not None:
             print(f"✅ Successfully logged {len(result)} evaluations to Phoenix")
+            if len(result) > 0:
+                print("Sample evaluation:")
+                print(f"  - Score: {result.iloc[0]['score']}")
+                print(f"  - Label: {result.iloc[0]['label']}")
+                print(f"  - Explanation: {result.iloc[0]['explanation'][:100]}...")
             # Step 5: Verify evaluations were logged
             print("\n5. Verifying evaluations in Phoenix...")
         print("  1. Your agent app is running (it starts Phoenix)")
         print("  2. You've run your agent at least once to create spans")
         print("  3. Phoenix is accessible at http://localhost:6006")
+        print("  4. Run 'python debug_spans.py' to see span column structure")
 if __name__ == "__main__":