Markus Clauss DIRU Vetsuisse
commited on
Commit
·
1637cd5
1
Parent(s):
b44026d
First agent traila
Browse files- .env.example +14 -0
- .gitignore +86 -0
- analyze_failures.py +97 -0
- app.py +1441 -105
- debug_lower_error.py +129 -0
- requirements.txt +15 -1
- run_gaia_test.py +66 -0
- test_agent.py +148 -0
- test_download_files.py +58 -0
- test_file_download.py +59 -0
- test_final_fixes.py +56 -0
- test_fixed_agent.py +151 -0
- test_inline_code.py +75 -0
- test_multimedia.py +105 -0
- test_multimedia_gaia.py +85 -0
.env.example
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API Keys for LangGraph Agent
|
| 2 |
+
|
| 3 |
+
# Required: Anthropic API key for Claude Sonnet 3.5
|
| 4 |
+
ANTHROPIC_API_KEY=sk-ant-your-api-key-here
|
| 5 |
+
|
| 6 |
+
# Recommended: Tavily API key for best web search
|
| 7 |
+
# Get your free key (1000 queries/month) from https://tavily.com
|
| 8 |
+
TAVILY_API_KEY=tvly-your-api-key-here
|
| 9 |
+
|
| 10 |
+
# Optional: SerpAPI key as backup web search
|
| 11 |
+
# Get your key from https://serpapi.com
|
| 12 |
+
SERPAPI_KEY=your-serpapi-key-here
|
| 13 |
+
|
| 14 |
+
# Note: Copy this file to .env and add your actual API keys
|
.gitignore
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables
|
| 2 |
+
.env
|
| 3 |
+
.env.local
|
| 4 |
+
.env.*.local
|
| 5 |
+
.env.
|
| 6 |
+
|
| 7 |
+
# Python
|
| 8 |
+
__pycache__/
|
| 9 |
+
*.py[cod]
|
| 10 |
+
*$py.class
|
| 11 |
+
*.so
|
| 12 |
+
.Python
|
| 13 |
+
env/
|
| 14 |
+
venv/
|
| 15 |
+
ENV/
|
| 16 |
+
env.bak/
|
| 17 |
+
venv.bak/
|
| 18 |
+
.venv/
|
| 19 |
+
|
| 20 |
+
# Virtual environments
|
| 21 |
+
bin/
|
| 22 |
+
include/
|
| 23 |
+
lib/
|
| 24 |
+
lib64/
|
| 25 |
+
share/
|
| 26 |
+
pyvenv.cfg
|
| 27 |
+
|
| 28 |
+
# IDE
|
| 29 |
+
.vscode/
|
| 30 |
+
.idea/
|
| 31 |
+
*.swp
|
| 32 |
+
*.swo
|
| 33 |
+
*~
|
| 34 |
+
|
| 35 |
+
# OS
|
| 36 |
+
.DS_Store
|
| 37 |
+
Thumbs.db
|
| 38 |
+
|
| 39 |
+
# Jupyter Notebook
|
| 40 |
+
.ipynb_checkpoints
|
| 41 |
+
|
| 42 |
+
# Distribution / packaging
|
| 43 |
+
.Python
|
| 44 |
+
build/
|
| 45 |
+
develop-eggs/
|
| 46 |
+
dist/
|
| 47 |
+
downloads/
|
| 48 |
+
eggs/
|
| 49 |
+
.eggs/
|
| 50 |
+
lib/
|
| 51 |
+
lib64/
|
| 52 |
+
parts/
|
| 53 |
+
sdist/
|
| 54 |
+
var/
|
| 55 |
+
wheels/
|
| 56 |
+
*.egg-info/
|
| 57 |
+
.installed.cfg
|
| 58 |
+
*.egg
|
| 59 |
+
|
| 60 |
+
# Testing
|
| 61 |
+
.pytest_cache/
|
| 62 |
+
.coverage
|
| 63 |
+
htmlcov/
|
| 64 |
+
.tox/
|
| 65 |
+
.nox/
|
| 66 |
+
|
| 67 |
+
# Logs
|
| 68 |
+
*.log
|
| 69 |
+
|
| 70 |
+
# Database
|
| 71 |
+
*.db
|
| 72 |
+
*.sqlite3
|
| 73 |
+
|
| 74 |
+
# Gradio
|
| 75 |
+
flagged/
|
| 76 |
+
gradio_cached_examples/
|
| 77 |
+
|
| 78 |
+
# Model cache
|
| 79 |
+
.cache/
|
| 80 |
+
models/
|
| 81 |
+
|
| 82 |
+
# Temporary files
|
| 83 |
+
*.tmp
|
| 84 |
+
*.temp
|
| 85 |
+
tmp/
|
| 86 |
+
temp/
|
analyze_failures.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Analyze which GAIA questions are failing and why
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from app import BasicAgent
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
def analyze_gaia_failures():
|
| 14 |
+
"""Test GAIA questions and categorize failures"""
|
| 15 |
+
|
| 16 |
+
# Initialize agent
|
| 17 |
+
agent = BasicAgent()
|
| 18 |
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
| 19 |
+
if not api_key:
|
| 20 |
+
print("Error: ANTHROPIC_API_KEY not found")
|
| 21 |
+
return
|
| 22 |
+
|
| 23 |
+
agent.set_api_key(api_key)
|
| 24 |
+
|
| 25 |
+
# GAIA questions with expected answers (based on previous runs)
|
| 26 |
+
test_cases = [
|
| 27 |
+
# Correct ones (10/20)
|
| 28 |
+
{"q": "How many lightning strikes occur on Earth each second? Round your answer to the nearest integer.", "expected": "100", "status": "✅"},
|
| 29 |
+
{"q": "What is the current population of Gabon?", "expected": "~2.3M", "status": "✅"},
|
| 30 |
+
{"q": "In a park, there are three gardens: one with 5 tulips and 3 daisies, one with 6 marigolds and 4 petunias, and one with 8 hydrangeas, 2 jasmines, and twice as many roses as the first two gardens combined. How many flowers are there in total?", "expected": "66", "status": "✅"},
|
| 31 |
+
{"q": "What is the sum of the first 20 terms of the arithmetic sequence where the first term is 5 and the common difference is 3?", "expected": "670", "status": "✅"},
|
| 32 |
+
{"q": "What percentage of Gabon is covered by forests?", "expected": "85%", "status": "✅"},
|
| 33 |
+
|
| 34 |
+
# Failed ones that need improvement (10/20)
|
| 35 |
+
{"q": "In Audre Lorde's poem 'Diaspora', she repeats, \"home is\" three times. The last line ends \"and I am...\" what?", "expected": "apart", "status": "❌"},
|
| 36 |
+
{"q": "On April 1, 2024, the French National Railway Company (SNCF) published an April Fool's joke on X (formerly Twitter) about a new model of train. What is the name of this model?", "expected": "TGV Pigeon", "status": "❌"},
|
| 37 |
+
{"q": "In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of \"a \"moat\" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline.\" Verma's \"moat\" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?", "expected": "STEM", "status": "❌"},
|
| 38 |
+
{"q": "Whose X account (formerly Twitter) is this: @lbcmjc?", "expected": "specific person", "status": "❌"},
|
| 39 |
+
{"q": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?", "expected": "code fix", "status": "❌"},
|
| 40 |
+
{"q": "What is the name of the only Israeli pitcher to ever play in the major leagues?", "expected": "specific name", "status": "❌"},
|
| 41 |
+
{"q": "Tell me the amount of sales in the sales sheet for the attached excel file.", "expected": "Unable to determine", "status": "✅"},
|
| 42 |
+
{"q": "How many times is the word \"therefore\" used in the attached PDF?", "expected": "Unable to determine", "status": "✅"},
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
categories = {
|
| 46 |
+
"web_search": [],
|
| 47 |
+
"multimedia": [],
|
| 48 |
+
"calculation": [],
|
| 49 |
+
"code": [],
|
| 50 |
+
"literature": []
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
print("Analyzing GAIA question patterns...\n")
|
| 54 |
+
|
| 55 |
+
for i, test in enumerate(test_cases[:8], 1): # Test first 8 to save time
|
| 56 |
+
question = test["q"]
|
| 57 |
+
expected = test["expected"]
|
| 58 |
+
status = test["status"]
|
| 59 |
+
|
| 60 |
+
print(f"\n{i}. {status} Question: {question[:80]}...")
|
| 61 |
+
print(f" Expected: {expected}")
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
answer = agent(question)
|
| 65 |
+
print(f" Got: {answer[:100]}...")
|
| 66 |
+
|
| 67 |
+
# Categorize question type
|
| 68 |
+
if "twitter" in question.lower() or "april fool" in question.lower():
|
| 69 |
+
categories["web_search"].append((question, answer, status))
|
| 70 |
+
elif "video" in question.lower() or "attached" in question.lower():
|
| 71 |
+
categories["multimedia"].append((question, answer, status))
|
| 72 |
+
elif any(word in question.lower() for word in ["sum", "total", "how many"]):
|
| 73 |
+
categories["calculation"].append((question, answer, status))
|
| 74 |
+
elif "code" in question.lower() or "python" in question.lower():
|
| 75 |
+
categories["code"].append((question, answer, status))
|
| 76 |
+
elif "poem" in question.lower() or "book" in question.lower():
|
| 77 |
+
categories["literature"].append((question, answer, status))
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f" Error: {e}")
|
| 81 |
+
|
| 82 |
+
print("\n" + "="*80)
|
| 83 |
+
print("ANALYSIS SUMMARY")
|
| 84 |
+
print("="*80)
|
| 85 |
+
|
| 86 |
+
for category, items in categories.items():
|
| 87 |
+
if items:
|
| 88 |
+
print(f"\n{category.upper()} ({len(items)} questions):")
|
| 89 |
+
failed = [item for item in items if "❌" in item[2]]
|
| 90 |
+
if failed:
|
| 91 |
+
print(f" Failed: {len(failed)}")
|
| 92 |
+
for q, a, _ in failed[:2]: # Show first 2 failures
|
| 93 |
+
print(f" Q: {q[:60]}...")
|
| 94 |
+
print(f" A: {a[:60]}...")
|
| 95 |
+
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
analyze_gaia_failures()
|
app.py
CHANGED
|
@@ -1,53 +1,1348 @@
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
| 4 |
-
import inspect
|
| 5 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
# (Keep Constants as is)
|
| 8 |
# --- Constants ---
|
| 9 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# --- Basic Agent Definition ---
|
| 12 |
-
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
| 13 |
class BasicAgent:
|
| 14 |
def __init__(self):
|
| 15 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
def __call__(self, question: str) -> str:
|
| 17 |
-
print(f"
|
| 18 |
-
|
| 19 |
-
print(f"
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 23 |
"""
|
| 24 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 25 |
and displays the results.
|
| 26 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 28 |
-
space_id = os.getenv("SPACE_ID")
|
| 29 |
-
|
| 30 |
if profile:
|
| 31 |
-
username= f"{profile.username}"
|
| 32 |
print(f"User logged in: {username}")
|
| 33 |
else:
|
| 34 |
print("User not logged in.")
|
| 35 |
return "Please Login to Hugging Face with the button.", None
|
| 36 |
-
|
| 37 |
api_url = DEFAULT_API_URL
|
| 38 |
questions_url = f"{api_url}/questions"
|
| 39 |
submit_url = f"{api_url}/submit"
|
| 40 |
-
|
| 41 |
-
# 1.
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
print(agent_code)
|
| 50 |
-
|
| 51 |
# 2. Fetch Questions
|
| 52 |
print(f"Fetching questions from: {questions_url}")
|
| 53 |
try:
|
|
@@ -55,47 +1350,59 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 55 |
response.raise_for_status()
|
| 56 |
questions_data = response.json()
|
| 57 |
if not questions_data:
|
| 58 |
-
|
| 59 |
-
|
| 60 |
print(f"Fetched {len(questions_data)} questions.")
|
| 61 |
-
except
|
| 62 |
print(f"Error fetching questions: {e}")
|
| 63 |
return f"Error fetching questions: {e}", None
|
| 64 |
-
|
| 65 |
-
print(f"Error decoding JSON response from questions endpoint: {e}")
|
| 66 |
-
print(f"Response text: {response.text[:500]}")
|
| 67 |
-
return f"Error decoding server response for questions: {e}", None
|
| 68 |
-
except Exception as e:
|
| 69 |
-
print(f"An unexpected error occurred fetching questions: {e}")
|
| 70 |
-
return f"An unexpected error occurred fetching questions: {e}", None
|
| 71 |
-
|
| 72 |
# 3. Run your Agent
|
| 73 |
results_log = []
|
| 74 |
answers_payload = []
|
| 75 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 76 |
-
|
|
|
|
| 77 |
task_id = item.get("task_id")
|
| 78 |
question_text = item.get("question")
|
|
|
|
| 79 |
if not task_id or question_text is None:
|
| 80 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 81 |
continue
|
|
|
|
|
|
|
|
|
|
| 82 |
try:
|
| 83 |
submitted_answer = agent(question_text)
|
| 84 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 85 |
-
results_log.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
except Exception as e:
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
if not answers_payload:
|
| 91 |
print("Agent did not produce any answers to submit.")
|
| 92 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 93 |
-
|
| 94 |
# 4. Prepare Submission
|
| 95 |
-
submission_data = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 97 |
print(status_update)
|
| 98 |
-
|
| 99 |
# 5. Submit
|
| 100 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 101 |
try:
|
|
@@ -112,85 +1419,114 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 112 |
print("Submission successful.")
|
| 113 |
results_df = pd.DataFrame(results_log)
|
| 114 |
return final_status, results_df
|
| 115 |
-
except requests.exceptions.HTTPError as e:
|
| 116 |
-
error_detail = f"Server responded with status {e.response.status_code}."
|
| 117 |
-
try:
|
| 118 |
-
error_json = e.response.json()
|
| 119 |
-
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
| 120 |
-
except requests.exceptions.JSONDecodeError:
|
| 121 |
-
error_detail += f" Response: {e.response.text[:500]}"
|
| 122 |
-
status_message = f"Submission Failed: {error_detail}"
|
| 123 |
-
print(status_message)
|
| 124 |
-
results_df = pd.DataFrame(results_log)
|
| 125 |
-
return status_message, results_df
|
| 126 |
-
except requests.exceptions.Timeout:
|
| 127 |
-
status_message = "Submission Failed: The request timed out."
|
| 128 |
-
print(status_message)
|
| 129 |
-
results_df = pd.DataFrame(results_log)
|
| 130 |
-
return status_message, results_df
|
| 131 |
-
except requests.exceptions.RequestException as e:
|
| 132 |
-
status_message = f"Submission Failed: Network error - {e}"
|
| 133 |
-
print(status_message)
|
| 134 |
-
results_df = pd.DataFrame(results_log)
|
| 135 |
-
return status_message, results_df
|
| 136 |
except Exception as e:
|
| 137 |
-
status_message = f"
|
| 138 |
print(status_message)
|
| 139 |
results_df = pd.DataFrame(results_log)
|
| 140 |
return status_message, results_df
|
| 141 |
|
| 142 |
-
|
| 143 |
# --- Build Gradio Interface using Blocks ---
|
| 144 |
with gr.Blocks() as demo:
|
| 145 |
-
gr.Markdown("#
|
| 146 |
gr.Markdown(
|
| 147 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
**Instructions:**
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
**
|
| 156 |
-
Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
|
| 157 |
-
This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
|
| 158 |
"""
|
| 159 |
)
|
| 160 |
-
|
| 161 |
-
gr.
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
run_button.click(
|
| 170 |
fn=run_and_submit_all,
|
|
|
|
| 171 |
outputs=[status_output, results_table]
|
| 172 |
)
|
| 173 |
|
| 174 |
if __name__ == "__main__":
|
| 175 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 185 |
-
|
| 186 |
-
if space_id_startup: # Print repo URLs if SPACE_ID is found
|
| 187 |
-
print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 188 |
-
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 189 |
-
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
|
| 190 |
-
else:
|
| 191 |
-
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
| 192 |
-
|
| 193 |
-
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 194 |
-
|
| 195 |
-
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
| 196 |
-
demo.launch(debug=True, share=False)
|
|
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
+
from typing import Dict, List, Any, Optional, TypedDict, Annotated
|
| 6 |
+
import re
|
| 7 |
+
import numpy as np
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
# LangChain and LangGraph imports
|
| 11 |
+
from langchain_anthropic import ChatAnthropic
|
| 12 |
+
from langchain_core.messages import HumanMessage, SystemMessage, BaseMessage, AIMessage
|
| 13 |
+
from langchain_core.tools import tool
|
| 14 |
+
from serpapi import GoogleSearch
|
| 15 |
+
from langgraph.graph import StateGraph, END
|
| 16 |
+
from langgraph.prebuilt import ToolNode
|
| 17 |
+
from langgraph.graph.message import add_messages
|
| 18 |
+
import numexpr
|
| 19 |
+
from dotenv import load_dotenv
|
| 20 |
+
|
| 21 |
+
# Load environment variables
|
| 22 |
+
load_dotenv()
|
| 23 |
|
|
|
|
| 24 |
# --- Constants ---
|
| 25 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 26 |
|
| 27 |
+
# --- State Definition for LangGraph ---
|
| 28 |
+
class AgentState(TypedDict):
|
| 29 |
+
messages: Annotated[List[BaseMessage], add_messages]
|
| 30 |
+
|
| 31 |
+
# --- Tool Definitions ---
|
| 32 |
+
@tool
|
| 33 |
+
def web_search(query: str, max_results: int = 8) -> str:
|
| 34 |
+
"""
|
| 35 |
+
Enhanced web search using DuckDuckGo (no API key required).
|
| 36 |
+
Falls back to SerpAPI if available.
|
| 37 |
+
"""
|
| 38 |
+
try:
|
| 39 |
+
# Handle list input
|
| 40 |
+
if isinstance(query, list):
|
| 41 |
+
query = " ".join(str(item) for item in query)
|
| 42 |
+
elif not isinstance(query, str):
|
| 43 |
+
query = str(query)
|
| 44 |
+
|
| 45 |
+
# Try Tavily first if API key is available
|
| 46 |
+
tavily_api_key = os.getenv("TAVILY_API_KEY")
|
| 47 |
+
if tavily_api_key:
|
| 48 |
+
try:
|
| 49 |
+
import requests
|
| 50 |
+
tavily_url = "https://api.tavily.com/search"
|
| 51 |
+
tavily_headers = {
|
| 52 |
+
"Content-Type": "application/json"
|
| 53 |
+
}
|
| 54 |
+
tavily_data = {
|
| 55 |
+
"api_key": tavily_api_key,
|
| 56 |
+
"query": query,
|
| 57 |
+
"search_depth": "advanced",
|
| 58 |
+
"include_answer": True,
|
| 59 |
+
"include_raw_content": False,
|
| 60 |
+
"max_results": max_results
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
response = requests.post(tavily_url, json=tavily_data, headers=tavily_headers, timeout=10)
|
| 64 |
+
if response.status_code == 200:
|
| 65 |
+
results = response.json()
|
| 66 |
+
formatted_results = []
|
| 67 |
+
|
| 68 |
+
# Extract direct answer if available
|
| 69 |
+
if results.get("answer"):
|
| 70 |
+
formatted_results.append(f"DIRECT ANSWER: {results['answer']}")
|
| 71 |
+
|
| 72 |
+
# Extract search results
|
| 73 |
+
if results.get("results"):
|
| 74 |
+
for i, result in enumerate(results["results"][:max_results], 1):
|
| 75 |
+
title = result.get("title", "")
|
| 76 |
+
content = result.get("content", "")
|
| 77 |
+
url = result.get("url", "")
|
| 78 |
+
formatted_results.append(f"{i}. {title}\n {content}\n Source: {url}")
|
| 79 |
+
|
| 80 |
+
if formatted_results:
|
| 81 |
+
return "\n\n".join(formatted_results)
|
| 82 |
+
|
| 83 |
+
except Exception as tavily_error:
|
| 84 |
+
print(f"Tavily search error: {tavily_error}")
|
| 85 |
+
|
| 86 |
+
# Try DuckDuckGo as fallback (no API key needed)
|
| 87 |
+
try:
|
| 88 |
+
import requests
|
| 89 |
+
from urllib.parse import quote
|
| 90 |
+
|
| 91 |
+
# Set shorter timeout and add retries
|
| 92 |
+
ddg_success = False
|
| 93 |
+
formatted_results = []
|
| 94 |
+
|
| 95 |
+
# Try DuckDuckGo Instant Answer API with retry
|
| 96 |
+
for attempt in range(2):
|
| 97 |
+
try:
|
| 98 |
+
ddg_url = f"https://api.duckduckgo.com/?q={quote(query)}&format=json&no_html=1"
|
| 99 |
+
response = requests.get(ddg_url, timeout=5)
|
| 100 |
+
|
| 101 |
+
if response.status_code == 200:
|
| 102 |
+
ddg_data = response.json()
|
| 103 |
+
|
| 104 |
+
# Extract instant answer
|
| 105 |
+
if ddg_data.get("Answer"):
|
| 106 |
+
formatted_results.append(f"DIRECT ANSWER: {ddg_data['Answer']}")
|
| 107 |
+
ddg_success = True
|
| 108 |
+
|
| 109 |
+
# Extract abstract (Wikipedia-like summary)
|
| 110 |
+
if ddg_data.get("Abstract"):
|
| 111 |
+
formatted_results.append(f"SUMMARY: {ddg_data['Abstract']}")
|
| 112 |
+
ddg_success = True
|
| 113 |
+
|
| 114 |
+
# Extract definition
|
| 115 |
+
if ddg_data.get("Definition"):
|
| 116 |
+
formatted_results.append(f"DEFINITION: {ddg_data['Definition']}")
|
| 117 |
+
ddg_success = True
|
| 118 |
+
|
| 119 |
+
if ddg_success:
|
| 120 |
+
break
|
| 121 |
+
except:
|
| 122 |
+
if attempt == 0:
|
| 123 |
+
print(f"DuckDuckGo attempt 1 failed, retrying...")
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
# If DuckDuckGo failed or gave no results, create basic search results
|
| 127 |
+
if not ddg_success:
|
| 128 |
+
print(f"DuckDuckGo unavailable, checking alternatives...")
|
| 129 |
+
|
| 130 |
+
# Try a simple Wikipedia search for specific queries
|
| 131 |
+
if "wikipedia" in query.lower() or "featured article" in query.lower():
|
| 132 |
+
formatted_results.append(f"Search query: {query}")
|
| 133 |
+
formatted_results.append("Note: For Wikipedia Featured Articles, check Wikipedia's FA archives")
|
| 134 |
+
formatted_results.append("Tip: Featured Articles are promoted monthly and listed in Wikipedia's FA log")
|
| 135 |
+
else:
|
| 136 |
+
# Provide some basic context based on common queries
|
| 137 |
+
query_lower = query.lower() if isinstance(query, str) else str(query).lower()
|
| 138 |
+
if "who is" in query_lower or "who was" in query_lower:
|
| 139 |
+
formatted_results.append(f"Search query: {query}")
|
| 140 |
+
formatted_results.append("Note: Live web search unavailable. Please verify information.")
|
| 141 |
+
elif any(word in query_lower for word in ["when", "what year", "what date"]):
|
| 142 |
+
formatted_results.append(f"Search query: {query}")
|
| 143 |
+
formatted_results.append("Note: For current dates and recent events, web search is limited.")
|
| 144 |
+
else:
|
| 145 |
+
formatted_results.append(f"Search query: {query}")
|
| 146 |
+
formatted_results.append("Note: Web search temporarily unavailable.")
|
| 147 |
+
|
| 148 |
+
if formatted_results:
|
| 149 |
+
return "\n\n".join(formatted_results)
|
| 150 |
+
|
| 151 |
+
except Exception as ddg_error:
|
| 152 |
+
print(f"DuckDuckGo search error: {ddg_error}")
|
| 153 |
+
|
| 154 |
+
# Fallback to SerpAPI if available
|
| 155 |
+
api_key = os.getenv("SERPAPI_KEY")
|
| 156 |
+
if api_key:
|
| 157 |
+
params = {
|
| 158 |
+
"q": query,
|
| 159 |
+
"api_key": api_key,
|
| 160 |
+
"num": max_results,
|
| 161 |
+
"engine": "google",
|
| 162 |
+
"hl": "en",
|
| 163 |
+
"gl": "us"
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
search = GoogleSearch(params)
|
| 167 |
+
results = search.get_dict()
|
| 168 |
+
|
| 169 |
+
formatted_results = []
|
| 170 |
+
|
| 171 |
+
# Extract SerpAPI results (same as before)
|
| 172 |
+
if "answer_box" in results:
|
| 173 |
+
ab = results["answer_box"]
|
| 174 |
+
if "answer" in ab:
|
| 175 |
+
formatted_results.append(f"DIRECT ANSWER: {ab['answer']}")
|
| 176 |
+
elif "snippet" in ab:
|
| 177 |
+
formatted_results.append(f"ANSWER BOX: {ab['snippet']}")
|
| 178 |
+
|
| 179 |
+
if "organic_results" in results:
|
| 180 |
+
for i, result in enumerate(results["organic_results"][:max_results], 1):
|
| 181 |
+
title = result.get("title", "")
|
| 182 |
+
snippet = result.get("snippet", "")
|
| 183 |
+
formatted_results.append(f"{i}. {title}\n {snippet}")
|
| 184 |
+
|
| 185 |
+
return "\n\n".join(formatted_results) if formatted_results else "No results found"
|
| 186 |
+
|
| 187 |
+
return "No search service available. Please set SERPAPI_KEY or check internet connection."
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
return f"Search error: {str(e)}"
|
| 191 |
+
|
| 192 |
+
@tool
|
| 193 |
+
def calculator(expression: str) -> str:
|
| 194 |
+
"""
|
| 195 |
+
Enhanced calculator with unit conversion and advanced functions.
|
| 196 |
+
Supports: arithmetic, percentages, trigonometry, logarithms, unit conversion.
|
| 197 |
+
Examples: "15% of 200", "sqrt(16)", "convert 5 km to miles"
|
| 198 |
+
"""
|
| 199 |
+
try:
|
| 200 |
+
# Handle list input
|
| 201 |
+
if isinstance(expression, list):
|
| 202 |
+
expression = " ".join(str(item) for item in expression)
|
| 203 |
+
elif not isinstance(expression, str):
|
| 204 |
+
expression = str(expression)
|
| 205 |
+
|
| 206 |
+
expression = expression.strip().lower()
|
| 207 |
+
|
| 208 |
+
# Handle percentage calculations
|
| 209 |
+
if "% of" in expression:
|
| 210 |
+
parts = expression.split("% of")
|
| 211 |
+
if len(parts) == 2:
|
| 212 |
+
percent = float(parts[0].strip())
|
| 213 |
+
value = float(parts[1].strip())
|
| 214 |
+
result = (percent / 100) * value
|
| 215 |
+
return str(result)
|
| 216 |
+
|
| 217 |
+
# Handle unit conversions
|
| 218 |
+
if "convert" in expression or " to " in expression:
|
| 219 |
+
# Common conversions
|
| 220 |
+
conversions = {
|
| 221 |
+
"km to miles": 0.621371,
|
| 222 |
+
"miles to km": 1.60934,
|
| 223 |
+
"kg to lbs": 2.20462,
|
| 224 |
+
"lbs to kg": 0.453592,
|
| 225 |
+
"celsius to fahrenheit": lambda c: (c * 9/5) + 32,
|
| 226 |
+
"fahrenheit to celsius": lambda f: (f - 32) * 5/9,
|
| 227 |
+
"meters to feet": 3.28084,
|
| 228 |
+
"feet to meters": 0.3048,
|
| 229 |
+
"liters to gallons": 0.264172,
|
| 230 |
+
"gallons to liters": 3.78541
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
for conv, factor in conversions.items():
|
| 234 |
+
if conv in expression:
|
| 235 |
+
# Extract number
|
| 236 |
+
import re
|
| 237 |
+
numbers = re.findall(r'[\d.]+', expression)
|
| 238 |
+
if numbers:
|
| 239 |
+
value = float(numbers[0])
|
| 240 |
+
if callable(factor):
|
| 241 |
+
result = factor(value)
|
| 242 |
+
else:
|
| 243 |
+
result = value * factor
|
| 244 |
+
return f"{result:.4f}".rstrip('0').rstrip('.')
|
| 245 |
+
|
| 246 |
+
# Replace math functions for numexpr
|
| 247 |
+
expression = expression.replace("sqrt", "sqrt")
|
| 248 |
+
expression = expression.replace("log10", "log10")
|
| 249 |
+
expression = expression.replace("log", "log")
|
| 250 |
+
expression = expression.replace("sin", "sin")
|
| 251 |
+
expression = expression.replace("cos", "cos")
|
| 252 |
+
expression = expression.replace("tan", "tan")
|
| 253 |
+
expression = expression.replace("pi", "3.14159265359")
|
| 254 |
+
expression = expression.replace("e", "2.71828182846")
|
| 255 |
+
|
| 256 |
+
# Remove any remaining text
|
| 257 |
+
expression = re.sub(r'[a-zA-Z]+', '', expression)
|
| 258 |
+
|
| 259 |
+
# Evaluate with numexpr
|
| 260 |
+
result = numexpr.evaluate(expression)
|
| 261 |
+
|
| 262 |
+
# Format result
|
| 263 |
+
if isinstance(result, (int, np.integer)):
|
| 264 |
+
return str(int(result))
|
| 265 |
+
elif isinstance(result, (float, np.floating)):
|
| 266 |
+
if abs(result) < 1e-10:
|
| 267 |
+
return "0"
|
| 268 |
+
elif abs(result) > 1e10:
|
| 269 |
+
return f"{result:.2e}"
|
| 270 |
+
else:
|
| 271 |
+
# Keep reasonable precision
|
| 272 |
+
formatted = f"{result:.6f}".rstrip('0').rstrip('.')
|
| 273 |
+
# If it's a whole number, return as int
|
| 274 |
+
if float(formatted).is_integer():
|
| 275 |
+
return str(int(float(formatted)))
|
| 276 |
+
return formatted
|
| 277 |
+
else:
|
| 278 |
+
return str(result)
|
| 279 |
+
|
| 280 |
+
except Exception as e:
|
| 281 |
+
# Try basic Python eval for simple cases
|
| 282 |
+
try:
|
| 283 |
+
import math
|
| 284 |
+
result = eval(expression, {"__builtins__": {}, "math": math})
|
| 285 |
+
if isinstance(result, float) and result.is_integer():
|
| 286 |
+
return str(int(result))
|
| 287 |
+
return str(result)
|
| 288 |
+
except:
|
| 289 |
+
return f"Calculation error: {str(e)}"
|
| 290 |
+
|
| 291 |
+
@tool
|
| 292 |
+
def python_executor(code: str) -> str:
|
| 293 |
+
"""
|
| 294 |
+
Enhanced Python executor with data analysis and web scraping capabilities.
|
| 295 |
+
Includes: pandas, numpy, statistics, datetime, requests, BeautifulSoup.
|
| 296 |
+
Always print the final result you want to return.
|
| 297 |
+
"""
|
| 298 |
+
try:
|
| 299 |
+
# Handle list input
|
| 300 |
+
if isinstance(code, list):
|
| 301 |
+
code = "\n".join(str(item) for item in code)
|
| 302 |
+
elif not isinstance(code, str):
|
| 303 |
+
code = str(code)
|
| 304 |
+
# Enhanced global namespace with more libraries
|
| 305 |
+
safe_globals = {
|
| 306 |
+
'__builtins__': {
|
| 307 |
+
'print': print,
|
| 308 |
+
'len': len,
|
| 309 |
+
'range': range,
|
| 310 |
+
'sum': sum,
|
| 311 |
+
'min': min,
|
| 312 |
+
'max': max,
|
| 313 |
+
'abs': abs,
|
| 314 |
+
'round': round,
|
| 315 |
+
'sorted': sorted,
|
| 316 |
+
'reversed': reversed,
|
| 317 |
+
'enumerate': enumerate,
|
| 318 |
+
'zip': zip,
|
| 319 |
+
'map': map,
|
| 320 |
+
'filter': filter,
|
| 321 |
+
'str': str,
|
| 322 |
+
'int': int,
|
| 323 |
+
'float': float,
|
| 324 |
+
'list': list,
|
| 325 |
+
'dict': dict,
|
| 326 |
+
'set': set,
|
| 327 |
+
'tuple': tuple,
|
| 328 |
+
'bool': bool,
|
| 329 |
+
'all': all,
|
| 330 |
+
'any': any,
|
| 331 |
+
'isinstance': isinstance,
|
| 332 |
+
'type': type,
|
| 333 |
+
},
|
| 334 |
+
'math': __import__('math'),
|
| 335 |
+
'datetime': __import__('datetime'),
|
| 336 |
+
'json': __import__('json'),
|
| 337 |
+
're': __import__('re'),
|
| 338 |
+
'numpy': __import__('numpy'),
|
| 339 |
+
'np': __import__('numpy'),
|
| 340 |
+
'pandas': __import__('pandas'),
|
| 341 |
+
'pd': __import__('pandas'),
|
| 342 |
+
'statistics': __import__('statistics'),
|
| 343 |
+
'itertools': __import__('itertools'),
|
| 344 |
+
'collections': __import__('collections'),
|
| 345 |
+
'Counter': __import__('collections').Counter,
|
| 346 |
+
'defaultdict': __import__('collections').defaultdict,
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
# Capture output
|
| 350 |
+
from io import StringIO
|
| 351 |
+
import sys
|
| 352 |
+
|
| 353 |
+
old_stdout = sys.stdout
|
| 354 |
+
sys.stdout = output_buffer = StringIO()
|
| 355 |
+
|
| 356 |
+
try:
|
| 357 |
+
# Add common imports to the code if needed
|
| 358 |
+
enhanced_code = code
|
| 359 |
+
if "from datetime" not in code and "import datetime" not in code:
|
| 360 |
+
enhanced_code = "from datetime import datetime, date, timedelta\n" + enhanced_code
|
| 361 |
+
|
| 362 |
+
exec(enhanced_code, safe_globals)
|
| 363 |
+
output = output_buffer.getvalue().strip()
|
| 364 |
+
|
| 365 |
+
# If no output, check if there's a result variable
|
| 366 |
+
if not output:
|
| 367 |
+
for var in ['result', 'answer', 'output']:
|
| 368 |
+
if var in safe_globals:
|
| 369 |
+
output = str(safe_globals[var])
|
| 370 |
+
break
|
| 371 |
+
|
| 372 |
+
return output if output else "No output (add print statement)"
|
| 373 |
+
finally:
|
| 374 |
+
sys.stdout = old_stdout
|
| 375 |
+
|
| 376 |
+
except Exception as e:
|
| 377 |
+
import traceback
|
| 378 |
+
return f"Error: {str(e)}\nTraceback: {traceback.format_exc()}"
|
| 379 |
+
|
| 380 |
+
@tool
|
| 381 |
+
def extract_image_from_question(question: str) -> str:
|
| 382 |
+
"""
|
| 383 |
+
Extract and analyze images mentioned in questions.
|
| 384 |
+
For GAIA benchmark, images are typically base64 encoded or referenced.
|
| 385 |
+
"""
|
| 386 |
+
try:
|
| 387 |
+
# Handle list input
|
| 388 |
+
if isinstance(question, list):
|
| 389 |
+
question = " ".join(str(item) for item in question)
|
| 390 |
+
elif not isinstance(question, str):
|
| 391 |
+
question = str(question)
|
| 392 |
+
# Check for base64 image data
|
| 393 |
+
if "data:image" in question:
|
| 394 |
+
return "Image data detected in question"
|
| 395 |
+
|
| 396 |
+
# Check for image file references
|
| 397 |
+
image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg']
|
| 398 |
+
for ext in image_extensions:
|
| 399 |
+
if ext in question.lower():
|
| 400 |
+
return f"Image file reference detected: {ext}"
|
| 401 |
+
|
| 402 |
+
# Check for common image-related phrases
|
| 403 |
+
image_phrases = ['image', 'picture', 'photo', 'diagram', 'figure', 'screenshot']
|
| 404 |
+
for phrase in image_phrases:
|
| 405 |
+
if phrase in question.lower():
|
| 406 |
+
return "Image-related content mentioned in question"
|
| 407 |
+
|
| 408 |
+
return "No image content detected"
|
| 409 |
+
except Exception as e:
|
| 410 |
+
return f"Error analyzing for images: {str(e)}"
|
| 411 |
+
|
| 412 |
+
@tool
|
| 413 |
+
def analyze_attachments(question: str) -> str:
|
| 414 |
+
"""
|
| 415 |
+
Analyze questions for references to attachments (files, videos, audio).
|
| 416 |
+
For GAIA questions that reference external content.
|
| 417 |
+
"""
|
| 418 |
+
# Handle list input
|
| 419 |
+
if isinstance(question, list):
|
| 420 |
+
question = " ".join(str(item) for item in question)
|
| 421 |
+
elif not isinstance(question, str):
|
| 422 |
+
question = str(question)
|
| 423 |
+
|
| 424 |
+
attachments = []
|
| 425 |
+
|
| 426 |
+
# Check for YouTube videos
|
| 427 |
+
youtube_patterns = [
|
| 428 |
+
r'youtube\.com/watch\?v=([a-zA-Z0-9_-]+)',
|
| 429 |
+
r'youtu\.be/([a-zA-Z0-9_-]+)'
|
| 430 |
+
]
|
| 431 |
+
for pattern in youtube_patterns:
|
| 432 |
+
import re
|
| 433 |
+
matches = re.findall(pattern, question)
|
| 434 |
+
if matches:
|
| 435 |
+
attachments.append(f"YouTube video: {matches[0]}")
|
| 436 |
+
|
| 437 |
+
# Check for file URLs
|
| 438 |
+
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+\.(?:xlsx|xls|csv|pdf|txt)'
|
| 439 |
+
url_matches = re.findall(url_pattern, question, re.IGNORECASE)
|
| 440 |
+
if url_matches:
|
| 441 |
+
for url in url_matches:
|
| 442 |
+
if '.xlsx' in url or '.xls' in url:
|
| 443 |
+
attachments.append(f"Excel file URL: {url}")
|
| 444 |
+
elif '.csv' in url:
|
| 445 |
+
attachments.append(f"CSV file URL: {url}")
|
| 446 |
+
elif '.pdf' in url:
|
| 447 |
+
attachments.append(f"PDF file URL: {url}")
|
| 448 |
+
elif '.txt' in url:
|
| 449 |
+
attachments.append(f"Text file URL: {url}")
|
| 450 |
+
|
| 451 |
+
# Check for file references without URLs
|
| 452 |
+
file_patterns = [
|
| 453 |
+
r'attached (\w+) file',
|
| 454 |
+
r'the (\w+) file',
|
| 455 |
+
r'(\w+\.\w{2,4})' # filename.ext
|
| 456 |
+
]
|
| 457 |
+
for pattern in file_patterns:
|
| 458 |
+
matches = re.findall(pattern, question, re.IGNORECASE)
|
| 459 |
+
if matches:
|
| 460 |
+
# Filter out URLs we already found
|
| 461 |
+
for match in matches:
|
| 462 |
+
if not any(match in url for url in url_matches):
|
| 463 |
+
attachments.append(f"File reference: {match}")
|
| 464 |
+
|
| 465 |
+
if attachments:
|
| 466 |
+
return "Attachments found: " + ", ".join(attachments)
|
| 467 |
+
return "No attachments detected"
|
| 468 |
+
|
| 469 |
+
@tool
|
| 470 |
+
def analyze_reversed_text(text: str) -> str:
|
| 471 |
+
"""
|
| 472 |
+
Analyze text that might be written backwards or contains puzzles.
|
| 473 |
+
Useful for GAIA questions with reversed text.
|
| 474 |
+
"""
|
| 475 |
+
try:
|
| 476 |
+
# Handle list input
|
| 477 |
+
if isinstance(text, list):
|
| 478 |
+
text = " ".join(str(item) for item in text)
|
| 479 |
+
elif not isinstance(text, str):
|
| 480 |
+
text = str(text)
|
| 481 |
+
# Check if text might be reversed
|
| 482 |
+
reversed_text = text[::-1]
|
| 483 |
+
|
| 484 |
+
# Common patterns for reversed text
|
| 485 |
+
if "rewsna" in text.lower() or "noitseuq" in text.lower():
|
| 486 |
+
return f"Text appears to be reversed. Original: {reversed_text}"
|
| 487 |
+
|
| 488 |
+
# Check for word reversal
|
| 489 |
+
words = text.split()
|
| 490 |
+
reversed_words = [word[::-1] for word in words]
|
| 491 |
+
|
| 492 |
+
return f"Normal text: {text}\nReversed text: {reversed_text}\nReversed words: {' '.join(reversed_words)}"
|
| 493 |
+
except Exception as e:
|
| 494 |
+
return f"Error analyzing text: {str(e)}"
|
| 495 |
+
|
| 496 |
+
@tool
|
| 497 |
+
def analyze_code_in_question(question: str) -> str:
|
| 498 |
+
"""
|
| 499 |
+
Detect and extract Python code from questions.
|
| 500 |
+
Looks for code blocks, inline code, and code-related phrases.
|
| 501 |
+
"""
|
| 502 |
+
try:
|
| 503 |
+
# Handle list input
|
| 504 |
+
if isinstance(question, list):
|
| 505 |
+
question = " ".join(str(item) for item in question)
|
| 506 |
+
elif not isinstance(question, str):
|
| 507 |
+
question = str(question)
|
| 508 |
+
|
| 509 |
+
extracted_code = []
|
| 510 |
+
|
| 511 |
+
# Pattern 1: Look for markdown code blocks ```python ... ```
|
| 512 |
+
code_block_pattern = r'```python\s*(.*?)\s*```'
|
| 513 |
+
code_blocks = re.findall(code_block_pattern, question, re.DOTALL | re.IGNORECASE)
|
| 514 |
+
if code_blocks:
|
| 515 |
+
for i, code in enumerate(code_blocks, 1):
|
| 516 |
+
extracted_code.append(f"Code Block {i}:\n{code.strip()}")
|
| 517 |
+
|
| 518 |
+
# Pattern 2: Look for generic code blocks ``` ... ```
|
| 519 |
+
generic_code_pattern = r'```\s*(.*?)\s*```'
|
| 520 |
+
generic_blocks = re.findall(generic_code_pattern, question, re.DOTALL)
|
| 521 |
+
if generic_blocks:
|
| 522 |
+
for i, code in enumerate(generic_blocks, 1):
|
| 523 |
+
# Check if it looks like Python code
|
| 524 |
+
if any(keyword in code for keyword in ['def ', 'import ', 'class ', 'if ', 'for ', 'while ', 'print(', 'return ']):
|
| 525 |
+
extracted_code.append(f"Generic Code Block {i}:\n{code.strip()}")
|
| 526 |
+
|
| 527 |
+
# Pattern 3: Look for inline code `...`
|
| 528 |
+
inline_code_pattern = r'`([^`]+)`'
|
| 529 |
+
inline_codes = re.findall(inline_code_pattern, question)
|
| 530 |
+
if inline_codes:
|
| 531 |
+
# Filter for likely Python code
|
| 532 |
+
python_inline = []
|
| 533 |
+
for code in inline_codes:
|
| 534 |
+
if any(char in code for char in ['(', ')', '=', '[', ']', '{', '}', 'def', 'import', 'print']):
|
| 535 |
+
python_inline.append(code)
|
| 536 |
+
if python_inline:
|
| 537 |
+
extracted_code.append("Inline Code:\n" + "\n".join(f"- {code}" for code in python_inline))
|
| 538 |
+
|
| 539 |
+
# Pattern 4: Look for code-related phrases
|
| 540 |
+
code_phrases = [
|
| 541 |
+
r'attached python code',
|
| 542 |
+
r'the following code',
|
| 543 |
+
r'this code',
|
| 544 |
+
r'given code',
|
| 545 |
+
r'code snippet',
|
| 546 |
+
r'python script',
|
| 547 |
+
r'the script',
|
| 548 |
+
r'function below',
|
| 549 |
+
r'class below',
|
| 550 |
+
r'program below'
|
| 551 |
+
]
|
| 552 |
+
|
| 553 |
+
code_indicators = []
|
| 554 |
+
for phrase in code_phrases:
|
| 555 |
+
if re.search(phrase, question, re.IGNORECASE):
|
| 556 |
+
code_indicators.append(phrase.replace(r'\\', ''))
|
| 557 |
+
|
| 558 |
+
# Pattern 5: Look for common Python patterns not in code blocks
|
| 559 |
+
python_patterns = [
|
| 560 |
+
r'def\s+\w+\s*\([^)]*\)\s*:', # function definitions
|
| 561 |
+
r'class\s+\w+\s*(?:\([^)]*\))?\s*:', # class definitions
|
| 562 |
+
r'import\s+\w+', # import statements
|
| 563 |
+
r'from\s+\w+\s+import', # from imports
|
| 564 |
+
r'if\s+.*:\s*\n', # if statements
|
| 565 |
+
r'for\s+\w+\s+in\s+', # for loops
|
| 566 |
+
r'while\s+.*:\s*\n', # while loops
|
| 567 |
+
]
|
| 568 |
+
|
| 569 |
+
loose_code = []
|
| 570 |
+
for pattern in python_patterns:
|
| 571 |
+
matches = re.findall(pattern, question, re.MULTILINE)
|
| 572 |
+
if matches:
|
| 573 |
+
loose_code.extend(matches)
|
| 574 |
+
|
| 575 |
+
if loose_code:
|
| 576 |
+
extracted_code.append("Detected Python patterns:\n" + "\n".join(f"- {code.strip()}" for code in loose_code[:5]))
|
| 577 |
+
|
| 578 |
+
# Build response
|
| 579 |
+
response_parts = []
|
| 580 |
+
|
| 581 |
+
if extracted_code:
|
| 582 |
+
response_parts.append("Found Python code in question:")
|
| 583 |
+
response_parts.extend(extracted_code)
|
| 584 |
+
|
| 585 |
+
if code_indicators:
|
| 586 |
+
response_parts.append(f"\nCode-related phrases detected: {', '.join(code_indicators)}")
|
| 587 |
+
|
| 588 |
+
if not extracted_code and not code_indicators:
|
| 589 |
+
return "No Python code detected in the question"
|
| 590 |
+
|
| 591 |
+
return "\n\n".join(response_parts)
|
| 592 |
+
|
| 593 |
+
except Exception as e:
|
| 594 |
+
return f"Error analyzing code in question: {str(e)}"
|
| 595 |
+
|
| 596 |
+
@tool
|
| 597 |
+
def get_youtube_transcript(url: str) -> str:
|
| 598 |
+
"""
|
| 599 |
+
Extract transcript/subtitles from YouTube videos.
|
| 600 |
+
Useful for questions asking about video content.
|
| 601 |
+
"""
|
| 602 |
+
try:
|
| 603 |
+
# Handle list input
|
| 604 |
+
if isinstance(url, list):
|
| 605 |
+
url = " ".join(str(item) for item in url)
|
| 606 |
+
elif not isinstance(url, str):
|
| 607 |
+
url = str(url)
|
| 608 |
+
|
| 609 |
+
# Extract video ID from URL
|
| 610 |
+
import re
|
| 611 |
+
video_id_match = re.search(r'(?:v=|/)([0-9A-Za-z_-]{11}).*', url)
|
| 612 |
+
if not video_id_match:
|
| 613 |
+
return "Error: Invalid YouTube URL"
|
| 614 |
+
|
| 615 |
+
video_id = video_id_match.group(1)
|
| 616 |
+
|
| 617 |
+
# Try to get transcript
|
| 618 |
+
try:
|
| 619 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 620 |
+
import time
|
| 621 |
+
|
| 622 |
+
# Add a small delay to avoid rate limiting
|
| 623 |
+
time.sleep(1)
|
| 624 |
+
|
| 625 |
+
# Try to get transcript in different languages
|
| 626 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
| 627 |
+
|
| 628 |
+
# Try English first
|
| 629 |
+
transcript = None
|
| 630 |
+
try:
|
| 631 |
+
transcript = transcript_list.find_transcript(['en'])
|
| 632 |
+
except:
|
| 633 |
+
# Get any available transcript
|
| 634 |
+
try:
|
| 635 |
+
transcript = transcript_list.find_manually_created_transcript()
|
| 636 |
+
except:
|
| 637 |
+
try:
|
| 638 |
+
transcript = transcript_list.find_generated_transcript()
|
| 639 |
+
except:
|
| 640 |
+
pass
|
| 641 |
+
|
| 642 |
+
if transcript:
|
| 643 |
+
# Get the actual transcript data
|
| 644 |
+
transcript_data = transcript.fetch()
|
| 645 |
+
|
| 646 |
+
# Combine all text - handle both list and dict formats
|
| 647 |
+
if isinstance(transcript_data, list):
|
| 648 |
+
full_text = " ".join([entry.get('text', '') if isinstance(entry, dict) else str(entry) for entry in transcript_data])
|
| 649 |
+
else:
|
| 650 |
+
# Handle other formats
|
| 651 |
+
full_text = str(transcript_data)
|
| 652 |
+
|
| 653 |
+
# For specific dialogue questions, also return with timestamps
|
| 654 |
+
if any(phrase in url.lower() or phrase in str(url).lower()
|
| 655 |
+
for phrase in ["say", "response", "answer", "dialogue"]):
|
| 656 |
+
# Return last 500 chars for context
|
| 657 |
+
return f"Transcript excerpt: ...{full_text[-500:]}"
|
| 658 |
+
|
| 659 |
+
return f"Full transcript: {full_text[:1000]}..." if len(full_text) > 1000 else f"Full transcript: {full_text}"
|
| 660 |
+
|
| 661 |
+
except Exception as yt_error:
|
| 662 |
+
error_str = str(yt_error)
|
| 663 |
+
print(f"YouTube transcript error: {yt_error}")
|
| 664 |
+
|
| 665 |
+
# Handle rate limiting specifically
|
| 666 |
+
if "429" in error_str or "Too Many Requests" in error_str:
|
| 667 |
+
return "Unable to determine"
|
| 668 |
+
|
| 669 |
+
# Try alternative method with pytube
|
| 670 |
+
try:
|
| 671 |
+
from pytube import YouTube
|
| 672 |
+
import time
|
| 673 |
+
|
| 674 |
+
# Add delay to avoid rate limiting
|
| 675 |
+
time.sleep(1)
|
| 676 |
+
|
| 677 |
+
yt = YouTube(url)
|
| 678 |
+
|
| 679 |
+
# Get video title and description for context
|
| 680 |
+
title = yt.title if hasattr(yt, 'title') else "Unknown"
|
| 681 |
+
description = yt.description[:200] if hasattr(yt, 'description') and yt.description else "No description"
|
| 682 |
+
|
| 683 |
+
return f"Video info - Title: {title}\nDescription: {description}\nNote: Transcript not available"
|
| 684 |
+
|
| 685 |
+
except Exception as pytube_error:
|
| 686 |
+
print(f"Pytube error: {pytube_error}")
|
| 687 |
+
|
| 688 |
+
return "Unable to determine"
|
| 689 |
+
|
| 690 |
+
except Exception as e:
|
| 691 |
+
return f"Error accessing YouTube video: {str(e)}"
|
| 692 |
+
|
| 693 |
+
@tool
|
| 694 |
+
def analyze_multimedia_reference(question: str) -> str:
|
| 695 |
+
"""
|
| 696 |
+
Detect and provide guidance for multimedia content in questions.
|
| 697 |
+
Returns specific answers for common multimedia patterns.
|
| 698 |
+
"""
|
| 699 |
+
try:
|
| 700 |
+
# Handle list input
|
| 701 |
+
if isinstance(question, list):
|
| 702 |
+
question = " ".join(str(item) for item in question)
|
| 703 |
+
elif not isinstance(question, str):
|
| 704 |
+
question = str(question)
|
| 705 |
+
|
| 706 |
+
question_lower = question.lower()
|
| 707 |
+
|
| 708 |
+
# More intelligent responses based on question context
|
| 709 |
+
|
| 710 |
+
# Excel/Spreadsheet questions asking for numeric values
|
| 711 |
+
if any(term in question_lower for term in ["excel", "spreadsheet", ".xlsx", ".xls", ".csv"]):
|
| 712 |
+
if any(term in question_lower for term in ["total", "sum", "how much", "how many", "amount"]):
|
| 713 |
+
# For numeric questions about spreadsheets, we can't determine the value
|
| 714 |
+
return "Cannot access spreadsheet - provide final answer: Unable to determine"
|
| 715 |
+
elif "sales" in question_lower and "total" in question_lower:
|
| 716 |
+
return "Cannot access sales data - provide final answer: Unable to determine"
|
| 717 |
+
|
| 718 |
+
# Python code questions
|
| 719 |
+
if "attached" in question_lower and ("python" in question_lower or "code" in question_lower):
|
| 720 |
+
if "output" in question_lower and ("numeric" in question_lower or "final" in question_lower):
|
| 721 |
+
return "Cannot access attached code - provide final answer: Unable to determine"
|
| 722 |
+
elif "fix" in question_lower or "correct" in question_lower:
|
| 723 |
+
return "Cannot access attached code to fix - provide final answer: Unable to determine"
|
| 724 |
+
|
| 725 |
+
# PDF questions asking for counts
|
| 726 |
+
if ("pdf" in question_lower or ".pdf" in question_lower) and any(term in question_lower for term in ["how many", "count", "times"]):
|
| 727 |
+
return "Cannot access PDF to count - provide final answer: Unable to determine"
|
| 728 |
+
|
| 729 |
+
# Image questions
|
| 730 |
+
if any(term in question_lower for term in ["image", "picture", "photo", ".png", ".jpg", ".jpeg"]):
|
| 731 |
+
if "chess" in question_lower:
|
| 732 |
+
return "Cannot access chess position image - provide final answer: Unable to determine"
|
| 733 |
+
elif any(term in question_lower for term in ["color", "what is", "describe"]):
|
| 734 |
+
return "Cannot access image - provide final answer: Unable to determine"
|
| 735 |
+
|
| 736 |
+
# Audio questions
|
| 737 |
+
if any(term in question_lower for term in ["audio", ".mp3", ".wav", "recording"]):
|
| 738 |
+
if any(term in question_lower for term in ["transcribe", "what does", "study", "exam"]):
|
| 739 |
+
return "Cannot access audio file - provide final answer: Unable to determine"
|
| 740 |
+
|
| 741 |
+
return "No specific multimedia pattern requiring 'Unable to determine' response"
|
| 742 |
+
|
| 743 |
+
except Exception as e:
|
| 744 |
+
return f"Error analyzing multimedia: {str(e)}"
|
| 745 |
+
|
| 746 |
+
@tool
|
| 747 |
+
def download_and_process_file(url: str, file_type: str = None) -> str:
|
| 748 |
+
"""
|
| 749 |
+
Download and process files from URLs (Excel, CSV, PDF, etc).
|
| 750 |
+
Useful when questions reference files by URL.
|
| 751 |
+
"""
|
| 752 |
+
try:
|
| 753 |
+
# Handle list input
|
| 754 |
+
if isinstance(url, list):
|
| 755 |
+
url = " ".join(str(item) for item in url)
|
| 756 |
+
elif not isinstance(url, str):
|
| 757 |
+
url = str(url)
|
| 758 |
+
|
| 759 |
+
# Clean URL
|
| 760 |
+
url = url.strip()
|
| 761 |
+
|
| 762 |
+
# Try to determine file type from URL if not provided
|
| 763 |
+
if not file_type:
|
| 764 |
+
if any(ext in url.lower() for ext in ['.xlsx', '.xls']):
|
| 765 |
+
file_type = 'excel'
|
| 766 |
+
elif '.csv' in url.lower():
|
| 767 |
+
file_type = 'csv'
|
| 768 |
+
elif '.pdf' in url.lower():
|
| 769 |
+
file_type = 'pdf'
|
| 770 |
+
elif any(ext in url.lower() for ext in ['.txt', '.text']):
|
| 771 |
+
file_type = 'text'
|
| 772 |
+
else:
|
| 773 |
+
return "Unable to determine file type from URL"
|
| 774 |
+
|
| 775 |
+
# Download the file
|
| 776 |
+
import requests
|
| 777 |
+
from io import BytesIO, StringIO
|
| 778 |
+
|
| 779 |
+
try:
|
| 780 |
+
response = requests.get(url, timeout=15, headers={'User-Agent': 'Mozilla/5.0'})
|
| 781 |
+
response.raise_for_status()
|
| 782 |
+
except requests.exceptions.RequestException as e:
|
| 783 |
+
return f"Failed to download file: {str(e)}"
|
| 784 |
+
|
| 785 |
+
# Process based on file type
|
| 786 |
+
if file_type == 'excel':
|
| 787 |
+
try:
|
| 788 |
+
import pandas as pd
|
| 789 |
+
df = pd.read_excel(BytesIO(response.content))
|
| 790 |
+
|
| 791 |
+
# Provide summary of Excel file
|
| 792 |
+
info = []
|
| 793 |
+
info.append(f"Excel file loaded successfully")
|
| 794 |
+
info.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
|
| 795 |
+
info.append(f"Columns: {', '.join(df.columns)}")
|
| 796 |
+
|
| 797 |
+
# If numeric columns exist, provide sums
|
| 798 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
| 799 |
+
if len(numeric_cols) > 0:
|
| 800 |
+
info.append("\nNumeric column sums:")
|
| 801 |
+
for col in numeric_cols:
|
| 802 |
+
total = df[col].sum()
|
| 803 |
+
info.append(f" {col}: {total}")
|
| 804 |
+
|
| 805 |
+
# Check for common patterns
|
| 806 |
+
if 'sales' in ' '.join(df.columns).lower():
|
| 807 |
+
sales_cols = [col for col in df.columns if 'sales' in col.lower()]
|
| 808 |
+
if sales_cols:
|
| 809 |
+
total_sales = df[sales_cols].sum().sum()
|
| 810 |
+
info.append(f"\nTotal sales: {total_sales}")
|
| 811 |
+
|
| 812 |
+
return '\n'.join(info)
|
| 813 |
+
|
| 814 |
+
except Exception as e:
|
| 815 |
+
return f"Error processing Excel file: {str(e)}"
|
| 816 |
+
|
| 817 |
+
elif file_type == 'csv':
|
| 818 |
+
try:
|
| 819 |
+
import pandas as pd
|
| 820 |
+
df = pd.read_csv(StringIO(response.text))
|
| 821 |
+
|
| 822 |
+
info = []
|
| 823 |
+
info.append(f"CSV file loaded successfully")
|
| 824 |
+
info.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
|
| 825 |
+
info.append(f"Columns: {', '.join(df.columns)}")
|
| 826 |
+
|
| 827 |
+
# Provide numeric summaries
|
| 828 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
| 829 |
+
if len(numeric_cols) > 0:
|
| 830 |
+
info.append("\nNumeric column sums:")
|
| 831 |
+
for col in numeric_cols:
|
| 832 |
+
total = df[col].sum()
|
| 833 |
+
info.append(f" {col}: {total}")
|
| 834 |
+
|
| 835 |
+
return '\n'.join(info)
|
| 836 |
+
|
| 837 |
+
except Exception as e:
|
| 838 |
+
return f"Error processing CSV file: {str(e)}"
|
| 839 |
+
|
| 840 |
+
elif file_type == 'pdf':
|
| 841 |
+
try:
|
| 842 |
+
import PyPDF2
|
| 843 |
+
pdf_reader = PyPDF2.PdfReader(BytesIO(response.content))
|
| 844 |
+
|
| 845 |
+
info = []
|
| 846 |
+
info.append(f"PDF file loaded successfully")
|
| 847 |
+
info.append(f"Number of pages: {len(pdf_reader.pages)}")
|
| 848 |
+
|
| 849 |
+
# Extract text from all pages
|
| 850 |
+
full_text = ""
|
| 851 |
+
for page in pdf_reader.pages:
|
| 852 |
+
text = page.extract_text()
|
| 853 |
+
full_text += text + "\n"
|
| 854 |
+
|
| 855 |
+
# Count occurrences of common words if asked
|
| 856 |
+
info.append(f"Total characters: {len(full_text)}")
|
| 857 |
+
info.append(f"Total words: {len(full_text.split())}")
|
| 858 |
+
|
| 859 |
+
# Store the text for searching
|
| 860 |
+
info.append("\nFull text extracted and available for searching")
|
| 861 |
+
|
| 862 |
+
return '\n'.join(info) + f"\n\nFull text (first 1000 chars):\n{full_text[:1000]}..."
|
| 863 |
+
|
| 864 |
+
except Exception as e:
|
| 865 |
+
return f"Error processing PDF file: {str(e)}"
|
| 866 |
+
|
| 867 |
+
elif file_type == 'text':
|
| 868 |
+
try:
|
| 869 |
+
text_content = response.text
|
| 870 |
+
info = []
|
| 871 |
+
info.append(f"Text file loaded successfully")
|
| 872 |
+
info.append(f"Length: {len(text_content)} characters")
|
| 873 |
+
info.append(f"Lines: {len(text_content.splitlines())}")
|
| 874 |
+
info.append(f"\nContent preview:\n{text_content[:500]}...")
|
| 875 |
+
|
| 876 |
+
return '\n'.join(info)
|
| 877 |
+
|
| 878 |
+
except Exception as e:
|
| 879 |
+
return f"Error processing text file: {str(e)}"
|
| 880 |
+
|
| 881 |
+
else:
|
| 882 |
+
return f"Unsupported file type: {file_type}"
|
| 883 |
+
|
| 884 |
+
except Exception as e:
|
| 885 |
+
return f"Error downloading/processing file: {str(e)}"
|
| 886 |
+
|
| 887 |
+
@tool
|
| 888 |
+
def extract_file_urls(question: str) -> str:
|
| 889 |
+
"""
|
| 890 |
+
Extract file URLs from questions for downloading.
|
| 891 |
+
Returns URLs of files that can be downloaded.
|
| 892 |
+
"""
|
| 893 |
+
try:
|
| 894 |
+
# Handle list input
|
| 895 |
+
if isinstance(question, list):
|
| 896 |
+
question = " ".join(str(item) for item in question)
|
| 897 |
+
elif not isinstance(question, str):
|
| 898 |
+
question = str(question)
|
| 899 |
+
|
| 900 |
+
import re
|
| 901 |
+
|
| 902 |
+
# Pattern to find URLs ending with file extensions
|
| 903 |
+
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+\.(?:xlsx|xls|csv|pdf|txt|doc|docx)'
|
| 904 |
+
urls = re.findall(url_pattern, question, re.IGNORECASE)
|
| 905 |
+
|
| 906 |
+
if urls:
|
| 907 |
+
return f"Found downloadable file URLs: {', '.join(urls)}"
|
| 908 |
+
else:
|
| 909 |
+
return "No downloadable file URLs found in the question"
|
| 910 |
+
|
| 911 |
+
except Exception as e:
|
| 912 |
+
return f"Error extracting URLs: {str(e)}"
|
| 913 |
+
|
| 914 |
+
@tool
|
| 915 |
+
def get_current_datetime() -> str:
|
| 916 |
+
"""Get the current date and time."""
|
| 917 |
+
return datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
|
| 918 |
+
|
| 919 |
+
# --- LangGraph Agent ---
|
| 920 |
+
class LangGraphAgent:
|
| 921 |
+
def __init__(self, anthropic_api_key: Optional[str] = None):
|
| 922 |
+
# Initialize LLM
|
| 923 |
+
api_key = anthropic_api_key or os.getenv("ANTHROPIC_API_KEY")
|
| 924 |
+
if not api_key:
|
| 925 |
+
raise ValueError("ANTHROPIC_API_KEY must be provided or set in environment variables")
|
| 926 |
+
|
| 927 |
+
self.llm = ChatAnthropic(
|
| 928 |
+
api_key=api_key,
|
| 929 |
+
model="claude-3-5-sonnet-20241022",
|
| 930 |
+
temperature=0.3,
|
| 931 |
+
max_tokens=4096
|
| 932 |
+
)
|
| 933 |
+
|
| 934 |
+
# Initialize tools
|
| 935 |
+
self.tools = [
|
| 936 |
+
web_search,
|
| 937 |
+
calculator,
|
| 938 |
+
python_executor,
|
| 939 |
+
extract_image_from_question,
|
| 940 |
+
analyze_attachments,
|
| 941 |
+
analyze_reversed_text,
|
| 942 |
+
analyze_code_in_question,
|
| 943 |
+
get_youtube_transcript,
|
| 944 |
+
analyze_multimedia_reference,
|
| 945 |
+
extract_file_urls,
|
| 946 |
+
download_and_process_file,
|
| 947 |
+
get_current_datetime
|
| 948 |
+
]
|
| 949 |
+
|
| 950 |
+
# Bind tools to LLM
|
| 951 |
+
self.llm_with_tools = self.llm.bind_tools(self.tools)
|
| 952 |
+
|
| 953 |
+
# Create tool node
|
| 954 |
+
self.tool_node = ToolNode(self.tools)
|
| 955 |
+
|
| 956 |
+
# Build the graph
|
| 957 |
+
self.graph = self._build_graph()
|
| 958 |
+
|
| 959 |
+
def _build_graph(self):
|
| 960 |
+
workflow = StateGraph(AgentState)
|
| 961 |
+
|
| 962 |
+
# Define the agent node
|
| 963 |
+
workflow.add_node("agent", self._call_model)
|
| 964 |
+
workflow.add_node("tools", self.tool_node)
|
| 965 |
+
|
| 966 |
+
# Set entry point
|
| 967 |
+
workflow.set_entry_point("agent")
|
| 968 |
+
|
| 969 |
+
# Add conditional edge
|
| 970 |
+
workflow.add_conditional_edges(
|
| 971 |
+
"agent",
|
| 972 |
+
self._should_continue,
|
| 973 |
+
{
|
| 974 |
+
"continue": "tools",
|
| 975 |
+
"end": END
|
| 976 |
+
}
|
| 977 |
+
)
|
| 978 |
+
|
| 979 |
+
# Add edge from tools back to agent
|
| 980 |
+
workflow.add_edge("tools", "agent")
|
| 981 |
+
|
| 982 |
+
return workflow.compile()
|
| 983 |
+
|
| 984 |
+
def _call_model(self, state: AgentState):
|
| 985 |
+
"""Call the model with tools."""
|
| 986 |
+
messages = state["messages"]
|
| 987 |
+
response = self.llm_with_tools.invoke(messages)
|
| 988 |
+
return {"messages": [response]}
|
| 989 |
+
|
| 990 |
+
def _should_continue(self, state: AgentState):
|
| 991 |
+
"""Determine if we should continue with tools or end."""
|
| 992 |
+
last_message = state["messages"][-1]
|
| 993 |
+
|
| 994 |
+
# If there are tool calls, continue
|
| 995 |
+
if hasattr(last_message, "tool_calls") and last_message.tool_calls:
|
| 996 |
+
return "continue"
|
| 997 |
+
|
| 998 |
+
# Count how many tool calls we've made
|
| 999 |
+
tool_call_count = 0
|
| 1000 |
+
for msg in state["messages"]:
|
| 1001 |
+
if hasattr(msg, "tool_calls") and msg.tool_calls:
|
| 1002 |
+
tool_call_count += len(msg.tool_calls)
|
| 1003 |
+
|
| 1004 |
+
# Force more tool usage for better accuracy
|
| 1005 |
+
if tool_call_count < 2:
|
| 1006 |
+
# Check if we have a final answer yet
|
| 1007 |
+
if hasattr(last_message, "content") and last_message.content:
|
| 1008 |
+
content_str = last_message.content if isinstance(last_message.content, str) else str(last_message.content)
|
| 1009 |
+
has_final_answer = "FINAL ANSWER:" in content_str
|
| 1010 |
+
|
| 1011 |
+
# If no final answer and still early, encourage more research
|
| 1012 |
+
if not has_final_answer and tool_call_count < 3:
|
| 1013 |
+
return "continue"
|
| 1014 |
+
|
| 1015 |
+
# Stop if we have made enough attempts or have a clear final answer
|
| 1016 |
+
content_str = str(last_message.content) if hasattr(last_message, "content") else ""
|
| 1017 |
+
if tool_call_count >= 6 or "FINAL ANSWER:" in content_str:
|
| 1018 |
+
return "end"
|
| 1019 |
+
|
| 1020 |
+
return "end"
|
| 1021 |
+
|
| 1022 |
+
def run(self, question: str) -> str:
|
| 1023 |
+
"""Run the agent on a question."""
|
| 1024 |
+
print(f"\nDEBUG LangGraphAgent.run():")
|
| 1025 |
+
print(f" Input type: {type(question)}")
|
| 1026 |
+
print(f" Input value: {repr(question)[:200]}...")
|
| 1027 |
+
|
| 1028 |
+
system_prompt = """You are solving GAIA benchmark questions that require deep research and analysis.
|
| 1029 |
+
|
| 1030 |
+
IMPORTANT: You should:
|
| 1031 |
+
1. Use multiple tools to thoroughly research the question
|
| 1032 |
+
2. Search for specific facts, verify information, and perform calculations
|
| 1033 |
+
3. Think step-by-step and use chain-of-thought reasoning
|
| 1034 |
+
4. Double-check facts with multiple searches if needed
|
| 1035 |
+
5. Use python_executor for complex data analysis or calculations
|
| 1036 |
+
|
| 1037 |
+
At the very end, after all your research and reasoning, provide ONLY the final answer in this format:
|
| 1038 |
+
FINAL ANSWER: [your answer here]
|
| 1039 |
+
|
| 1040 |
+
The final answer should contain ONLY the requested information:
|
| 1041 |
+
- Numbers: just the number (e.g., "5" not "5 people")
|
| 1042 |
+
- Years: just the year (e.g., "1969")
|
| 1043 |
+
- Names: exact name with proper capitalization
|
| 1044 |
+
- Yes/No: exactly "Yes" or "No"
|
| 1045 |
+
- Lists: comma-separated values
|
| 1046 |
+
|
| 1047 |
+
Available tools:
|
| 1048 |
+
- web_search: Search for current information (use multiple times with different queries)
|
| 1049 |
+
- calculator: Perform calculations and unit conversions
|
| 1050 |
+
- python_executor: Complex analysis, data processing, date calculations
|
| 1051 |
+
- analyze_attachments: Detect references to external files/media
|
| 1052 |
+
- analyze_reversed_text: Decode backwards or puzzle text
|
| 1053 |
+
- analyze_code_in_question: Extract and analyze Python code from questions
|
| 1054 |
+
- get_youtube_transcript: Extract transcripts from YouTube videos
|
| 1055 |
+
- analyze_multimedia_reference: Handle questions about images, audio, PDFs, Excel files
|
| 1056 |
+
- extract_file_urls: Find downloadable file URLs in questions
|
| 1057 |
+
- download_and_process_file: Download and analyze files from URLs (Excel, CSV, PDF)
|
| 1058 |
+
- get_current_datetime: Get current date/time
|
| 1059 |
+
|
| 1060 |
+
For questions mentioning "attached code" or containing code snippets:
|
| 1061 |
+
1. First use analyze_code_in_question to extract the code
|
| 1062 |
+
2. Then use python_executor to run it and get the output
|
| 1063 |
+
|
| 1064 |
+
For questions with YouTube videos:
|
| 1065 |
+
1. Use get_youtube_transcript to extract the video transcript
|
| 1066 |
+
2. Search the transcript for the relevant information
|
| 1067 |
+
|
| 1068 |
+
For questions mentioning files with URLs:
|
| 1069 |
+
1. Use extract_file_urls to find any file URLs in the question
|
| 1070 |
+
2. If URLs are found, use download_and_process_file to download and analyze the file
|
| 1071 |
+
3. Extract the specific information requested (totals, counts, etc.)
|
| 1072 |
+
4. For Excel files asking for totals, sum the relevant columns
|
| 1073 |
+
5. For PDFs asking for word counts, search the extracted text
|
| 1074 |
+
|
| 1075 |
+
For questions mentioning attached files without URLs:
|
| 1076 |
+
1. Use analyze_multimedia_reference to check if file access is needed
|
| 1077 |
+
2. Return "Unable to determine" if the file cannot be accessed"""
|
| 1078 |
+
|
| 1079 |
+
messages = [
|
| 1080 |
+
SystemMessage(content=system_prompt),
|
| 1081 |
+
HumanMessage(content=question)
|
| 1082 |
+
]
|
| 1083 |
+
|
| 1084 |
+
try:
|
| 1085 |
+
# Configure for more tool usage
|
| 1086 |
+
config = {
|
| 1087 |
+
"recursion_limit": 25,
|
| 1088 |
+
"configurable": {
|
| 1089 |
+
"thread_id": "gaia_evaluation"
|
| 1090 |
+
}
|
| 1091 |
+
}
|
| 1092 |
+
|
| 1093 |
+
result = self.graph.invoke({"messages": messages}, config)
|
| 1094 |
+
|
| 1095 |
+
# Extract the final answer
|
| 1096 |
+
final_answer = self._extract_final_answer(result["messages"])
|
| 1097 |
+
return final_answer
|
| 1098 |
+
|
| 1099 |
+
except Exception as e:
|
| 1100 |
+
return f"Error: {str(e)}"
|
| 1101 |
+
|
| 1102 |
+
def _extract_final_answer(self, messages: List[BaseMessage]) -> str:
|
| 1103 |
+
"""Extract the final answer from the message history."""
|
| 1104 |
+
# Look through messages in reverse order
|
| 1105 |
+
for message in reversed(messages):
|
| 1106 |
+
if hasattr(message, "content") and message.content:
|
| 1107 |
+
content = message.content.strip()
|
| 1108 |
+
|
| 1109 |
+
# Look for FINAL ANSWER marker
|
| 1110 |
+
if "FINAL ANSWER:" in content:
|
| 1111 |
+
parts = content.split("FINAL ANSWER:")
|
| 1112 |
+
if len(parts) >= 2:
|
| 1113 |
+
answer = parts[-1].strip()
|
| 1114 |
+
# Clean up the answer
|
| 1115 |
+
answer = self._clean_answer(answer)
|
| 1116 |
+
return answer
|
| 1117 |
+
|
| 1118 |
+
# If no marker found in last AI message, extract from it
|
| 1119 |
+
if isinstance(message, AIMessage):
|
| 1120 |
+
return self._clean_answer(content)
|
| 1121 |
+
|
| 1122 |
+
return "Unable to determine"
|
| 1123 |
+
|
| 1124 |
+
def _clean_answer(self, answer: str) -> str:
|
| 1125 |
+
"""Clean and format the final answer."""
|
| 1126 |
+
# Handle list input
|
| 1127 |
+
if isinstance(answer, list):
|
| 1128 |
+
answer = " ".join(str(item) for item in answer)
|
| 1129 |
+
elif not isinstance(answer, str):
|
| 1130 |
+
answer = str(answer)
|
| 1131 |
+
|
| 1132 |
+
answer = answer.strip()
|
| 1133 |
+
|
| 1134 |
+
# Remove quotes if they wrap the entire answer
|
| 1135 |
+
if len(answer) > 2 and answer[0] == '"' and answer[-1] == '"':
|
| 1136 |
+
answer = answer[1:-1]
|
| 1137 |
+
|
| 1138 |
+
# Remove common prefixes
|
| 1139 |
+
prefixes_to_remove = [
|
| 1140 |
+
"the answer is", "answer:", "based on", "according to",
|
| 1141 |
+
"my research shows", "i found that", "the result is",
|
| 1142 |
+
"after searching", "from the", "it is", "it's", "there are",
|
| 1143 |
+
"there is", "approximately", "about", "around"
|
| 1144 |
+
]
|
| 1145 |
+
|
| 1146 |
+
lower_answer = answer.lower()
|
| 1147 |
+
for prefix in prefixes_to_remove:
|
| 1148 |
+
if lower_answer.startswith(prefix):
|
| 1149 |
+
answer = answer[len(prefix):].strip()
|
| 1150 |
+
if answer and answer[0] == ':':
|
| 1151 |
+
answer = answer[1:].strip()
|
| 1152 |
+
lower_answer = answer.lower()
|
| 1153 |
+
|
| 1154 |
+
# Handle specific patterns
|
| 1155 |
+
if "unable to" in lower_answer or "cannot" in lower_answer:
|
| 1156 |
+
return "Unable to determine"
|
| 1157 |
+
|
| 1158 |
+
# Clean yes/no answers
|
| 1159 |
+
if lower_answer in ["yes.", "no.", "yes,", "no,"]:
|
| 1160 |
+
return answer[:-1]
|
| 1161 |
+
|
| 1162 |
+
# Remove trailing periods for single-word answers
|
| 1163 |
+
if answer.endswith(".") and " " not in answer:
|
| 1164 |
+
answer = answer[:-1]
|
| 1165 |
+
|
| 1166 |
+
return answer
|
| 1167 |
+
|
| 1168 |
# --- Basic Agent Definition ---
|
|
|
|
| 1169 |
class BasicAgent:
|
| 1170 |
def __init__(self):
|
| 1171 |
+
print("Initializing LangGraph Agent...")
|
| 1172 |
+
|
| 1173 |
+
# Try to get API key from environment or use a placeholder
|
| 1174 |
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
| 1175 |
+
|
| 1176 |
+
if not api_key:
|
| 1177 |
+
print("Warning: ANTHROPIC_API_KEY not found in environment variables.")
|
| 1178 |
+
print("Please set it in the Gradio interface or as an environment variable.")
|
| 1179 |
+
self.agent = None
|
| 1180 |
+
else:
|
| 1181 |
+
try:
|
| 1182 |
+
self.agent = LangGraphAgent(api_key)
|
| 1183 |
+
print("LangGraph Agent initialized successfully.")
|
| 1184 |
+
except Exception as e:
|
| 1185 |
+
print(f"Error initializing LangGraph Agent: {e}")
|
| 1186 |
+
self.agent = None
|
| 1187 |
+
|
| 1188 |
+
def set_api_key(self, api_key: str):
|
| 1189 |
+
"""Set or update the API key."""
|
| 1190 |
+
if api_key:
|
| 1191 |
+
try:
|
| 1192 |
+
self.agent = LangGraphAgent(api_key)
|
| 1193 |
+
return True
|
| 1194 |
+
except Exception as e:
|
| 1195 |
+
print(f"Error setting API key: {e}")
|
| 1196 |
+
return False
|
| 1197 |
+
return False
|
| 1198 |
+
|
| 1199 |
def __call__(self, question: str) -> str:
|
| 1200 |
+
print(f"\n{'='*60}")
|
| 1201 |
+
print(f"DEBUG: Agent received question")
|
| 1202 |
+
print(f"Question type: {type(question)}")
|
| 1203 |
+
print(f"Question length: {len(question) if isinstance(question, str) else 'N/A'}")
|
| 1204 |
+
print(f"Question preview: {str(question)[:200]}...")
|
| 1205 |
+
print(f"{'='*60}\n")
|
| 1206 |
+
|
| 1207 |
+
if not self.agent:
|
| 1208 |
+
return "Error: Agent not initialized. Please set your ANTHROPIC_API_KEY."
|
| 1209 |
+
|
| 1210 |
+
try:
|
| 1211 |
+
answer = self.agent.run(question)
|
| 1212 |
+
print(f"\nDEBUG: Agent generated answer")
|
| 1213 |
+
print(f"Answer type: {type(answer)}")
|
| 1214 |
+
print(f"Answer preview: {str(answer)[:200]}...")
|
| 1215 |
+
return answer
|
| 1216 |
+
except Exception as e:
|
| 1217 |
+
error_msg = f"Error processing question: {str(e)}"
|
| 1218 |
+
print(f"\nDEBUG: Error occurred!")
|
| 1219 |
+
print(f"Error type: {type(e)}")
|
| 1220 |
+
print(f"Error details: {str(e)}")
|
| 1221 |
+
import traceback
|
| 1222 |
+
print(f"Traceback:\n{traceback.format_exc()}")
|
| 1223 |
+
return error_msg
|
| 1224 |
+
|
| 1225 |
+
# Global agent instance
|
| 1226 |
+
global_agent = None
|
| 1227 |
+
|
| 1228 |
+
def validate_api_keys(anthropic_key: str, serpapi_key: str = None, tavily_key: str = None):
|
| 1229 |
+
"""Validate the API keys before using them."""
|
| 1230 |
+
results = []
|
| 1231 |
+
|
| 1232 |
+
# Test Anthropic API key
|
| 1233 |
+
if anthropic_key:
|
| 1234 |
+
try:
|
| 1235 |
+
test_llm = ChatAnthropic(
|
| 1236 |
+
api_key=anthropic_key,
|
| 1237 |
+
model="claude-3-5-sonnet-20241022",
|
| 1238 |
+
max_tokens=10
|
| 1239 |
+
)
|
| 1240 |
+
# Try a simple test call
|
| 1241 |
+
test_llm.invoke([HumanMessage(content="test")])
|
| 1242 |
+
results.append("✅ Anthropic API key is valid")
|
| 1243 |
+
except Exception as e:
|
| 1244 |
+
error_msg = str(e)
|
| 1245 |
+
if "401" in error_msg or "authentication" in error_msg.lower():
|
| 1246 |
+
results.append("❌ Anthropic API key is invalid or expired")
|
| 1247 |
+
else:
|
| 1248 |
+
results.append(f"❌ Anthropic API error: {error_msg[:100]}...")
|
| 1249 |
+
else:
|
| 1250 |
+
results.append("❌ No Anthropic API key provided")
|
| 1251 |
+
|
| 1252 |
+
# Test Tavily API key
|
| 1253 |
+
if tavily_key:
|
| 1254 |
+
try:
|
| 1255 |
+
import requests
|
| 1256 |
+
test_url = "https://api.tavily.com/search"
|
| 1257 |
+
test_data = {
|
| 1258 |
+
"api_key": tavily_key,
|
| 1259 |
+
"query": "test",
|
| 1260 |
+
"max_results": 1
|
| 1261 |
+
}
|
| 1262 |
+
response = requests.post(test_url, json=test_data, timeout=5)
|
| 1263 |
+
if response.status_code == 200:
|
| 1264 |
+
results.append("✅ Tavily API key is valid")
|
| 1265 |
+
else:
|
| 1266 |
+
results.append(f"❌ Tavily API key error: {response.status_code}")
|
| 1267 |
+
except Exception as e:
|
| 1268 |
+
results.append(f"⚠️ Tavily API test error: {str(e)[:100]}...")
|
| 1269 |
+
else:
|
| 1270 |
+
results.append("ℹ️ No Tavily API key provided")
|
| 1271 |
+
|
| 1272 |
+
# Test SerpAPI key
|
| 1273 |
+
if serpapi_key:
|
| 1274 |
+
try:
|
| 1275 |
+
params = {
|
| 1276 |
+
"q": "test",
|
| 1277 |
+
"api_key": serpapi_key,
|
| 1278 |
+
"num": 1,
|
| 1279 |
+
"engine": "google"
|
| 1280 |
+
}
|
| 1281 |
+
search = GoogleSearch(params)
|
| 1282 |
+
search.get_dict()
|
| 1283 |
+
results.append("✅ SerpAPI key is valid")
|
| 1284 |
+
except Exception as e:
|
| 1285 |
+
results.append(f"⚠️ SerpAPI key error: {str(e)[:100]}...")
|
| 1286 |
+
else:
|
| 1287 |
+
results.append("ℹ️ No SerpAPI key provided")
|
| 1288 |
+
|
| 1289 |
+
return "\n".join(results)
|
| 1290 |
+
|
| 1291 |
+
def initialize_agent_with_key(api_key: str):
|
| 1292 |
+
"""Initialize the global agent with the provided API key."""
|
| 1293 |
+
global global_agent
|
| 1294 |
+
|
| 1295 |
+
# First validate the key
|
| 1296 |
+
validation_result = validate_api_keys(api_key)
|
| 1297 |
+
if "❌ Anthropic API key is invalid" in validation_result:
|
| 1298 |
+
return validation_result
|
| 1299 |
+
|
| 1300 |
+
if api_key:
|
| 1301 |
+
if global_agent is None:
|
| 1302 |
+
global_agent = BasicAgent()
|
| 1303 |
+
success = global_agent.set_api_key(api_key)
|
| 1304 |
+
if success:
|
| 1305 |
+
return f"{validation_result}\n\n✅ Agent initialized successfully!"
|
| 1306 |
+
else:
|
| 1307 |
+
return "❌ Failed to initialize agent. Please check if your API key is valid."
|
| 1308 |
+
return "❌ Please provide an API key."
|
| 1309 |
|
| 1310 |
+
def run_and_submit_all(api_key: str, profile: gr.OAuthProfile | None):
|
| 1311 |
"""
|
| 1312 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 1313 |
and displays the results.
|
| 1314 |
"""
|
| 1315 |
+
global global_agent
|
| 1316 |
+
|
| 1317 |
+
# Initialize agent if needed
|
| 1318 |
+
if global_agent is None or api_key:
|
| 1319 |
+
init_msg = initialize_agent_with_key(api_key)
|
| 1320 |
+
print(init_msg)
|
| 1321 |
+
if "Failed" in init_msg or "Please provide" in init_msg:
|
| 1322 |
+
return init_msg, None
|
| 1323 |
+
|
| 1324 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 1325 |
+
space_id = os.getenv("SPACE_ID")
|
| 1326 |
+
|
| 1327 |
if profile:
|
| 1328 |
+
username = f"{profile.username}"
|
| 1329 |
print(f"User logged in: {username}")
|
| 1330 |
else:
|
| 1331 |
print("User not logged in.")
|
| 1332 |
return "Please Login to Hugging Face with the button.", None
|
| 1333 |
+
|
| 1334 |
api_url = DEFAULT_API_URL
|
| 1335 |
questions_url = f"{api_url}/questions"
|
| 1336 |
submit_url = f"{api_url}/submit"
|
| 1337 |
+
|
| 1338 |
+
# 1. Use the global agent
|
| 1339 |
+
agent = global_agent
|
| 1340 |
+
if not agent:
|
| 1341 |
+
return "Error: Agent not initialized properly.", None
|
| 1342 |
+
|
| 1343 |
+
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local"
|
| 1344 |
+
print(f"Agent code URL: {agent_code}")
|
| 1345 |
+
|
|
|
|
|
|
|
| 1346 |
# 2. Fetch Questions
|
| 1347 |
print(f"Fetching questions from: {questions_url}")
|
| 1348 |
try:
|
|
|
|
| 1350 |
response.raise_for_status()
|
| 1351 |
questions_data = response.json()
|
| 1352 |
if not questions_data:
|
| 1353 |
+
print("Fetched questions list is empty.")
|
| 1354 |
+
return "Fetched questions list is empty or invalid format.", None
|
| 1355 |
print(f"Fetched {len(questions_data)} questions.")
|
| 1356 |
+
except Exception as e:
|
| 1357 |
print(f"Error fetching questions: {e}")
|
| 1358 |
return f"Error fetching questions: {e}", None
|
| 1359 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1360 |
# 3. Run your Agent
|
| 1361 |
results_log = []
|
| 1362 |
answers_payload = []
|
| 1363 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 1364 |
+
|
| 1365 |
+
for i, item in enumerate(questions_data, 1):
|
| 1366 |
task_id = item.get("task_id")
|
| 1367 |
question_text = item.get("question")
|
| 1368 |
+
|
| 1369 |
if not task_id or question_text is None:
|
| 1370 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 1371 |
continue
|
| 1372 |
+
|
| 1373 |
+
print(f"\nProcessing question {i}/{len(questions_data)}: {task_id}")
|
| 1374 |
+
|
| 1375 |
try:
|
| 1376 |
submitted_answer = agent(question_text)
|
| 1377 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 1378 |
+
results_log.append({
|
| 1379 |
+
"Task ID": task_id,
|
| 1380 |
+
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 1381 |
+
"Submitted Answer": submitted_answer[:200] + "..." if len(submitted_answer) > 200 else submitted_answer
|
| 1382 |
+
})
|
| 1383 |
except Exception as e:
|
| 1384 |
+
print(f"Error running agent on task {task_id}: {e}")
|
| 1385 |
+
error_answer = f"AGENT ERROR: {e}"
|
| 1386 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": error_answer})
|
| 1387 |
+
results_log.append({
|
| 1388 |
+
"Task ID": task_id,
|
| 1389 |
+
"Question": question_text[:100] + "...",
|
| 1390 |
+
"Submitted Answer": error_answer
|
| 1391 |
+
})
|
| 1392 |
+
|
| 1393 |
if not answers_payload:
|
| 1394 |
print("Agent did not produce any answers to submit.")
|
| 1395 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 1396 |
+
|
| 1397 |
# 4. Prepare Submission
|
| 1398 |
+
submission_data = {
|
| 1399 |
+
"username": username.strip(),
|
| 1400 |
+
"agent_code": agent_code,
|
| 1401 |
+
"answers": answers_payload
|
| 1402 |
+
}
|
| 1403 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 1404 |
print(status_update)
|
| 1405 |
+
|
| 1406 |
# 5. Submit
|
| 1407 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 1408 |
try:
|
|
|
|
| 1419 |
print("Submission successful.")
|
| 1420 |
results_df = pd.DataFrame(results_log)
|
| 1421 |
return final_status, results_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1422 |
except Exception as e:
|
| 1423 |
+
status_message = f"Submission Failed: {str(e)}"
|
| 1424 |
print(status_message)
|
| 1425 |
results_df = pd.DataFrame(results_log)
|
| 1426 |
return status_message, results_df
|
| 1427 |
|
|
|
|
| 1428 |
# --- Build Gradio Interface using Blocks ---
|
| 1429 |
with gr.Blocks() as demo:
|
| 1430 |
+
gr.Markdown("# LangGraph Agent for GAIA Evaluation")
|
| 1431 |
gr.Markdown(
|
| 1432 |
"""
|
| 1433 |
+
**This agent uses LangGraph with multiple tools to answer complex questions:**
|
| 1434 |
+
- 🔍 Web Search (Tavily → DuckDuckGo → SerpAPI)
|
| 1435 |
+
- 🧮 Calculator for mathematical computations
|
| 1436 |
+
- 🐍 Python code execution
|
| 1437 |
+
- 📅 Current date/time
|
| 1438 |
+
- 🖼️ Image analysis (description-based)
|
| 1439 |
+
|
| 1440 |
**Instructions:**
|
| 1441 |
+
1. Enter your Anthropic API key (Claude Sonnet 3.5)
|
| 1442 |
+
2. Optionally enter your Tavily API key for best web search (free tier: 1000/month)
|
| 1443 |
+
3. Optionally enter your SerpAPI key as backup
|
| 1444 |
+
4. Log in to your Hugging Face account
|
| 1445 |
+
5. Click 'Run Evaluation & Submit All Answers'
|
| 1446 |
+
|
| 1447 |
+
**Search Priority:** Tavily (if key) → DuckDuckGo (free) → SerpAPI (if key)
|
|
|
|
|
|
|
| 1448 |
"""
|
| 1449 |
)
|
| 1450 |
+
|
| 1451 |
+
with gr.Row():
|
| 1452 |
+
with gr.Column():
|
| 1453 |
+
gr.LoginButton()
|
| 1454 |
+
|
| 1455 |
+
with gr.Row():
|
| 1456 |
+
with gr.Column():
|
| 1457 |
+
api_key_input = gr.Textbox(
|
| 1458 |
+
label="Anthropic API Key (Required)",
|
| 1459 |
+
placeholder="sk-ant-...",
|
| 1460 |
+
type="password"
|
| 1461 |
+
)
|
| 1462 |
+
tavily_key_input = gr.Textbox(
|
| 1463 |
+
label="Tavily API Key (Recommended for web search)",
|
| 1464 |
+
placeholder="tvly-...",
|
| 1465 |
+
type="password"
|
| 1466 |
+
)
|
| 1467 |
+
serpapi_key_input = gr.Textbox(
|
| 1468 |
+
label="SerpAPI Key (Optional backup)",
|
| 1469 |
+
placeholder="Your SerpAPI key...",
|
| 1470 |
+
type="password"
|
| 1471 |
+
)
|
| 1472 |
+
|
| 1473 |
+
with gr.Row():
|
| 1474 |
+
validate_button = gr.Button("Validate API Keys", variant="secondary")
|
| 1475 |
+
init_button = gr.Button("Initialize Agent", variant="secondary")
|
| 1476 |
+
run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
|
| 1477 |
+
|
| 1478 |
+
status_output = gr.Textbox(label="Status / Results", lines=8, interactive=False)
|
| 1479 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 1480 |
+
|
| 1481 |
+
# Set environment variables when provided
|
| 1482 |
+
def set_tavily_key(key):
|
| 1483 |
+
if key:
|
| 1484 |
+
os.environ["TAVILY_API_KEY"] = key
|
| 1485 |
+
return "✅ Tavily API key set!"
|
| 1486 |
+
return ""
|
| 1487 |
+
|
| 1488 |
+
def set_serpapi_key(key):
|
| 1489 |
+
if key:
|
| 1490 |
+
os.environ["SERPAPI_KEY"] = key
|
| 1491 |
+
return "✅ SerpAPI key set!"
|
| 1492 |
+
return ""
|
| 1493 |
+
|
| 1494 |
+
tavily_key_input.change(set_tavily_key, inputs=[tavily_key_input], outputs=[])
|
| 1495 |
+
serpapi_key_input.change(set_serpapi_key, inputs=[serpapi_key_input], outputs=[])
|
| 1496 |
+
|
| 1497 |
+
# Function to validate all keys
|
| 1498 |
+
def validate_all_keys(anthropic_key, tavily_key, serpapi_key):
|
| 1499 |
+
if tavily_key:
|
| 1500 |
+
os.environ["TAVILY_API_KEY"] = tavily_key
|
| 1501 |
+
if serpapi_key:
|
| 1502 |
+
os.environ["SERPAPI_KEY"] = serpapi_key
|
| 1503 |
+
return validate_api_keys(anthropic_key, serpapi_key, tavily_key)
|
| 1504 |
+
|
| 1505 |
+
validate_button.click(
|
| 1506 |
+
fn=validate_all_keys,
|
| 1507 |
+
inputs=[api_key_input, tavily_key_input, serpapi_key_input],
|
| 1508 |
+
outputs=[status_output]
|
| 1509 |
+
)
|
| 1510 |
+
|
| 1511 |
+
init_button.click(
|
| 1512 |
+
fn=initialize_agent_with_key,
|
| 1513 |
+
inputs=[api_key_input],
|
| 1514 |
+
outputs=[status_output]
|
| 1515 |
+
)
|
| 1516 |
+
|
| 1517 |
run_button.click(
|
| 1518 |
fn=run_and_submit_all,
|
| 1519 |
+
inputs=[api_key_input],
|
| 1520 |
outputs=[status_output, results_table]
|
| 1521 |
)
|
| 1522 |
|
| 1523 |
if __name__ == "__main__":
|
| 1524 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 1525 |
+
print("LangGraph Agent for GAIA Evaluation")
|
| 1526 |
+
print("Required: ANTHROPIC_API_KEY")
|
| 1527 |
+
print("Recommended: TAVILY_API_KEY for best web search (1000 free/month)")
|
| 1528 |
+
print("Optional: SERPAPI_KEY as backup")
|
| 1529 |
+
print("Fallback: DuckDuckGo search (no API key needed)")
|
| 1530 |
+
print("-"*74 + "\n")
|
| 1531 |
+
|
| 1532 |
+
demo.launch(debug=True, share=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
debug_lower_error.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Debug script to find where .lower() is being called on non-strings
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
# Set up path
|
| 10 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 11 |
+
|
| 12 |
+
# Set minimal env vars
|
| 13 |
+
os.environ["ANTHROPIC_API_KEY"] = "test-key"
|
| 14 |
+
|
| 15 |
+
def find_lower_calls():
|
| 16 |
+
"""Find all .lower() calls in the code"""
|
| 17 |
+
print("Searching for all .lower() calls in app.py...")
|
| 18 |
+
print("-" * 60)
|
| 19 |
+
|
| 20 |
+
with open('app.py', 'r') as f:
|
| 21 |
+
lines = f.readlines()
|
| 22 |
+
|
| 23 |
+
lower_calls = []
|
| 24 |
+
for i, line in enumerate(lines, 1):
|
| 25 |
+
if '.lower()' in line:
|
| 26 |
+
lower_calls.append((i, line.strip()))
|
| 27 |
+
|
| 28 |
+
print(f"Found {len(lower_calls)} .lower() calls:\n")
|
| 29 |
+
for line_num, line in lower_calls:
|
| 30 |
+
print(f"Line {line_num}: {line}")
|
| 31 |
+
# Check if there's protection
|
| 32 |
+
if 'isinstance' in lines[line_num-2:line_num]:
|
| 33 |
+
print(" ✅ Has type checking")
|
| 34 |
+
else:
|
| 35 |
+
print(" ⚠️ No type checking nearby")
|
| 36 |
+
print()
|
| 37 |
+
|
| 38 |
+
def test_problematic_inputs():
|
| 39 |
+
"""Test inputs that might cause .lower() errors"""
|
| 40 |
+
print("\nTesting problematic inputs...")
|
| 41 |
+
print("-" * 60)
|
| 42 |
+
|
| 43 |
+
# Test cases that might break .lower()
|
| 44 |
+
test_inputs = [
|
| 45 |
+
"normal string",
|
| 46 |
+
["list", "of", "strings"],
|
| 47 |
+
{"dict": "value"},
|
| 48 |
+
123,
|
| 49 |
+
None,
|
| 50 |
+
[{"nested": "structure"}],
|
| 51 |
+
b"bytes string",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for test_input in test_inputs:
|
| 55 |
+
print(f"\nInput: {repr(test_input)} (type: {type(test_input)})")
|
| 56 |
+
|
| 57 |
+
# Test direct .lower()
|
| 58 |
+
try:
|
| 59 |
+
result = test_input.lower()
|
| 60 |
+
print(f" ✅ .lower() works: {result}")
|
| 61 |
+
except AttributeError as e:
|
| 62 |
+
print(f" ❌ .lower() fails: {e}")
|
| 63 |
+
|
| 64 |
+
# Test with type checking
|
| 65 |
+
try:
|
| 66 |
+
if isinstance(test_input, str):
|
| 67 |
+
result = test_input.lower()
|
| 68 |
+
print(f" ✅ With type check: {result}")
|
| 69 |
+
else:
|
| 70 |
+
result = str(test_input).lower()
|
| 71 |
+
print(f" ✅ With str() conversion: {result}")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f" ❌ Even with protection: {e}")
|
| 74 |
+
|
| 75 |
+
def test_message_content():
|
| 76 |
+
"""Test what might be in message.content"""
|
| 77 |
+
print("\n\nTesting message content scenarios...")
|
| 78 |
+
print("-" * 60)
|
| 79 |
+
|
| 80 |
+
# Simulate different message contents
|
| 81 |
+
class MockMessage:
|
| 82 |
+
def __init__(self, content):
|
| 83 |
+
self.content = content
|
| 84 |
+
|
| 85 |
+
test_messages = [
|
| 86 |
+
MockMessage("Normal text content"),
|
| 87 |
+
MockMessage(["List", "content"]), # This might happen!
|
| 88 |
+
MockMessage({"type": "text", "content": "dict content"}),
|
| 89 |
+
MockMessage(None),
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
for i, msg in enumerate(test_messages):
|
| 93 |
+
print(f"\nMessage {i}: content = {repr(msg.content)}")
|
| 94 |
+
|
| 95 |
+
# Simulate what might happen in the code
|
| 96 |
+
if hasattr(msg, "content") and msg.content:
|
| 97 |
+
content = msg.content
|
| 98 |
+
print(f" Content type: {type(content)}")
|
| 99 |
+
|
| 100 |
+
# This would fail on non-strings!
|
| 101 |
+
try:
|
| 102 |
+
content = content.strip()
|
| 103 |
+
print(f" ✅ .strip() works")
|
| 104 |
+
except AttributeError:
|
| 105 |
+
print(f" ❌ .strip() fails - content is not a string!")
|
| 106 |
+
|
| 107 |
+
# Safe approach
|
| 108 |
+
if isinstance(content, list):
|
| 109 |
+
content = " ".join(str(item) for item in content)
|
| 110 |
+
print(f" ✅ Converted list to string: {content}")
|
| 111 |
+
elif not isinstance(content, str):
|
| 112 |
+
content = str(content)
|
| 113 |
+
print(f" ✅ Converted to string: {content}")
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
print("=" * 80)
|
| 117 |
+
print("DEBUG: Finding .lower() error sources")
|
| 118 |
+
print("=" * 80)
|
| 119 |
+
|
| 120 |
+
find_lower_calls()
|
| 121 |
+
test_problematic_inputs()
|
| 122 |
+
test_message_content()
|
| 123 |
+
|
| 124 |
+
print("\n" + "=" * 80)
|
| 125 |
+
print("CONCLUSION:")
|
| 126 |
+
print("The error likely occurs when message.content is a list instead of string")
|
| 127 |
+
print("This can happen with multimodal messages or tool responses")
|
| 128 |
+
print("Solution: Always check type before calling .lower() or .strip()")
|
| 129 |
+
print("=" * 80)
|
requirements.txt
CHANGED
|
@@ -1,2 +1,16 @@
|
|
| 1 |
gradio
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio
|
| 2 |
+
pandas
|
| 3 |
+
requests
|
| 4 |
+
langchain
|
| 5 |
+
langchain-anthropic
|
| 6 |
+
langgraph
|
| 7 |
+
google-search-results
|
| 8 |
+
numexpr
|
| 9 |
+
python-dotenv
|
| 10 |
+
typing-extensions
|
| 11 |
+
pydantic
|
| 12 |
+
numpy
|
| 13 |
+
youtube-transcript-api
|
| 14 |
+
pytube
|
| 15 |
+
PyPDF2
|
| 16 |
+
openpyxl
|
run_gaia_test.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Run GAIA evaluation test
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from app import BasicAgent
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
def test_gaia_questions():
|
| 14 |
+
"""Test with GAIA questions"""
|
| 15 |
+
|
| 16 |
+
# Initialize agent
|
| 17 |
+
agent = BasicAgent()
|
| 18 |
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
| 19 |
+
if not api_key:
|
| 20 |
+
print("Error: ANTHROPIC_API_KEY not found in environment variables")
|
| 21 |
+
return
|
| 22 |
+
|
| 23 |
+
agent.set_api_key(api_key)
|
| 24 |
+
|
| 25 |
+
# GAIA questions from previous debug output
|
| 26 |
+
questions = [
|
| 27 |
+
"How many lightning strikes occur on Earth each second? Round your answer to the nearest integer.",
|
| 28 |
+
"In Audre Lorde's poem 'Diaspora', she repeats, \"home is\" three times. The last line ends \"and I am...\" what?",
|
| 29 |
+
"On April 1, 2024, the French National Railway Company (SNCF) published an April Fool's joke on X (formerly Twitter) about a new model of train. What is the name of this model?",
|
| 30 |
+
"In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of \"a \"moat\" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline.\" Verma's \"moat\" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?",
|
| 31 |
+
"Whose X account (formerly Twitter) is this: @lbcmjc?",
|
| 32 |
+
"What is the current population of Gabon?",
|
| 33 |
+
"In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?",
|
| 34 |
+
"In a park, there are three gardens: one with 5 tulips and 3 daisies, one with 6 marigolds and 4 petunias, and one with 8 hydrangeas, 2 jasmines, and twice as many roses as the first two gardens combined. How many flowers are there in total?",
|
| 35 |
+
"What is the name of the only Israeli pitcher to ever play in the major leagues?",
|
| 36 |
+
"When would a purple lightsaber be needed for the August 16, 2024, Lego Star Wars release?",
|
| 37 |
+
"What is the sum of the first 20 terms of the arithmetic sequence where the first term is 5 and the common difference is 3?",
|
| 38 |
+
"What percentage of Gabon is covered by forests?",
|
| 39 |
+
"When did the Khorezm People's Soviet Republic cease to exist?",
|
| 40 |
+
"As of January 2024, what is the latest OS update for iPad mini (5th generation)?",
|
| 41 |
+
"Tell me the amount of sales in the sales sheet for the attached excel file.",
|
| 42 |
+
"How many times is the word \"therefore\" used in the attached PDF?",
|
| 43 |
+
"What item came in first on the Official Monster Raving Loony Party's 2019 manifesto?",
|
| 44 |
+
"What is the hexadecimal value of the unicode character for 'Brain' emoji?",
|
| 45 |
+
"What was the score of the Women's Handball World Championship match between Argentina and Austria on 4 December 2023?",
|
| 46 |
+
"Which record producer is quoted in the Wikipedia article on James Blake's album \"Friends That Break Your Heart\"?"
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
correct = 0
|
| 50 |
+
for i, question in enumerate(questions, 1):
|
| 51 |
+
print(f"\nQuestion {i}: {question}")
|
| 52 |
+
try:
|
| 53 |
+
answer = agent(question)
|
| 54 |
+
print(f"Answer: {answer}")
|
| 55 |
+
# Simple heuristic - if answer is not an error and not too long, count as potentially correct
|
| 56 |
+
if answer and "error" not in answer.lower() and len(answer) < 100:
|
| 57 |
+
correct += 1
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"Error: {e}")
|
| 60 |
+
|
| 61 |
+
print(f"\n{'='*80}")
|
| 62 |
+
print(f"Final Score: {correct}/{len(questions)} ({correct/len(questions)*100:.1f}%)")
|
| 63 |
+
print(f"{'='*80}")
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
test_gaia_questions()
|
test_agent.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to debug the 'list' object has no attribute 'lower' error
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
# Add current directory to path
|
| 10 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 11 |
+
|
| 12 |
+
# Set test API keys
|
| 13 |
+
os.environ["ANTHROPIC_API_KEY"]= "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"
|
| 14 |
+
|
| 15 |
+
# Mock the API calls to avoid actual API usage
|
| 16 |
+
from unittest.mock import patch, MagicMock
|
| 17 |
+
|
| 18 |
+
def test_agent_with_various_inputs():
|
| 19 |
+
"""Test the agent with different input types that might cause errors"""
|
| 20 |
+
|
| 21 |
+
print("Testing agent with various input types...")
|
| 22 |
+
|
| 23 |
+
# Test cases that might cause the error
|
| 24 |
+
test_cases = [
|
| 25 |
+
# Normal string
|
| 26 |
+
"What is 2 + 2?",
|
| 27 |
+
|
| 28 |
+
# Question with image reference
|
| 29 |
+
"Look at the image and tell me what you see",
|
| 30 |
+
|
| 31 |
+
# Question with list-like content
|
| 32 |
+
"Calculate the sum of [1, 2, 3, 4, 5]",
|
| 33 |
+
|
| 34 |
+
# Question with code
|
| 35 |
+
"What is the output of this code:\n```python\nprint([1, 2, 3])\n```",
|
| 36 |
+
|
| 37 |
+
# Reversed text question
|
| 38 |
+
".rewsna eht sa 'tfel' drow eht fo etisoppo eht etirw",
|
| 39 |
+
|
| 40 |
+
# Question with attachment reference
|
| 41 |
+
"What is the final numeric output from the attached Python code?",
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
# Import the agent
|
| 45 |
+
try:
|
| 46 |
+
from app import LangGraphAgent, _clean_answer
|
| 47 |
+
|
| 48 |
+
# Test the _clean_answer function directly with different inputs
|
| 49 |
+
print("\n1. Testing _clean_answer function:")
|
| 50 |
+
print("-" * 50)
|
| 51 |
+
|
| 52 |
+
test_answers = [
|
| 53 |
+
"42",
|
| 54 |
+
["The", "answer", "is", "42"], # List input
|
| 55 |
+
{"answer": "42"}, # Dict input
|
| 56 |
+
42, # Integer
|
| 57 |
+
None, # None
|
| 58 |
+
["list", "with", "numbers", 1, 2, 3], # Mixed list
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
# Create a mock agent to test _clean_answer
|
| 62 |
+
class MockAgent:
|
| 63 |
+
def _clean_answer(self, answer):
|
| 64 |
+
# This is the current implementation
|
| 65 |
+
answer = answer.strip() # This will fail on lists!
|
| 66 |
+
|
| 67 |
+
lower_answer = answer.lower() # This will also fail!
|
| 68 |
+
return answer
|
| 69 |
+
|
| 70 |
+
mock_agent = MockAgent()
|
| 71 |
+
|
| 72 |
+
for test_answer in test_answers:
|
| 73 |
+
print(f"\nTesting with: {test_answer} (type: {type(test_answer)})")
|
| 74 |
+
try:
|
| 75 |
+
result = mock_agent._clean_answer(test_answer)
|
| 76 |
+
print(f"✅ Success: {result}")
|
| 77 |
+
except AttributeError as e:
|
| 78 |
+
print(f"❌ AttributeError: {e}")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"❌ Other error: {type(e).__name__}: {e}")
|
| 81 |
+
|
| 82 |
+
# Test with actual agent if possible
|
| 83 |
+
print("\n\n2. Testing with tool responses that might return lists:")
|
| 84 |
+
print("-" * 50)
|
| 85 |
+
|
| 86 |
+
# Mock tool responses that might cause issues
|
| 87 |
+
tool_responses = [
|
| 88 |
+
# Normal response
|
| 89 |
+
{"tool": "calculator", "output": "42"},
|
| 90 |
+
|
| 91 |
+
# List response (this might be the issue!)
|
| 92 |
+
{"tool": "python_executor", "output": ["Result:", "42"]},
|
| 93 |
+
|
| 94 |
+
# Complex response
|
| 95 |
+
{"tool": "web_search", "output": {"results": ["item1", "item2"]}},
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
for response in tool_responses:
|
| 99 |
+
print(f"\nTool response: {response}")
|
| 100 |
+
output = response.get("output", "")
|
| 101 |
+
print(f"Output type: {type(output)}")
|
| 102 |
+
if isinstance(output, list):
|
| 103 |
+
print("⚠️ This is a LIST - might cause 'lower' error!")
|
| 104 |
+
|
| 105 |
+
except ImportError as e:
|
| 106 |
+
print(f"Import error: {e}")
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"Unexpected error: {type(e).__name__}: {e}")
|
| 109 |
+
|
| 110 |
+
def test_message_content_types():
|
| 111 |
+
"""Test what types of content messages might contain"""
|
| 112 |
+
print("\n\n3. Testing message content types:")
|
| 113 |
+
print("-" * 50)
|
| 114 |
+
|
| 115 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
| 116 |
+
|
| 117 |
+
# Test different message contents
|
| 118 |
+
test_contents = [
|
| 119 |
+
"Normal string message",
|
| 120 |
+
["List", "as", "content"], # This might happen!
|
| 121 |
+
{"type": "image", "data": "base64..."}, # Multimodal content
|
| 122 |
+
None,
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
for content in test_contents:
|
| 126 |
+
print(f"\nTesting message with content: {content} (type: {type(content)})")
|
| 127 |
+
try:
|
| 128 |
+
msg = AIMessage(content=content)
|
| 129 |
+
print(f"Message created successfully")
|
| 130 |
+
print(f"Message.content type: {type(msg.content)}")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"Error creating message: {e}")
|
| 133 |
+
|
| 134 |
+
if __name__ == "__main__":
|
| 135 |
+
print("=" * 60)
|
| 136 |
+
print("GAIA Agent Error Debugging Test")
|
| 137 |
+
print("=" * 60)
|
| 138 |
+
|
| 139 |
+
test_agent_with_various_inputs()
|
| 140 |
+
test_message_content_types()
|
| 141 |
+
|
| 142 |
+
print("\n\nConclusion:")
|
| 143 |
+
print("-" * 50)
|
| 144 |
+
print("The error likely occurs when:")
|
| 145 |
+
print("1. A tool returns a list instead of a string")
|
| 146 |
+
print("2. The message content is a list (multimodal)")
|
| 147 |
+
print("3. The _clean_answer method tries to call .strip() or .lower() on a list")
|
| 148 |
+
print("\nFix: Add type checking in _clean_answer method!")
|
test_download_files.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test downloading files from URLs
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import PyPDF2
|
| 9 |
+
from io import BytesIO
|
| 10 |
+
|
| 11 |
+
def test_file_download():
|
| 12 |
+
"""Test downloading different file types from URLs"""
|
| 13 |
+
|
| 14 |
+
# Example URLs (these are hypothetical)
|
| 15 |
+
test_urls = [
|
| 16 |
+
{
|
| 17 |
+
"url": "https://example.com/sales_data.xlsx",
|
| 18 |
+
"type": "excel",
|
| 19 |
+
"question": "What is the total sales from the Excel file at https://example.com/sales_data.xlsx?"
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"url": "https://example.com/document.pdf",
|
| 23 |
+
"type": "pdf",
|
| 24 |
+
"question": "How many times does 'therefore' appear in https://example.com/document.pdf?"
|
| 25 |
+
}
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
for test in test_urls:
|
| 29 |
+
print(f"\nTesting {test['type']} download:")
|
| 30 |
+
print(f"URL: {test['url']}")
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
# Download the file
|
| 34 |
+
response = requests.get(test['url'], timeout=10)
|
| 35 |
+
|
| 36 |
+
if response.status_code == 200:
|
| 37 |
+
print("✅ File downloaded successfully")
|
| 38 |
+
|
| 39 |
+
# Process based on file type
|
| 40 |
+
if test['type'] == 'excel':
|
| 41 |
+
# Read Excel file
|
| 42 |
+
df = pd.read_excel(BytesIO(response.content))
|
| 43 |
+
print(f"Excel shape: {df.shape}")
|
| 44 |
+
print(f"Columns: {list(df.columns)}")
|
| 45 |
+
|
| 46 |
+
elif test['type'] == 'pdf':
|
| 47 |
+
# Read PDF file
|
| 48 |
+
pdf_reader = PyPDF2.PdfReader(BytesIO(response.content))
|
| 49 |
+
print(f"PDF pages: {len(pdf_reader.pages)}")
|
| 50 |
+
|
| 51 |
+
else:
|
| 52 |
+
print(f"❌ Failed to download: {response.status_code}")
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"❌ Error: {e}")
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
test_file_download()
|
test_file_download.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test file download functionality
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from app import BasicAgent
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
def test_file_download():
|
| 13 |
+
"""Test questions with file URLs"""
|
| 14 |
+
|
| 15 |
+
agent = BasicAgent()
|
| 16 |
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
| 17 |
+
if not api_key:
|
| 18 |
+
print("Error: ANTHROPIC_API_KEY not found")
|
| 19 |
+
return
|
| 20 |
+
|
| 21 |
+
agent.set_api_key(api_key)
|
| 22 |
+
|
| 23 |
+
# Test cases with file URLs (these are hypothetical)
|
| 24 |
+
test_cases = [
|
| 25 |
+
{
|
| 26 |
+
"question": "What is the total sales from the Excel file at https://example.com/sales.xlsx?",
|
| 27 |
+
"type": "excel_url"
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"question": "How many times does 'therefore' appear in the PDF at https://example.com/document.pdf?",
|
| 31 |
+
"type": "pdf_url"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"question": "The attached Excel file contains sales data. What is the total?",
|
| 35 |
+
"type": "no_url"
|
| 36 |
+
}
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
for i, test in enumerate(test_cases, 1):
|
| 40 |
+
print(f"\nTest {i} ({test['type']}):")
|
| 41 |
+
print(f"Question: {test['question']}")
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
answer = agent(test['question'])
|
| 45 |
+
print(f"Answer: {answer}")
|
| 46 |
+
|
| 47 |
+
if test['type'] == 'no_url' and "unable to determine" in answer.lower():
|
| 48 |
+
print("✅ Correctly identified missing file")
|
| 49 |
+
elif test['type'] in ['excel_url', 'pdf_url']:
|
| 50 |
+
if "failed to download" in answer.lower():
|
| 51 |
+
print("⚠️ URL not accessible (expected for example.com)")
|
| 52 |
+
else:
|
| 53 |
+
print("✅ Attempted to process URL")
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"Error: {e}")
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
test_file_download()
|
test_final_fixes.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test that all .lower() errors are fixed
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
os.environ["ANTHROPIC_API_KEY"] = "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"
|
| 8 |
+
|
| 9 |
+
from app import BasicAgent
|
| 10 |
+
|
| 11 |
+
def test_with_problematic_questions():
|
| 12 |
+
"""Test questions that might cause .lower() errors"""
|
| 13 |
+
|
| 14 |
+
print("Testing GAIA agent with potentially problematic questions...")
|
| 15 |
+
print("-" * 60)
|
| 16 |
+
|
| 17 |
+
agent = BasicAgent()
|
| 18 |
+
agent.set_api_key(os.environ["ANTHROPIC_API_KEY"])
|
| 19 |
+
|
| 20 |
+
test_questions = [
|
| 21 |
+
# Normal question
|
| 22 |
+
"What is 2 + 2?",
|
| 23 |
+
|
| 24 |
+
# Question that might trigger web search with connection issues
|
| 25 |
+
"Who is the current president of France?",
|
| 26 |
+
|
| 27 |
+
# Question with code that might return list
|
| 28 |
+
"What is the output of: print([1,2,3])",
|
| 29 |
+
|
| 30 |
+
# Image-related question
|
| 31 |
+
"Look at the image and describe what you see",
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
for i, question in enumerate(test_questions, 1):
|
| 35 |
+
print(f"\nTest {i}: {question}")
|
| 36 |
+
try:
|
| 37 |
+
answer = agent(question)
|
| 38 |
+
print(f"✅ Success: {answer[:100]}...")
|
| 39 |
+
except AttributeError as e:
|
| 40 |
+
if "lower" in str(e):
|
| 41 |
+
print(f"❌ LOWER ERROR: {e}")
|
| 42 |
+
else:
|
| 43 |
+
print(f"❌ Other AttributeError: {e}")
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"❌ Other error ({type(e).__name__}): {e}")
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
print("=" * 80)
|
| 49 |
+
print("Final Test - All .lower() errors should be fixed")
|
| 50 |
+
print("=" * 80)
|
| 51 |
+
|
| 52 |
+
test_with_problematic_questions()
|
| 53 |
+
|
| 54 |
+
print("\n" + "=" * 80)
|
| 55 |
+
print("If you see any 'lower' errors above, we missed a spot!")
|
| 56 |
+
print("=" * 80)
|
test_fixed_agent.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify the fixes for list handling and DuckDuckGo integration
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
# Add current directory to path
|
| 10 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 11 |
+
|
| 12 |
+
# Set test API key
|
| 13 |
+
os.environ["ANTHROPIC_API_KEY"] = "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"
|
| 14 |
+
|
| 15 |
+
def test_clean_answer_with_lists():
|
| 16 |
+
"""Test that _clean_answer now handles lists properly"""
|
| 17 |
+
print("=" * 60)
|
| 18 |
+
print("Testing _clean_answer with different input types")
|
| 19 |
+
print("=" * 60)
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
from app import LangGraphAgent
|
| 23 |
+
|
| 24 |
+
# Create a test agent
|
| 25 |
+
agent = LangGraphAgent(os.environ["ANTHROPIC_API_KEY"])
|
| 26 |
+
|
| 27 |
+
# Test cases that previously caused errors
|
| 28 |
+
test_inputs = [
|
| 29 |
+
"Normal string answer",
|
| 30 |
+
["This", "was", "a", "list"], # This caused the error!
|
| 31 |
+
{"answer": "dict input"},
|
| 32 |
+
42,
|
| 33 |
+
["The answer is:", "42"],
|
| 34 |
+
None,
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
for test_input in test_inputs:
|
| 38 |
+
print(f"\nInput: {test_input} (type: {type(test_input)})")
|
| 39 |
+
try:
|
| 40 |
+
result = agent._clean_answer(test_input)
|
| 41 |
+
print(f"✅ Success: '{result}'")
|
| 42 |
+
except AttributeError as e:
|
| 43 |
+
print(f"❌ AttributeError: {e}")
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"❌ Other error: {type(e).__name__}: {e}")
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Failed to import or create agent: {e}")
|
| 49 |
+
|
| 50 |
+
def test_web_search_without_serpapi():
|
| 51 |
+
"""Test that web search works with DuckDuckGo"""
|
| 52 |
+
print("\n" + "=" * 60)
|
| 53 |
+
print("Testing DuckDuckGo web search (no API key needed)")
|
| 54 |
+
print("=" * 60)
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
from app import web_search
|
| 58 |
+
|
| 59 |
+
# Test queries
|
| 60 |
+
queries = [
|
| 61 |
+
"Python programming",
|
| 62 |
+
"Current president of France",
|
| 63 |
+
"What is 2 + 2",
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
for query in queries:
|
| 67 |
+
print(f"\nSearching for: '{query}'")
|
| 68 |
+
try:
|
| 69 |
+
result = web_search(query, max_results=3)
|
| 70 |
+
print(f"✅ Search successful!")
|
| 71 |
+
print(f"Result preview: {result[:200]}...")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f"❌ Search failed: {e}")
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"Failed to import web_search: {e}")
|
| 77 |
+
|
| 78 |
+
def test_tool_input_handling():
|
| 79 |
+
"""Test that all tools handle list inputs"""
|
| 80 |
+
print("\n" + "=" * 60)
|
| 81 |
+
print("Testing tool input handling")
|
| 82 |
+
print("=" * 60)
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
from app import calculator, python_executor, analyze_reversed_text
|
| 86 |
+
|
| 87 |
+
# Test with list inputs
|
| 88 |
+
test_cases = [
|
| 89 |
+
("calculator", calculator, ["2", "+", "2"]),
|
| 90 |
+
("python_executor", python_executor, ["print('Hello')", "print('World')"]),
|
| 91 |
+
("analyze_reversed_text", analyze_reversed_text, ["hello", "world"]),
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
for tool_name, tool_func, list_input in test_cases:
|
| 95 |
+
print(f"\nTesting {tool_name} with list input: {list_input}")
|
| 96 |
+
try:
|
| 97 |
+
result = tool_func(list_input)
|
| 98 |
+
print(f"✅ Success: {result[:100]}...")
|
| 99 |
+
except AttributeError as e:
|
| 100 |
+
print(f"❌ AttributeError: {e}")
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"❌ Other error: {type(e).__name__}: {e}")
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"Failed to import tools: {e}")
|
| 106 |
+
|
| 107 |
+
def test_gaia_question():
|
| 108 |
+
"""Test with an actual GAIA-like question"""
|
| 109 |
+
print("\n" + "=" * 60)
|
| 110 |
+
print("Testing with GAIA-like question")
|
| 111 |
+
print("=" * 60)
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
from app import BasicAgent
|
| 115 |
+
|
| 116 |
+
# Create agent
|
| 117 |
+
agent = BasicAgent()
|
| 118 |
+
if agent.agent is None:
|
| 119 |
+
agent.set_api_key(os.environ["ANTHROPIC_API_KEY"])
|
| 120 |
+
|
| 121 |
+
# Test question
|
| 122 |
+
question = "What is the capital of France?"
|
| 123 |
+
|
| 124 |
+
print(f"Question: {question}")
|
| 125 |
+
print("Running agent...")
|
| 126 |
+
|
| 127 |
+
try:
|
| 128 |
+
answer = agent(question)
|
| 129 |
+
print(f"✅ Answer: {answer}")
|
| 130 |
+
except Exception as e:
|
| 131 |
+
print(f"❌ Error: {type(e).__name__}: {e}")
|
| 132 |
+
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f"Failed to test agent: {e}")
|
| 135 |
+
|
| 136 |
+
if __name__ == "__main__":
|
| 137 |
+
print("GAIA Agent Fix Verification Tests")
|
| 138 |
+
print("=" * 80)
|
| 139 |
+
|
| 140 |
+
# Run all tests
|
| 141 |
+
test_clean_answer_with_lists()
|
| 142 |
+
test_web_search_without_serpapi()
|
| 143 |
+
test_tool_input_handling()
|
| 144 |
+
test_gaia_question()
|
| 145 |
+
|
| 146 |
+
print("\n" + "=" * 80)
|
| 147 |
+
print("Test Summary:")
|
| 148 |
+
print("1. _clean_answer should now handle lists without 'lower' error")
|
| 149 |
+
print("2. Web search should work with DuckDuckGo (no API key)")
|
| 150 |
+
print("3. All tools should handle list inputs gracefully")
|
| 151 |
+
print("4. Agent should provide clean, concise answers")
|
test_inline_code.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test inline code handling
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from app import BasicAgent
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
def test_inline_code():
|
| 13 |
+
"""Test questions with inline code"""
|
| 14 |
+
|
| 15 |
+
agent = BasicAgent()
|
| 16 |
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
| 17 |
+
if not api_key:
|
| 18 |
+
print("Error: ANTHROPIC_API_KEY not found")
|
| 19 |
+
return
|
| 20 |
+
|
| 21 |
+
agent.set_api_key(api_key)
|
| 22 |
+
|
| 23 |
+
# Test cases with inline code
|
| 24 |
+
test_cases = [
|
| 25 |
+
{
|
| 26 |
+
"question": "What is the output of this Python code: print(sum([1, 2, 3, 4, 5]))",
|
| 27 |
+
"expected": "15"
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"question": """What is the output of this code?
|
| 31 |
+
```python
|
| 32 |
+
x = 5
|
| 33 |
+
y = 3
|
| 34 |
+
print(x * y + 2)
|
| 35 |
+
```""",
|
| 36 |
+
"expected": "17"
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"question": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?",
|
| 40 |
+
"expected": "Unable to determine (no code provided)"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"question": """Fix this code and give me only the complete corrected code:
|
| 44 |
+
```python
|
| 45 |
+
number = 42
|
| 46 |
+
# This line has an error
|
| 47 |
+
padded = number.zfill(5)
|
| 48 |
+
print(padded)
|
| 49 |
+
```""",
|
| 50 |
+
"expected": "Should provide corrected code"
|
| 51 |
+
}
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for i, test in enumerate(test_cases, 1):
|
| 55 |
+
print(f"\nTest {i}:")
|
| 56 |
+
print(f"Question: {test['question'][:100]}...")
|
| 57 |
+
print(f"Expected: {test['expected']}")
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
answer = agent(test['question'])
|
| 61 |
+
print(f"Got: {answer}")
|
| 62 |
+
|
| 63 |
+
# Check if code was detected and executed
|
| 64 |
+
if "```" in test['question'] and "unable to determine" not in answer.lower():
|
| 65 |
+
print("✅ Code was detected and processed")
|
| 66 |
+
elif "attached" in test['question'].lower() and "unable to determine" in answer.lower():
|
| 67 |
+
print("✅ Correctly identified missing attachment")
|
| 68 |
+
else:
|
| 69 |
+
print("❌ May need improvement")
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f"Error: {e}")
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
test_inline_code()
|
test_multimedia.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test multimedia handling for GAIA agent
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
# Load environment variables
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
# Import the agent
|
| 13 |
+
from app import LangGraphAgent
|
| 14 |
+
|
| 15 |
+
def test_multimedia_questions():
|
| 16 |
+
"""Test questions that involve multimedia content"""
|
| 17 |
+
|
| 18 |
+
print("Testing GAIA agent with multimedia questions...")
|
| 19 |
+
print("=" * 80)
|
| 20 |
+
|
| 21 |
+
# Initialize agent
|
| 22 |
+
agent = LangGraphAgent()
|
| 23 |
+
|
| 24 |
+
# Test questions from the GAIA benchmark that involve multimedia
|
| 25 |
+
test_questions = [
|
| 26 |
+
# YouTube video question
|
| 27 |
+
{
|
| 28 |
+
"question": 'In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of "a "moat" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline." Verma\'s "moat" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?',
|
| 29 |
+
"expected": "Should extract transcript and find STEM"
|
| 30 |
+
},
|
| 31 |
+
|
| 32 |
+
# Image question (should return "Unable to determine")
|
| 33 |
+
{
|
| 34 |
+
"question": "Look at the attached image and tell me what color is the car?",
|
| 35 |
+
"expected": "Unable to determine without access to image files"
|
| 36 |
+
},
|
| 37 |
+
|
| 38 |
+
# Excel file question (should return "Unable to determine")
|
| 39 |
+
{
|
| 40 |
+
"question": "What is the sum of all values in column B of the attached Excel file?",
|
| 41 |
+
"expected": "Unable to determine without access to Excel files"
|
| 42 |
+
},
|
| 43 |
+
|
| 44 |
+
# Audio question (should return "Unable to determine")
|
| 45 |
+
{
|
| 46 |
+
"question": "What song is playing in the attached audio file?",
|
| 47 |
+
"expected": "Unable to determine without access to audio files"
|
| 48 |
+
},
|
| 49 |
+
|
| 50 |
+
# PDF question (should return "Unable to determine")
|
| 51 |
+
{
|
| 52 |
+
"question": "What is written on page 3 of the attached PDF?",
|
| 53 |
+
"expected": "Unable to determine without access to PDF files"
|
| 54 |
+
},
|
| 55 |
+
|
| 56 |
+
# Another YouTube question with shortened URL
|
| 57 |
+
{
|
| 58 |
+
"question": "In the YouTube video at https://youtu.be/dQw4w9WgXcQ, what is the main theme?",
|
| 59 |
+
"expected": "Should extract transcript from Rick Astley video"
|
| 60 |
+
}
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
# Test each question
|
| 64 |
+
for i, test_case in enumerate(test_questions, 1):
|
| 65 |
+
question = test_case["question"]
|
| 66 |
+
expected = test_case["expected"]
|
| 67 |
+
|
| 68 |
+
print(f"\nTest {i}: {question[:80]}...")
|
| 69 |
+
print(f"Expected behavior: {expected}")
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
# Get the answer
|
| 73 |
+
answer = agent.run(question)
|
| 74 |
+
|
| 75 |
+
print(f"Answer: {answer}")
|
| 76 |
+
|
| 77 |
+
# Check if multimedia was handled appropriately
|
| 78 |
+
if "youtube" in question.lower() or "youtu.be" in question.lower():
|
| 79 |
+
if "Unable to determine" in answer:
|
| 80 |
+
print("❌ Failed to extract YouTube transcript")
|
| 81 |
+
else:
|
| 82 |
+
print("✅ Successfully handled YouTube content")
|
| 83 |
+
elif any(keyword in question.lower() for keyword in ["image", "excel", "audio", "pdf", "attached"]):
|
| 84 |
+
if "Unable to determine" in answer:
|
| 85 |
+
print("✅ Correctly returned 'Unable to determine' for inaccessible file")
|
| 86 |
+
else:
|
| 87 |
+
print("❌ Should have returned 'Unable to determine'")
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f"❌ Error: {type(e).__name__}: {e}")
|
| 91 |
+
|
| 92 |
+
print("-" * 80)
|
| 93 |
+
|
| 94 |
+
print("\n" + "=" * 80)
|
| 95 |
+
print("Multimedia handling test complete!")
|
| 96 |
+
print("=" * 80)
|
| 97 |
+
|
| 98 |
+
if __name__ == "__main__":
|
| 99 |
+
# Check for API key
|
| 100 |
+
if not os.getenv("ANTHROPIC_API_KEY"):
|
| 101 |
+
print("Error: ANTHROPIC_API_KEY not found in environment variables")
|
| 102 |
+
print("Please set it in your .env file")
|
| 103 |
+
exit(1)
|
| 104 |
+
|
| 105 |
+
test_multimedia_questions()
|
test_multimedia_gaia.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test specific multimedia GAIA questions
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from app import BasicAgent
|
| 9 |
+
|
| 10 |
+
# Load environment variables
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
def test_specific_questions():
|
| 14 |
+
"""Test specific GAIA questions with multimedia"""
|
| 15 |
+
|
| 16 |
+
# Initialize agent
|
| 17 |
+
agent = BasicAgent()
|
| 18 |
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
| 19 |
+
if not api_key:
|
| 20 |
+
print("Error: ANTHROPIC_API_KEY not found in environment variables")
|
| 21 |
+
return
|
| 22 |
+
|
| 23 |
+
agent.set_api_key(api_key)
|
| 24 |
+
|
| 25 |
+
# Test specific questions
|
| 26 |
+
test_cases = [
|
| 27 |
+
{
|
| 28 |
+
"question": "What is 2 + 2?",
|
| 29 |
+
"expected": "4",
|
| 30 |
+
"type": "simple"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"question": 'In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of "a "moat" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline." Verma\'s "moat" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?',
|
| 34 |
+
"expected": "STEM",
|
| 35 |
+
"type": "youtube"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"question": "Tell me the amount of sales in the sales sheet for the attached excel file.",
|
| 39 |
+
"expected": "Unable to determine",
|
| 40 |
+
"type": "excel"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"question": "How many times is the word \"therefore\" used in the attached PDF?",
|
| 44 |
+
"expected": "Unable to determine",
|
| 45 |
+
"type": "pdf"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"question": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?",
|
| 49 |
+
"expected": "Unable to determine",
|
| 50 |
+
"type": "code"
|
| 51 |
+
}
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
correct = 0
|
| 55 |
+
for i, test_case in enumerate(test_cases, 1):
|
| 56 |
+
question = test_case["question"]
|
| 57 |
+
expected = test_case["expected"]
|
| 58 |
+
q_type = test_case["type"]
|
| 59 |
+
|
| 60 |
+
print(f"\nTest {i} ({q_type}): {question[:80]}...")
|
| 61 |
+
print(f"Expected: {expected}")
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
answer = agent(question)
|
| 65 |
+
print(f"Got: {answer}")
|
| 66 |
+
|
| 67 |
+
# Check if answer matches expected
|
| 68 |
+
if q_type in ["excel", "pdf", "code"] and "Unable to determine" in answer:
|
| 69 |
+
print("✅ Correctly handled inaccessible file")
|
| 70 |
+
correct += 1
|
| 71 |
+
elif expected.lower() in answer.lower():
|
| 72 |
+
print("✅ Correct answer")
|
| 73 |
+
correct += 1
|
| 74 |
+
else:
|
| 75 |
+
print("❌ Incorrect answer")
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"❌ Error: {e}")
|
| 79 |
+
|
| 80 |
+
print(f"\n{'='*80}")
|
| 81 |
+
print(f"Score: {correct}/{len(test_cases)} ({correct/len(test_cases)*100:.0f}%)")
|
| 82 |
+
print(f"{'='*80}")
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
test_specific_questions()
|