|
|
|
|
|
""" |
|
|
Analyze which GAIA questions are failing and why |
|
|
""" |
|
|
|
|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
from app import BasicAgent |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
def analyze_gaia_failures(): |
|
|
"""Test GAIA questions and categorize failures""" |
|
|
|
|
|
|
|
|
agent = BasicAgent() |
|
|
api_key = os.getenv("ANTHROPIC_API_KEY") |
|
|
if not api_key: |
|
|
print("Error: ANTHROPIC_API_KEY not found") |
|
|
return |
|
|
|
|
|
agent.set_api_key(api_key) |
|
|
|
|
|
|
|
|
test_cases = [ |
|
|
|
|
|
{"q": "How many lightning strikes occur on Earth each second? Round your answer to the nearest integer.", "expected": "100", "status": "β
"}, |
|
|
{"q": "What is the current population of Gabon?", "expected": "~2.3M", "status": "β
"}, |
|
|
{"q": "In a park, there are three gardens: one with 5 tulips and 3 daisies, one with 6 marigolds and 4 petunias, and one with 8 hydrangeas, 2 jasmines, and twice as many roses as the first two gardens combined. How many flowers are there in total?", "expected": "66", "status": "β
"}, |
|
|
{"q": "What is the sum of the first 20 terms of the arithmetic sequence where the first term is 5 and the common difference is 3?", "expected": "670", "status": "β
"}, |
|
|
{"q": "What percentage of Gabon is covered by forests?", "expected": "85%", "status": "β
"}, |
|
|
|
|
|
|
|
|
{"q": "In Audre Lorde's poem 'Diaspora', she repeats, \"home is\" three times. The last line ends \"and I am...\" what?", "expected": "apart", "status": "β"}, |
|
|
{"q": "On April 1, 2024, the French National Railway Company (SNCF) published an April Fool's joke on X (formerly Twitter) about a new model of train. What is the name of this model?", "expected": "TGV Pigeon", "status": "β"}, |
|
|
{"q": "In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of \"a \"moat\" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline.\" Verma's \"moat\" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?", "expected": "STEM", "status": "β"}, |
|
|
{"q": "Whose X account (formerly Twitter) is this: @lbcmjc?", "expected": "specific person", "status": "β"}, |
|
|
{"q": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?", "expected": "code fix", "status": "β"}, |
|
|
{"q": "What is the name of the only Israeli pitcher to ever play in the major leagues?", "expected": "specific name", "status": "β"}, |
|
|
{"q": "Tell me the amount of sales in the sales sheet for the attached excel file.", "expected": "Unable to determine", "status": "β
"}, |
|
|
{"q": "How many times is the word \"therefore\" used in the attached PDF?", "expected": "Unable to determine", "status": "β
"}, |
|
|
] |
|
|
|
|
|
categories = { |
|
|
"web_search": [], |
|
|
"multimedia": [], |
|
|
"calculation": [], |
|
|
"code": [], |
|
|
"literature": [] |
|
|
} |
|
|
|
|
|
print("Analyzing GAIA question patterns...\n") |
|
|
|
|
|
for i, test in enumerate(test_cases[:8], 1): |
|
|
question = test["q"] |
|
|
expected = test["expected"] |
|
|
status = test["status"] |
|
|
|
|
|
print(f"\n{i}. {status} Question: {question[:80]}...") |
|
|
print(f" Expected: {expected}") |
|
|
|
|
|
try: |
|
|
answer = agent(question) |
|
|
print(f" Got: {answer[:100]}...") |
|
|
|
|
|
|
|
|
if "twitter" in question.lower() or "april fool" in question.lower(): |
|
|
categories["web_search"].append((question, answer, status)) |
|
|
elif "video" in question.lower() or "attached" in question.lower(): |
|
|
categories["multimedia"].append((question, answer, status)) |
|
|
elif any(word in question.lower() for word in ["sum", "total", "how many"]): |
|
|
categories["calculation"].append((question, answer, status)) |
|
|
elif "code" in question.lower() or "python" in question.lower(): |
|
|
categories["code"].append((question, answer, status)) |
|
|
elif "poem" in question.lower() or "book" in question.lower(): |
|
|
categories["literature"].append((question, answer, status)) |
|
|
|
|
|
except Exception as e: |
|
|
print(f" Error: {e}") |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("ANALYSIS SUMMARY") |
|
|
print("="*80) |
|
|
|
|
|
for category, items in categories.items(): |
|
|
if items: |
|
|
print(f"\n{category.upper()} ({len(items)} questions):") |
|
|
failed = [item for item in items if "β" in item[2]] |
|
|
if failed: |
|
|
print(f" Failed: {len(failed)}") |
|
|
for q, a, _ in failed[:2]: |
|
|
print(f" Q: {q[:60]}...") |
|
|
print(f" A: {a[:60]}...") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
analyze_gaia_failures() |