File size: 5,200 Bytes
1637cd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
"""
Analyze which GAIA questions are failing and why
"""

import os
from dotenv import load_dotenv
from app import BasicAgent

# Load environment variables
load_dotenv()

def analyze_gaia_failures():
    """Test GAIA questions and categorize failures"""
    
    # Initialize agent
    agent = BasicAgent()
    api_key = os.getenv("ANTHROPIC_API_KEY")
    if not api_key:
        print("Error: ANTHROPIC_API_KEY not found")
        return
    
    agent.set_api_key(api_key)
    
    # GAIA questions with expected answers (based on previous runs)
    test_cases = [
        # Correct ones (10/20)
        {"q": "How many lightning strikes occur on Earth each second? Round your answer to the nearest integer.", "expected": "100", "status": "βœ…"},
        {"q": "What is the current population of Gabon?", "expected": "~2.3M", "status": "βœ…"},
        {"q": "In a park, there are three gardens: one with 5 tulips and 3 daisies, one with 6 marigolds and 4 petunias, and one with 8 hydrangeas, 2 jasmines, and twice as many roses as the first two gardens combined. How many flowers are there in total?", "expected": "66", "status": "βœ…"},
        {"q": "What is the sum of the first 20 terms of the arithmetic sequence where the first term is 5 and the common difference is 3?", "expected": "670", "status": "βœ…"},
        {"q": "What percentage of Gabon is covered by forests?", "expected": "85%", "status": "βœ…"},
        
        # Failed ones that need improvement (10/20)
        {"q": "In Audre Lorde's poem 'Diaspora', she repeats, \"home is\" three times. The last line ends \"and I am...\" what?", "expected": "apart", "status": "❌"},
        {"q": "On April 1, 2024, the French National Railway Company (SNCF) published an April Fool's joke on X (formerly Twitter) about a new model of train. What is the name of this model?", "expected": "TGV Pigeon", "status": "❌"},
        {"q": "In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of \"a \"moat\" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline.\" Verma's \"moat\" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?", "expected": "STEM", "status": "❌"},
        {"q": "Whose X account (formerly Twitter) is this: @lbcmjc?", "expected": "specific person", "status": "❌"},
        {"q": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?", "expected": "code fix", "status": "❌"},
        {"q": "What is the name of the only Israeli pitcher to ever play in the major leagues?", "expected": "specific name", "status": "❌"},
        {"q": "Tell me the amount of sales in the sales sheet for the attached excel file.", "expected": "Unable to determine", "status": "βœ…"},
        {"q": "How many times is the word \"therefore\" used in the attached PDF?", "expected": "Unable to determine", "status": "βœ…"},
    ]
    
    categories = {
        "web_search": [],
        "multimedia": [],
        "calculation": [],
        "code": [],
        "literature": []
    }
    
    print("Analyzing GAIA question patterns...\n")
    
    for i, test in enumerate(test_cases[:8], 1):  # Test first 8 to save time
        question = test["q"]
        expected = test["expected"]
        status = test["status"]
        
        print(f"\n{i}. {status} Question: {question[:80]}...")
        print(f"   Expected: {expected}")
        
        try:
            answer = agent(question)
            print(f"   Got: {answer[:100]}...")
            
            # Categorize question type
            if "twitter" in question.lower() or "april fool" in question.lower():
                categories["web_search"].append((question, answer, status))
            elif "video" in question.lower() or "attached" in question.lower():
                categories["multimedia"].append((question, answer, status))
            elif any(word in question.lower() for word in ["sum", "total", "how many"]):
                categories["calculation"].append((question, answer, status))
            elif "code" in question.lower() or "python" in question.lower():
                categories["code"].append((question, answer, status))
            elif "poem" in question.lower() or "book" in question.lower():
                categories["literature"].append((question, answer, status))
                
        except Exception as e:
            print(f"   Error: {e}")
    
    print("\n" + "="*80)
    print("ANALYSIS SUMMARY")
    print("="*80)
    
    for category, items in categories.items():
        if items:
            print(f"\n{category.upper()} ({len(items)} questions):")
            failed = [item for item in items if "❌" in item[2]]
            if failed:
                print(f"  Failed: {len(failed)}")
                for q, a, _ in failed[:2]:  # Show first 2 failures
                    print(f"    Q: {q[:60]}...")
                    print(f"    A: {a[:60]}...")

if __name__ == "__main__":
    analyze_gaia_failures()