#!/usr/bin/env python3 """ Run GAIA evaluation test """ import os from dotenv import load_dotenv from app import BasicAgent # Load environment variables load_dotenv() def test_gaia_questions(): """Test with GAIA questions""" # Initialize agent agent = BasicAgent() api_key = os.getenv("ANTHROPIC_API_KEY") if not api_key: print("Error: ANTHROPIC_API_KEY not found in environment variables") return agent.set_api_key(api_key) # GAIA questions from previous debug output questions = [ "How many lightning strikes occur on Earth each second? Round your answer to the nearest integer.", "In Audre Lorde's poem 'Diaspora', she repeats, \"home is\" three times. The last line ends \"and I am...\" what?", "On April 1, 2024, the French National Railway Company (SNCF) published an April Fool's joke on X (formerly Twitter) about a new model of train. What is the name of this model?", "In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of \"a \"moat\" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline.\" Verma's \"moat\" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?", "Whose X account (formerly Twitter) is this: @lbcmjc?", "What is the current population of Gabon?", "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?", "In a park, there are three gardens: one with 5 tulips and 3 daisies, one with 6 marigolds and 4 petunias, and one with 8 hydrangeas, 2 jasmines, and twice as many roses as the first two gardens combined. How many flowers are there in total?", "What is the name of the only Israeli pitcher to ever play in the major leagues?", "When would a purple lightsaber be needed for the August 16, 2024, Lego Star Wars release?", "What is the sum of the first 20 terms of the arithmetic sequence where the first term is 5 and the common difference is 3?", "What percentage of Gabon is covered by forests?", "When did the Khorezm People's Soviet Republic cease to exist?", "As of January 2024, what is the latest OS update for iPad mini (5th generation)?", "Tell me the amount of sales in the sales sheet for the attached excel file.", "How many times is the word \"therefore\" used in the attached PDF?", "What item came in first on the Official Monster Raving Loony Party's 2019 manifesto?", "What is the hexadecimal value of the unicode character for 'Brain' emoji?", "What was the score of the Women's Handball World Championship match between Argentina and Austria on 4 December 2023?", "Which record producer is quoted in the Wikipedia article on James Blake's album \"Friends That Break Your Heart\"?" ] correct = 0 for i, question in enumerate(questions, 1): print(f"\nQuestion {i}: {question}") try: answer = agent(question) print(f"Answer: {answer}") # Simple heuristic - if answer is not an error and not too long, count as potentially correct if answer and "error" not in answer.lower() and len(answer) < 100: correct += 1 except Exception as e: print(f"Error: {e}") print(f"\n{'='*80}") print(f"Final Score: {correct}/{len(questions)} ({correct/len(questions)*100:.1f}%)") print(f"{'='*80}") if __name__ == "__main__": test_gaia_questions()