Spaces:

AbdullahIsaMarkus
/

Final_Assignment_Agent

Sleeping

Final_Assignment_Agent / analyze_failures.py

Markus Clauss DIRU Vetsuisse

First agent traila

1637cd5 4 months ago

5.2 kB

	#!/usr/bin/env python3
	"""
	Analyze which GAIA questions are failing and why
	"""

	import os
	from dotenv import load_dotenv
	from app import BasicAgent

	# Load environment variables
	load_dotenv()

	def analyze_gaia_failures():
	"""Test GAIA questions and categorize failures"""

	# Initialize agent
	agent = BasicAgent()
	api_key = os.getenv("ANTHROPIC_API_KEY")
	if not api_key:
	print("Error: ANTHROPIC_API_KEY not found")
	return

	agent.set_api_key(api_key)

	# GAIA questions with expected answers (based on previous runs)
	test_cases = [
	# Correct ones (10/20)
	{"q": "How many lightning strikes occur on Earth each second? Round your answer to the nearest integer.", "expected": "100", "status": "✅"},
	{"q": "What is the current population of Gabon?", "expected": "~2.3M", "status": "✅"},
	{"q": "In a park, there are three gardens: one with 5 tulips and 3 daisies, one with 6 marigolds and 4 petunias, and one with 8 hydrangeas, 2 jasmines, and twice as many roses as the first two gardens combined. How many flowers are there in total?", "expected": "66", "status": "✅"},
	{"q": "What is the sum of the first 20 terms of the arithmetic sequence where the first term is 5 and the common difference is 3?", "expected": "670", "status": "✅"},
	{"q": "What percentage of Gabon is covered by forests?", "expected": "85%", "status": "✅"},

	# Failed ones that need improvement (10/20)
	{"q": "In Audre Lorde's poem 'Diaspora', she repeats, \"home is\" three times. The last line ends \"and I am...\" what?", "expected": "apart", "status": "❌"},
	{"q": "On April 1, 2024, the French National Railway Company (SNCF) published an April Fool's joke on X (formerly Twitter) about a new model of train. What is the name of this model?", "expected": "TGV Pigeon", "status": "❌"},
	{"q": "In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of \"a \"moat\" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline.\" Verma's \"moat\" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?", "expected": "STEM", "status": "❌"},
	{"q": "Whose X account (formerly Twitter) is this: @lbcmjc?", "expected": "specific person", "status": "❌"},
	{"q": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?", "expected": "code fix", "status": "❌"},
	{"q": "What is the name of the only Israeli pitcher to ever play in the major leagues?", "expected": "specific name", "status": "❌"},
	{"q": "Tell me the amount of sales in the sales sheet for the attached excel file.", "expected": "Unable to determine", "status": "✅"},
	{"q": "How many times is the word \"therefore\" used in the attached PDF?", "expected": "Unable to determine", "status": "✅"},
	]

	categories = {
	"web_search": [],
	"multimedia": [],
	"calculation": [],
	"code": [],
	"literature": []
	}

	print("Analyzing GAIA question patterns...\n")

	for i, test in enumerate(test_cases[:8], 1): # Test first 8 to save time
	question = test["q"]
	expected = test["expected"]
	status = test["status"]

	print(f"\n{i}. {status} Question: {question[:80]}...")
	print(f" Expected: {expected}")

	try:
	answer = agent(question)
	print(f" Got: {answer[:100]}...")

	# Categorize question type
	if "twitter" in question.lower() or "april fool" in question.lower():
	categories["web_search"].append((question, answer, status))
	elif "video" in question.lower() or "attached" in question.lower():
	categories["multimedia"].append((question, answer, status))
	elif any(word in question.lower() for word in ["sum", "total", "how many"]):
	categories["calculation"].append((question, answer, status))
	elif "code" in question.lower() or "python" in question.lower():
	categories["code"].append((question, answer, status))
	elif "poem" in question.lower() or "book" in question.lower():
	categories["literature"].append((question, answer, status))

	except Exception as e:
	print(f" Error: {e}")

	print("\n" + "="*80)
	print("ANALYSIS SUMMARY")
	print("="*80)

	for category, items in categories.items():
	if items:
	print(f"\n{category.upper()} ({len(items)} questions):")
	failed = [item for item in items if "❌" in item[2]]
	if failed:
	print(f" Failed: {len(failed)}")
	for q, a, _ in failed[:2]: # Show first 2 failures
	print(f" Q: {q[:60]}...")
	print(f" A: {a[:60]}...")

	if __name__ == "__main__":
	analyze_gaia_failures()