#!/usr/bin/env python3 """ Test specific multimedia GAIA questions """ import os from dotenv import load_dotenv from app import BasicAgent # Load environment variables load_dotenv() def test_specific_questions(): """Test specific GAIA questions with multimedia""" # Initialize agent agent = BasicAgent() api_key = os.getenv("ANTHROPIC_API_KEY") if not api_key: print("Error: ANTHROPIC_API_KEY not found in environment variables") return agent.set_api_key(api_key) # Test specific questions test_cases = [ { "question": "What is 2 + 2?", "expected": "4", "type": "simple" }, { "question": 'In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of "a "moat" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline." Verma\'s "moat" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?', "expected": "STEM", "type": "youtube" }, { "question": "Tell me the amount of sales in the sales sheet for the attached excel file.", "expected": "Unable to determine", "type": "excel" }, { "question": "How many times is the word \"therefore\" used in the attached PDF?", "expected": "Unable to determine", "type": "pdf" }, { "question": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?", "expected": "Unable to determine", "type": "code" } ] correct = 0 for i, test_case in enumerate(test_cases, 1): question = test_case["question"] expected = test_case["expected"] q_type = test_case["type"] print(f"\nTest {i} ({q_type}): {question[:80]}...") print(f"Expected: {expected}") try: answer = agent(question) print(f"Got: {answer}") # Check if answer matches expected if q_type in ["excel", "pdf", "code"] and "Unable to determine" in answer: print("✅ Correctly handled inaccessible file") correct += 1 elif expected.lower() in answer.lower(): print("✅ Correct answer") correct += 1 else: print("❌ Incorrect answer") except Exception as e: print(f"❌ Error: {e}") print(f"\n{'='*80}") print(f"Score: {correct}/{len(test_cases)} ({correct/len(test_cases)*100:.0f}%)") print(f"{'='*80}") if __name__ == "__main__": test_specific_questions()