#!/usr/bin/env python3 """ Test multimedia handling for GAIA agent """ import os from dotenv import load_dotenv # Load environment variables load_dotenv() # Import the agent from app import LangGraphAgent def test_multimedia_questions(): """Test questions that involve multimedia content""" print("Testing GAIA agent with multimedia questions...") print("=" * 80) # Initialize agent agent = LangGraphAgent() # Test questions from the GAIA benchmark that involve multimedia test_questions = [ # YouTube video question { "question": 'In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of "a "moat" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline." Verma\'s "moat" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?', "expected": "Should extract transcript and find STEM" }, # Image question (should return "Unable to determine") { "question": "Look at the attached image and tell me what color is the car?", "expected": "Unable to determine without access to image files" }, # Excel file question (should return "Unable to determine") { "question": "What is the sum of all values in column B of the attached Excel file?", "expected": "Unable to determine without access to Excel files" }, # Audio question (should return "Unable to determine") { "question": "What song is playing in the attached audio file?", "expected": "Unable to determine without access to audio files" }, # PDF question (should return "Unable to determine") { "question": "What is written on page 3 of the attached PDF?", "expected": "Unable to determine without access to PDF files" }, # Another YouTube question with shortened URL { "question": "In the YouTube video at https://youtu.be/dQw4w9WgXcQ, what is the main theme?", "expected": "Should extract transcript from Rick Astley video" } ] # Test each question for i, test_case in enumerate(test_questions, 1): question = test_case["question"] expected = test_case["expected"] print(f"\nTest {i}: {question[:80]}...") print(f"Expected behavior: {expected}") try: # Get the answer answer = agent.run(question) print(f"Answer: {answer}") # Check if multimedia was handled appropriately if "youtube" in question.lower() or "youtu.be" in question.lower(): if "Unable to determine" in answer: print("❌ Failed to extract YouTube transcript") else: print("✅ Successfully handled YouTube content") elif any(keyword in question.lower() for keyword in ["image", "excel", "audio", "pdf", "attached"]): if "Unable to determine" in answer: print("✅ Correctly returned 'Unable to determine' for inaccessible file") else: print("❌ Should have returned 'Unable to determine'") except Exception as e: print(f"❌ Error: {type(e).__name__}: {e}") print("-" * 80) print("\n" + "=" * 80) print("Multimedia handling test complete!") print("=" * 80) if __name__ == "__main__": # Check for API key if not os.getenv("ANTHROPIC_API_KEY"): print("Error: ANTHROPIC_API_KEY not found in environment variables") print("Please set it in your .env file") exit(1) test_multimedia_questions()