Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Validate our multi-agent system answers against known GAIA results | |
| """ | |
| import json | |
| import requests | |
| from gaia_web_loader import GAIAQuestionLoaderWeb | |
| from main import GAIASolver | |
| from question_classifier import QuestionClassifier | |
| # Known correct answers from GAIA validation (manually collected for testing) | |
| KNOWN_ANSWERS = { | |
| "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": { | |
| "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?", | |
| "expected_answer": "FunkMonk", # Need to verify this | |
| "our_answer": "JuraForm", | |
| "category": "research" | |
| }, | |
| "2d83110e-a098-4ebb-9987-066c06fa42d0": { | |
| "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI", | |
| "expected_answer": "right", | |
| "our_answer": "right", | |
| "category": "logic_math" | |
| }, | |
| "cca530fc-4052-43b2-b130-b30968d8aa44": { | |
| "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.", | |
| "expected_answer": "Qxg2#", # Need to verify with actual chess analysis | |
| "our_answer": "Qxg2#", | |
| "category": "multimedia" | |
| } | |
| } | |
| def validate_answer(question_id: str, our_answer: str, expected_answer: str) -> dict: | |
| """Validate our answer against the expected answer""" | |
| # Clean up answers for comparison | |
| our_clean = str(our_answer).strip().lower() | |
| expected_clean = str(expected_answer).strip().lower() | |
| # Exact match | |
| exact_match = our_clean == expected_clean | |
| # Contains match (for longer answers) | |
| contains_match = expected_clean in our_clean or our_clean in expected_clean | |
| # Similarity score (rough) | |
| similarity = len(set(our_clean.split()) & set(expected_clean.split())) / max(len(set(our_clean.split())), len(set(expected_clean.split())), 1) | |
| return { | |
| "exact_match": exact_match, | |
| "contains_match": contains_match, | |
| "similarity_score": similarity, | |
| "our_answer": our_answer, | |
| "expected_answer": expected_answer, | |
| "status": "CORRECT" if exact_match else "PARTIAL" if contains_match else "INCORRECT" | |
| } | |
| def test_validation_system(): | |
| """Test our validation system with known questions""" | |
| print("π§ͺ GAIA ANSWER VALIDATION SYSTEM") | |
| print("=" * 60) | |
| total_tests = len(KNOWN_ANSWERS) | |
| correct_count = 0 | |
| partial_count = 0 | |
| for question_id, data in KNOWN_ANSWERS.items(): | |
| print(f"\nπ Testing Question: {question_id[:8]}...") | |
| print(f"Category: {data['category']}") | |
| print(f"Question: {data['question'][:80]}...") | |
| # Validate our answer | |
| validation = validate_answer( | |
| question_id, | |
| data['our_answer'], | |
| data['expected_answer'] | |
| ) | |
| print(f"\nπ VALIDATION RESULTS:") | |
| print(f"Our Answer: {validation['our_answer']}") | |
| print(f"Expected: {validation['expected_answer']}") | |
| print(f"Status: {validation['status']}") | |
| print(f"Exact Match: {validation['exact_match']}") | |
| print(f"Contains Match: {validation['contains_match']}") | |
| print(f"Similarity: {validation['similarity_score']:.2f}") | |
| if validation['status'] == "CORRECT": | |
| correct_count += 1 | |
| print("β CORRECT!") | |
| elif validation['status'] == "PARTIAL": | |
| partial_count += 1 | |
| print("π‘ PARTIAL MATCH") | |
| else: | |
| print("β INCORRECT") | |
| print(f"\nπ OVERALL VALIDATION SUMMARY:") | |
| print("=" * 60) | |
| print(f"Total Questions Tested: {total_tests}") | |
| print(f"Correct Answers: {correct_count} ({correct_count/total_tests*100:.1f}%)") | |
| print(f"Partial Matches: {partial_count} ({partial_count/total_tests*100:.1f}%)") | |
| print(f"Incorrect: {total_tests - correct_count - partial_count}") | |
| print(f"Overall Success Rate: {(correct_count + partial_count)/total_tests*100:.1f}%") | |
| def research_correct_answer(): | |
| """Research the correct answer for the Wikipedia dinosaur question""" | |
| print("\nπ RESEARCHING CORRECT ANSWER FOR DINOSAUR QUESTION") | |
| print("=" * 60) | |
| question_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8" | |
| print("Question: Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?") | |
| print("\nπ΅οΈ Research Process:") | |
| print("1. Need to find Featured Articles promoted in November 2016") | |
| print("2. Identify which one was about a dinosaur") | |
| print("3. Find the nominator") | |
| print("\nπ‘ Research Strategy:") | |
| print("- Check Wikipedia's Featured Article log for November 2016") | |
| print("- Look for dinosaur-related articles promoted that month") | |
| print("- Find nomination information") | |
| print(f"\nπ€ Our Answer: JuraForm") | |
| print(f"β Need to verify: Was this correct?") | |
| print(f"\nπ Alternative Research Approach:") | |
| print("- Search for 'Spinosaurus' article on Wikipedia") | |
| print("- Check its promotion history") | |
| print("- Verify nomination details") | |
| if __name__ == "__main__": | |
| test_validation_system() | |
| research_correct_answer() |