Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Validate all GAIA questions with our multi-agent system | |
| """ | |
| import json | |
| import time | |
| from typing import Dict, List | |
| from gaia_web_loader import GAIAQuestionLoaderWeb | |
| from main import GAIASolver | |
| from question_classifier import QuestionClassifier | |
| def solve_all_questions_with_validation(): | |
| """Solve all 20 GAIA questions and collect results for validation""" | |
| print("π§ͺ COMPREHENSIVE GAIA VALIDATION - ALL 20 QUESTIONS") | |
| print("=" * 70) | |
| # Initialize components | |
| print("π Initializing multi-agent system...") | |
| loader = GAIAQuestionLoaderWeb() | |
| classifier = QuestionClassifier() | |
| solver = GAIASolver() | |
| questions = loader.questions | |
| results = [] | |
| print(f"π Found {len(questions)} questions to solve") | |
| for i, question_data in enumerate(questions, 1): | |
| task_id = question_data.get('task_id', 'unknown') | |
| question_text = question_data.get('question', '') | |
| file_name = question_data.get('file_name', '') | |
| print(f"\n{'='*60}") | |
| print(f"QUESTION {i}/20: {task_id[:8]}...") | |
| print(f"{'='*60}") | |
| try: | |
| # Classification phase | |
| print(f"π§ CLASSIFICATION:") | |
| classification = classifier.classify_question(question_text, file_name) | |
| routing = classifier.get_routing_recommendation(classification) | |
| print(f" Primary Agent: {classification['primary_agent']}") | |
| print(f" Secondary: {classification.get('secondary_agents', [])}") | |
| print(f" Complexity: {classification['complexity']}/5") | |
| print(f" Confidence: {classification['confidence']:.3f}") | |
| # Solving phase | |
| print(f"\nπ€ SOLVING:") | |
| print(f" Question: {question_text[:100]}...") | |
| if file_name: | |
| print(f" File: {file_name}") | |
| start_time = time.time() | |
| answer = solver.solve_question(question_data) | |
| solve_time = time.time() - start_time | |
| print(f" β Answer: {answer[:100]}...") | |
| print(f" β±οΈ Time: {solve_time:.1f}s") | |
| # Store results | |
| result = { | |
| 'question_id': task_id, | |
| 'question': question_text, | |
| 'file_name': file_name, | |
| 'classification': { | |
| 'primary_agent': classification['primary_agent'], | |
| 'secondary_agents': classification.get('secondary_agents', []), | |
| 'complexity': classification['complexity'], | |
| 'confidence': classification['confidence'], | |
| 'tools_needed': classification.get('tools_needed', []) | |
| }, | |
| 'routing': { | |
| 'coordination_needed': routing['requires_coordination'], | |
| 'duration_estimate': routing['estimated_duration'] | |
| }, | |
| 'answer': answer, | |
| 'solve_time': solve_time, | |
| 'status': 'completed' | |
| } | |
| results.append(result) | |
| except Exception as e: | |
| print(f" β Error: {e}") | |
| # Store error result | |
| error_result = { | |
| 'question_id': task_id, | |
| 'question': question_text, | |
| 'file_name': file_name, | |
| 'classification': classification if 'classification' in locals() else None, | |
| 'answer': f"Error: {str(e)}", | |
| 'solve_time': 0, | |
| 'status': 'error' | |
| } | |
| results.append(error_result) | |
| # Small delay to avoid overwhelming APIs | |
| time.sleep(1) | |
| return results | |
| def analyze_results(results: List[Dict]): | |
| """Analyze the solving results""" | |
| print(f"\nπ COMPREHENSIVE RESULTS ANALYSIS") | |
| print("=" * 70) | |
| total_questions = len(results) | |
| completed = len([r for r in results if r['status'] == 'completed']) | |
| errors = len([r for r in results if r['status'] == 'error']) | |
| print(f"π OVERALL STATISTICS:") | |
| print(f" Total Questions: {total_questions}") | |
| print(f" Successfully Solved: {completed} ({completed/total_questions*100:.1f}%)") | |
| print(f" Errors: {errors} ({errors/total_questions*100:.1f}%)") | |
| if completed > 0: | |
| completed_results = [r for r in results if r['status'] == 'completed'] | |
| avg_time = sum(r['solve_time'] for r in completed_results) / len(completed_results) | |
| print(f" Average Solve Time: {avg_time:.1f}s") | |
| # Classification analysis | |
| print(f"\nπ― CLASSIFICATION ANALYSIS:") | |
| agent_counts = {} | |
| complexity_counts = {} | |
| confidence_scores = [] | |
| for result in results: | |
| if result['classification']: | |
| primary = result['classification']['primary_agent'] | |
| agent_counts[primary] = agent_counts.get(primary, 0) + 1 | |
| complexity = result['classification']['complexity'] | |
| complexity_counts[complexity] = complexity_counts.get(complexity, 0) + 1 | |
| confidence_scores.append(result['classification']['confidence']) | |
| print(f" Agent Distribution:") | |
| for agent, count in sorted(agent_counts.items()): | |
| percentage = (count / total_questions) * 100 | |
| print(f" {agent}: {count} questions ({percentage:.1f}%)") | |
| print(f" Complexity Distribution:") | |
| for complexity, count in sorted(complexity_counts.items()): | |
| percentage = (count / total_questions) * 100 | |
| print(f" Level {complexity}: {count} questions ({percentage:.1f}%)") | |
| if confidence_scores: | |
| avg_confidence = sum(confidence_scores) / len(confidence_scores) | |
| print(f" Average Classification Confidence: {avg_confidence:.3f}") | |
| # Question type analysis | |
| print(f"\nπ QUESTION BREAKDOWN:") | |
| for i, result in enumerate(results, 1): | |
| status_emoji = "β " if result['status'] == 'completed' else "β" | |
| task_id = result['question_id'][:8] | |
| primary_agent = result['classification']['primary_agent'] if result['classification'] else 'unknown' | |
| answer_preview = result['answer'][:50] + "..." if len(result['answer']) > 50 else result['answer'] | |
| print(f" {i:2d}. {status_emoji} {task_id}... [{primary_agent}] {answer_preview}") | |
| def save_results(results: List[Dict]): | |
| """Save results to JSON file for further analysis""" | |
| output_file = "gaia_validation_results.json" | |
| with open(output_file, 'w') as f: | |
| json.dump(results, f, indent=2, ensure_ascii=False) | |
| print(f"\nπΎ Results saved to: {output_file}") | |
| print(f"π Use this file to compare with official GAIA answers") | |
| def main(): | |
| """Main validation workflow""" | |
| print("π― Starting comprehensive GAIA validation...") | |
| print("β οΈ This will take several minutes to complete all 20 questions") | |
| # Solve all questions | |
| results = solve_all_questions_with_validation() | |
| # Analyze results | |
| analyze_results(results) | |
| # Save for comparison | |
| save_results(results) | |
| print(f"\nβ VALIDATION COMPLETE!") | |
| print(f"π Check gaia_validation_results.json for detailed results") | |
| print(f"π Compare answers with official GAIA dataset when available") | |
| if __name__ == "__main__": | |
| main() |