Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Comprehensive Accuracy Test - Full GAIA Benchmark Evaluation | |
| Runs all 20 questions through the async batch processor for complete accuracy assessment | |
| """ | |
| import asyncio | |
| import sys | |
| from pathlib import Path | |
| from datetime import datetime | |
| import json | |
| # Add parent directory to path for imports | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| from tests.async_batch_processor import BatchQuestionProcessor | |
| from gaia_web_loader import GAIAQuestionLoaderWeb | |
| async def run_comprehensive_accuracy_test(): | |
| """Run comprehensive accuracy test on all available GAIA questions""" | |
| print("π― COMPREHENSIVE GAIA ACCURACY TEST") | |
| print("=" * 80) | |
| print(f"π Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| print(f"π― Goal: Establish baseline accuracy and identify improvement areas") | |
| print() | |
| try: | |
| # Load all questions | |
| print("π Loading all GAIA questions...") | |
| loader = GAIAQuestionLoaderWeb() | |
| all_questions = loader.questions | |
| print(f"β Loaded {len(all_questions)} questions from GAIA benchmark") | |
| # Show question distribution by level | |
| level_counts = {} | |
| classification_preview = {} | |
| for q in all_questions: | |
| level = q.get('Level', 'Unknown') | |
| level_counts[level] = level_counts.get(level, 0) + 1 | |
| # Quick classification preview (first 5 questions) | |
| if len(classification_preview) < 5: | |
| task_id = q.get('task_id', 'unknown') | |
| question_preview = q.get('question', '')[:60] + "..." | |
| has_file = "Yes" if q.get('file_name') else "No" | |
| classification_preview[task_id[:8]] = { | |
| 'question': question_preview, | |
| 'level': level, | |
| 'has_file': has_file | |
| } | |
| print(f"\nπ Question Distribution:") | |
| for level, count in sorted(level_counts.items()): | |
| print(f" Level {level}: {count} questions") | |
| print(f"\nπ Sample Questions:") | |
| for task_id, info in classification_preview.items(): | |
| print(f" {task_id}... | L{info['level']} | File: {info['has_file']} | {info['question']}") | |
| # Initialize batch processor with production settings | |
| print(f"\nπ Initializing production-grade batch processor...") | |
| processor = BatchQuestionProcessor( | |
| max_concurrent=3, # Balanced concurrency for stability | |
| question_timeout=900, # 15 minutes per question for complex cases | |
| progress_interval=15 # Progress updates every 15 seconds | |
| ) | |
| print(f"βοΈ Configuration:") | |
| print(f" - Max Concurrent: {processor.max_concurrent}") | |
| print(f" - Question Timeout: {processor.question_timeout}s (15 minutes)") | |
| print(f" - Progress Interval: {processor.progress_interval}s") | |
| print(f" - Expected Duration: ~{len(all_questions) * 3 // processor.max_concurrent // 60} minutes") | |
| # Confirm before starting | |
| print(f"\nβ οΈ This will process ALL {len(all_questions)} questions concurrently.") | |
| print(f"π Estimated time: {len(all_questions) * 3 // processor.max_concurrent} minutes") | |
| print(f"π Starting comprehensive accuracy test...") | |
| print() | |
| # Process all questions | |
| start_time = datetime.now() | |
| results = await processor.process_questions_batch( | |
| all_questions, | |
| solver_kwargs={ | |
| "use_kluster": True, | |
| "kluster_model": "qwen3-235b" | |
| } | |
| ) | |
| end_time = datetime.now() | |
| # Comprehensive results analysis | |
| print(f"\n" + "=" * 80) | |
| print(f"π COMPREHENSIVE TEST RESULTS") | |
| print(f"=" * 80) | |
| duration = (end_time - start_time).total_seconds() | |
| accuracy = results["accuracy_metrics"]["accuracy_rate"] | |
| success = results["accuracy_metrics"]["success_rate"] | |
| print(f"β±οΈ Total Duration: {int(duration // 60)}m {int(duration % 60)}s") | |
| print(f"β Overall Accuracy: {accuracy:.1%} ({results['accuracy_metrics']['correct_answers']}/{results['completed_questions']})") | |
| print(f"π― Success Rate: {success:.1%} (including partial matches)") | |
| print(f"β‘ Average per Question: {results['performance_metrics']['average_duration']:.1f}s") | |
| # Detailed breakdown | |
| print(f"\nπ DETAILED BREAKDOWN:") | |
| print(f" β CORRECT: {results['accuracy_metrics']['correct_answers']}") | |
| print(f" π‘ PARTIAL: {results['accuracy_metrics']['partial_answers']}") | |
| print(f" β INCORRECT: {results['accuracy_metrics']['incorrect_answers']}") | |
| print(f" β±οΈ TIMEOUT: {results['accuracy_metrics']['timeouts']}") | |
| print(f" π₯ ERROR: {results['accuracy_metrics']['errors']}") | |
| # Classification performance analysis | |
| print(f"\nπ― CLASSIFICATION PERFORMANCE:") | |
| classification_performance = {} | |
| for result in results["detailed_results"]: | |
| classification = result.classification | |
| if classification not in classification_performance: | |
| classification_performance[classification] = { | |
| 'total': 0, 'correct': 0, 'partial': 0, 'incorrect': 0 | |
| } | |
| classification_performance[classification]['total'] += 1 | |
| if result.status == 'CORRECT': | |
| classification_performance[classification]['correct'] += 1 | |
| elif result.status == 'PARTIAL': | |
| classification_performance[classification]['partial'] += 1 | |
| elif result.status == 'INCORRECT': | |
| classification_performance[classification]['incorrect'] += 1 | |
| # Sort by accuracy for prioritization | |
| sorted_classifications = sorted( | |
| classification_performance.items(), | |
| key=lambda x: (x[1]['correct'] + x[1]['partial'] * 0.5) / x[1]['total'] if x[1]['total'] > 0 else 0 | |
| ) | |
| for classification, perf in sorted_classifications: | |
| total = perf['total'] | |
| if total > 0: | |
| accuracy_rate = perf['correct'] / total | |
| success_rate = (perf['correct'] + perf['partial']) / total | |
| print(f" {classification:15} | {accuracy_rate:.1%} acc | {success_rate:.1%} success | {total:2d} questions") | |
| # Identify improvement priorities | |
| print(f"\nπ§ IMPROVEMENT PRIORITIES:") | |
| improvement_priorities = [] | |
| for classification, perf in sorted_classifications: | |
| total = perf['total'] | |
| if total > 0: | |
| accuracy_rate = perf['correct'] / total | |
| impact_score = total * (1 - accuracy_rate) # Questions * failure rate | |
| if accuracy_rate < 0.7: # Less than 70% accuracy | |
| priority = "HIGH" if impact_score > 2 else "MEDIUM" | |
| improvement_priorities.append({ | |
| 'classification': classification, | |
| 'accuracy': accuracy_rate, | |
| 'total_questions': total, | |
| 'impact_score': impact_score, | |
| 'priority': priority | |
| }) | |
| for priority_item in sorted(improvement_priorities, key=lambda x: x['impact_score'], reverse=True): | |
| classification = priority_item['classification'] | |
| accuracy = priority_item['accuracy'] | |
| total = priority_item['total_questions'] | |
| priority = priority_item['priority'] | |
| impact = priority_item['impact_score'] | |
| print(f" π₯ {priority:6} | {classification:15} | {accuracy:.1%} accuracy | {total} questions | Impact: {impact:.1f}") | |
| # Save detailed results | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| results_file = f"logs/comprehensive_accuracy_test_{timestamp}.json" | |
| with open(results_file, 'w') as f: | |
| json.dump({ | |
| 'test_metadata': { | |
| 'timestamp': timestamp, | |
| 'total_questions': len(all_questions), | |
| 'duration_seconds': duration, | |
| 'configuration': { | |
| 'max_concurrent': processor.max_concurrent, | |
| 'question_timeout': processor.question_timeout, | |
| 'model': 'qwen3-235b' | |
| } | |
| }, | |
| 'overall_metrics': results['accuracy_metrics'], | |
| 'classification_performance': classification_performance, | |
| 'improvement_priorities': improvement_priorities, | |
| 'detailed_results': [ | |
| { | |
| 'task_id': r.task_id, | |
| 'classification': r.classification, | |
| 'status': r.status, | |
| 'accuracy_score': r.accuracy_score, | |
| 'our_answer': r.our_answer, | |
| 'expected_answer': r.expected_answer, | |
| 'duration': r.total_duration, | |
| 'error_type': r.error_type | |
| } for r in results['detailed_results'] | |
| ] | |
| }, f, indent=2) | |
| print(f"\nπ Detailed results saved to: {results_file}") | |
| # Summary and next steps | |
| print(f"\nπ― NEXT STEPS RECOMMENDATION:") | |
| if accuracy >= 0.9: | |
| print(f" π EXCELLENT: {accuracy:.1%} accuracy achieved! Focus on edge cases.") | |
| elif accuracy >= 0.7: | |
| print(f" β GOOD: {accuracy:.1%} accuracy. Target specific classifications for 90%+.") | |
| elif accuracy >= 0.5: | |
| print(f" π§ MODERATE: {accuracy:.1%} accuracy. Implement targeted improvements.") | |
| else: | |
| print(f" π¨ NEEDS WORK: {accuracy:.1%} accuracy. Focus on high-impact areas.") | |
| if improvement_priorities: | |
| top_priority = improvement_priorities[0] | |
| print(f" π― TOP PRIORITY: {top_priority['classification']} ({top_priority['accuracy']:.1%} accuracy, {top_priority['total_questions']} questions)") | |
| return results | |
| except Exception as e: | |
| print(f"β Comprehensive test failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None | |
| async def main(): | |
| """Run the comprehensive accuracy test""" | |
| results = await run_comprehensive_accuracy_test() | |
| if results: | |
| accuracy = results["accuracy_metrics"]["accuracy_rate"] | |
| print(f"\nπ Comprehensive accuracy test completed!") | |
| print(f"π Final Accuracy: {accuracy:.1%}") | |
| if accuracy >= 0.7: | |
| print(f"π― TARGET ACHIEVED: 70%+ accuracy reached!") | |
| else: | |
| gap = 0.7 - accuracy | |
| print(f"π§ GAP TO TARGET: {gap:.1%} improvement needed for 70%") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) |