Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Run comprehensive GAIA tests across all classification groups | |
| This script orchestrates the complete testing workflow and analysis | |
| """ | |
| import subprocess | |
| import time | |
| import json | |
| from pathlib import Path | |
| from datetime import datetime | |
| def run_command(command, description, timeout=1800): | |
| """Run a command with timeout and capture output""" | |
| print(f"\nπ {description}") | |
| print(f"Command: {command}") | |
| print("-" * 60) | |
| try: | |
| result = subprocess.run( | |
| command, | |
| shell=True, | |
| capture_output=True, | |
| text=True, | |
| timeout=timeout | |
| ) | |
| if result.returncode == 0: | |
| print("β SUCCESS") | |
| print(f"Output: {result.stdout[:500]}...") | |
| return True, result.stdout | |
| else: | |
| print("β FAILED") | |
| print(f"Error: {result.stderr[:500]}...") | |
| return False, result.stderr | |
| except subprocess.TimeoutExpired: | |
| print(f"β° TIMEOUT after {timeout}s") | |
| return False, "Command timed out" | |
| except Exception as e: | |
| print(f"π₯ EXCEPTION: {e}") | |
| return False, str(e) | |
| def main(): | |
| """Run comprehensive testing workflow""" | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| print("π― COMPREHENSIVE GAIA TESTING WORKFLOW") | |
| print("=" * 70) | |
| print(f"Started: {datetime.now()}") | |
| # Activate virtual environment prefix | |
| venv_prefix = "source venv/bin/activate &&" | |
| # Test plan - run each agent type separately for better error analysis | |
| test_plan = [ | |
| { | |
| "name": "Research Questions", | |
| "command": f"{venv_prefix} python tests/test_by_classification.py --agent-types research", | |
| "timeout": 1800, | |
| "priority": "HIGH" | |
| }, | |
| { | |
| "name": "Multimedia Questions", | |
| "command": f"{venv_prefix} python tests/test_by_classification.py --agent-types multimedia", | |
| "timeout": 2400, | |
| "priority": "HIGH" | |
| }, | |
| { | |
| "name": "Logic/Math Questions", | |
| "command": f"{venv_prefix} python tests/test_by_classification.py --agent-types logic_math", | |
| "timeout": 1200, | |
| "priority": "MEDIUM" | |
| }, | |
| { | |
| "name": "File Processing Questions", | |
| "command": f"{venv_prefix} python tests/test_by_classification.py --agent-types file_processing", | |
| "timeout": 900, | |
| "priority": "MEDIUM" | |
| }, | |
| { | |
| "name": "All Agent Types (Complete)", | |
| "command": f"{venv_prefix} python tests/test_by_classification.py", | |
| "timeout": 3600, | |
| "priority": "LOW" | |
| } | |
| ] | |
| results = [] | |
| # Execute test plan | |
| for i, test in enumerate(test_plan, 1): | |
| print(f"\n{'='*20} TEST {i}/{len(test_plan)} {'='*20}") | |
| print(f"Name: {test['name']}") | |
| print(f"Priority: {test['priority']}") | |
| start_time = time.time() | |
| success, output = run_command( | |
| test['command'], | |
| test['name'], | |
| test['timeout'] | |
| ) | |
| end_time = time.time() | |
| result = { | |
| 'test_name': test['name'], | |
| 'command': test['command'], | |
| 'priority': test['priority'], | |
| 'success': success, | |
| 'duration': end_time - start_time, | |
| 'output_preview': output[:200] if output else "", | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| results.append(result) | |
| # Brief pause between tests | |
| time.sleep(5) | |
| # Generate summary report | |
| print(f"\nπ COMPREHENSIVE TEST SUMMARY") | |
| print("=" * 70) | |
| total_tests = len(test_plan) | |
| successful_tests = len([r for r in results if r['success']]) | |
| failed_tests = total_tests - successful_tests | |
| print(f"Total Tests: {total_tests}") | |
| print(f"Successful: {successful_tests} ({successful_tests/total_tests*100:.1f}%)") | |
| print(f"Failed: {failed_tests} ({failed_tests/total_tests*100:.1f}%)") | |
| print(f"\nπ DETAILED RESULTS:") | |
| for result in results: | |
| status = "β " if result['success'] else "β" | |
| duration = result['duration'] | |
| print(f" {status} {result['test_name']}: {duration:.1f}s ({result['priority']} priority)") | |
| # Save comprehensive results | |
| results_file = f"comprehensive_test_results_{timestamp}.json" | |
| with open(results_file, 'w') as f: | |
| json.dump({ | |
| 'metadata': { | |
| 'timestamp': timestamp, | |
| 'total_tests': total_tests, | |
| 'successful_tests': successful_tests, | |
| 'failed_tests': failed_tests, | |
| 'success_rate': successful_tests/total_tests*100 | |
| }, | |
| 'test_results': results | |
| }, f, indent=2) | |
| print(f"\nπΎ Results saved to: {results_file}") | |
| # Generate action items based on results | |
| print(f"\nπ NEXT STEPS:") | |
| high_priority_failures = [r for r in results if not r['success'] and r['priority'] == 'HIGH'] | |
| if high_priority_failures: | |
| print("π΄ HIGH PRIORITY FIXES NEEDED:") | |
| for failure in high_priority_failures: | |
| print(f" - Fix {failure['test_name']}") | |
| print(f" Command: {failure['command']}") | |
| medium_priority_failures = [r for r in results if not r['success'] and r['priority'] == 'MEDIUM'] | |
| if medium_priority_failures: | |
| print("π‘ MEDIUM PRIORITY IMPROVEMENTS:") | |
| for failure in medium_priority_failures: | |
| print(f" - Optimize {failure['test_name']}") | |
| if successful_tests == total_tests: | |
| print("π ALL TESTS PASSED! Ready for production use.") | |
| print("π‘ Consider running specific error analysis on individual results files") | |
| # Find the most recent results files for analysis | |
| log_files = list(Path("logs").glob("classification_test_*.log")) | |
| if log_files: | |
| latest_log = max(log_files, key=lambda x: x.stat().st_mtime) | |
| print(f"π Latest log file: {latest_log}") | |
| result_files = list(Path(".").glob("gaia_classification_test_results_*.json")) | |
| if result_files: | |
| latest_results = max(result_files, key=lambda x: x.stat().st_mtime) | |
| print(f"π Latest results: {latest_results}") | |
| print(f"π Analyze with: python tests/analyze_test_results.py {latest_results}") | |
| print(f"\nβ COMPREHENSIVE TESTING COMPLETE!") | |
| print(f"Total Duration: {sum(r['duration'] for r in results):.1f}s") | |
| if __name__ == "__main__": | |
| main() |