Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Analyze GAIA test results and generate specific improvement recommendations | |
| """ | |
| import json | |
| import argparse | |
| from pathlib import Path | |
| from collections import defaultdict, Counter | |
| from typing import Dict, List, Optional | |
| class GAIAResultsAnalyzer: | |
| """Analyze test results and generate actionable improvement recommendations""" | |
| def __init__(self, results_file: str): | |
| self.results_file = results_file | |
| self.results_data = self.load_results() | |
| def load_results(self) -> Dict: | |
| """Load test results from JSON file""" | |
| try: | |
| with open(self.results_file, 'r') as f: | |
| return json.load(f) | |
| except FileNotFoundError: | |
| print(f"β Results file not found: {self.results_file}") | |
| return {} | |
| except json.JSONDecodeError: | |
| print(f"β Invalid JSON in results file: {self.results_file}") | |
| return {} | |
| def analyze_overall_performance(self): | |
| """Analyze overall testing performance""" | |
| if not self.results_data: | |
| return | |
| print("π OVERALL PERFORMANCE ANALYSIS") | |
| print("=" * 50) | |
| overall_stats = self.results_data.get('overall_stats', {}) | |
| agent_performance = self.results_data.get('agent_performance', {}) | |
| print(f"Total Questions: {overall_stats.get('total_questions', 0)}") | |
| print(f"Success Rate: {overall_stats.get('success_rate', 0):.1f}%") | |
| print(f"Successful: {overall_stats.get('successful', 0)}") | |
| print(f"Errors: {overall_stats.get('errors', 0)}") | |
| print(f"\nπ― AGENT PERFORMANCE BREAKDOWN:") | |
| for agent_type, stats in sorted(agent_performance.items(), key=lambda x: x[1]['success_rate'], reverse=True): | |
| success_rate = stats['success_rate'] | |
| status_emoji = "π’" if success_rate >= 90 else "π‘" if success_rate >= 70 else "π΄" | |
| print(f" {status_emoji} {agent_type}: {success_rate:.1f}% ({stats['successful']}/{stats['total_questions']})") | |
| if stats['average_solve_time'] > 0: | |
| print(f" Average Time: {stats['average_solve_time']:.1f}s") | |
| def analyze_error_patterns(self): | |
| """Analyze error patterns across all agent types""" | |
| print(f"\nπ ERROR PATTERN ANALYSIS") | |
| print("=" * 50) | |
| error_patterns = self.results_data.get('error_patterns', {}) | |
| if not error_patterns: | |
| print("π No error patterns found!") | |
| return | |
| # Aggregate error types across all agents | |
| all_error_types = Counter() | |
| for agent_type, errors in error_patterns.items(): | |
| print(f"\nπ¨ {agent_type.upper()} ERRORS:") | |
| agent_error_types = Counter() | |
| for error in errors: | |
| error_type = error.get('error_type', 'UNKNOWN') | |
| agent_error_types[error_type] += 1 | |
| all_error_types[error_type] += 1 | |
| for error_type, count in agent_error_types.most_common(): | |
| print(f" - {error_type}: {count} occurrences") | |
| print(f"\nπ MOST COMMON ERROR TYPES (All Agents):") | |
| for error_type, count in all_error_types.most_common(5): | |
| print(f" {count}Γ {error_type}") | |
| def generate_specific_improvements(self): | |
| """Generate specific, actionable improvement recommendations""" | |
| print(f"\nπ‘ SPECIFIC IMPROVEMENT RECOMMENDATIONS") | |
| print("=" * 50) | |
| agent_performance = self.results_data.get('agent_performance', {}) | |
| error_patterns = self.results_data.get('error_patterns', {}) | |
| detailed_results = self.results_data.get('detailed_results', []) | |
| # Analyze each agent type | |
| for agent_type, stats in agent_performance.items(): | |
| success_rate = stats['success_rate'] | |
| print(f"\nπ― {agent_type.upper()} AGENT IMPROVEMENTS:") | |
| if success_rate >= 95: | |
| print(f" β Excellent performance! Focus on optimization:") | |
| print(f" - Fine-tune prompts for edge cases") | |
| print(f" - Optimize solve time (current: {stats.get('average_solve_time', 0):.1f}s)") | |
| elif success_rate >= 80: | |
| print(f" π‘ Good performance with improvement opportunities:") | |
| self.suggest_improvements_for_agent(agent_type, error_patterns.get(agent_type, []), detailed_results) | |
| elif success_rate >= 60: | |
| print(f" π Moderate performance - needs attention:") | |
| self.suggest_improvements_for_agent(agent_type, error_patterns.get(agent_type, []), detailed_results) | |
| print(f" - Consider prompt engineering review") | |
| print(f" - Add more robust error handling") | |
| else: | |
| print(f" π΄ Poor performance - requires major overhaul:") | |
| self.suggest_improvements_for_agent(agent_type, error_patterns.get(agent_type, []), detailed_results) | |
| print(f" - Review agent architecture and tool selection") | |
| print(f" - Consider multi-agent coordination") | |
| print(f" - Implement comprehensive testing for this agent type") | |
| def suggest_improvements_for_agent(self, agent_type: str, errors: List[Dict], all_results: List[Dict]): | |
| """Generate specific improvement suggestions for an agent type""" | |
| if not errors: | |
| print(f" - No specific errors to address") | |
| return | |
| # Analyze error types for this agent | |
| error_type_counts = Counter() | |
| specific_errors = defaultdict(list) | |
| for error in errors: | |
| error_type = error.get('error_type', 'UNKNOWN') | |
| error_type_counts[error_type] += 1 | |
| specific_errors[error_type].append(error) | |
| # Generate specific fixes for top error types | |
| for error_type, count in error_type_counts.most_common(3): | |
| print(f" - Fix {error_type} errors ({count} occurrences):") | |
| self.suggest_fix_for_error_type(error_type, specific_errors[error_type]) | |
| def suggest_fix_for_error_type(self, error_type: str, specific_errors: List[Dict]): | |
| """Suggest specific fixes for error types with examples""" | |
| fixes = { | |
| 'API_OVERLOAD': [ | |
| "Implement exponential backoff with retry logic", | |
| "Add multiple API endpoint fallbacks", | |
| "Implement request queuing and rate limiting" | |
| ], | |
| 'TIMEOUT': [ | |
| "Increase timeout limits in API calls", | |
| "Implement progress tracking for long operations", | |
| "Break down complex operations into smaller steps" | |
| ], | |
| 'AUTHENTICATION': [ | |
| "Verify all API keys are correctly configured", | |
| "Add API key validation at startup", | |
| "Implement automatic token refresh mechanisms" | |
| ], | |
| 'WIKIPEDIA_TOOL': [ | |
| "Enhance Wikipedia search with multiple search strategies", | |
| "Add fallback to direct HTTP requests", | |
| "Improve article name parsing and disambiguation" | |
| ], | |
| 'CHESS_TOOL': [ | |
| "Enhance FEN notation validation and correction", | |
| "Add multiple chess engine backends", | |
| "Implement position verification with multiple tools" | |
| ], | |
| 'EXCEL_TOOL': [ | |
| "Add support for more Excel formats (.xlsb, .csv)", | |
| "Implement better column detection algorithms", | |
| "Add data validation and error recovery" | |
| ], | |
| 'VIDEO_TOOL': [ | |
| "Implement video size and duration limits", | |
| "Add fallback to frame-only analysis", | |
| "Improve audio extraction and transcription" | |
| ], | |
| 'GEMINI_API': [ | |
| "Add Gemini API error handling and retries", | |
| "Implement fallback to other vision models", | |
| "Add request size validation and optimization" | |
| ], | |
| 'FILE_PROCESSING': [ | |
| "Enhance file download with retry logic", | |
| "Add file format validation before processing", | |
| "Implement temporary file cleanup mechanisms" | |
| ], | |
| 'HALLUCINATION': [ | |
| "Strengthen anti-hallucination prompts", | |
| "Force tool output usage over model reasoning", | |
| "Add response validation against tool outputs" | |
| ], | |
| 'PARSING_ERROR': [ | |
| "Improve output parsing with multiple regex patterns", | |
| "Add structured output validation", | |
| "Implement fallback parsing strategies" | |
| ] | |
| } | |
| suggestions = fixes.get(error_type, ["Investigate root cause and implement appropriate fix"]) | |
| for suggestion in suggestions[:2]: # Show top 2 suggestions | |
| print(f" β {suggestion}") | |
| # Show example error if available | |
| if specific_errors: | |
| example = specific_errors[0] | |
| question_id = example.get('question_id', 'unknown')[:8] | |
| print(f" Example: {question_id}... - {example.get('question_preview', '')[:50]}...") | |
| def generate_prompt_improvements(self): | |
| """Generate specific prompt improvement suggestions""" | |
| print(f"\nπ PROMPT IMPROVEMENT SUGGESTIONS") | |
| print("=" * 50) | |
| detailed_results = self.results_data.get('detailed_results', []) | |
| failed_results = [r for r in detailed_results if r['status'] == 'error'] | |
| if not failed_results: | |
| print("π No failed results to analyze for prompt improvements!") | |
| return | |
| # Group failures by agent type | |
| failures_by_agent = defaultdict(list) | |
| for result in failed_results: | |
| failures_by_agent[result['agent_type']].append(result) | |
| for agent_type, failures in failures_by_agent.items(): | |
| print(f"\nπ― {agent_type.upper()} PROMPT IMPROVEMENTS:") | |
| # Analyze common failure patterns | |
| question_patterns = [] | |
| for failure in failures: | |
| question = failure.get('question', '') | |
| if len(question) > 50: | |
| question_patterns.append(question[:100] + "...") | |
| if agent_type == 'research': | |
| print(f" - Add more specific Wikipedia search guidance") | |
| print(f" - Strengthen temporal query parsing (e.g., 'as of July 2023')") | |
| print(f" - Enhance data extraction and validation prompts") | |
| elif agent_type == 'multimedia': | |
| print(f" - Improve video/audio analysis instructions") | |
| print(f" - Add specific guidance for character dialogue extraction") | |
| print(f" - Enhance image analysis with structured output requirements") | |
| elif agent_type == 'logic_math': | |
| print(f" - Add step-by-step mathematical reasoning guidance") | |
| print(f" - Strengthen calculation verification prompts") | |
| print(f" - Improve pattern recognition instructions") | |
| elif agent_type == 'file_processing': | |
| print(f" - Enhance Excel analysis with column filtering guidance") | |
| print(f" - Add specific data aggregation instructions") | |
| print(f" - Improve Python code execution safety prompts") | |
| # Show example failed questions | |
| if question_patterns: | |
| print(f" Failed question examples:") | |
| for pattern in question_patterns[:2]: | |
| print(f" - {pattern}") | |
| def create_action_plan(self): | |
| """Create a prioritized action plan for improvements""" | |
| print(f"\nπ PRIORITIZED ACTION PLAN") | |
| print("=" * 50) | |
| agent_performance = self.results_data.get('agent_performance', {}) | |
| # Sort agents by success rate (lowest first - highest priority) | |
| sorted_agents = sorted(agent_performance.items(), key=lambda x: x[1]['success_rate']) | |
| print(f"Priority order (based on success rate):") | |
| for i, (agent_type, stats) in enumerate(sorted_agents, 1): | |
| success_rate = stats['success_rate'] | |
| total_questions = stats['total_questions'] | |
| print(f"\n{i}. {agent_type.upper()} AGENT (Success: {success_rate:.1f}%)") | |
| print(f" Questions: {total_questions}") | |
| if success_rate < 70: | |
| print(f" π΄ HIGH PRIORITY - Major improvements needed") | |
| print(f" Actions: Review architecture, enhance tools, rewrite prompts") | |
| elif success_rate < 85: | |
| print(f" π‘ MEDIUM PRIORITY - Targeted improvements") | |
| print(f" Actions: Fix specific error patterns, optimize prompts") | |
| else: | |
| print(f" π’ LOW PRIORITY - Fine-tuning only") | |
| print(f" Actions: Edge case handling, performance optimization") | |
| print(f"\nπ RECOMMENDED WORKFLOW:") | |
| print(f"1. Start with highest priority agent type") | |
| print(f"2. Implement suggested improvements") | |
| print(f"3. Re-test only that agent type: --agent-types {sorted_agents[0][0] if sorted_agents else 'unknown'}") | |
| print(f"4. Repeat until success rate > 85%") | |
| print(f"5. Move to next priority agent type") | |
| def main(): | |
| """Main CLI interface for results analysis""" | |
| parser = argparse.ArgumentParser(description="Analyze GAIA test results and generate improvement recommendations") | |
| parser.add_argument('results_file', help='Path to the test results JSON file') | |
| parser.add_argument('--detailed', action='store_true', help='Show detailed analysis including individual errors') | |
| args = parser.parse_args() | |
| if not Path(args.results_file).exists(): | |
| print(f"β Results file not found: {args.results_file}") | |
| return | |
| analyzer = GAIAResultsAnalyzer(args.results_file) | |
| print("π GAIA TEST RESULTS ANALYSIS") | |
| print("=" * 70) | |
| analyzer.analyze_overall_performance() | |
| analyzer.analyze_error_patterns() | |
| analyzer.generate_specific_improvements() | |
| analyzer.generate_prompt_improvements() | |
| analyzer.create_action_plan() | |
| print(f"\nβ ANALYSIS COMPLETE!") | |
| print(f"π Use the action plan above to prioritize improvements") | |
| if __name__ == "__main__": | |
| main() |