Final_Assignment

Sleeping

Final_Assignment / tests /analyze_test_results.py

GAIA Developer

🧪 Add comprehensive test infrastructure and async testing system

c262d1a 5 months ago

15.1 kB

	#!/usr/bin/env python3
	"""
	Analyze GAIA test results and generate specific improvement recommendations
	"""

	import json
	import argparse
	from pathlib import Path
	from collections import defaultdict, Counter
	from typing import Dict, List, Optional

	class GAIAResultsAnalyzer:
	"""Analyze test results and generate actionable improvement recommendations"""

	def __init__(self, results_file: str):
	self.results_file = results_file
	self.results_data = self.load_results()

	def load_results(self) -> Dict:
	"""Load test results from JSON file"""
	try:
	with open(self.results_file, 'r') as f:
	return json.load(f)
	except FileNotFoundError:
	print(f"❌ Results file not found: {self.results_file}")
	return {}
	except json.JSONDecodeError:
	print(f"❌ Invalid JSON in results file: {self.results_file}")
	return {}

	def analyze_overall_performance(self):
	"""Analyze overall testing performance"""

	if not self.results_data:
	return

	print("📊 OVERALL PERFORMANCE ANALYSIS")
	print("=" * 50)

	overall_stats = self.results_data.get('overall_stats', {})
	agent_performance = self.results_data.get('agent_performance', {})

	print(f"Total Questions: {overall_stats.get('total_questions', 0)}")
	print(f"Success Rate: {overall_stats.get('success_rate', 0):.1f}%")
	print(f"Successful: {overall_stats.get('successful', 0)}")
	print(f"Errors: {overall_stats.get('errors', 0)}")

	print(f"\n🎯 AGENT PERFORMANCE BREAKDOWN:")
	for agent_type, stats in sorted(agent_performance.items(), key=lambda x: x[1]['success_rate'], reverse=True):
	success_rate = stats['success_rate']
	status_emoji = "🟢" if success_rate >= 90 else "🟡" if success_rate >= 70 else "🔴"

	print(f" {status_emoji} {agent_type}: {success_rate:.1f}% ({stats['successful']}/{stats['total_questions']})")
	if stats['average_solve_time'] > 0:
	print(f" Average Time: {stats['average_solve_time']:.1f}s")

	def analyze_error_patterns(self):
	"""Analyze error patterns across all agent types"""

	print(f"\n🔍 ERROR PATTERN ANALYSIS")
	print("=" * 50)

	error_patterns = self.results_data.get('error_patterns', {})

	if not error_patterns:
	print("🎉 No error patterns found!")
	return

	# Aggregate error types across all agents
	all_error_types = Counter()

	for agent_type, errors in error_patterns.items():
	print(f"\n🚨 {agent_type.upper()} ERRORS:")

	agent_error_types = Counter()
	for error in errors:
	error_type = error.get('error_type', 'UNKNOWN')
	agent_error_types[error_type] += 1
	all_error_types[error_type] += 1

	for error_type, count in agent_error_types.most_common():
	print(f" - {error_type}: {count} occurrences")

	print(f"\n📈 MOST COMMON ERROR TYPES (All Agents):")
	for error_type, count in all_error_types.most_common(5):
	print(f" {count}× {error_type}")

	def generate_specific_improvements(self):
	"""Generate specific, actionable improvement recommendations"""

	print(f"\n💡 SPECIFIC IMPROVEMENT RECOMMENDATIONS")
	print("=" * 50)

	agent_performance = self.results_data.get('agent_performance', {})
	error_patterns = self.results_data.get('error_patterns', {})
	detailed_results = self.results_data.get('detailed_results', [])

	# Analyze each agent type
	for agent_type, stats in agent_performance.items():
	success_rate = stats['success_rate']

	print(f"\n🎯 {agent_type.upper()} AGENT IMPROVEMENTS:")

	if success_rate >= 95:
	print(f" ✅ Excellent performance! Focus on optimization:")
	print(f" - Fine-tune prompts for edge cases")
	print(f" - Optimize solve time (current: {stats.get('average_solve_time', 0):.1f}s)")

	elif success_rate >= 80:
	print(f" 🟡 Good performance with improvement opportunities:")
	self.suggest_improvements_for_agent(agent_type, error_patterns.get(agent_type, []), detailed_results)

	elif success_rate >= 60:
	print(f" 🟠 Moderate performance - needs attention:")
	self.suggest_improvements_for_agent(agent_type, error_patterns.get(agent_type, []), detailed_results)
	print(f" - Consider prompt engineering review")
	print(f" - Add more robust error handling")

	else:
	print(f" 🔴 Poor performance - requires major overhaul:")
	self.suggest_improvements_for_agent(agent_type, error_patterns.get(agent_type, []), detailed_results)
	print(f" - Review agent architecture and tool selection")
	print(f" - Consider multi-agent coordination")
	print(f" - Implement comprehensive testing for this agent type")

	def suggest_improvements_for_agent(self, agent_type: str, errors: List[Dict], all_results: List[Dict]):
	"""Generate specific improvement suggestions for an agent type"""

	if not errors:
	print(f" - No specific errors to address")
	return

	# Analyze error types for this agent
	error_type_counts = Counter()
	specific_errors = defaultdict(list)

	for error in errors:
	error_type = error.get('error_type', 'UNKNOWN')
	error_type_counts[error_type] += 1
	specific_errors[error_type].append(error)

	# Generate specific fixes for top error types
	for error_type, count in error_type_counts.most_common(3):
	print(f" - Fix {error_type} errors ({count} occurrences):")
	self.suggest_fix_for_error_type(error_type, specific_errors[error_type])

	def suggest_fix_for_error_type(self, error_type: str, specific_errors: List[Dict]):
	"""Suggest specific fixes for error types with examples"""

	fixes = {
	'API_OVERLOAD': [
	"Implement exponential backoff with retry logic",
	"Add multiple API endpoint fallbacks",
	"Implement request queuing and rate limiting"
	],
	'TIMEOUT': [
	"Increase timeout limits in API calls",
	"Implement progress tracking for long operations",
	"Break down complex operations into smaller steps"
	],
	'AUTHENTICATION': [
	"Verify all API keys are correctly configured",
	"Add API key validation at startup",
	"Implement automatic token refresh mechanisms"
	],
	'WIKIPEDIA_TOOL': [
	"Enhance Wikipedia search with multiple search strategies",
	"Add fallback to direct HTTP requests",
	"Improve article name parsing and disambiguation"
	],
	'CHESS_TOOL': [
	"Enhance FEN notation validation and correction",
	"Add multiple chess engine backends",
	"Implement position verification with multiple tools"
	],
	'EXCEL_TOOL': [
	"Add support for more Excel formats (.xlsb, .csv)",
	"Implement better column detection algorithms",
	"Add data validation and error recovery"
	],
	'VIDEO_TOOL': [
	"Implement video size and duration limits",
	"Add fallback to frame-only analysis",
	"Improve audio extraction and transcription"
	],
	'GEMINI_API': [
	"Add Gemini API error handling and retries",
	"Implement fallback to other vision models",
	"Add request size validation and optimization"
	],
	'FILE_PROCESSING': [
	"Enhance file download with retry logic",
	"Add file format validation before processing",
	"Implement temporary file cleanup mechanisms"
	],
	'HALLUCINATION': [
	"Strengthen anti-hallucination prompts",
	"Force tool output usage over model reasoning",
	"Add response validation against tool outputs"
	],
	'PARSING_ERROR': [
	"Improve output parsing with multiple regex patterns",
	"Add structured output validation",
	"Implement fallback parsing strategies"
	]
	}

	suggestions = fixes.get(error_type, ["Investigate root cause and implement appropriate fix"])

	for suggestion in suggestions[:2]: # Show top 2 suggestions
	print(f" → {suggestion}")

	# Show example error if available
	if specific_errors:
	example = specific_errors[0]
	question_id = example.get('question_id', 'unknown')[:8]
	print(f" Example: {question_id}... - {example.get('question_preview', '')[:50]}...")

	def generate_prompt_improvements(self):
	"""Generate specific prompt improvement suggestions"""

	print(f"\n📝 PROMPT IMPROVEMENT SUGGESTIONS")
	print("=" * 50)

	detailed_results = self.results_data.get('detailed_results', [])
	failed_results = [r for r in detailed_results if r['status'] == 'error']

	if not failed_results:
	print("🎉 No failed results to analyze for prompt improvements!")
	return

	# Group failures by agent type
	failures_by_agent = defaultdict(list)
	for result in failed_results:
	failures_by_agent[result['agent_type']].append(result)

	for agent_type, failures in failures_by_agent.items():
	print(f"\n🎯 {agent_type.upper()} PROMPT IMPROVEMENTS:")

	# Analyze common failure patterns
	question_patterns = []
	for failure in failures:
	question = failure.get('question', '')
	if len(question) > 50:
	question_patterns.append(question[:100] + "...")

	if agent_type == 'research':
	print(f" - Add more specific Wikipedia search guidance")
	print(f" - Strengthen temporal query parsing (e.g., 'as of July 2023')")
	print(f" - Enhance data extraction and validation prompts")

	elif agent_type == 'multimedia':
	print(f" - Improve video/audio analysis instructions")
	print(f" - Add specific guidance for character dialogue extraction")
	print(f" - Enhance image analysis with structured output requirements")

	elif agent_type == 'logic_math':
	print(f" - Add step-by-step mathematical reasoning guidance")
	print(f" - Strengthen calculation verification prompts")
	print(f" - Improve pattern recognition instructions")

	elif agent_type == 'file_processing':
	print(f" - Enhance Excel analysis with column filtering guidance")
	print(f" - Add specific data aggregation instructions")
	print(f" - Improve Python code execution safety prompts")

	# Show example failed questions
	if question_patterns:
	print(f" Failed question examples:")
	for pattern in question_patterns[:2]:
	print(f" - {pattern}")

	def create_action_plan(self):
	"""Create a prioritized action plan for improvements"""

	print(f"\n📋 PRIORITIZED ACTION PLAN")
	print("=" * 50)

	agent_performance = self.results_data.get('agent_performance', {})

	# Sort agents by success rate (lowest first - highest priority)
	sorted_agents = sorted(agent_performance.items(), key=lambda x: x[1]['success_rate'])

	print(f"Priority order (based on success rate):")

	for i, (agent_type, stats) in enumerate(sorted_agents, 1):
	success_rate = stats['success_rate']
	total_questions = stats['total_questions']

	print(f"\n{i}. {agent_type.upper()} AGENT (Success: {success_rate:.1f}%)")
	print(f" Questions: {total_questions}")

	if success_rate < 70:
	print(f" 🔴 HIGH PRIORITY - Major improvements needed")
	print(f" Actions: Review architecture, enhance tools, rewrite prompts")
	elif success_rate < 85:
	print(f" 🟡 MEDIUM PRIORITY - Targeted improvements")
	print(f" Actions: Fix specific error patterns, optimize prompts")
	else:
	print(f" 🟢 LOW PRIORITY - Fine-tuning only")
	print(f" Actions: Edge case handling, performance optimization")

	print(f"\n📅 RECOMMENDED WORKFLOW:")
	print(f"1. Start with highest priority agent type")
	print(f"2. Implement suggested improvements")
	print(f"3. Re-test only that agent type: --agent-types {sorted_agents[0][0] if sorted_agents else 'unknown'}")
	print(f"4. Repeat until success rate > 85%")
	print(f"5. Move to next priority agent type")

	def main():
	"""Main CLI interface for results analysis"""

	parser = argparse.ArgumentParser(description="Analyze GAIA test results and generate improvement recommendations")
	parser.add_argument('results_file', help='Path to the test results JSON file')
	parser.add_argument('--detailed', action='store_true', help='Show detailed analysis including individual errors')

	args = parser.parse_args()

	if not Path(args.results_file).exists():
	print(f"❌ Results file not found: {args.results_file}")
	return

	analyzer = GAIAResultsAnalyzer(args.results_file)

	print("🔍 GAIA TEST RESULTS ANALYSIS")
	print("=" * 70)

	analyzer.analyze_overall_performance()
	analyzer.analyze_error_patterns()
	analyzer.generate_specific_improvements()
	analyzer.generate_prompt_improvements()
	analyzer.create_action_plan()

	print(f"\n✅ ANALYSIS COMPLETE!")
	print(f"📋 Use the action plan above to prioritize improvements")

	if __name__ == "__main__":
	main()