Spaces:
Sleeping
Sleeping
| """ | |
| Test script to evaluate Qwen2.5 0.5B capabilities with MCP for RTS game | |
| Tests basic MCP understanding and complex planning capabilities | |
| """ | |
| import json | |
| import time | |
| from typing import Dict, Any, List | |
| from ai_analysis import query_llm | |
| # Test prompts for different capability levels | |
| TEST_PROMPTS = { | |
| "basic_mcp": [ | |
| { | |
| "name": "Simple Tool Call", | |
| "prompt": """ | |
| You are an AI assistant controlling an RTS game through MCP (Model Context Protocol). | |
| Available tools: | |
| - get_game_state(): Get current game state | |
| - move_units(unit_ids, target_x, target_y): Move units to position | |
| - attack_unit(attacker_ids, target_id): Attack enemy unit | |
| User command: "Show me the current game state" | |
| Respond with a JSON object containing the tool call to execute. | |
| """, | |
| "expected": {"tool": "get_game_state", "args": {}} | |
| }, | |
| { | |
| "name": "Basic Action Translation", | |
| "prompt": """ | |
| You are an AI assistant controlling an RTS game through MCP. | |
| Available tools: | |
| - move_units(unit_ids, target_x, target_y): Move units to position | |
| - attack_unit(attacker_ids, target_id): Attack enemy unit | |
| User command: "Move my infantry to position 100, 200" | |
| Respond with a JSON object containing the tool call to execute. | |
| """, | |
| "expected": {"tool": "move_units", "args": {"unit_ids": ["infantry"], "target_x": 100, "target_y": 200}} | |
| } | |
| ], | |
| "complex_planning": [ | |
| { | |
| "name": "Multi-step Strategy", | |
| "prompt": """ | |
| You are an AI assistant controlling an RTS game through MCP. | |
| Available tools: | |
| - get_game_state(): Get current game state | |
| - move_units(unit_ids, target_x, target_y): Move units | |
| - attack_unit(attacker_ids, target_id): Attack enemy | |
| - build_building(building_type, position_x, position_y, player_id): Build building | |
| User command: "I want to build a base near the ore field and defend it with turrets" | |
| Break this down into a sequence of MCP tool calls. Respond with a JSON array of tool calls. | |
| """, | |
| "expected": {"type": "sequence", "steps": ["get_game_state", "build_building", "build_building"]} | |
| }, | |
| { | |
| "name": "Strategic Analysis", | |
| "prompt": """ | |
| You are an AI assistant controlling an RTS game through MCP. | |
| Available tools: | |
| - get_game_state(): Get current game state | |
| - get_ai_analysis(language): Get tactical analysis | |
| User command: "Analyze the battlefield and suggest the best strategy" | |
| Respond with a JSON object containing the tool calls needed. | |
| """, | |
| "expected": {"type": "analysis", "steps": ["get_game_state", "get_ai_analysis"]} | |
| } | |
| ], | |
| "advanced_mcp": [ | |
| { | |
| "name": "Parameter Extraction", | |
| "prompt": """ | |
| You are an AI assistant controlling an RTS game through MCP. | |
| Available tools: | |
| - move_units(unit_ids, target_x, target_y): Move units | |
| User command: "Move tanks 1, 3, and 7 to coordinates 150, 75" | |
| Extract the parameters and respond with a JSON tool call. | |
| """, | |
| "expected": {"tool": "move_units", "args": {"unit_ids": [1, 3, 7], "target_x": 150, "target_y": 75}} | |
| }, | |
| { | |
| "name": "Error Handling", | |
| "prompt": """ | |
| You are an AI assistant controlling an RTS game through MCP. | |
| Available tools: | |
| - move_units(unit_ids, target_x, target_y): Move units | |
| User command: "Move my units to the enemy base" | |
| Since you don't know the exact coordinates, how would you handle this? | |
| Respond with a JSON object showing your approach. | |
| """, | |
| "expected": {"type": "needs_clarification", "message": "Need coordinates for enemy base"} | |
| } | |
| ] | |
| } | |
| def test_qwen_capabilities(): | |
| """Run comprehensive tests on Qwen2.5 0.5B MCP capabilities""" | |
| print("=== Testing Qwen2.5 0.5B MCP Capabilities ===\n") | |
| results = {} | |
| for category, tests in TEST_PROMPTS.items(): | |
| print(f"\n๐ Testing {category.replace('_', ' ').title()}:") | |
| print("-" * 50) | |
| category_results = [] | |
| for test in tests: | |
| print(f"\n๐งช Test: {test['name']}") | |
| print(f"Prompt: {test['prompt'][:100]}...") | |
| try: | |
| # Query the LLM | |
| start_time = time.time() | |
| response = query_llm( | |
| prompt=test['prompt'], | |
| max_tokens=500, | |
| temperature=0.1, | |
| system_message="You are an AI assistant that responds with JSON objects for MCP tool calls." | |
| ) | |
| response_time = time.time() - start_time | |
| # Parse and analyze response | |
| analysis = analyze_response(test, response, response_time) | |
| category_results.append(analysis) | |
| print(f"โ Response time: {response_time:.2f}s") | |
| print(f"๐ Response: {response[:200]}...") | |
| print(f"๐ Analysis: {analysis['score']}/10") | |
| except Exception as e: | |
| print(f"โ Error: {e}") | |
| category_results.append({ | |
| 'test': test['name'], | |
| 'error': str(e), | |
| 'score': 0 | |
| }) | |
| results[category] = category_results | |
| # Generate summary report | |
| generate_summary_report(results) | |
| return results | |
| def analyze_response(test: Dict[str, Any], response: str, response_time: float) -> Dict[str, Any]: | |
| """Analyze the LLM response and score its performance""" | |
| analysis = { | |
| 'test': test['name'], | |
| 'response': response, | |
| 'response_time': response_time, | |
| 'score': 0, | |
| 'strengths': [], | |
| 'weaknesses': [], | |
| 'details': {} | |
| } | |
| # Basic response quality checks | |
| if not response or response.strip() == "": | |
| analysis['weaknesses'].append("Empty response") | |
| return analysis | |
| # Check for JSON structure | |
| try: | |
| # Try to parse as JSON | |
| parsed = json.loads(response) | |
| analysis['details']['json_valid'] = True | |
| analysis['strengths'].append("Valid JSON structure") | |
| # Check if it matches expected structure | |
| if 'expected' in test: | |
| expected = test['expected'] | |
| if isinstance(expected, dict): | |
| if 'tool' in expected and 'tool' in parsed: | |
| if parsed['tool'] == expected['tool']: | |
| analysis['score'] += 4 | |
| analysis['strengths'].append("Correct tool selection") | |
| else: | |
| analysis['weaknesses'].append(f"Wrong tool: {parsed.get('tool')} vs {expected['tool']}") | |
| # Check arguments | |
| if 'args' in expected and 'args' in parsed: | |
| arg_match = compare_arguments(parsed['args'], expected['args']) | |
| analysis['score'] += arg_match * 3 | |
| if arg_match > 0.7: | |
| analysis['strengths'].append("Good argument matching") | |
| else: | |
| analysis['weaknesses'].append("Poor argument matching") | |
| except json.JSONDecodeError: | |
| analysis['details']['json_valid'] = False | |
| analysis['weaknesses'].append("Invalid JSON format") | |
| # Check for tool-like patterns in text | |
| if 'get_game_state' in response: | |
| analysis['score'] += 2 | |
| analysis['strengths'].append("Mentions correct tool") | |
| if 'move_units' in response or 'attack_unit' in response: | |
| analysis['score'] += 1 | |
| # Response time scoring | |
| if response_time < 5.0: | |
| analysis['score'] += 1 | |
| analysis['strengths'].append("Fast response") | |
| elif response_time > 15.0: | |
| analysis['weaknesses'].append("Slow response") | |
| # Content relevance scoring | |
| if any(keyword in response.lower() for keyword in ['game', 'state', 'move', 'attack', 'build']): | |
| analysis['score'] += 1 | |
| analysis['strengths'].append("Relevant content") | |
| # Cap for score | |
| analysis['score'] = min(analysis['score'], 10) | |
| return analysis | |
| def compare_arguments(actual: Dict, expected: Dict) -> float: | |
| """Compare argument dictionaries and return match percentage""" | |
| if not actual or not expected: | |
| return 0.0 | |
| matches = 0 | |
| total = len(expected) | |
| for key, expected_value in expected.items(): | |
| if key in actual: | |
| actual_value = actual[key] | |
| if isinstance(expected_value, list) and isinstance(actual_value, list): | |
| # Compare lists | |
| if set(expected_value) == set(actual_value): | |
| matches += 1 | |
| elif expected_value == actual_value: | |
| matches += 1 | |
| return matches / total if total > 0 else 0.0 | |
| def generate_summary_report(results: Dict[str, List[Dict]]): | |
| """Generate a comprehensive summary report""" | |
| print("\n" + "="*60) | |
| print("๐ QWEN2.5 0.5B MCP CAPABILITY ASSESSMENT REPORT") | |
| print("="*60) | |
| overall_scores = [] | |
| for category, category_results in results.items(): | |
| if not category_results: | |
| continue | |
| category_scores = [r.get('score', 0) for r in category_results if 'score' in r] | |
| avg_score = sum(category_scores) / len(category_scores) if category_scores else 0 | |
| overall_scores.append(avg_score) | |
| print(f"\n๐ {category.replace('_', ' ').title()}:") | |
| print(f" Average Score: {avg_score:.1f}/10") | |
| for result in category_results: | |
| if 'error' in result: | |
| print(f" โ {result['test']}: ERROR - {result['error']}") | |
| else: | |
| print(f" {'โ ' if result['score'] >= 6 else 'โ ๏ธ'} {result['test']}: {result['score']}/10") | |
| if result['strengths']: | |
| print(f" Strengths: {', '.join(result['strengths'][:2])}") | |
| if result['weaknesses']: | |
| print(f" Weaknesses: {', '.join(result['weaknesses'][:2])}") | |
| # Overall assessment | |
| if overall_scores: | |
| overall_avg = sum(overall_scores) / len(overall_scores) | |
| print(f"\n๐ฏ OVERALL ASSESSMENT: {overall_avg:.1f}/10") | |
| if overall_avg >= 8: | |
| print("๐ช EXCELLENT - Qwen2.5 0.5B is highly capable for MCP tasks") | |
| elif overall_avg >= 6: | |
| print("๐ GOOD - Qwen2.5 0.5B is capable with some limitations") | |
| elif overall_avg >= 4: | |
| print("โ ๏ธ MODERATE - Qwen2.5 0.5B has significant limitations") | |
| else: | |
| print("โ POOR - Qwen2.5 0.5B is not suitable for MCP tasks") | |
| # Recommendations | |
| print(f"\n๐ก RECOMMENDATIONS:") | |
| if overall_avg >= 7: | |
| print("- Use Qwen2.5 0.5B for MCP translation with confidence") | |
| print("- Implement prompt engineering for complex tasks") | |
| print("- Add validation layer for safety") | |
| elif overall_avg >= 5: | |
| print("- Use Qwen2.5 0.5B for simple MCP tasks") | |
| print("- Implement strong validation and fallback mechanisms") | |
| print("- Consider using larger models for complex planning") | |
| else: | |
| print("- Consider upgrading to a larger model (1.5B+)") | |
| print("- Use Qwen2.5 0.5B only for very simple translations") | |
| print("- Implement extensive error handling") | |
| if __name__ == "__main__": | |
| print("Starting Qwen2.5 0.5B MCP capability assessment...") | |
| print("This will test the model's ability to translate user commands to MCP tool calls.") | |
| print("Make sure the model is downloaded and available at: qwen2.5-0.5b-instruct-q4_0.gguf") | |
| try: | |
| results = test_qwen_capabilities() | |
| # Save detailed results to file | |
| with open("/home/luigi/rts/web/qwen_mcp_assessment.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| print("\n๐ Detailed results saved to: qwen_mcp_assessment.json") | |
| except Exception as e: | |
| print(f"โ Assessment failed: {e}") | |
| print("Make sure the AI model is properly downloaded and configured.") |