""" Évaluation complète MCP avec 9 modèles Inclut les modèles spécialisés MCP et les modèles généraux Test réaliste avec commandes RTS typiques """ import sys import os import json import time sys.path.append(os.path.dirname(os.path.abspath(__file__))) def test_model_comprehensive(model_path, model_name): """Test complet d'un modèle pour MCP""" try: from llama_cpp import Llama print(f"🧪 Test de {model_name}...") # Vérifier la taille du fichier file_size = os.path.getsize(model_path) / (1024*1024) print(f" 📏 Taille: {file_size:.0f} MB") # Initialiser avec timeout plus long pour les gros modèles llm = Llama( model_path=model_path, n_ctx=1024, n_threads=1, verbose=False, n_gpu_layers=0 # Forcer CPU pour comparaison juste ) # Commandes RTS réelles avec différents niveaux de complexité test_commands = [ { "name": "Commande simple", "command": "show game state", "expected": "get_game_state", "difficulty": "easy" }, { "name": "Action avec coordonnées", "command": "move infantry to 150,200", "expected": "move_units", "difficulty": "easy" }, { "name": "Attaque spécifique", "command": "attack enemy tank at position 300,150", "expected": "attack_unit", "difficulty": "medium" }, { "name": "Construction", "command": "build power plant near my base at 100,100", "expected": "build_building", "difficulty": "medium" }, { "name": "Commande complexe", "command": "defend base with all available units", "expected": "move_units", "difficulty": "hard" } ] results = [] total_score = 0 total_time = 0 for test in test_commands: prompt = f"""You are an AI assistant for an RTS game using MCP (Model Context Protocol). Available tools: - get_game_state() - move_units(unit_ids, target_x, target_y) - attack_unit(attacker_ids, target_id) - build_building(building_type, position_x, position_y) User command: "{test['command']}" Respond with JSON only: {{"tool": "tool_name", "args": {{}}}}""" start_time = time.time() try: response = llm( prompt, max_tokens=100, temperature=0.1, stop=["", "<|im_end|>", "```"] ) response_time = time.time() - start_time # Extraire la réponse try: response_text = response['choices'][0]['text'].strip() except: # Fallback pour différents formats de réponse if hasattr(response, 'get'): response_text = response.get('text', str(response)) else: response_text = str(response) # Évaluer la réponse score = evaluate_mcp_response(response_text, test) total_score += score total_time += response_time print(f" ✅ {test['name']}: {score}/10 ({response_time:.2f}s)") results.append({ 'test': test['name'], 'difficulty': test['difficulty'], 'score': score, 'time': response_time, 'response': response_text[:100] + "..." if len(response_text) > 100 else response_text }) except Exception as e: print(f" ❌ {test['name']}: Erreur - {e}") results.append({ 'test': test['name'], 'difficulty': test['difficulty'], 'score': 0, 'time': 0, 'error': str(e) }) avg_score = total_score / len(test_commands) avg_time = total_time / len(test_commands) print(f" 📊 Moyenne: {avg_score:.1f}/10 | Temps: {avg_time:.2f}s") return { 'name': model_name, 'file_size_mb': file_size, 'avg_score': avg_score, 'avg_time': avg_time, 'efficiency': avg_score / avg_time if avg_time > 0 else 0, 'results': results } except Exception as e: print(f"❌ Erreur critique avec {model_name}: {e}") return { 'name': model_name, 'error': str(e), 'avg_score': 0, 'avg_time': 0, 'efficiency': 0 } def evaluate_mcp_response(response, test): """Évaluation standardisée des réponses MCP""" if not response or response.strip() == "": return 0 score = 0 # JSON valide (3 points) try: json.loads(response) score += 3 except: # Chercher JSON dans le texte import re json_match = re.search(r'\{[^}]*\}', response) if json_match: try: json.loads(json_match.group()) score += 1 except: pass # Outil correct (3 points) expected_tool = test['expected'] if expected_tool in response: score += 3 # Paramètres appropriés (2 points) if test['difficulty'] == 'easy': if '150,200' in response or 'game state' in response: score += 2 elif test['difficulty'] == 'medium': if any(coord in response for coord in ['300,150', '100,100']): score += 2 elif test['difficulty'] == 'hard': if 'units' in response and 'defend' in response: score += 2 # Format correct (2 points) if 'tool' in response and 'args' in response: score += 2 return min(score, 10) def main(): """Évaluation complète de tous les modèles""" print("🚀 ÉVALUATION COMPLÈTE MCP - 9 MODÈLES") print("=" * 70) print("Test avec modèles généraux et spécialisés MCP") print("=" * 70) # Tous les modèles à tester models = [ # Modèles généraux (testés précédemment) { 'name': 'Qwen2.5-0.5B', 'path': 'qwen2.5-0.5b-instruct-q4_0.gguf', 'type': 'general' }, { 'name': 'Qwen3-0.6B', 'path': 'Qwen3-0.6B-Q8_0.gguf', 'type': 'general' }, { 'name': 'Gemma-3-270M', 'path': 'gemma-3-270m-it-qat-Q8_0.gguf', 'type': 'general' }, { 'name': 'Qwen3-1.7B', 'path': 'Qwen3-1.7B-Q4_0.gguf', 'type': 'general' }, # Modèles spécialisés MCP { 'name': 'MCP-Instruct-v1', 'path': 'mcp-instruct-v1.Q4_K_M.gguf', 'type': 'mcp_specialized' }, { 'name': 'MCPR L-3B-Exa', 'path': 'mcprl-3b-exa.Q2_K.gguf', 'type': 'mcp_specialized' }, { 'name': 'Gemma-3n-E2B-it', 'path': 'gemma-3n-E2B-it-UD-IQ2_XXS.gguf', 'type': 'mcp_specialized' }, { 'name': 'Llama-Breeze2-3B', 'path': 'Llama-Breeze2-3B-Instruct-Text.Q2_K.gguf', 'type': 'general' }, # Modèle spécialisé en code/structuré { 'name': 'Qwen2.5-Coder-0.5B', 'path': 'qwen2.5-coder-0.5b-instruct-q4_0.gguf', 'type': 'code_specialized' } ] results = [] for model in models: if os.path.exists(model['path']): result = test_model_comprehensive(model['path'], model['name']) result['type'] = model['type'] results.append(result) print() else: print(f"❌ Modèle non trouvé: {model['path']}") print() # Analyse complète print("=" * 70) print("📊 RÉSULTATS COMPLETS") print("=" * 70) successful_results = [r for r in results if 'error' not in r and r['avg_score'] > 0] if successful_results: # Classement par performance sorted_by_score = sorted(successful_results, key=lambda x: x['avg_score'], reverse=True) print(f"\n🏆 CLASSEMENT PAR PERFORMANCE:") for i, result in enumerate(sorted_by_score, 1): print(f" {i:2d}. {result['name']:20s} | {result['avg_score']:.1f}/10 | {result['avg_time']:.2f}s | {result['file_size_mb']:.0f}MB | {result['type']}") # Classement par efficacité sorted_by_efficiency = sorted(successful_results, key=lambda x: x['efficiency'], reverse=True) print(f"\n⚡ CLASSEMENT PAR EFFICACITÉ:") for i, result in enumerate(sorted_by_efficiency, 1): print(f" {i:2d}. {result['name']:20s} | {result['efficiency']:.2f} score/s | {result['file_size_mb']:.0f}MB") # Analyse par type print(f"\n📈 ANALYSE PAR TYPE DE MODÈLE:") general_models = [r for r in successful_results if r['type'] == 'general'] mcp_specialized = [r for r in successful_results if r['type'] == 'mcp_specialized'] code_specialized = [r for r in successful_results if r['type'] == 'code_specialized'] if general_models: avg_general = sum(r['avg_score'] for r in general_models) / len(general_models) print(f" Modèles généraux ({len(general_models)}): {avg_general:.1f}/10 moyen") if mcp_specialized: avg_mcp = sum(r['avg_score'] for r in mcp_specialized) / len(mcp_specialized) print(f" Spécialisés MCP ({len(mcp_specialized)}): {avg_mcp:.1f}/10 moyen") if code_specialized: avg_code = sum(r['avg_score'] for r in code_specialized) / len(code_specialized) print(f" Spécialisés Code ({len(code_specialized)}): {avg_code:.1f}/10 moyen") # Meilleur modèle global best = sorted_by_score[0] most_efficient = sorted_by_efficiency[0] print(f"\n🎯 MEILLEUR MODÈLE GLOBAL: {best['name']}") print(f" Score: {best['avg_score']:.1f}/10") print(f" Temps: {best['avg_time']:.2f}s") print(f" Taille: {best['file_size_mb']:.0f}MB") print(f" Type: {best['type']}") print(f"\n⚡ MODÈLE LE PLUS EFFICACE: {most_efficient['name']}") print(f" Efficacité: {most_efficient['efficiency']:.2f} score/s") # Recommandations finales print(f"\n💡 RECOMMANDATIONS FINALES:") if best['avg_score'] >= 7: print(f"✅ {best['name']} est EXCELLENT pour la production MCP") elif best['avg_score'] >= 5: print(f"👍 {best['name']} est BON pour la production MCP") else: print(f"⚠️ {best['name']} nécessite des améliorations") # Comparaison spécialisés vs généraux if mcp_specialized and general_models: best_specialized = max(mcp_specialized, key=lambda x: x['avg_score']) best_general = max(general_models, key=lambda x: x['avg_score']) print(f"\n🔬 SPÉCIALISÉS VS GÉNÉRAUX:") print(f" Meilleur spécialisé MCP: {best_specialized['name']} ({best_specialized['avg_score']:.1f}/10)") print(f" Meilleur général: {best_general['name']} ({best_general['avg_score']:.1f}/10)") if best_specialized['avg_score'] > best_general['avg_score']: print(f" ✅ Les modèles spécialisés MCP sont meilleurs!") else: print(f" 🤔 Les modèles généraux performent aussi bien") # Analyse détaillée du meilleur print(f"\n📋 DÉTAILS DU MEILLEUR MODÈLE ({best['name']}):") for result in best['results']: status = "✅" if result['score'] >= 6 else "⚠️" if result['score'] >= 4 else "❌" print(f" {status} {result['test']}: {result['score']}/10 ({result['time']:.2f}s)") # Sauvegarder résultats complets comprehensive_results = { 'evaluation_type': 'comprehensive_mcp_test', 'total_models_tested': len(models), 'successful_models': len(successful_results), 'results': results, 'ranking_by_score': sorted_by_score if successful_results else [], 'ranking_by_efficiency': sorted_by_efficiency if successful_results else [], 'best_overall': best if successful_results else None, 'most_efficient': most_efficient if successful_results else None } with open("comprehensive_mcp_evaluation.json", "w", encoding="utf-8") as f: json.dump(comprehensive_results, f, indent=2, ensure_ascii=False) print(f"\n📄 Résultats complets sauvegardés dans: comprehensive_mcp_evaluation.json") if __name__ == "__main__": main()