Spaces:
Sleeping
Sleeping
| """ | |
| Évaluation complète MCP avec 9 modèles | |
| Inclut les modèles spécialisés MCP et les modèles généraux | |
| Test réaliste avec commandes RTS typiques | |
| """ | |
| import sys | |
| import os | |
| import json | |
| import time | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| def test_model_comprehensive(model_path, model_name): | |
| """Test complet d'un modèle pour MCP""" | |
| try: | |
| from llama_cpp import Llama | |
| print(f"🧪 Test de {model_name}...") | |
| # Vérifier la taille du fichier | |
| file_size = os.path.getsize(model_path) / (1024*1024) | |
| print(f" 📏 Taille: {file_size:.0f} MB") | |
| # Initialiser avec timeout plus long pour les gros modèles | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=1024, | |
| n_threads=1, | |
| verbose=False, | |
| n_gpu_layers=0 # Forcer CPU pour comparaison juste | |
| ) | |
| # Commandes RTS réelles avec différents niveaux de complexité | |
| test_commands = [ | |
| { | |
| "name": "Commande simple", | |
| "command": "show game state", | |
| "expected": "get_game_state", | |
| "difficulty": "easy" | |
| }, | |
| { | |
| "name": "Action avec coordonnées", | |
| "command": "move infantry to 150,200", | |
| "expected": "move_units", | |
| "difficulty": "easy" | |
| }, | |
| { | |
| "name": "Attaque spécifique", | |
| "command": "attack enemy tank at position 300,150", | |
| "expected": "attack_unit", | |
| "difficulty": "medium" | |
| }, | |
| { | |
| "name": "Construction", | |
| "command": "build power plant near my base at 100,100", | |
| "expected": "build_building", | |
| "difficulty": "medium" | |
| }, | |
| { | |
| "name": "Commande complexe", | |
| "command": "defend base with all available units", | |
| "expected": "move_units", | |
| "difficulty": "hard" | |
| } | |
| ] | |
| results = [] | |
| total_score = 0 | |
| total_time = 0 | |
| for test in test_commands: | |
| prompt = f"""You are an AI assistant for an RTS game using MCP (Model Context Protocol). | |
| Available tools: | |
| - get_game_state() | |
| - move_units(unit_ids, target_x, target_y) | |
| - attack_unit(attacker_ids, target_id) | |
| - build_building(building_type, position_x, position_y) | |
| User command: "{test['command']}" | |
| Respond with JSON only: {{"tool": "tool_name", "args": {{}}}}""" | |
| start_time = time.time() | |
| try: | |
| response = llm( | |
| prompt, | |
| max_tokens=100, | |
| temperature=0.1, | |
| stop=["</s>", "<|im_end|>", "```"] | |
| ) | |
| response_time = time.time() - start_time | |
| # Extraire la réponse | |
| try: | |
| response_text = response['choices'][0]['text'].strip() | |
| except: | |
| # Fallback pour différents formats de réponse | |
| if hasattr(response, 'get'): | |
| response_text = response.get('text', str(response)) | |
| else: | |
| response_text = str(response) | |
| # Évaluer la réponse | |
| score = evaluate_mcp_response(response_text, test) | |
| total_score += score | |
| total_time += response_time | |
| print(f" ✅ {test['name']}: {score}/10 ({response_time:.2f}s)") | |
| results.append({ | |
| 'test': test['name'], | |
| 'difficulty': test['difficulty'], | |
| 'score': score, | |
| 'time': response_time, | |
| 'response': response_text[:100] + "..." if len(response_text) > 100 else response_text | |
| }) | |
| except Exception as e: | |
| print(f" ❌ {test['name']}: Erreur - {e}") | |
| results.append({ | |
| 'test': test['name'], | |
| 'difficulty': test['difficulty'], | |
| 'score': 0, | |
| 'time': 0, | |
| 'error': str(e) | |
| }) | |
| avg_score = total_score / len(test_commands) | |
| avg_time = total_time / len(test_commands) | |
| print(f" 📊 Moyenne: {avg_score:.1f}/10 | Temps: {avg_time:.2f}s") | |
| return { | |
| 'name': model_name, | |
| 'file_size_mb': file_size, | |
| 'avg_score': avg_score, | |
| 'avg_time': avg_time, | |
| 'efficiency': avg_score / avg_time if avg_time > 0 else 0, | |
| 'results': results | |
| } | |
| except Exception as e: | |
| print(f"❌ Erreur critique avec {model_name}: {e}") | |
| return { | |
| 'name': model_name, | |
| 'error': str(e), | |
| 'avg_score': 0, | |
| 'avg_time': 0, | |
| 'efficiency': 0 | |
| } | |
| def evaluate_mcp_response(response, test): | |
| """Évaluation standardisée des réponses MCP""" | |
| if not response or response.strip() == "": | |
| return 0 | |
| score = 0 | |
| # JSON valide (3 points) | |
| try: | |
| json.loads(response) | |
| score += 3 | |
| except: | |
| # Chercher JSON dans le texte | |
| import re | |
| json_match = re.search(r'\{[^}]*\}', response) | |
| if json_match: | |
| try: | |
| json.loads(json_match.group()) | |
| score += 1 | |
| except: | |
| pass | |
| # Outil correct (3 points) | |
| expected_tool = test['expected'] | |
| if expected_tool in response: | |
| score += 3 | |
| # Paramètres appropriés (2 points) | |
| if test['difficulty'] == 'easy': | |
| if '150,200' in response or 'game state' in response: | |
| score += 2 | |
| elif test['difficulty'] == 'medium': | |
| if any(coord in response for coord in ['300,150', '100,100']): | |
| score += 2 | |
| elif test['difficulty'] == 'hard': | |
| if 'units' in response and 'defend' in response: | |
| score += 2 | |
| # Format correct (2 points) | |
| if 'tool' in response and 'args' in response: | |
| score += 2 | |
| return min(score, 10) | |
| def main(): | |
| """Évaluation complète de tous les modèles""" | |
| print("🚀 ÉVALUATION COMPLÈTE MCP - 9 MODÈLES") | |
| print("=" * 70) | |
| print("Test avec modèles généraux et spécialisés MCP") | |
| print("=" * 70) | |
| # Tous les modèles à tester | |
| models = [ | |
| # Modèles généraux (testés précédemment) | |
| { | |
| 'name': 'Qwen2.5-0.5B', | |
| 'path': 'qwen2.5-0.5b-instruct-q4_0.gguf', | |
| 'type': 'general' | |
| }, | |
| { | |
| 'name': 'Qwen3-0.6B', | |
| 'path': 'Qwen3-0.6B-Q8_0.gguf', | |
| 'type': 'general' | |
| }, | |
| { | |
| 'name': 'Gemma-3-270M', | |
| 'path': 'gemma-3-270m-it-qat-Q8_0.gguf', | |
| 'type': 'general' | |
| }, | |
| { | |
| 'name': 'Qwen3-1.7B', | |
| 'path': 'Qwen3-1.7B-Q4_0.gguf', | |
| 'type': 'general' | |
| }, | |
| # Modèles spécialisés MCP | |
| { | |
| 'name': 'MCP-Instruct-v1', | |
| 'path': 'mcp-instruct-v1.Q4_K_M.gguf', | |
| 'type': 'mcp_specialized' | |
| }, | |
| { | |
| 'name': 'MCPR L-3B-Exa', | |
| 'path': 'mcprl-3b-exa.Q2_K.gguf', | |
| 'type': 'mcp_specialized' | |
| }, | |
| { | |
| 'name': 'Gemma-3n-E2B-it', | |
| 'path': 'gemma-3n-E2B-it-UD-IQ2_XXS.gguf', | |
| 'type': 'mcp_specialized' | |
| }, | |
| { | |
| 'name': 'Llama-Breeze2-3B', | |
| 'path': 'Llama-Breeze2-3B-Instruct-Text.Q2_K.gguf', | |
| 'type': 'general' | |
| }, | |
| # Modèle spécialisé en code/structuré | |
| { | |
| 'name': 'Qwen2.5-Coder-0.5B', | |
| 'path': 'qwen2.5-coder-0.5b-instruct-q4_0.gguf', | |
| 'type': 'code_specialized' | |
| } | |
| ] | |
| results = [] | |
| for model in models: | |
| if os.path.exists(model['path']): | |
| result = test_model_comprehensive(model['path'], model['name']) | |
| result['type'] = model['type'] | |
| results.append(result) | |
| print() | |
| else: | |
| print(f"❌ Modèle non trouvé: {model['path']}") | |
| print() | |
| # Analyse complète | |
| print("=" * 70) | |
| print("📊 RÉSULTATS COMPLETS") | |
| print("=" * 70) | |
| successful_results = [r for r in results if 'error' not in r and r['avg_score'] > 0] | |
| if successful_results: | |
| # Classement par performance | |
| sorted_by_score = sorted(successful_results, key=lambda x: x['avg_score'], reverse=True) | |
| print(f"\n🏆 CLASSEMENT PAR PERFORMANCE:") | |
| for i, result in enumerate(sorted_by_score, 1): | |
| print(f" {i:2d}. {result['name']:20s} | {result['avg_score']:.1f}/10 | {result['avg_time']:.2f}s | {result['file_size_mb']:.0f}MB | {result['type']}") | |
| # Classement par efficacité | |
| sorted_by_efficiency = sorted(successful_results, key=lambda x: x['efficiency'], reverse=True) | |
| print(f"\n⚡ CLASSEMENT PAR EFFICACITÉ:") | |
| for i, result in enumerate(sorted_by_efficiency, 1): | |
| print(f" {i:2d}. {result['name']:20s} | {result['efficiency']:.2f} score/s | {result['file_size_mb']:.0f}MB") | |
| # Analyse par type | |
| print(f"\n📈 ANALYSE PAR TYPE DE MODÈLE:") | |
| general_models = [r for r in successful_results if r['type'] == 'general'] | |
| mcp_specialized = [r for r in successful_results if r['type'] == 'mcp_specialized'] | |
| code_specialized = [r for r in successful_results if r['type'] == 'code_specialized'] | |
| if general_models: | |
| avg_general = sum(r['avg_score'] for r in general_models) / len(general_models) | |
| print(f" Modèles généraux ({len(general_models)}): {avg_general:.1f}/10 moyen") | |
| if mcp_specialized: | |
| avg_mcp = sum(r['avg_score'] for r in mcp_specialized) / len(mcp_specialized) | |
| print(f" Spécialisés MCP ({len(mcp_specialized)}): {avg_mcp:.1f}/10 moyen") | |
| if code_specialized: | |
| avg_code = sum(r['avg_score'] for r in code_specialized) / len(code_specialized) | |
| print(f" Spécialisés Code ({len(code_specialized)}): {avg_code:.1f}/10 moyen") | |
| # Meilleur modèle global | |
| best = sorted_by_score[0] | |
| most_efficient = sorted_by_efficiency[0] | |
| print(f"\n🎯 MEILLEUR MODÈLE GLOBAL: {best['name']}") | |
| print(f" Score: {best['avg_score']:.1f}/10") | |
| print(f" Temps: {best['avg_time']:.2f}s") | |
| print(f" Taille: {best['file_size_mb']:.0f}MB") | |
| print(f" Type: {best['type']}") | |
| print(f"\n⚡ MODÈLE LE PLUS EFFICACE: {most_efficient['name']}") | |
| print(f" Efficacité: {most_efficient['efficiency']:.2f} score/s") | |
| # Recommandations finales | |
| print(f"\n💡 RECOMMANDATIONS FINALES:") | |
| if best['avg_score'] >= 7: | |
| print(f"✅ {best['name']} est EXCELLENT pour la production MCP") | |
| elif best['avg_score'] >= 5: | |
| print(f"👍 {best['name']} est BON pour la production MCP") | |
| else: | |
| print(f"⚠️ {best['name']} nécessite des améliorations") | |
| # Comparaison spécialisés vs généraux | |
| if mcp_specialized and general_models: | |
| best_specialized = max(mcp_specialized, key=lambda x: x['avg_score']) | |
| best_general = max(general_models, key=lambda x: x['avg_score']) | |
| print(f"\n🔬 SPÉCIALISÉS VS GÉNÉRAUX:") | |
| print(f" Meilleur spécialisé MCP: {best_specialized['name']} ({best_specialized['avg_score']:.1f}/10)") | |
| print(f" Meilleur général: {best_general['name']} ({best_general['avg_score']:.1f}/10)") | |
| if best_specialized['avg_score'] > best_general['avg_score']: | |
| print(f" ✅ Les modèles spécialisés MCP sont meilleurs!") | |
| else: | |
| print(f" 🤔 Les modèles généraux performent aussi bien") | |
| # Analyse détaillée du meilleur | |
| print(f"\n📋 DÉTAILS DU MEILLEUR MODÈLE ({best['name']}):") | |
| for result in best['results']: | |
| status = "✅" if result['score'] >= 6 else "⚠️" if result['score'] >= 4 else "❌" | |
| print(f" {status} {result['test']}: {result['score']}/10 ({result['time']:.2f}s)") | |
| # Sauvegarder résultats complets | |
| comprehensive_results = { | |
| 'evaluation_type': 'comprehensive_mcp_test', | |
| 'total_models_tested': len(models), | |
| 'successful_models': len(successful_results), | |
| 'results': results, | |
| 'ranking_by_score': sorted_by_score if successful_results else [], | |
| 'ranking_by_efficiency': sorted_by_efficiency if successful_results else [], | |
| 'best_overall': best if successful_results else None, | |
| 'most_efficient': most_efficient if successful_results else None | |
| } | |
| with open("comprehensive_mcp_evaluation.json", "w", encoding="utf-8") as f: | |
| json.dump(comprehensive_results, f, indent=2, ensure_ascii=False) | |
| print(f"\n📄 Résultats complets sauvegardés dans: comprehensive_mcp_evaluation.json") | |
| if __name__ == "__main__": | |
| main() |