""" Évaluation finale comparative de tous les modèles pour MCP Inclut le nouveau Gemma-3-270M plus petit et potentiellement plus rapide """ import sys import os import json import time # Ajouter le chemin pour les imports sys.path.append(os.path.dirname(os.path.abspath(__file__))) def test_model_comprehensive(model_path, model_name): """Test complet d'un modèle pour les tâches MCP""" try: from llama_cpp import Llama print(f"🔄 Test de {model_name}...") # Initialiser avec des paramètres optimisés llm = Llama( model_path=model_path, n_ctx=1024, n_threads=1, verbose=False ) # Tests MCP variés tests = [ { "name": "Commande simple", "prompt": """Tu es un assistant IA pour un jeu RTS via MCP. Outils: get_game_state(), move_units(unit_ids, target_x, target_y) Commande: "Montre-moi l'état du jeu" Réponds avec JSON: {{"tool": "nom_outil", "args": {{}}}}""", "expected": "get_game_state" }, { "name": "Action avec paramètres", "prompt": """Outils: move_units(unit_ids, target_x, target_y) Commande: "Déplace l'infanterie vers 100,200" JSON: {{"tool": "move_units", "args": {{"unit_ids": ["infantry"], "target_x": 100, "target_y": 200}}}}""", "expected": "move_units" }, { "name": "Vitesse de réponse", "prompt": "Réponds simplement: OK", "expected": "OK" } ] total_score = 0 total_time = 0 results = [] for test in tests: start_time = time.time() response = llm( test['prompt'], max_tokens=80, temperature=0.1, stop=["", "<|im_end|>"] ) response_time = time.time() - start_time response_text = response['choices'][0]['text'].strip() # Noter la réponse score = 0 # JSON valide pour les tests MCP if test['name'] != "Vitesse de réponse": try: json.loads(response_text) score += 3 except: pass # Contenu attendu if test['expected'] in response_text: score += 4 # Format approprié if "tool" in response_text and test['name'] != "Vitesse de réponse": score += 2 # Cohérence if any(word in response_text.lower() for word in ['game', 'move', 'state']): score += 1 score = min(score, 10) total_score += score total_time += response_time results.append({ 'test': test['name'], 'score': score, 'time': response_time, 'response': response_text[:50] + "..." if len(response_text) > 50 else response_text }) avg_score = total_score / len(tests) avg_time = total_time / len(tests) print(f"✅ {model_name}: {avg_score:.1f}/10 | Temps: {avg_time:.2f}s") return { 'name': model_name, 'avg_score': avg_score, 'avg_time': avg_time, 'efficiency': avg_score / avg_time if avg_time > 0 else 0, 'tests': results } except Exception as e: print(f"❌ {model_name}: Erreur - {e}") return { 'name': model_name, 'avg_score': 0, 'avg_time': 0, 'efficiency': 0, 'error': str(e) } def main(): """Évaluation finale comparative""" print("🏁 ÉVALUATION FINALE COMPARATIVE MCP") print("=" * 60) # Tous les modèles à tester models = [ { 'name': 'Qwen2.5-0.5B', 'path': 'qwen2.5-0.5b-instruct-q4_0.gguf' }, { 'name': 'Qwen3-0.6B', 'path': 'Qwen3-0.6B-Q8_0.gguf' }, { 'name': 'Gemma-3-1B', 'path': 'google_gemma-3-1b-it-qat-Q4_0.gguf' }, { 'name': 'Gemma-3-270M', 'path': 'gemma-3-270m-it-qat-Q8_0.gguf' } ] results = [] for model in models: if os.path.exists(model['path']): result = test_model_comprehensive(model['path'], model['name']) results.append(result) else: print(f"❌ Fichier non trouvé: {model['path']}") # Analyse comparative print("\n" + "=" * 60) print("📊 RÉSULTATS FINAUX") print("=" * 60) successful_results = [r for r in results if 'error' not in r and r['avg_score'] > 0] if successful_results: # Classement par score sorted_by_score = sorted(successful_results, key=lambda x: x['avg_score'], reverse=True) print(f"\n🏆 CLASSEMENT PAR PERFORMANCE:") for i, result in enumerate(sorted_by_score, 1): file_size = os.path.getsize([m['path'] for m in models if m['name'] == result['name']][0]) / (1024*1024) print(f" {i}. {result['name']}: {result['avg_score']:.1f}/10 | {result['avg_time']:.2f}s | {file_size:.0f}MB") # Classement par efficacité (score/seconde) sorted_by_efficiency = sorted(successful_results, key=lambda x: x['efficiency'], reverse=True) print(f"\n⚡ CLASSEMENT PAR EFFICACITÉ:") for i, result in enumerate(sorted_by_efficiency, 1): file_size = os.path.getsize([m['path'] for m in models if m['name'] == result['name']][0]) / (1024*1024) print(f" {i}. {result['name']}: {result['efficiency']:.2f} score/s | {file_size:.0f}MB") # Meilleur modèle global best_overall = sorted_by_score[0] most_efficient = sorted_by_efficiency[0] print(f"\n🎯 MEILLEUR MODÈLE GLOBAL: {best_overall['name']}") print(f" Score: {best_overall['avg_score']:.1f}/10") print(f" Temps: {best_overall['avg_time']:.2f}s") print(f"\n⚡ MODÈLE LE PLUS EFFICACE: {most_efficient['name']}") print(f" Efficacité: {most_efficient['efficiency']:.2f} score/s") # Analyse détaillée print(f"\n📈 ANALYSE DÉTAILLÉE:") for result in successful_results: file_size = os.path.getsize([m['path'] for m in models if m['name'] == result['name']][0]) / (1024*1024) efficiency_per_mb = result['efficiency'] / file_size if file_size > 0 else 0 print(f"\n🔹 {result['name']}:") print(f" Score moyen: {result['avg_score']:.1f}/10") print(f" Temps moyen: {result['avg_time']:.2f}s") print(f" Efficacité: {result['efficiency']:.2f} score/s") print(f" Taille: {file_size:.0f}MB") print(f" Efficacité/MB: {efficiency_per_mb:.4f}") # Tests individuels for test in result['tests']: status = "✅" if test['score'] >= 6 else "⚠️" if test['score'] >= 4 else "❌" print(f" {status} {test['test']}: {test['score']}/10 ({test['time']:.2f}s)") # Recommandations finales print(f"\n💡 RECOMMANDATIONS FINALES:") if best_overall['avg_score'] >= 7: print(f"✅ {best_overall['name']} est EXCELLENT pour la production MCP") elif best_overall['avg_score'] >= 5: print(f"👍 {best_overall['name']} est BON pour la production MCP") else: print(f"⚠️ {best_overall['name']} nécessite des améliorations") # Recommandation basée sur l'usage print(f"\n🎯 RECOMMANDATIONS SPÉCIFIQUES:") if most_efficient['name'] != best_overall['name']: print(f"⚡ Pour les réponses rapides: {most_efficient['name']}") print(f"🏆 Pour la meilleure qualité: {best_overall['name']}") else: print(f"🎉 {best_overall['name']} est le meilleur choix pour la vitesse ET la qualité") # Vérifier si Gemma-3-270M est surprenant gemma_270m = next((r for r in successful_results if r['name'] == 'Gemma-3-270M'), None) if gemma_270m and gemma_270m['avg_score'] >= 5: print(f"🚀 Surprise: Gemma-3-270M offre un excellent rapport taille/performance!") # Sauvegarder résultats complets final_results = { 'all_results': results, 'successful_models': successful_results, 'ranking_by_score': sorted_by_score if successful_results else [], 'ranking_by_efficiency': sorted_by_efficiency if successful_results else [], 'best_overall': best_overall if successful_results else None, 'most_efficient': most_efficient if successful_results else None } with open("final_model_comparison.json", "w", encoding="utf-8") as f: json.dump(final_results, f, indent=2, ensure_ascii=False) print(f"\n📄 Résultats complets sauvegardés dans: final_model_comparison.json") if __name__ == "__main__": main()