Spaces:
Sleeping
Sleeping
| """ | |
| Évaluation finale comparative de tous les modèles pour MCP | |
| Inclut le nouveau Gemma-3-270M plus petit et potentiellement plus rapide | |
| """ | |
| import sys | |
| import os | |
| import json | |
| import time | |
| # Ajouter le chemin pour les imports | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| def test_model_comprehensive(model_path, model_name): | |
| """Test complet d'un modèle pour les tâches MCP""" | |
| try: | |
| from llama_cpp import Llama | |
| print(f"🔄 Test de {model_name}...") | |
| # Initialiser avec des paramètres optimisés | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=1024, | |
| n_threads=1, | |
| verbose=False | |
| ) | |
| # Tests MCP variés | |
| tests = [ | |
| { | |
| "name": "Commande simple", | |
| "prompt": """Tu es un assistant IA pour un jeu RTS via MCP. | |
| Outils: get_game_state(), move_units(unit_ids, target_x, target_y) | |
| Commande: "Montre-moi l'état du jeu" | |
| Réponds avec JSON: {{"tool": "nom_outil", "args": {{}}}}""", | |
| "expected": "get_game_state" | |
| }, | |
| { | |
| "name": "Action avec paramètres", | |
| "prompt": """Outils: move_units(unit_ids, target_x, target_y) | |
| Commande: "Déplace l'infanterie vers 100,200" | |
| JSON: {{"tool": "move_units", "args": {{"unit_ids": ["infantry"], "target_x": 100, "target_y": 200}}}}""", | |
| "expected": "move_units" | |
| }, | |
| { | |
| "name": "Vitesse de réponse", | |
| "prompt": "Réponds simplement: OK", | |
| "expected": "OK" | |
| } | |
| ] | |
| total_score = 0 | |
| total_time = 0 | |
| results = [] | |
| for test in tests: | |
| start_time = time.time() | |
| response = llm( | |
| test['prompt'], | |
| max_tokens=80, | |
| temperature=0.1, | |
| stop=["</s>", "<|im_end|>"] | |
| ) | |
| response_time = time.time() - start_time | |
| response_text = response['choices'][0]['text'].strip() | |
| # Noter la réponse | |
| score = 0 | |
| # JSON valide pour les tests MCP | |
| if test['name'] != "Vitesse de réponse": | |
| try: | |
| json.loads(response_text) | |
| score += 3 | |
| except: | |
| pass | |
| # Contenu attendu | |
| if test['expected'] in response_text: | |
| score += 4 | |
| # Format approprié | |
| if "tool" in response_text and test['name'] != "Vitesse de réponse": | |
| score += 2 | |
| # Cohérence | |
| if any(word in response_text.lower() for word in ['game', 'move', 'state']): | |
| score += 1 | |
| score = min(score, 10) | |
| total_score += score | |
| total_time += response_time | |
| results.append({ | |
| 'test': test['name'], | |
| 'score': score, | |
| 'time': response_time, | |
| 'response': response_text[:50] + "..." if len(response_text) > 50 else response_text | |
| }) | |
| avg_score = total_score / len(tests) | |
| avg_time = total_time / len(tests) | |
| print(f"✅ {model_name}: {avg_score:.1f}/10 | Temps: {avg_time:.2f}s") | |
| return { | |
| 'name': model_name, | |
| 'avg_score': avg_score, | |
| 'avg_time': avg_time, | |
| 'efficiency': avg_score / avg_time if avg_time > 0 else 0, | |
| 'tests': results | |
| } | |
| except Exception as e: | |
| print(f"❌ {model_name}: Erreur - {e}") | |
| return { | |
| 'name': model_name, | |
| 'avg_score': 0, | |
| 'avg_time': 0, | |
| 'efficiency': 0, | |
| 'error': str(e) | |
| } | |
| def main(): | |
| """Évaluation finale comparative""" | |
| print("🏁 ÉVALUATION FINALE COMPARATIVE MCP") | |
| print("=" * 60) | |
| # Tous les modèles à tester | |
| models = [ | |
| { | |
| 'name': 'Qwen2.5-0.5B', | |
| 'path': 'qwen2.5-0.5b-instruct-q4_0.gguf' | |
| }, | |
| { | |
| 'name': 'Qwen3-0.6B', | |
| 'path': 'Qwen3-0.6B-Q8_0.gguf' | |
| }, | |
| { | |
| 'name': 'Gemma-3-1B', | |
| 'path': 'google_gemma-3-1b-it-qat-Q4_0.gguf' | |
| }, | |
| { | |
| 'name': 'Gemma-3-270M', | |
| 'path': 'gemma-3-270m-it-qat-Q8_0.gguf' | |
| } | |
| ] | |
| results = [] | |
| for model in models: | |
| if os.path.exists(model['path']): | |
| result = test_model_comprehensive(model['path'], model['name']) | |
| results.append(result) | |
| else: | |
| print(f"❌ Fichier non trouvé: {model['path']}") | |
| # Analyse comparative | |
| print("\n" + "=" * 60) | |
| print("📊 RÉSULTATS FINAUX") | |
| print("=" * 60) | |
| successful_results = [r for r in results if 'error' not in r and r['avg_score'] > 0] | |
| if successful_results: | |
| # Classement par score | |
| sorted_by_score = sorted(successful_results, key=lambda x: x['avg_score'], reverse=True) | |
| print(f"\n🏆 CLASSEMENT PAR PERFORMANCE:") | |
| for i, result in enumerate(sorted_by_score, 1): | |
| file_size = os.path.getsize([m['path'] for m in models if m['name'] == result['name']][0]) / (1024*1024) | |
| print(f" {i}. {result['name']}: {result['avg_score']:.1f}/10 | {result['avg_time']:.2f}s | {file_size:.0f}MB") | |
| # Classement par efficacité (score/seconde) | |
| sorted_by_efficiency = sorted(successful_results, key=lambda x: x['efficiency'], reverse=True) | |
| print(f"\n⚡ CLASSEMENT PAR EFFICACITÉ:") | |
| for i, result in enumerate(sorted_by_efficiency, 1): | |
| file_size = os.path.getsize([m['path'] for m in models if m['name'] == result['name']][0]) / (1024*1024) | |
| print(f" {i}. {result['name']}: {result['efficiency']:.2f} score/s | {file_size:.0f}MB") | |
| # Meilleur modèle global | |
| best_overall = sorted_by_score[0] | |
| most_efficient = sorted_by_efficiency[0] | |
| print(f"\n🎯 MEILLEUR MODÈLE GLOBAL: {best_overall['name']}") | |
| print(f" Score: {best_overall['avg_score']:.1f}/10") | |
| print(f" Temps: {best_overall['avg_time']:.2f}s") | |
| print(f"\n⚡ MODÈLE LE PLUS EFFICACE: {most_efficient['name']}") | |
| print(f" Efficacité: {most_efficient['efficiency']:.2f} score/s") | |
| # Analyse détaillée | |
| print(f"\n📈 ANALYSE DÉTAILLÉE:") | |
| for result in successful_results: | |
| file_size = os.path.getsize([m['path'] for m in models if m['name'] == result['name']][0]) / (1024*1024) | |
| efficiency_per_mb = result['efficiency'] / file_size if file_size > 0 else 0 | |
| print(f"\n🔹 {result['name']}:") | |
| print(f" Score moyen: {result['avg_score']:.1f}/10") | |
| print(f" Temps moyen: {result['avg_time']:.2f}s") | |
| print(f" Efficacité: {result['efficiency']:.2f} score/s") | |
| print(f" Taille: {file_size:.0f}MB") | |
| print(f" Efficacité/MB: {efficiency_per_mb:.4f}") | |
| # Tests individuels | |
| for test in result['tests']: | |
| status = "✅" if test['score'] >= 6 else "⚠️" if test['score'] >= 4 else "❌" | |
| print(f" {status} {test['test']}: {test['score']}/10 ({test['time']:.2f}s)") | |
| # Recommandations finales | |
| print(f"\n💡 RECOMMANDATIONS FINALES:") | |
| if best_overall['avg_score'] >= 7: | |
| print(f"✅ {best_overall['name']} est EXCELLENT pour la production MCP") | |
| elif best_overall['avg_score'] >= 5: | |
| print(f"👍 {best_overall['name']} est BON pour la production MCP") | |
| else: | |
| print(f"⚠️ {best_overall['name']} nécessite des améliorations") | |
| # Recommandation basée sur l'usage | |
| print(f"\n🎯 RECOMMANDATIONS SPÉCIFIQUES:") | |
| if most_efficient['name'] != best_overall['name']: | |
| print(f"⚡ Pour les réponses rapides: {most_efficient['name']}") | |
| print(f"🏆 Pour la meilleure qualité: {best_overall['name']}") | |
| else: | |
| print(f"🎉 {best_overall['name']} est le meilleur choix pour la vitesse ET la qualité") | |
| # Vérifier si Gemma-3-270M est surprenant | |
| gemma_270m = next((r for r in successful_results if r['name'] == 'Gemma-3-270M'), None) | |
| if gemma_270m and gemma_270m['avg_score'] >= 5: | |
| print(f"🚀 Surprise: Gemma-3-270M offre un excellent rapport taille/performance!") | |
| # Sauvegarder résultats complets | |
| final_results = { | |
| 'all_results': results, | |
| 'successful_models': successful_results, | |
| 'ranking_by_score': sorted_by_score if successful_results else [], | |
| 'ranking_by_efficiency': sorted_by_efficiency if successful_results else [], | |
| 'best_overall': best_overall if successful_results else None, | |
| 'most_efficient': most_efficient if successful_results else None | |
| } | |
| with open("final_model_comparison.json", "w", encoding="utf-8") as f: | |
| json.dump(final_results, f, indent=2, ensure_ascii=False) | |
| print(f"\n📄 Résultats complets sauvegardés dans: final_model_comparison.json") | |
| if __name__ == "__main__": | |
| main() |