rts-commander / tests /scripts /final_model_comparison.py
Luigi's picture
Organize project structure: move test scripts to tests/scripts and documentation to docs/reports
d28c36c
"""
Évaluation finale comparative de tous les modèles pour MCP
Inclut le nouveau Gemma-3-270M plus petit et potentiellement plus rapide
"""
import sys
import os
import json
import time
# Ajouter le chemin pour les imports
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
def test_model_comprehensive(model_path, model_name):
"""Test complet d'un modèle pour les tâches MCP"""
try:
from llama_cpp import Llama
print(f"🔄 Test de {model_name}...")
# Initialiser avec des paramètres optimisés
llm = Llama(
model_path=model_path,
n_ctx=1024,
n_threads=1,
verbose=False
)
# Tests MCP variés
tests = [
{
"name": "Commande simple",
"prompt": """Tu es un assistant IA pour un jeu RTS via MCP.
Outils: get_game_state(), move_units(unit_ids, target_x, target_y)
Commande: "Montre-moi l'état du jeu"
Réponds avec JSON: {{"tool": "nom_outil", "args": {{}}}}""",
"expected": "get_game_state"
},
{
"name": "Action avec paramètres",
"prompt": """Outils: move_units(unit_ids, target_x, target_y)
Commande: "Déplace l'infanterie vers 100,200"
JSON: {{"tool": "move_units", "args": {{"unit_ids": ["infantry"], "target_x": 100, "target_y": 200}}}}""",
"expected": "move_units"
},
{
"name": "Vitesse de réponse",
"prompt": "Réponds simplement: OK",
"expected": "OK"
}
]
total_score = 0
total_time = 0
results = []
for test in tests:
start_time = time.time()
response = llm(
test['prompt'],
max_tokens=80,
temperature=0.1,
stop=["</s>", "<|im_end|>"]
)
response_time = time.time() - start_time
response_text = response['choices'][0]['text'].strip()
# Noter la réponse
score = 0
# JSON valide pour les tests MCP
if test['name'] != "Vitesse de réponse":
try:
json.loads(response_text)
score += 3
except:
pass
# Contenu attendu
if test['expected'] in response_text:
score += 4
# Format approprié
if "tool" in response_text and test['name'] != "Vitesse de réponse":
score += 2
# Cohérence
if any(word in response_text.lower() for word in ['game', 'move', 'state']):
score += 1
score = min(score, 10)
total_score += score
total_time += response_time
results.append({
'test': test['name'],
'score': score,
'time': response_time,
'response': response_text[:50] + "..." if len(response_text) > 50 else response_text
})
avg_score = total_score / len(tests)
avg_time = total_time / len(tests)
print(f"✅ {model_name}: {avg_score:.1f}/10 | Temps: {avg_time:.2f}s")
return {
'name': model_name,
'avg_score': avg_score,
'avg_time': avg_time,
'efficiency': avg_score / avg_time if avg_time > 0 else 0,
'tests': results
}
except Exception as e:
print(f"❌ {model_name}: Erreur - {e}")
return {
'name': model_name,
'avg_score': 0,
'avg_time': 0,
'efficiency': 0,
'error': str(e)
}
def main():
"""Évaluation finale comparative"""
print("🏁 ÉVALUATION FINALE COMPARATIVE MCP")
print("=" * 60)
# Tous les modèles à tester
models = [
{
'name': 'Qwen2.5-0.5B',
'path': 'qwen2.5-0.5b-instruct-q4_0.gguf'
},
{
'name': 'Qwen3-0.6B',
'path': 'Qwen3-0.6B-Q8_0.gguf'
},
{
'name': 'Gemma-3-1B',
'path': 'google_gemma-3-1b-it-qat-Q4_0.gguf'
},
{
'name': 'Gemma-3-270M',
'path': 'gemma-3-270m-it-qat-Q8_0.gguf'
}
]
results = []
for model in models:
if os.path.exists(model['path']):
result = test_model_comprehensive(model['path'], model['name'])
results.append(result)
else:
print(f"❌ Fichier non trouvé: {model['path']}")
# Analyse comparative
print("\n" + "=" * 60)
print("📊 RÉSULTATS FINAUX")
print("=" * 60)
successful_results = [r for r in results if 'error' not in r and r['avg_score'] > 0]
if successful_results:
# Classement par score
sorted_by_score = sorted(successful_results, key=lambda x: x['avg_score'], reverse=True)
print(f"\n🏆 CLASSEMENT PAR PERFORMANCE:")
for i, result in enumerate(sorted_by_score, 1):
file_size = os.path.getsize([m['path'] for m in models if m['name'] == result['name']][0]) / (1024*1024)
print(f" {i}. {result['name']}: {result['avg_score']:.1f}/10 | {result['avg_time']:.2f}s | {file_size:.0f}MB")
# Classement par efficacité (score/seconde)
sorted_by_efficiency = sorted(successful_results, key=lambda x: x['efficiency'], reverse=True)
print(f"\n⚡ CLASSEMENT PAR EFFICACITÉ:")
for i, result in enumerate(sorted_by_efficiency, 1):
file_size = os.path.getsize([m['path'] for m in models if m['name'] == result['name']][0]) / (1024*1024)
print(f" {i}. {result['name']}: {result['efficiency']:.2f} score/s | {file_size:.0f}MB")
# Meilleur modèle global
best_overall = sorted_by_score[0]
most_efficient = sorted_by_efficiency[0]
print(f"\n🎯 MEILLEUR MODÈLE GLOBAL: {best_overall['name']}")
print(f" Score: {best_overall['avg_score']:.1f}/10")
print(f" Temps: {best_overall['avg_time']:.2f}s")
print(f"\n⚡ MODÈLE LE PLUS EFFICACE: {most_efficient['name']}")
print(f" Efficacité: {most_efficient['efficiency']:.2f} score/s")
# Analyse détaillée
print(f"\n📈 ANALYSE DÉTAILLÉE:")
for result in successful_results:
file_size = os.path.getsize([m['path'] for m in models if m['name'] == result['name']][0]) / (1024*1024)
efficiency_per_mb = result['efficiency'] / file_size if file_size > 0 else 0
print(f"\n🔹 {result['name']}:")
print(f" Score moyen: {result['avg_score']:.1f}/10")
print(f" Temps moyen: {result['avg_time']:.2f}s")
print(f" Efficacité: {result['efficiency']:.2f} score/s")
print(f" Taille: {file_size:.0f}MB")
print(f" Efficacité/MB: {efficiency_per_mb:.4f}")
# Tests individuels
for test in result['tests']:
status = "✅" if test['score'] >= 6 else "⚠️" if test['score'] >= 4 else "❌"
print(f" {status} {test['test']}: {test['score']}/10 ({test['time']:.2f}s)")
# Recommandations finales
print(f"\n💡 RECOMMANDATIONS FINALES:")
if best_overall['avg_score'] >= 7:
print(f"✅ {best_overall['name']} est EXCELLENT pour la production MCP")
elif best_overall['avg_score'] >= 5:
print(f"👍 {best_overall['name']} est BON pour la production MCP")
else:
print(f"⚠️ {best_overall['name']} nécessite des améliorations")
# Recommandation basée sur l'usage
print(f"\n🎯 RECOMMANDATIONS SPÉCIFIQUES:")
if most_efficient['name'] != best_overall['name']:
print(f"⚡ Pour les réponses rapides: {most_efficient['name']}")
print(f"🏆 Pour la meilleure qualité: {best_overall['name']}")
else:
print(f"🎉 {best_overall['name']} est le meilleur choix pour la vitesse ET la qualité")
# Vérifier si Gemma-3-270M est surprenant
gemma_270m = next((r for r in successful_results if r['name'] == 'Gemma-3-270M'), None)
if gemma_270m and gemma_270m['avg_score'] >= 5:
print(f"🚀 Surprise: Gemma-3-270M offre un excellent rapport taille/performance!")
# Sauvegarder résultats complets
final_results = {
'all_results': results,
'successful_models': successful_results,
'ranking_by_score': sorted_by_score if successful_results else [],
'ranking_by_efficiency': sorted_by_efficiency if successful_results else [],
'best_overall': best_overall if successful_results else None,
'most_efficient': most_efficient if successful_results else None
}
with open("final_model_comparison.json", "w", encoding="utf-8") as f:
json.dump(final_results, f, indent=2, ensure_ascii=False)
print(f"\n📄 Résultats complets sauvegardés dans: final_model_comparison.json")
if __name__ == "__main__":
main()