rts-commander / tests /scripts /comprehensive_mcp_evaluation.py
Luigi's picture
Organize project structure: move test scripts to tests/scripts and documentation to docs/reports
d28c36c
"""
Évaluation complète MCP avec 9 modèles
Inclut les modèles spécialisés MCP et les modèles généraux
Test réaliste avec commandes RTS typiques
"""
import sys
import os
import json
import time
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
def test_model_comprehensive(model_path, model_name):
"""Test complet d'un modèle pour MCP"""
try:
from llama_cpp import Llama
print(f"🧪 Test de {model_name}...")
# Vérifier la taille du fichier
file_size = os.path.getsize(model_path) / (1024*1024)
print(f" 📏 Taille: {file_size:.0f} MB")
# Initialiser avec timeout plus long pour les gros modèles
llm = Llama(
model_path=model_path,
n_ctx=1024,
n_threads=1,
verbose=False,
n_gpu_layers=0 # Forcer CPU pour comparaison juste
)
# Commandes RTS réelles avec différents niveaux de complexité
test_commands = [
{
"name": "Commande simple",
"command": "show game state",
"expected": "get_game_state",
"difficulty": "easy"
},
{
"name": "Action avec coordonnées",
"command": "move infantry to 150,200",
"expected": "move_units",
"difficulty": "easy"
},
{
"name": "Attaque spécifique",
"command": "attack enemy tank at position 300,150",
"expected": "attack_unit",
"difficulty": "medium"
},
{
"name": "Construction",
"command": "build power plant near my base at 100,100",
"expected": "build_building",
"difficulty": "medium"
},
{
"name": "Commande complexe",
"command": "defend base with all available units",
"expected": "move_units",
"difficulty": "hard"
}
]
results = []
total_score = 0
total_time = 0
for test in test_commands:
prompt = f"""You are an AI assistant for an RTS game using MCP (Model Context Protocol).
Available tools:
- get_game_state()
- move_units(unit_ids, target_x, target_y)
- attack_unit(attacker_ids, target_id)
- build_building(building_type, position_x, position_y)
User command: "{test['command']}"
Respond with JSON only: {{"tool": "tool_name", "args": {{}}}}"""
start_time = time.time()
try:
response = llm(
prompt,
max_tokens=100,
temperature=0.1,
stop=["</s>", "<|im_end|>", "```"]
)
response_time = time.time() - start_time
# Extraire la réponse
try:
response_text = response['choices'][0]['text'].strip()
except:
# Fallback pour différents formats de réponse
if hasattr(response, 'get'):
response_text = response.get('text', str(response))
else:
response_text = str(response)
# Évaluer la réponse
score = evaluate_mcp_response(response_text, test)
total_score += score
total_time += response_time
print(f" ✅ {test['name']}: {score}/10 ({response_time:.2f}s)")
results.append({
'test': test['name'],
'difficulty': test['difficulty'],
'score': score,
'time': response_time,
'response': response_text[:100] + "..." if len(response_text) > 100 else response_text
})
except Exception as e:
print(f" ❌ {test['name']}: Erreur - {e}")
results.append({
'test': test['name'],
'difficulty': test['difficulty'],
'score': 0,
'time': 0,
'error': str(e)
})
avg_score = total_score / len(test_commands)
avg_time = total_time / len(test_commands)
print(f" 📊 Moyenne: {avg_score:.1f}/10 | Temps: {avg_time:.2f}s")
return {
'name': model_name,
'file_size_mb': file_size,
'avg_score': avg_score,
'avg_time': avg_time,
'efficiency': avg_score / avg_time if avg_time > 0 else 0,
'results': results
}
except Exception as e:
print(f"❌ Erreur critique avec {model_name}: {e}")
return {
'name': model_name,
'error': str(e),
'avg_score': 0,
'avg_time': 0,
'efficiency': 0
}
def evaluate_mcp_response(response, test):
"""Évaluation standardisée des réponses MCP"""
if not response or response.strip() == "":
return 0
score = 0
# JSON valide (3 points)
try:
json.loads(response)
score += 3
except:
# Chercher JSON dans le texte
import re
json_match = re.search(r'\{[^}]*\}', response)
if json_match:
try:
json.loads(json_match.group())
score += 1
except:
pass
# Outil correct (3 points)
expected_tool = test['expected']
if expected_tool in response:
score += 3
# Paramètres appropriés (2 points)
if test['difficulty'] == 'easy':
if '150,200' in response or 'game state' in response:
score += 2
elif test['difficulty'] == 'medium':
if any(coord in response for coord in ['300,150', '100,100']):
score += 2
elif test['difficulty'] == 'hard':
if 'units' in response and 'defend' in response:
score += 2
# Format correct (2 points)
if 'tool' in response and 'args' in response:
score += 2
return min(score, 10)
def main():
"""Évaluation complète de tous les modèles"""
print("🚀 ÉVALUATION COMPLÈTE MCP - 9 MODÈLES")
print("=" * 70)
print("Test avec modèles généraux et spécialisés MCP")
print("=" * 70)
# Tous les modèles à tester
models = [
# Modèles généraux (testés précédemment)
{
'name': 'Qwen2.5-0.5B',
'path': 'qwen2.5-0.5b-instruct-q4_0.gguf',
'type': 'general'
},
{
'name': 'Qwen3-0.6B',
'path': 'Qwen3-0.6B-Q8_0.gguf',
'type': 'general'
},
{
'name': 'Gemma-3-270M',
'path': 'gemma-3-270m-it-qat-Q8_0.gguf',
'type': 'general'
},
{
'name': 'Qwen3-1.7B',
'path': 'Qwen3-1.7B-Q4_0.gguf',
'type': 'general'
},
# Modèles spécialisés MCP
{
'name': 'MCP-Instruct-v1',
'path': 'mcp-instruct-v1.Q4_K_M.gguf',
'type': 'mcp_specialized'
},
{
'name': 'MCPR L-3B-Exa',
'path': 'mcprl-3b-exa.Q2_K.gguf',
'type': 'mcp_specialized'
},
{
'name': 'Gemma-3n-E2B-it',
'path': 'gemma-3n-E2B-it-UD-IQ2_XXS.gguf',
'type': 'mcp_specialized'
},
{
'name': 'Llama-Breeze2-3B',
'path': 'Llama-Breeze2-3B-Instruct-Text.Q2_K.gguf',
'type': 'general'
},
# Modèle spécialisé en code/structuré
{
'name': 'Qwen2.5-Coder-0.5B',
'path': 'qwen2.5-coder-0.5b-instruct-q4_0.gguf',
'type': 'code_specialized'
}
]
results = []
for model in models:
if os.path.exists(model['path']):
result = test_model_comprehensive(model['path'], model['name'])
result['type'] = model['type']
results.append(result)
print()
else:
print(f"❌ Modèle non trouvé: {model['path']}")
print()
# Analyse complète
print("=" * 70)
print("📊 RÉSULTATS COMPLETS")
print("=" * 70)
successful_results = [r for r in results if 'error' not in r and r['avg_score'] > 0]
if successful_results:
# Classement par performance
sorted_by_score = sorted(successful_results, key=lambda x: x['avg_score'], reverse=True)
print(f"\n🏆 CLASSEMENT PAR PERFORMANCE:")
for i, result in enumerate(sorted_by_score, 1):
print(f" {i:2d}. {result['name']:20s} | {result['avg_score']:.1f}/10 | {result['avg_time']:.2f}s | {result['file_size_mb']:.0f}MB | {result['type']}")
# Classement par efficacité
sorted_by_efficiency = sorted(successful_results, key=lambda x: x['efficiency'], reverse=True)
print(f"\n⚡ CLASSEMENT PAR EFFICACITÉ:")
for i, result in enumerate(sorted_by_efficiency, 1):
print(f" {i:2d}. {result['name']:20s} | {result['efficiency']:.2f} score/s | {result['file_size_mb']:.0f}MB")
# Analyse par type
print(f"\n📈 ANALYSE PAR TYPE DE MODÈLE:")
general_models = [r for r in successful_results if r['type'] == 'general']
mcp_specialized = [r for r in successful_results if r['type'] == 'mcp_specialized']
code_specialized = [r for r in successful_results if r['type'] == 'code_specialized']
if general_models:
avg_general = sum(r['avg_score'] for r in general_models) / len(general_models)
print(f" Modèles généraux ({len(general_models)}): {avg_general:.1f}/10 moyen")
if mcp_specialized:
avg_mcp = sum(r['avg_score'] for r in mcp_specialized) / len(mcp_specialized)
print(f" Spécialisés MCP ({len(mcp_specialized)}): {avg_mcp:.1f}/10 moyen")
if code_specialized:
avg_code = sum(r['avg_score'] for r in code_specialized) / len(code_specialized)
print(f" Spécialisés Code ({len(code_specialized)}): {avg_code:.1f}/10 moyen")
# Meilleur modèle global
best = sorted_by_score[0]
most_efficient = sorted_by_efficiency[0]
print(f"\n🎯 MEILLEUR MODÈLE GLOBAL: {best['name']}")
print(f" Score: {best['avg_score']:.1f}/10")
print(f" Temps: {best['avg_time']:.2f}s")
print(f" Taille: {best['file_size_mb']:.0f}MB")
print(f" Type: {best['type']}")
print(f"\n⚡ MODÈLE LE PLUS EFFICACE: {most_efficient['name']}")
print(f" Efficacité: {most_efficient['efficiency']:.2f} score/s")
# Recommandations finales
print(f"\n💡 RECOMMANDATIONS FINALES:")
if best['avg_score'] >= 7:
print(f"✅ {best['name']} est EXCELLENT pour la production MCP")
elif best['avg_score'] >= 5:
print(f"👍 {best['name']} est BON pour la production MCP")
else:
print(f"⚠️ {best['name']} nécessite des améliorations")
# Comparaison spécialisés vs généraux
if mcp_specialized and general_models:
best_specialized = max(mcp_specialized, key=lambda x: x['avg_score'])
best_general = max(general_models, key=lambda x: x['avg_score'])
print(f"\n🔬 SPÉCIALISÉS VS GÉNÉRAUX:")
print(f" Meilleur spécialisé MCP: {best_specialized['name']} ({best_specialized['avg_score']:.1f}/10)")
print(f" Meilleur général: {best_general['name']} ({best_general['avg_score']:.1f}/10)")
if best_specialized['avg_score'] > best_general['avg_score']:
print(f" ✅ Les modèles spécialisés MCP sont meilleurs!")
else:
print(f" 🤔 Les modèles généraux performent aussi bien")
# Analyse détaillée du meilleur
print(f"\n📋 DÉTAILS DU MEILLEUR MODÈLE ({best['name']}):")
for result in best['results']:
status = "✅" if result['score'] >= 6 else "⚠️" if result['score'] >= 4 else "❌"
print(f" {status} {result['test']}: {result['score']}/10 ({result['time']:.2f}s)")
# Sauvegarder résultats complets
comprehensive_results = {
'evaluation_type': 'comprehensive_mcp_test',
'total_models_tested': len(models),
'successful_models': len(successful_results),
'results': results,
'ranking_by_score': sorted_by_score if successful_results else [],
'ranking_by_efficiency': sorted_by_efficiency if successful_results else [],
'best_overall': best if successful_results else None,
'most_efficient': most_efficient if successful_results else None
}
with open("comprehensive_mcp_evaluation.json", "w", encoding="utf-8") as f:
json.dump(comprehensive_results, f, indent=2, ensure_ascii=False)
print(f"\n📄 Résultats complets sauvegardés dans: comprehensive_mcp_evaluation.json")
if __name__ == "__main__":
main()