Spaces:
Sleeping
Sleeping
| """ | |
| Test pratique MCP pour évaluer les capacités réelles des modèles | |
| Simule l'usage réel dans un jeu RTS avec des commandes typiques | |
| """ | |
| import sys | |
| import os | |
| import json | |
| import time | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| def test_model_practical(model_path, model_name): | |
| """Test pratique avec des commandes RTS réelles""" | |
| try: | |
| from llama_cpp import Llama | |
| print(f"🎮 Test pratique de {model_name}...") | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=1024, | |
| n_threads=1, | |
| verbose=False | |
| ) | |
| # Commandes RTS typiques qu'un utilisateur taperait | |
| real_commands = [ | |
| "show game state", | |
| "move infantry to 100,200", | |
| "attack enemy tank at 300,150", | |
| "build barracks near 250,200", | |
| "defend base with all units", | |
| "harvest resources with harvester", | |
| "scout enemy positions" | |
| ] | |
| results = [] | |
| for command in real_commands: | |
| prompt = f"""You control an RTS game via MCP. Available tools: | |
| - get_game_state() | |
| - move_units(unit_ids, target_x, target_y) | |
| - attack_unit(attacker_ids, target_id) | |
| - build_building(building_type, position_x, position_y) | |
| User command: "{command}" | |
| Respond with JSON: {{"tool": "tool_name", "args": {{}}}}""" | |
| start_time = time.time() | |
| response = llm( | |
| prompt, | |
| max_tokens=100, | |
| temperature=0.1 | |
| ) | |
| response_time = time.time() - start_time | |
| # Extraire et évaluer | |
| try: | |
| response_text = response['choices'][0]['text'].strip() | |
| except: | |
| response_text = str(response) | |
| # Évaluation simple mais réaliste | |
| score = 0 | |
| # JSON valide? | |
| try: | |
| json.loads(response_text) | |
| score += 3 | |
| except: | |
| # Chercher JSON dans le texte | |
| import re | |
| if re.search(r'\{.*\}', response_text): | |
| score += 1 | |
| # Outil approprié? | |
| if "game state" in command and "get_game_state" in response_text: | |
| score += 3 | |
| elif "move" in command and "move_units" in response_text: | |
| score += 3 | |
| elif "attack" in command and "attack_unit" in response_text: | |
| score += 3 | |
| elif "build" in command and "build_building" in response_text: | |
| score += 3 | |
| elif any(tool in response_text for tool in ['get_game_state', 'move_units', 'attack_unit', 'build_building']): | |
| score += 1 | |
| # Paramètres raisonnables? | |
| if "100,200" in response_text or "300,150" in response_text or "250,200" in response_text: | |
| score += 2 | |
| score = min(score, 10) | |
| print(f" '{command}' → {score}/10 ({response_time:.2f}s)") | |
| results.append({ | |
| 'command': command, | |
| 'score': score, | |
| 'time': response_time, | |
| 'response': response_text[:50] | |
| }) | |
| avg_score = sum(r['score'] for r in results) / len(results) | |
| avg_time = sum(r['time'] for r in results) / len(results) | |
| print(f"📊 {model_name}: {avg_score:.1f}/10 moyen | {avg_time:.2f}s moyen") | |
| return { | |
| 'name': model_name, | |
| 'avg_score': avg_score, | |
| 'avg_time': avg_time, | |
| 'results': results | |
| } | |
| except Exception as e: | |
| print(f"❌ Erreur {model_name}: {e}") | |
| return None | |
| def main(): | |
| """Test pratique comparatif""" | |
| print("🎯 TEST PRATIQUE MCP - COMMANDES RTS RÉELLES") | |
| print("=" * 50) | |
| print("Simulation de l'usage réel avec des commandes typiques") | |
| print("=" * 50) | |
| models = [ | |
| {'name': 'Qwen2.5-0.5B', 'path': 'qwen2.5-0.5b-instruct-q4_0.gguf'}, | |
| {'name': 'Qwen3-0.6B', 'path': 'Qwen3-0.6B-Q8_0.gguf'}, | |
| {'name': 'Gemma-3-270M', 'path': 'gemma-3-270m-it-qat-Q8_0.gguf'} | |
| ] | |
| results = [] | |
| for model in models: | |
| if os.path.exists(model['path']): | |
| result = test_model_practical(model['path'], model['name']) | |
| if result: | |
| results.append(result) | |
| print() | |
| # Analyse pratique | |
| if results: | |
| print("📊 RÉSULTATS PRATIQUES:") | |
| print("-" * 30) | |
| sorted_results = sorted(results, key=lambda x: x['avg_score'], reverse=True) | |
| for i, result in enumerate(sorted_results, 1): | |
| print(f"{i}. {result['name']}: {result['avg_score']:.1f}/10") | |
| # Recommandation pratique | |
| best = sorted_results[0] | |
| print(f"\n🎯 RECOMMANDATION PRATIQUE:") | |
| if best['avg_score'] >= 7: | |
| print(f"✅ {best['name']} est EXCELLENT pour la production") | |
| print(" → Gère bien les commandes RTS typiques") | |
| elif best['avg_score'] >= 5: | |
| print(f"👍 {best['name']} est UTILISABLE avec validation") | |
| print(" → Fonctionne pour les commandes simples") | |
| else: | |
| print(f"⚠️ {best['name']} nécessite des améliorations") | |
| print(" → Considérer prompts plus spécifiques") | |
| # Analyse des commandes réussies | |
| print(f"\n📈 COMMANDES LES MIEUX GÉRÉES:") | |
| for result in results: | |
| best_commands = [r for r in result['results'] if r['score'] >= 7] | |
| if best_commands: | |
| print(f"\n{result['name']}:") | |
| for cmd in best_commands[:3]: # Top 3 | |
| print(f" • {cmd['command']}: {cmd['score']}/10") | |
| # Conclusion sur la pertinence du test | |
| print(f"\n🔍 PERTINENCE DU TEST:") | |
| print("✅ Ce test est BEAUCOUP plus représentatif:") | |
| print(" • Commandes réelles d'utilisateurs") | |
| print(" • Format de sortie JSON attendu") | |
| print(" • Temps de réponse réaliste") | |
| print(" • Pas de prompts artificiels complexes") | |
| # Sauvegarder | |
| with open("practical_mcp_results.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| print(f"\n📄 Résultats sauvegardés: practical_mcp_results.json") | |
| if __name__ == "__main__": | |
| main() |