[ { "model": "Qwen3-8B", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 12.37, "Zebra": 26.92, "Sudoku": 2.33, "Skyscraper": 0.29, "Kakurasu": 36.00, "Crypto": 14.93, "Minesweeper": 0.25, "Navigation": 67.88, "Binario": 7.83, "Hanoi": 9.0, "Hitori": 16.83, "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "Qwen3-30B-A3B-Thinking-2507", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 37.33, "Zebra": 64.25, "Sudoku": 24.72, "Skyscraper": 1.25, "Kakurasu": 87.88, "Crypto": 55.86, "Minesweeper": 39.63, "Navigation": 93.75, "Binario": 19.17, "Hanoi": 28.38, "Hitori": 40.00, "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "Qwen3-32B", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 20.97, "Zebra": 44.00, "Sudoku": 6.89, "Skyscraper": 1.63, "Kakurasu": "51.13", "Crypto": "43.57", "Minesweeper": "0.54", "Navigation": "83.88", "Binario": "13.83", "Hanoi": "18.96", "Hitori": "21.42", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "Qwen3-Next-80B-A3B-Thinking", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 36.35, "Zebra": "71.0", "Sudoku": "21.78", "Skyscraper": "4.25", "Kakurasu": "83.13", "Crypto": "57.71", "Minesweeper": "27.67", "Navigation": "95.0", "Binario": "27.67", "Hanoi": "27.38", "Hitori": "36.5", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "Qwen3-235B-A22B-Thinking-2507", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 43.33, "Zebra": "61.0", "Sudoku": "28.94", "Skyscraper": "4.21", "Kakurasu": "89.5", "Crypto": "75.79", "Minesweeper": "30.17", "Navigation": "97.25", "Binario": "35.0", "Hanoi": "40.83", "Hitori": "40.5", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "MiniMax-M1-40k", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 6.44, "Zebra": "16.08", "Sudoku": "0.0", "Skyscraper": "0.13", "Kakurasu": "25.5", "Crypto": "1.5", "Minesweeper": "0.17", "Navigation": "9.5", "Binario": "4.58", "Hanoi": "13.75", "Hitori": "9.92", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "DeepSeek-R1-0528-Qwen3-8B", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 13.83, "Zebra": "39.33", "Sudoku": "0.28", "Skyscraper": "0.04", "Kakurasu": "39.38", "Crypto": "12.00", "Minesweeper": "1.75", "Navigation": "69.88", "Binario": "6.25", "Hanoi": "17.71", "Hitori": "8.00", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "DeepSeek-V3.1", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 41.43, "Zebra": "62.67", "Sudoku": "18.61", "Skyscraper": "1.38", "Kakurasu": "92.0", "Crypto": "75.64", "Minesweeper": "35.17", "Navigation": "92.75", "Binario": "23.42", "Hanoi": "46.63", "Hitori": "45.75", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "DeepSeek-R1-0528", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 41.37, "Zebra": "59.08", "Sudoku": "19.39", "Skyscraper": "1.25", "Kakurasu": "89.75", "Crypto": "80.93", "Minesweeper": "36.38", "Navigation": "97.0", "Binario": "35.83", "Hanoi": "43.58", "Hitori": "28.42", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "GLM-4.5", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 21.67, "Zebra": "29.58", "Sudoku": "4.56", "Skyscraper": "1.92", "Kakurasu": "44.25", "Crypto": "24.14", "Minesweeper": "9.46", "Navigation": "93.63", "Binario": "16.92", "Hanoi": "31.17", "Hitori": "23.25", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "Kimi-K2-Instruct", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 15.18, "Zebra": "19.42", "Sudoku": "1.89", "Skyscraper": "0.08", "Kakurasu": "50.75", "Crypto": "20.21", "Minesweeper": "7.00", "Navigation": "63.13", "Binario": "7.58", "Hanoi": "21.08", "Hitori": "11.67", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "Seed-OSS-36B-Instruct", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 38.96, "Zebra": "53.0", "Sudoku": "24.17", "Skyscraper": "4.71", "Kakurasu": "91.38", "Crypto": "52.43", "Minesweeper": "25.25", "Navigation": "96.5", "Binario": "31.67", "Hanoi": "45.17", "Hitori": "48.92", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "gpt-oss-120b", "mode": "sampling (Temp=0.6)", "open-source": true, "total accuracy": 51.97, "Zebra": "56.67", "Sudoku": "58.22", "Skyscraper": "9.04", "Kakurasu": "88.5", "Crypto": "79.71", "Minesweeper": "60.79", "Navigation": "95.88", "Binario": "42.67", "Hanoi": "36.13", "Hitori": "61.08", "temperature": 0.6, "n_sampling": 4, "n": 50 }, { "model": "gpt-5", "mode": "sampling (Temp=0.6)", "open-source": false, "total accuracy": 69.10, "Zebra": "76.67", "Sudoku": "60.56", "Skyscraper": "22.92", "Kakurasu": "100.0", "Crypto": "77.86", "Minesweeper": "88.75", "Navigation": "98.75", "Binario": "85.0", "Hanoi": "65.83", "Hitori": "67.5", "temperature": 0.6, "n_sampling": 4, "n": 5 }, { "model": "gpt-5-mini", "mode": "sampling (Temp=0.6)", "open-source": false, "total accuracy": 54.49, "Zebra": "67.5", "Sudoku": "49.44", "Skyscraper": "15.0", "Kakurasu": "90.0", "Crypto": "92.86", "Minesweeper": "52.50", "Navigation": "100.0", "Binario": "47.5", "Hanoi": "47.92", "Hitori": "53.33", "temperature": 0.6, "n_sampling": 4, "n": 5 }, { "model": "o4-mini", "mode": "sampling (Temp=0.6)", "open-source": false, "total accuracy": 50.13, "Zebra": "71.67", "Sudoku": "48.89", "Skyscraper": "8.75", "Kakurasu": "87.5", "Crypto": "81.43", "Minesweeper": "48.75", "Navigation": "98.75", "Binario": "49.17", "Hanoi": "36.25", "Hitori": "50.83", "temperature": 0.6, "n_sampling": 4, "n": 5 }, { "model": "grok-4", "mode": "sampling (Temp=0.6)", "open-source": false, "total accuracy": 59.55, "Zebra": "87.5", "Sudoku": "35.56", "Skyscraper": "14.17", "Kakurasu": "98.75", "Crypto": "83.57", "Minesweeper": "50.42", "Navigation": "100.0", "Binario": "65.0", "Hanoi": "67.92", "Hitori": "73.33", "temperature": 0.6, "n_sampling": 4, "n": 5 }, { "model": "gemini-2.5-pro", "mode": "sampling (Temp=0.6)", "open-source": false, "total accuracy": 40.58, "Zebra": "47.5", "Sudoku": "12.22", "Skyscraper": "10.0", "Kakurasu": "90.0", "Crypto": "50.71", "Minesweeper": "37.50", "Navigation": "100.0", "Binario": "42.5", "Hanoi": "46.67", "Hitori": "45.0", "temperature": 0.6, "n_sampling": 4, "n": 5 }, { "model": "grok-3-mini", "mode": "sampling (Temp=0.6)", "open-source": false, "total accuracy": 42.56, "Zebra": "74.17", "Sudoku": "10.0", "Skyscraper": "0.42", "Kakurasu": "96.25", "Crypto": "59.29", "Minesweeper": "37.08", "Navigation": "97.5", "Binario": "40.83", "Hanoi": "45.0", "Hitori": "60.0", "temperature": 0.6, "n_sampling": 4, "n": 5 }, { "model": "claude-sonnet-4-thinking", "mode": "sampling (Temp=0.6)", "open-source": false, "total accuracy": 30.51, "Zebra": "30.83", "Sudoku": "19.44", "Skyscraper": "1.67", "Kakurasu": "88.75", "Crypto": "54.29", "Minesweeper": "15.83", "Navigation": "93.75", "Binario": "24.17", "Hanoi": "26.25", "Hitori": "40.0", "temperature": 0.6, "n_sampling": 4, "n": 5 }, { "model": "gemini-2.5-flash", "mode": "sampling (Temp=0.6)", "open-source": false, "total accuracy": 19.49, "Zebra": "20.0", "Sudoku": "0.56", "Skyscraper": "2.08", "Kakurasu": "43.75", "Crypto": "17.14", "Minesweeper": "12.92", "Navigation": "97.5", "Binario": "29.17", "Hanoi": "18.33", "Hitori": "22.5", "temperature": 0.6, "n_sampling": 4, "n": 5 } ]