Spaces:
Running
Running
| [ | |
| { | |
| "model": "Qwen3-8B", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 12.37, | |
| "Zebra": 26.92, | |
| "Sudoku": 2.33, | |
| "Skyscraper": 0.29, | |
| "Kakurasu": 36.00, | |
| "Crypto": 14.93, | |
| "Minesweeper": 0.25, | |
| "Navigation": 67.88, | |
| "Binario": 7.83, | |
| "Hanoi": 9.0, | |
| "Hitori": 16.83, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Qwen3-30B-A3B-Thinking-2507", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 37.33, | |
| "Zebra": 64.25, | |
| "Sudoku": 24.72, | |
| "Skyscraper": 1.25, | |
| "Kakurasu": 87.88, | |
| "Crypto": 55.86, | |
| "Minesweeper": 39.63, | |
| "Navigation": 93.75, | |
| "Binario": 19.17, | |
| "Hanoi": 28.38, | |
| "Hitori": 40.00, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Qwen3-32B", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 20.97, | |
| "Zebra": 44.00, | |
| "Sudoku": 6.89, | |
| "Skyscraper": 1.63, | |
| "Kakurasu": "51.13", | |
| "Crypto": "43.57", | |
| "Minesweeper": "0.54", | |
| "Navigation": "83.88", | |
| "Binario": "13.83", | |
| "Hanoi": "18.96", | |
| "Hitori": "21.42", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Qwen3-Next-80B-A3B-Thinking", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 36.35, | |
| "Zebra": "71.0", | |
| "Sudoku": "21.78", | |
| "Skyscraper": "4.25", | |
| "Kakurasu": "83.13", | |
| "Crypto": "57.71", | |
| "Minesweeper": "27.67", | |
| "Navigation": "95.0", | |
| "Binario": "27.67", | |
| "Hanoi": "27.38", | |
| "Hitori": "36.5", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Qwen3-235B-A22B-Thinking-2507", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 43.33, | |
| "Zebra": "61.0", | |
| "Sudoku": "28.94", | |
| "Skyscraper": "4.21", | |
| "Kakurasu": "89.5", | |
| "Crypto": "75.79", | |
| "Minesweeper": "30.17", | |
| "Navigation": "97.25", | |
| "Binario": "35.0", | |
| "Hanoi": "40.83", | |
| "Hitori": "40.5", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "MiniMax-M1-40k", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 6.44, | |
| "Zebra": "16.08", | |
| "Sudoku": "0.0", | |
| "Skyscraper": "0.13", | |
| "Kakurasu": "25.5", | |
| "Crypto": "1.5", | |
| "Minesweeper": "0.17", | |
| "Navigation": "9.5", | |
| "Binario": "4.58", | |
| "Hanoi": "13.75", | |
| "Hitori": "9.92", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "DeepSeek-R1-0528-Qwen3-8B", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 13.83, | |
| "Zebra": "39.33", | |
| "Sudoku": "0.28", | |
| "Skyscraper": "0.04", | |
| "Kakurasu": "39.38", | |
| "Crypto": "12.00", | |
| "Minesweeper": "1.75", | |
| "Navigation": "69.88", | |
| "Binario": "6.25", | |
| "Hanoi": "17.71", | |
| "Hitori": "8.00", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "DeepSeek-V3.1", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 41.43, | |
| "Zebra": "62.67", | |
| "Sudoku": "18.61", | |
| "Skyscraper": "1.38", | |
| "Kakurasu": "92.0", | |
| "Crypto": "75.64", | |
| "Minesweeper": "35.17", | |
| "Navigation": "92.75", | |
| "Binario": "23.42", | |
| "Hanoi": "46.63", | |
| "Hitori": "45.75", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "DeepSeek-R1-0528", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 41.37, | |
| "Zebra": "59.08", | |
| "Sudoku": "19.39", | |
| "Skyscraper": "1.25", | |
| "Kakurasu": "89.75", | |
| "Crypto": "80.93", | |
| "Minesweeper": "36.38", | |
| "Navigation": "97.0", | |
| "Binario": "35.83", | |
| "Hanoi": "43.58", | |
| "Hitori": "28.42", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "GLM-4.5", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 21.67, | |
| "Zebra": "29.58", | |
| "Sudoku": "4.56", | |
| "Skyscraper": "1.92", | |
| "Kakurasu": "44.25", | |
| "Crypto": "24.14", | |
| "Minesweeper": "9.46", | |
| "Navigation": "93.63", | |
| "Binario": "16.92", | |
| "Hanoi": "31.17", | |
| "Hitori": "23.25", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Kimi-K2-Instruct", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 15.18, | |
| "Zebra": "19.42", | |
| "Sudoku": "1.89", | |
| "Skyscraper": "0.08", | |
| "Kakurasu": "50.75", | |
| "Crypto": "20.21", | |
| "Minesweeper": "7.00", | |
| "Navigation": "63.13", | |
| "Binario": "7.58", | |
| "Hanoi": "21.08", | |
| "Hitori": "11.67", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Seed-OSS-36B-Instruct", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 38.96, | |
| "Zebra": "53.0", | |
| "Sudoku": "24.17", | |
| "Skyscraper": "4.71", | |
| "Kakurasu": "91.38", | |
| "Crypto": "52.43", | |
| "Minesweeper": "25.25", | |
| "Navigation": "96.5", | |
| "Binario": "31.67", | |
| "Hanoi": "45.17", | |
| "Hitori": "48.92", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "gpt-oss-120b", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 51.97, | |
| "Zebra": "56.67", | |
| "Sudoku": "58.22", | |
| "Skyscraper": "9.04", | |
| "Kakurasu": "88.5", | |
| "Crypto": "79.71", | |
| "Minesweeper": "60.79", | |
| "Navigation": "95.88", | |
| "Binario": "42.67", | |
| "Hanoi": "36.13", | |
| "Hitori": "61.08", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "gpt-5", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 69.10, | |
| "Zebra": "76.67", | |
| "Sudoku": "60.56", | |
| "Skyscraper": "22.92", | |
| "Kakurasu": "100.0", | |
| "Crypto": "77.86", | |
| "Minesweeper": "88.75", | |
| "Navigation": "98.75", | |
| "Binario": "85.0", | |
| "Hanoi": "65.83", | |
| "Hitori": "67.5", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "gpt-5-mini", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 54.49, | |
| "Zebra": "67.5", | |
| "Sudoku": "49.44", | |
| "Skyscraper": "15.0", | |
| "Kakurasu": "90.0", | |
| "Crypto": "92.86", | |
| "Minesweeper": "52.50", | |
| "Navigation": "100.0", | |
| "Binario": "47.5", | |
| "Hanoi": "47.92", | |
| "Hitori": "53.33", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "o4-mini", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 50.13, | |
| "Zebra": "71.67", | |
| "Sudoku": "48.89", | |
| "Skyscraper": "8.75", | |
| "Kakurasu": "87.5", | |
| "Crypto": "81.43", | |
| "Minesweeper": "48.75", | |
| "Navigation": "98.75", | |
| "Binario": "49.17", | |
| "Hanoi": "36.25", | |
| "Hitori": "50.83", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "grok-4", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 59.55, | |
| "Zebra": "87.5", | |
| "Sudoku": "35.56", | |
| "Skyscraper": "14.17", | |
| "Kakurasu": "98.75", | |
| "Crypto": "83.57", | |
| "Minesweeper": "50.42", | |
| "Navigation": "100.0", | |
| "Binario": "65.0", | |
| "Hanoi": "67.92", | |
| "Hitori": "73.33", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "gemini-2.5-pro", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 40.58, | |
| "Zebra": "47.5", | |
| "Sudoku": "12.22", | |
| "Skyscraper": "10.0", | |
| "Kakurasu": "90.0", | |
| "Crypto": "50.71", | |
| "Minesweeper": "37.50", | |
| "Navigation": "100.0", | |
| "Binario": "42.5", | |
| "Hanoi": "46.67", | |
| "Hitori": "45.0", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "grok-3-mini", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 42.56, | |
| "Zebra": "74.17", | |
| "Sudoku": "10.0", | |
| "Skyscraper": "0.42", | |
| "Kakurasu": "96.25", | |
| "Crypto": "59.29", | |
| "Minesweeper": "37.08", | |
| "Navigation": "97.5", | |
| "Binario": "40.83", | |
| "Hanoi": "45.0", | |
| "Hitori": "60.0", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "claude-sonnet-4-thinking", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 30.51, | |
| "Zebra": "30.83", | |
| "Sudoku": "19.44", | |
| "Skyscraper": "1.67", | |
| "Kakurasu": "88.75", | |
| "Crypto": "54.29", | |
| "Minesweeper": "15.83", | |
| "Navigation": "93.75", | |
| "Binario": "24.17", | |
| "Hanoi": "26.25", | |
| "Hitori": "40.0", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "gemini-2.5-flash", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 19.49, | |
| "Zebra": "20.0", | |
| "Sudoku": "0.56", | |
| "Skyscraper": "2.08", | |
| "Kakurasu": "43.75", | |
| "Crypto": "17.14", | |
| "Minesweeper": "12.92", | |
| "Navigation": "97.5", | |
| "Binario": "29.17", | |
| "Hanoi": "18.33", | |
| "Hitori": "22.5", | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| } | |
| ] |