HardcoreLogic / hardcorelogic.puzzle.json
JunsWan's picture
Upload 2 files
e738640 verified
[
{
"model": "Qwen3-8B",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 12.37,
"Zebra": 26.92,
"Sudoku": 2.33,
"Skyscraper": 0.29,
"Kakurasu": 36.00,
"Crypto": 14.93,
"Minesweeper": 0.25,
"Navigation": 67.88,
"Binario": 7.83,
"Hanoi": 9.0,
"Hitori": 16.83,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Qwen3-30B-A3B-Thinking-2507",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 37.33,
"Zebra": 64.25,
"Sudoku": 24.72,
"Skyscraper": 1.25,
"Kakurasu": 87.88,
"Crypto": 55.86,
"Minesweeper": 39.63,
"Navigation": 93.75,
"Binario": 19.17,
"Hanoi": 28.38,
"Hitori": 40.00,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Qwen3-32B",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 20.97,
"Zebra": 44.00,
"Sudoku": 6.89,
"Skyscraper": 1.63,
"Kakurasu": "51.13",
"Crypto": "43.57",
"Minesweeper": "0.54",
"Navigation": "83.88",
"Binario": "13.83",
"Hanoi": "18.96",
"Hitori": "21.42",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Qwen3-Next-80B-A3B-Thinking",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 36.35,
"Zebra": "71.0",
"Sudoku": "21.78",
"Skyscraper": "4.25",
"Kakurasu": "83.13",
"Crypto": "57.71",
"Minesweeper": "27.67",
"Navigation": "95.0",
"Binario": "27.67",
"Hanoi": "27.38",
"Hitori": "36.5",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Qwen3-235B-A22B-Thinking-2507",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 43.33,
"Zebra": "61.0",
"Sudoku": "28.94",
"Skyscraper": "4.21",
"Kakurasu": "89.5",
"Crypto": "75.79",
"Minesweeper": "30.17",
"Navigation": "97.25",
"Binario": "35.0",
"Hanoi": "40.83",
"Hitori": "40.5",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "MiniMax-M1-40k",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 6.44,
"Zebra": "16.08",
"Sudoku": "0.0",
"Skyscraper": "0.13",
"Kakurasu": "25.5",
"Crypto": "1.5",
"Minesweeper": "0.17",
"Navigation": "9.5",
"Binario": "4.58",
"Hanoi": "13.75",
"Hitori": "9.92",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "DeepSeek-R1-0528-Qwen3-8B",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 13.83,
"Zebra": "39.33",
"Sudoku": "0.28",
"Skyscraper": "0.04",
"Kakurasu": "39.38",
"Crypto": "12.00",
"Minesweeper": "1.75",
"Navigation": "69.88",
"Binario": "6.25",
"Hanoi": "17.71",
"Hitori": "8.00",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "DeepSeek-V3.1",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 41.43,
"Zebra": "62.67",
"Sudoku": "18.61",
"Skyscraper": "1.38",
"Kakurasu": "92.0",
"Crypto": "75.64",
"Minesweeper": "35.17",
"Navigation": "92.75",
"Binario": "23.42",
"Hanoi": "46.63",
"Hitori": "45.75",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "DeepSeek-R1-0528",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 41.37,
"Zebra": "59.08",
"Sudoku": "19.39",
"Skyscraper": "1.25",
"Kakurasu": "89.75",
"Crypto": "80.93",
"Minesweeper": "36.38",
"Navigation": "97.0",
"Binario": "35.83",
"Hanoi": "43.58",
"Hitori": "28.42",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "GLM-4.5",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 21.67,
"Zebra": "29.58",
"Sudoku": "4.56",
"Skyscraper": "1.92",
"Kakurasu": "44.25",
"Crypto": "24.14",
"Minesweeper": "9.46",
"Navigation": "93.63",
"Binario": "16.92",
"Hanoi": "31.17",
"Hitori": "23.25",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Kimi-K2-Instruct",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 15.18,
"Zebra": "19.42",
"Sudoku": "1.89",
"Skyscraper": "0.08",
"Kakurasu": "50.75",
"Crypto": "20.21",
"Minesweeper": "7.00",
"Navigation": "63.13",
"Binario": "7.58",
"Hanoi": "21.08",
"Hitori": "11.67",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Seed-OSS-36B-Instruct",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 38.96,
"Zebra": "53.0",
"Sudoku": "24.17",
"Skyscraper": "4.71",
"Kakurasu": "91.38",
"Crypto": "52.43",
"Minesweeper": "25.25",
"Navigation": "96.5",
"Binario": "31.67",
"Hanoi": "45.17",
"Hitori": "48.92",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "gpt-oss-120b",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 51.97,
"Zebra": "56.67",
"Sudoku": "58.22",
"Skyscraper": "9.04",
"Kakurasu": "88.5",
"Crypto": "79.71",
"Minesweeper": "60.79",
"Navigation": "95.88",
"Binario": "42.67",
"Hanoi": "36.13",
"Hitori": "61.08",
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "gpt-5",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 69.10,
"Zebra": "76.67",
"Sudoku": "60.56",
"Skyscraper": "22.92",
"Kakurasu": "100.0",
"Crypto": "77.86",
"Minesweeper": "88.75",
"Navigation": "98.75",
"Binario": "85.0",
"Hanoi": "65.83",
"Hitori": "67.5",
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "gpt-5-mini",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 54.49,
"Zebra": "67.5",
"Sudoku": "49.44",
"Skyscraper": "15.0",
"Kakurasu": "90.0",
"Crypto": "92.86",
"Minesweeper": "52.50",
"Navigation": "100.0",
"Binario": "47.5",
"Hanoi": "47.92",
"Hitori": "53.33",
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "o4-mini",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 50.13,
"Zebra": "71.67",
"Sudoku": "48.89",
"Skyscraper": "8.75",
"Kakurasu": "87.5",
"Crypto": "81.43",
"Minesweeper": "48.75",
"Navigation": "98.75",
"Binario": "49.17",
"Hanoi": "36.25",
"Hitori": "50.83",
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "grok-4",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 59.55,
"Zebra": "87.5",
"Sudoku": "35.56",
"Skyscraper": "14.17",
"Kakurasu": "98.75",
"Crypto": "83.57",
"Minesweeper": "50.42",
"Navigation": "100.0",
"Binario": "65.0",
"Hanoi": "67.92",
"Hitori": "73.33",
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "gemini-2.5-pro",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 40.58,
"Zebra": "47.5",
"Sudoku": "12.22",
"Skyscraper": "10.0",
"Kakurasu": "90.0",
"Crypto": "50.71",
"Minesweeper": "37.50",
"Navigation": "100.0",
"Binario": "42.5",
"Hanoi": "46.67",
"Hitori": "45.0",
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "grok-3-mini",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 42.56,
"Zebra": "74.17",
"Sudoku": "10.0",
"Skyscraper": "0.42",
"Kakurasu": "96.25",
"Crypto": "59.29",
"Minesweeper": "37.08",
"Navigation": "97.5",
"Binario": "40.83",
"Hanoi": "45.0",
"Hitori": "60.0",
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "claude-sonnet-4-thinking",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 30.51,
"Zebra": "30.83",
"Sudoku": "19.44",
"Skyscraper": "1.67",
"Kakurasu": "88.75",
"Crypto": "54.29",
"Minesweeper": "15.83",
"Navigation": "93.75",
"Binario": "24.17",
"Hanoi": "26.25",
"Hitori": "40.0",
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "gemini-2.5-flash",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 19.49,
"Zebra": "20.0",
"Sudoku": "0.56",
"Skyscraper": "2.08",
"Kakurasu": "43.75",
"Crypto": "17.14",
"Minesweeper": "12.92",
"Navigation": "97.5",
"Binario": "29.17",
"Hanoi": "18.33",
"Hitori": "22.5",
"temperature": 0.6,
"n_sampling": 4,
"n": 5
}
]