JunsWan commited on
Commit
e738640
·
verified ·
1 Parent(s): 8ec6aa5

Upload 2 files

Browse files
hardcorelogic.puzzle.json ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "Qwen3-8B",
4
+ "mode": "sampling (Temp=0.6)",
5
+ "open-source": true,
6
+ "total accuracy": 12.37,
7
+ "Zebra": 26.92,
8
+ "Sudoku": 2.33,
9
+ "Skyscraper": 0.29,
10
+ "Kakurasu": 36.00,
11
+ "Crypto": 14.93,
12
+ "Minesweeper": 0.25,
13
+ "Navigation": 67.88,
14
+ "Binario": 7.83,
15
+ "Hanoi": 9.0,
16
+ "Hitori": 16.83,
17
+ "temperature": 0.6,
18
+ "n_sampling": 4,
19
+ "n": 50
20
+ },
21
+ {
22
+ "model": "Qwen3-30B-A3B-Thinking-2507",
23
+ "mode": "sampling (Temp=0.6)",
24
+ "open-source": true,
25
+ "total accuracy": 37.33,
26
+ "Zebra": 64.25,
27
+ "Sudoku": 24.72,
28
+ "Skyscraper": 1.25,
29
+ "Kakurasu": 87.88,
30
+ "Crypto": 55.86,
31
+ "Minesweeper": 39.63,
32
+ "Navigation": 93.75,
33
+ "Binario": 19.17,
34
+ "Hanoi": 28.38,
35
+ "Hitori": 40.00,
36
+ "temperature": 0.6,
37
+ "n_sampling": 4,
38
+ "n": 50
39
+ },
40
+ {
41
+ "model": "Qwen3-32B",
42
+ "mode": "sampling (Temp=0.6)",
43
+ "open-source": true,
44
+ "total accuracy": 20.97,
45
+ "Zebra": 44.00,
46
+ "Sudoku": 6.89,
47
+ "Skyscraper": 1.63,
48
+ "Kakurasu": "51.13",
49
+ "Crypto": "43.57",
50
+ "Minesweeper": "0.54",
51
+ "Navigation": "83.88",
52
+ "Binario": "13.83",
53
+ "Hanoi": "18.96",
54
+ "Hitori": "21.42",
55
+ "temperature": 0.6,
56
+ "n_sampling": 4,
57
+ "n": 50
58
+ },
59
+ {
60
+ "model": "Qwen3-Next-80B-A3B-Thinking",
61
+ "mode": "sampling (Temp=0.6)",
62
+ "open-source": true,
63
+ "total accuracy": 36.35,
64
+ "Zebra": "71.0",
65
+ "Sudoku": "21.78",
66
+ "Skyscraper": "4.25",
67
+ "Kakurasu": "83.13",
68
+ "Crypto": "57.71",
69
+ "Minesweeper": "27.67",
70
+ "Navigation": "95.0",
71
+ "Binario": "27.67",
72
+ "Hanoi": "27.38",
73
+ "Hitori": "36.5",
74
+ "temperature": 0.6,
75
+ "n_sampling": 4,
76
+ "n": 50
77
+ },
78
+ {
79
+ "model": "Qwen3-235B-A22B-Thinking-2507",
80
+ "mode": "sampling (Temp=0.6)",
81
+ "open-source": true,
82
+ "total accuracy": 43.33,
83
+ "Zebra": "61.0",
84
+ "Sudoku": "28.94",
85
+ "Skyscraper": "4.21",
86
+ "Kakurasu": "89.5",
87
+ "Crypto": "75.79",
88
+ "Minesweeper": "30.17",
89
+ "Navigation": "97.25",
90
+ "Binario": "35.0",
91
+ "Hanoi": "40.83",
92
+ "Hitori": "40.5",
93
+ "temperature": 0.6,
94
+ "n_sampling": 4,
95
+ "n": 50
96
+ },
97
+ {
98
+ "model": "MiniMax-M1-40k",
99
+ "mode": "sampling (Temp=0.6)",
100
+ "open-source": true,
101
+ "total accuracy": 6.44,
102
+ "Zebra": "16.08",
103
+ "Sudoku": "0.0",
104
+ "Skyscraper": "0.13",
105
+ "Kakurasu": "25.5",
106
+ "Crypto": "1.5",
107
+ "Minesweeper": "0.17",
108
+ "Navigation": "9.5",
109
+ "Binario": "4.58",
110
+ "Hanoi": "13.75",
111
+ "Hitori": "9.92",
112
+ "temperature": 0.6,
113
+ "n_sampling": 4,
114
+ "n": 50
115
+ },
116
+ {
117
+ "model": "DeepSeek-R1-0528-Qwen3-8B",
118
+ "mode": "sampling (Temp=0.6)",
119
+ "open-source": true,
120
+ "total accuracy": 13.83,
121
+ "Zebra": "39.33",
122
+ "Sudoku": "0.28",
123
+ "Skyscraper": "0.04",
124
+ "Kakurasu": "39.38",
125
+ "Crypto": "12.00",
126
+ "Minesweeper": "1.75",
127
+ "Navigation": "69.88",
128
+ "Binario": "6.25",
129
+ "Hanoi": "17.71",
130
+ "Hitori": "8.00",
131
+ "temperature": 0.6,
132
+ "n_sampling": 4,
133
+ "n": 50
134
+ },
135
+ {
136
+ "model": "DeepSeek-V3.1",
137
+ "mode": "sampling (Temp=0.6)",
138
+ "open-source": true,
139
+ "total accuracy": 41.43,
140
+ "Zebra": "62.67",
141
+ "Sudoku": "18.61",
142
+ "Skyscraper": "1.38",
143
+ "Kakurasu": "92.0",
144
+ "Crypto": "75.64",
145
+ "Minesweeper": "35.17",
146
+ "Navigation": "92.75",
147
+ "Binario": "23.42",
148
+ "Hanoi": "46.63",
149
+ "Hitori": "45.75",
150
+ "temperature": 0.6,
151
+ "n_sampling": 4,
152
+ "n": 50
153
+ },
154
+ {
155
+ "model": "DeepSeek-R1-0528",
156
+ "mode": "sampling (Temp=0.6)",
157
+ "open-source": true,
158
+ "total accuracy": 41.37,
159
+ "Zebra": "59.08",
160
+ "Sudoku": "19.39",
161
+ "Skyscraper": "1.25",
162
+ "Kakurasu": "89.75",
163
+ "Crypto": "80.93",
164
+ "Minesweeper": "36.38",
165
+ "Navigation": "97.0",
166
+ "Binario": "35.83",
167
+ "Hanoi": "43.58",
168
+ "Hitori": "28.42",
169
+ "temperature": 0.6,
170
+ "n_sampling": 4,
171
+ "n": 50
172
+ },
173
+ {
174
+ "model": "GLM-4.5",
175
+ "mode": "sampling (Temp=0.6)",
176
+ "open-source": true,
177
+ "total accuracy": 21.67,
178
+ "Zebra": "29.58",
179
+ "Sudoku": "4.56",
180
+ "Skyscraper": "1.92",
181
+ "Kakurasu": "44.25",
182
+ "Crypto": "24.14",
183
+ "Minesweeper": "9.46",
184
+ "Navigation": "93.63",
185
+ "Binario": "16.92",
186
+ "Hanoi": "31.17",
187
+ "Hitori": "23.25",
188
+ "temperature": 0.6,
189
+ "n_sampling": 4,
190
+ "n": 50
191
+ },
192
+ {
193
+ "model": "Kimi-K2-Instruct",
194
+ "mode": "sampling (Temp=0.6)",
195
+ "open-source": true,
196
+ "total accuracy": 15.18,
197
+ "Zebra": "19.42",
198
+ "Sudoku": "1.89",
199
+ "Skyscraper": "0.08",
200
+ "Kakurasu": "50.75",
201
+ "Crypto": "20.21",
202
+ "Minesweeper": "7.00",
203
+ "Navigation": "63.13",
204
+ "Binario": "7.58",
205
+ "Hanoi": "21.08",
206
+ "Hitori": "11.67",
207
+ "temperature": 0.6,
208
+ "n_sampling": 4,
209
+ "n": 50
210
+ },
211
+ {
212
+ "model": "Seed-OSS-36B-Instruct",
213
+ "mode": "sampling (Temp=0.6)",
214
+ "open-source": true,
215
+ "total accuracy": 38.96,
216
+ "Zebra": "53.0",
217
+ "Sudoku": "24.17",
218
+ "Skyscraper": "4.71",
219
+ "Kakurasu": "91.38",
220
+ "Crypto": "52.43",
221
+ "Minesweeper": "25.25",
222
+ "Navigation": "96.5",
223
+ "Binario": "31.67",
224
+ "Hanoi": "45.17",
225
+ "Hitori": "48.92",
226
+ "temperature": 0.6,
227
+ "n_sampling": 4,
228
+ "n": 50
229
+ },
230
+ {
231
+ "model": "gpt-oss-120b",
232
+ "mode": "sampling (Temp=0.6)",
233
+ "open-source": true,
234
+ "total accuracy": 51.97,
235
+ "Zebra": "56.67",
236
+ "Sudoku": "58.22",
237
+ "Skyscraper": "9.04",
238
+ "Kakurasu": "88.5",
239
+ "Crypto": "79.71",
240
+ "Minesweeper": "60.79",
241
+ "Navigation": "95.88",
242
+ "Binario": "42.67",
243
+ "Hanoi": "36.13",
244
+ "Hitori": "61.08",
245
+ "temperature": 0.6,
246
+ "n_sampling": 4,
247
+ "n": 50
248
+ },
249
+ {
250
+ "model": "gpt-5",
251
+ "mode": "sampling (Temp=0.6)",
252
+ "open-source": false,
253
+ "total accuracy": 69.10,
254
+ "Zebra": "76.67",
255
+ "Sudoku": "60.56",
256
+ "Skyscraper": "22.92",
257
+ "Kakurasu": "100.0",
258
+ "Crypto": "77.86",
259
+ "Minesweeper": "88.75",
260
+ "Navigation": "98.75",
261
+ "Binario": "85.0",
262
+ "Hanoi": "65.83",
263
+ "Hitori": "67.5",
264
+ "temperature": 0.6,
265
+ "n_sampling": 4,
266
+ "n": 5
267
+ },
268
+ {
269
+ "model": "gpt-5-mini",
270
+ "mode": "sampling (Temp=0.6)",
271
+ "open-source": false,
272
+ "total accuracy": 54.49,
273
+ "Zebra": "67.5",
274
+ "Sudoku": "49.44",
275
+ "Skyscraper": "15.0",
276
+ "Kakurasu": "90.0",
277
+ "Crypto": "92.86",
278
+ "Minesweeper": "52.50",
279
+ "Navigation": "100.0",
280
+ "Binario": "47.5",
281
+ "Hanoi": "47.92",
282
+ "Hitori": "53.33",
283
+ "temperature": 0.6,
284
+ "n_sampling": 4,
285
+ "n": 5
286
+ },
287
+ {
288
+ "model": "o4-mini",
289
+ "mode": "sampling (Temp=0.6)",
290
+ "open-source": false,
291
+ "total accuracy": 50.13,
292
+ "Zebra": "71.67",
293
+ "Sudoku": "48.89",
294
+ "Skyscraper": "8.75",
295
+ "Kakurasu": "87.5",
296
+ "Crypto": "81.43",
297
+ "Minesweeper": "48.75",
298
+ "Navigation": "98.75",
299
+ "Binario": "49.17",
300
+ "Hanoi": "36.25",
301
+ "Hitori": "50.83",
302
+ "temperature": 0.6,
303
+ "n_sampling": 4,
304
+ "n": 5
305
+ },
306
+ {
307
+ "model": "grok-4",
308
+ "mode": "sampling (Temp=0.6)",
309
+ "open-source": false,
310
+ "total accuracy": 59.55,
311
+ "Zebra": "87.5",
312
+ "Sudoku": "35.56",
313
+ "Skyscraper": "14.17",
314
+ "Kakurasu": "98.75",
315
+ "Crypto": "83.57",
316
+ "Minesweeper": "50.42",
317
+ "Navigation": "100.0",
318
+ "Binario": "65.0",
319
+ "Hanoi": "67.92",
320
+ "Hitori": "73.33",
321
+ "temperature": 0.6,
322
+ "n_sampling": 4,
323
+ "n": 5
324
+ },
325
+ {
326
+ "model": "gemini-2.5-pro",
327
+ "mode": "sampling (Temp=0.6)",
328
+ "open-source": false,
329
+ "total accuracy": 40.58,
330
+ "Zebra": "47.5",
331
+ "Sudoku": "12.22",
332
+ "Skyscraper": "10.0",
333
+ "Kakurasu": "90.0",
334
+ "Crypto": "50.71",
335
+ "Minesweeper": "37.50",
336
+ "Navigation": "100.0",
337
+ "Binario": "42.5",
338
+ "Hanoi": "46.67",
339
+ "Hitori": "45.0",
340
+ "temperature": 0.6,
341
+ "n_sampling": 4,
342
+ "n": 5
343
+ },
344
+ {
345
+ "model": "grok-3-mini",
346
+ "mode": "sampling (Temp=0.6)",
347
+ "open-source": false,
348
+ "total accuracy": 42.56,
349
+ "Zebra": "74.17",
350
+ "Sudoku": "10.0",
351
+ "Skyscraper": "0.42",
352
+ "Kakurasu": "96.25",
353
+ "Crypto": "59.29",
354
+ "Minesweeper": "37.08",
355
+ "Navigation": "97.5",
356
+ "Binario": "40.83",
357
+ "Hanoi": "45.0",
358
+ "Hitori": "60.0",
359
+ "temperature": 0.6,
360
+ "n_sampling": 4,
361
+ "n": 5
362
+ },
363
+ {
364
+ "model": "claude-sonnet-4-thinking",
365
+ "mode": "sampling (Temp=0.6)",
366
+ "open-source": false,
367
+ "total accuracy": 30.51,
368
+ "Zebra": "30.83",
369
+ "Sudoku": "19.44",
370
+ "Skyscraper": "1.67",
371
+ "Kakurasu": "88.75",
372
+ "Crypto": "54.29",
373
+ "Minesweeper": "15.83",
374
+ "Navigation": "93.75",
375
+ "Binario": "24.17",
376
+ "Hanoi": "26.25",
377
+ "Hitori": "40.0",
378
+ "temperature": 0.6,
379
+ "n_sampling": 4,
380
+ "n": 5
381
+ },
382
+ {
383
+ "model": "gemini-2.5-flash",
384
+ "mode": "sampling (Temp=0.6)",
385
+ "open-source": false,
386
+ "total accuracy": 19.49,
387
+ "Zebra": "20.0",
388
+ "Sudoku": "0.56",
389
+ "Skyscraper": "2.08",
390
+ "Kakurasu": "43.75",
391
+ "Crypto": "17.14",
392
+ "Minesweeper": "12.92",
393
+ "Navigation": "97.5",
394
+ "Binario": "29.17",
395
+ "Hanoi": "18.33",
396
+ "Hitori": "22.5",
397
+ "temperature": 0.6,
398
+ "n_sampling": 4,
399
+ "n": 5
400
+ }
401
+ ]
hardcorelogic.summary.json ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "Qwen3-8B",
4
+ "mode": "sampling (Temp=0.6)",
5
+ "open-source": true,
6
+ "total accuracy": 12.37,
7
+ "increased complexity": 15.08,
8
+ "uncommon elements": 10.58,
9
+ "unsolvable puzzle": 69.54,
10
+ "temperature": 0.6,
11
+ "n_sampling": 4,
12
+ "n": 50
13
+ },
14
+ {
15
+ "model": "Qwen3-30B-A3B-Thinking-2507",
16
+ "mode": "sampling (Temp=0.6)",
17
+ "open-source": true,
18
+ "total accuracy": 37.33,
19
+ "increased complexity": "",
20
+ "uncommon elements": "" ,
21
+ "unsolvable puzzle": 86.09,
22
+ "temperature": 0.6,
23
+ "n_sampling": 4,
24
+ "n": 50
25
+ },
26
+ {
27
+ "model": "Qwen3-32B",
28
+ "mode": "sampling (Temp=0.6)",
29
+ "open-source": true,
30
+ "total accuracy": 20.97,
31
+ "increased complexity": 25.38 ,
32
+ "uncommon elements": 16.93 ,
33
+ "unsolvable puzzle": 65.48,
34
+ "temperature": 0.6,
35
+ "n_sampling": 4,
36
+ "n": 50
37
+ },
38
+ {
39
+ "model": "Qwen3-Next-80B-A3B-Thinking",
40
+ "mode": "sampling (Temp=0.6)",
41
+ "open-source": true,
42
+ "total accuracy": 36.35,
43
+ "increased complexity": 41.97,
44
+ "uncommon elements": 32.13 ,
45
+ "unsolvable puzzle": 83.11,
46
+ "temperature": 0.6,
47
+ "n_sampling": 4,
48
+ "n": 50
49
+ },
50
+ {
51
+ "model": "Qwen3-235B-A22B-Thinking-2507",
52
+ "mode": "sampling (Temp=0.6)",
53
+ "open-source": true,
54
+ "total accuracy": 43.33,
55
+ "increased complexity": 46.93,
56
+ "uncommon elements": 40.94 ,
57
+ "unsolvable puzzle": 84.41,
58
+ "temperature": 0.6,
59
+ "n_sampling": 4,
60
+ "n": 50
61
+ },
62
+ {
63
+ "model": "MiniMax-M1-40k",
64
+ "mode": "sampling (Temp=0.6)",
65
+ "open-source": true,
66
+ "total accuracy": 6.44,
67
+ "increased complexity": 5.27,
68
+ "uncommon elements": 6.88 ,
69
+ "unsolvable puzzle": 51.39,
70
+ "temperature": 0.6,
71
+ "n_sampling": 4,
72
+ "n": 50
73
+ },
74
+ {
75
+ "model": "DeepSeek-R1-0528-Qwen3-8B",
76
+ "mode": "sampling (Temp=0.6)",
77
+ "open-source": true,
78
+ "total accuracy": 13.83,
79
+ "increased complexity": "",
80
+ "uncommon elements": "" ,
81
+ "unsolvable puzzle": 95.19,
82
+ "temperature": 0.6,
83
+ "n_sampling": 4,
84
+ "n": 50
85
+ },
86
+ {
87
+ "model": "DeepSeek-V3.1",
88
+ "mode": "sampling (Temp=0.6)",
89
+ "open-source": true,
90
+ "total accuracy": 41.43,
91
+ "increased complexity": 44.61,
92
+ "uncommon elements": 39.09 ,
93
+ "unsolvable puzzle": 88.76,
94
+ "temperature": 0.6,
95
+ "n_sampling": 4,
96
+ "n": 50
97
+ },
98
+ {
99
+ "model": "DeepSeek-R1-0528",
100
+ "mode": "sampling (Temp=0.6)",
101
+ "open-source": true,
102
+ "total accuracy": 41.37,
103
+ "increased complexity": 45.87,
104
+ "uncommon elements": 37.28 ,
105
+ "unsolvable puzzle": 93.50,
106
+ "temperature": 0.6,
107
+ "n_sampling": 4,
108
+ "n": 50
109
+ },
110
+ {
111
+ "model": "GLM-4.5",
112
+ "mode": "sampling (Temp=0.6)",
113
+ "open-source": true,
114
+ "total accuracy": 21.67,
115
+ "increased complexity": 24.17,
116
+ "uncommon elements": 21.49,
117
+ "unsolvable puzzle": 93.26,
118
+ "temperature": 0.6,
119
+ "n_sampling": 4,
120
+ "n": 50
121
+ },
122
+ {
123
+ "model": "Kimi-K2-Instruct",
124
+ "mode": "sampling (Temp=0.6)",
125
+ "open-source": true,
126
+ "total accuracy": 15.18,
127
+ "increased complexity": 17.33,
128
+ "uncommon elements": 14.71,
129
+ "unsolvable puzzle": 87.46,
130
+ "temperature": 0.6,
131
+ "n_sampling": 4,
132
+ "n": 50
133
+ },
134
+ {
135
+ "model": "Seed-OSS-36B-Instruct",
136
+ "mode": "sampling (Temp=0.6)",
137
+ "open-source": true,
138
+ "total accuracy": 38.96,
139
+ "increased complexity": 41.01,
140
+ "uncommon elements": 38.79 ,
141
+ "unsolvable puzzle": 85.76,
142
+ "temperature": 0.6,
143
+ "n_sampling": 4,
144
+ "n": 50
145
+ },
146
+ {
147
+ "model": "gpt-oss-120b",
148
+ "mode": "sampling (Temp=0.6)",
149
+ "open-source": true,
150
+ "total accuracy": 51.97,
151
+ "increased complexity": 54.08,
152
+ "uncommon elements": 51.11,
153
+ "unsolvable puzzle": 93.35,
154
+ "temperature": 0.6,
155
+ "n_sampling": 4,
156
+ "n": 50
157
+ },
158
+ {
159
+ "model": "gpt-5",
160
+ "mode": "sampling (Temp=0.6)",
161
+ "open-source": false,
162
+ "total accuracy": 69.10,
163
+ "increased complexity": 69.89,
164
+ "uncommon elements": 67.88,
165
+ "unsolvable puzzle": 97.78,
166
+ "temperature": 0.6,
167
+ "n_sampling": 4,
168
+ "n": 5
169
+ },
170
+ {
171
+ "model": "gpt-5-mini",
172
+ "mode": "sampling (Temp=0.6)",
173
+ "open-source": false,
174
+ "total accuracy": 54.49,
175
+ "increased complexity": 55.76,
176
+ "uncommon elements": 52.13 ,
177
+ "unsolvable puzzle": 98.52,
178
+ "temperature": 0.6,
179
+ "n_sampling": 4,
180
+ "n": 5
181
+ },
182
+ {
183
+ "model": "o4-mini",
184
+ "mode": "sampling (Temp=0.6)",
185
+ "open-source": false,
186
+ "total accuracy": 50.13,
187
+ "increased complexity": 55.11,
188
+ "uncommon elements": 47.13 ,
189
+ "unsolvable puzzle": 95.00,
190
+ "temperature": 0.6,
191
+ "n_sampling": 4,
192
+ "n": 5
193
+ },
194
+ {
195
+ "model": "grok-4",
196
+ "mode": "sampling (Temp=0.6)",
197
+ "open-source": false,
198
+ "total accuracy": 59.55,
199
+ "increased complexity": 58.26 ,
200
+ "uncommon elements": 59.62 ,
201
+ "unsolvable puzzle": 97.59,
202
+ "temperature": 0.6,
203
+ "n_sampling": 4,
204
+ "n": 5
205
+ },
206
+ {
207
+ "model": "gemini-2.5-pro",
208
+ "mode": "sampling (Temp=0.6)",
209
+ "open-source": false,
210
+ "total accuracy": 40.58,
211
+ "increased complexity": 43.80,
212
+ "uncommon elements": 39.38 ,
213
+ "unsolvable puzzle": 91.48,
214
+ "temperature": 0.6,
215
+ "n_sampling": 4,
216
+ "n": 5
217
+ },
218
+ {
219
+ "model": "grok-3-mini",
220
+ "mode": "sampling (Temp=0.6)",
221
+ "open-source": false,
222
+ "total accuracy": 42.56,
223
+ "increased complexity": 48.48,
224
+ "uncommon elements": 39.5,
225
+ "unsolvable puzzle": 94.63,
226
+ "temperature": 0.6,
227
+ "n_sampling": 4,
228
+ "n": 5
229
+ },
230
+ {
231
+ "model": "claude-sonnet-4-thinking",
232
+ "mode": "sampling (Temp=0.6)",
233
+ "open-source": false,
234
+ "total accuracy": 30.51,
235
+ "increased complexity": 34.67,
236
+ "uncommon elements": 28.25 ,
237
+ "unsolvable puzzle": 57.96,
238
+ "temperature": 0.6,
239
+ "n_sampling": 4,
240
+ "n": 5
241
+ },
242
+ {
243
+ "model": "gemini-2.5-flash",
244
+ "mode": "sampling (Temp=0.6)",
245
+ "open-source": false,
246
+ "total accuracy": 19.49,
247
+ "increased complexity": 25.11,
248
+ "uncommon elements": 16.00,
249
+ "unsolvable puzzle": 57.78,
250
+ "temperature": 0.6,
251
+ "n_sampling": 4,
252
+ "n": 5
253
+ }
254
+ ]