JunsWan commited on
Commit
8ec6aa5
·
verified ·
1 Parent(s): 291845b

Delete hardcorelogic.summary.json

Browse files
Files changed (1) hide show
  1. hardcorelogic.summary.json +0 -254
hardcorelogic.summary.json DELETED
@@ -1,254 +0,0 @@
1
- [
2
- {
3
- "model": "Qwen3-8B",
4
- "mode": "sampling (Temp=0.6)",
5
- "open-source": true,
6
- "total accuracy": 12.37,
7
- "increased complexity": 15.08,
8
- "uncommon elements": 10.58,
9
- "unsolvable puzzle": 69.54,
10
- "temperature": 0.6,
11
- "n_sampling": 4,
12
- "n": 50
13
- },
14
- {
15
- "model": "Qwen3-30B-A3B-Thinking-2507",
16
- "mode": "sampling (Temp=0.6)",
17
- "open-source": true,
18
- "total accuracy": 37.10,
19
- "increased complexity": "",
20
- "uncommon elements": "" ,
21
- "unsolvable puzzle": 86.09,
22
- "temperature": 0.6,
23
- "n_sampling": 4,
24
- "n": 50
25
- },
26
- {
27
- "model": "Qwen3-32B",
28
- "mode": "sampling (Temp=0.6)",
29
- "open-source": true,
30
- "total accuracy": 20.97,
31
- "increased complexity": 25.38 ,
32
- "uncommon elements": 16.93 ,
33
- "unsolvable puzzle": 65.48,
34
- "temperature": 0.6,
35
- "n_sampling": 4,
36
- "n": 50
37
- },
38
- {
39
- "model": "Qwen3-Next-80B-A3B-Thinking",
40
- "mode": "sampling (Temp=0.6)",
41
- "open-source": true,
42
- "total accuracy": 36.30,
43
- "increased complexity": 41.97,
44
- "uncommon elements": 32.13 ,
45
- "unsolvable puzzle": 83.11,
46
- "temperature": 0.6,
47
- "n_sampling": 4,
48
- "n": 50
49
- },
50
- {
51
- "model": "Qwen3-235B-A22B-Thinking-2507",
52
- "mode": "sampling (Temp=0.6)",
53
- "open-source": true,
54
- "total accuracy": 43.11,
55
- "increased complexity": 46.93,
56
- "uncommon elements": 40.94 ,
57
- "unsolvable puzzle": 84.41,
58
- "temperature": 0.6,
59
- "n_sampling": 4,
60
- "n": 50
61
- },
62
- {
63
- "model": "MiniMax-M1-40k",
64
- "mode": "sampling (Temp=0.6)",
65
- "open-source": true,
66
- "total accuracy": 6.44,
67
- "increased complexity": 5.27,
68
- "uncommon elements": 6.88 ,
69
- "unsolvable puzzle": 51.39,
70
- "temperature": 0.6,
71
- "n_sampling": 4,
72
- "n": 50
73
- },
74
- {
75
- "model": "DeepSeek-R1-0528-Qwen3-8B",
76
- "mode": "sampling (Temp=0.6)",
77
- "open-source": true,
78
- "total accuracy": 13.83,
79
- "increased complexity": "",
80
- "uncommon elements": "" ,
81
- "unsolvable puzzle": 95.19,
82
- "temperature": 0.6,
83
- "n_sampling": 4,
84
- "n": 50
85
- },
86
- {
87
- "model": "DeepSeek-V3.1",
88
- "mode": "sampling (Temp=0.6)",
89
- "open-source": true,
90
- "total accuracy": 41.08,
91
- "increased complexity": 44.61,
92
- "uncommon elements": 39.09 ,
93
- "unsolvable puzzle": 88.76,
94
- "temperature": 0.6,
95
- "n_sampling": 4,
96
- "n": 50
97
- },
98
- {
99
- "model": "DeepSeek-R1-0528",
100
- "mode": "sampling (Temp=0.6)",
101
- "open-source": true,
102
- "total accuracy": 41.10,
103
- "increased complexity": 45.87,
104
- "uncommon elements": 37.28 ,
105
- "unsolvable puzzle": 93.50,
106
- "temperature": 0.6,
107
- "n_sampling": 4,
108
- "n": 50
109
- },
110
- {
111
- "model": "GLM-4.5",
112
- "mode": "sampling (Temp=0.6)",
113
- "open-source": true,
114
- "total accuracy": 21.65,
115
- "increased complexity": 24.17,
116
- "uncommon elements": 21.49,
117
- "unsolvable puzzle": 93.26,
118
- "temperature": 0.6,
119
- "n_sampling": 4,
120
- "n": 50
121
- },
122
- {
123
- "model": "Kimi-K2-Instruct",
124
- "mode": "sampling (Temp=0.6)",
125
- "open-source": true,
126
- "total accuracy": 15.18,
127
- "increased complexity": 17.33,
128
- "uncommon elements": 14.71,
129
- "unsolvable puzzle": 87.46,
130
- "temperature": 0.6,
131
- "n_sampling": 4,
132
- "n": 50
133
- },
134
- {
135
- "model": "Seed-OSS-36B-Instruct",
136
- "mode": "sampling (Temp=0.6)",
137
- "open-source": true,
138
- "total accuracy": 38.87,
139
- "increased complexity": 41.01,
140
- "uncommon elements": 38.79 ,
141
- "unsolvable puzzle": 85.76,
142
- "temperature": 0.6,
143
- "n_sampling": 4,
144
- "n": 50
145
- },
146
- {
147
- "model": "gpt-oss-120b",
148
- "mode": "sampling (Temp=0.6)",
149
- "open-source": true,
150
- "total accuracy": 51.33,
151
- "increased complexity": 54.08,
152
- "uncommon elements": 51.11,
153
- "unsolvable puzzle": 93.35,
154
- "temperature": 0.6,
155
- "n_sampling": 4,
156
- "n": 50
157
- },
158
- {
159
- "model": "gpt-5",
160
- "mode": "sampling (Temp=0.6)",
161
- "open-source": false,
162
- "total accuracy": 67.37,
163
- "increased complexity": 69.89,
164
- "uncommon elements": 67.88,
165
- "unsolvable puzzle": 97.78,
166
- "temperature": 0.6,
167
- "n_sampling": 4,
168
- "n": 5
169
- },
170
- {
171
- "model": "gpt-5-mini",
172
- "mode": "sampling (Temp=0.6)",
173
- "open-source": false,
174
- "total accuracy": 53.40,
175
- "increased complexity": 55.76,
176
- "uncommon elements": 52.13 ,
177
- "unsolvable puzzle": 98.52,
178
- "temperature": 0.6,
179
- "n_sampling": 4,
180
- "n": 5
181
- },
182
- {
183
- "model": "o4-mini",
184
- "mode": "sampling (Temp=0.6)",
185
- "open-source": false,
186
- "total accuracy": 49.81,
187
- "increased complexity": 55.11,
188
- "uncommon elements": 47.13 ,
189
- "unsolvable puzzle": 95.00,
190
- "temperature": 0.6,
191
- "n_sampling": 4,
192
- "n": 5
193
- },
194
- {
195
- "model": "grok-4",
196
- "mode": "sampling (Temp=0.6)",
197
- "open-source": false,
198
- "total accuracy": 58.27,
199
- "increased complexity": 58.26 ,
200
- "uncommon elements": 59.62 ,
201
- "unsolvable puzzle": 97.59,
202
- "temperature": 0.6,
203
- "n_sampling": 4,
204
- "n": 5
205
- },
206
- {
207
- "model": "gemini-2.5-pro",
208
- "mode": "sampling (Temp=0.6)",
209
- "open-source": false,
210
- "total accuracy": 40.19,
211
- "increased complexity": 43.80,
212
- "uncommon elements": 39.38 ,
213
- "unsolvable puzzle": 91.48,
214
- "temperature": 0.6,
215
- "n_sampling": 4,
216
- "n": 5
217
- },
218
- {
219
- "model": "grok-3-mini",
220
- "mode": "sampling (Temp=0.6)",
221
- "open-source": false,
222
- "total accuracy": 42.50,
223
- "increased complexity": 48.48,
224
- "uncommon elements": 39.5,
225
- "unsolvable puzzle": 94.63,
226
- "temperature": 0.6,
227
- "n_sampling": 4,
228
- "n": 5
229
- },
230
- {
231
- "model": "claude-sonnet-4-thinking",
232
- "mode": "sampling (Temp=0.6)",
233
- "open-source": false,
234
- "total accuracy": 30.51,
235
- "increased complexity": 34.67,
236
- "uncommon elements": 28.25 ,
237
- "unsolvable puzzle": 57.96,
238
- "temperature": 0.6,
239
- "n_sampling": 4,
240
- "n": 5
241
- },
242
- {
243
- "model": "gemini-2.5-flash",
244
- "mode": "sampling (Temp=0.6)",
245
- "open-source": false,
246
- "total accuracy": 19.49,
247
- "increased complexity": 25.11,
248
- "uncommon elements": 16.00,
249
- "unsolvable puzzle": 57.78,
250
- "temperature": 0.6,
251
- "n_sampling": 4,
252
- "n": 5
253
- }
254
- ]