JunsWan commited on
Commit
9d527ba
·
verified ·
1 Parent(s): 6e2102e

Upload 8 files

Browse files
Files changed (8) hide show
  1. __init__.py +0 -0
  2. _about_us.md +14 -0
  3. _header.md +51 -0
  4. constants.py +322 -0
  5. data_utils.py +33 -0
  6. model_info.json +23 -0
  7. requirements.txt +5 -0
  8. utils_display.py +32 -0
__init__.py ADDED
File without changes
_about_us.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## About Us
2
+
3
+
4
+ #### Contact:
5
+
6
+ [Jingcong Liang](https://github.com/ljcleo), [Shijun Wan](https://github.com/JunsWan), and [Xuehai Wu]().
7
+
8
+ [Siyuan Wang]()
9
+
10
+
11
+ ### Contact
12
+
13
+ Please contact us in the following ways:
14
+
_header.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <br/>
2
+
3
+ # HardcoreLogic: Challenging Large Reasoning Models with Long-tail Logic Puzzle Games
4
+
5
+ <div style="text-align: center; margin-top: 20px; margin-bottom: 10px;">
6
+ <a href="#" class="btn-glass btn-paper">📄 Paper</a>
7
+ <a href="https://github.com/ljcleo/hardcore-logic" class="btn-glass btn-github">💻 GitHub</a>
8
+ <a href="https://huggingface.co/datasets/xhWu-fd/HardcoreLogic" class="btn-glass btn-dataset">🤗 Dataset</a>
9
+ </div>
10
+
11
+ <div style="text-align: center; font-size: 14px; color: #555; margin-top: 8px;">
12
+ Last Updated: <b>{LAST_UPDATED}</b>
13
+ </div>
14
+
15
+ <style>
16
+ .btn-glass {
17
+ display: inline-block;
18
+ padding: 12px 25px;
19
+ margin: 8px;
20
+ font-weight: 500;
21
+ text-decoration: none !important; /* 去掉下划线 */
22
+ border-radius: 12px;
23
+ color: white;
24
+ backdrop-filter: blur(6px); /* 虚化效果 */
25
+ background: rgba(255,255,255,0.85); /* 半透明,alpha 调高 */
26
+ box-shadow: 0 4px 15px rgba(0,0,0,0.2);
27
+ transition: all 0.3s ease;
28
+ }
29
+
30
+ /* 默认按钮颜色 */
31
+ .btn-paper { background-color: rgba(0, 123, 255, 0.85); }
32
+ .btn-github { background-color: rgba(40, 167, 69, 0.85); }
33
+ .btn-dataset { background-color: rgba(255, 183, 3, 0.85); }
34
+
35
+ /* hover 渐变 + 悬浮效果 */
36
+ .btn-paper:hover {
37
+ background: linear-gradient(135deg, rgba(0,123,255,0.95), rgba(0,198,255,0.95));
38
+ box-shadow: 0 10px 20px rgba(0,0,0,0.35);
39
+ transform: translateY(-3px);
40
+ }
41
+ .btn-github:hover {
42
+ background: linear-gradient(135deg, rgba(27,94,32,0.95), rgba(40,167,69,0.95));
43
+ box-shadow: 0 10px 20px rgba(0,0,0,0.35);
44
+ transform: translateY(-3px);
45
+ }
46
+ .btn-dataset:hover {
47
+ background: linear-gradient(135deg, rgba(255,140,0,0.95), rgba(255,183,3,0.95));
48
+ box-shadow: 0 10px 20px rgba(0,0,0,0.35);
49
+ transform: translateY(-3px);
50
+ }
51
+ </style>
constants.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ BANNER = '''
4
+ <div style="
5
+ display: flex;
6
+ justify-content: center;
7
+ align-items: center;
8
+ height: 120px;
9
+ background: #ffffff;
10
+ border-bottom: 2px solid #ced4da;
11
+ ">
12
+ <h1 style="
13
+ font-size: 5em;
14
+ font-family: 'Poppins', 'Segoe UI', sans-serif;
15
+ background: linear-gradient(90deg, #007bff, #00c6ff, #0096c7, #0077b6);
16
+ -webkit-background-clip: text;
17
+ color: transparent;
18
+ animation: gradientFlow 4s ease-in-out infinite;
19
+ background-size: 300%;
20
+ letter-spacing: 2px;
21
+ ">
22
+ HardcoreLogic
23
+ </h1>
24
+ </div>
25
+
26
+ <style>
27
+ @keyframes gradientFlow {
28
+ 0% { background-position: 0% 50%; }
29
+ 50% { background-position: 100% 50%; }
30
+ 100% { background-position: 0% 50%; }
31
+ }
32
+ </style>
33
+ '''
34
+
35
+
36
+ CITATION_TEXT = """
37
+
38
+ """
39
+ '''
40
+ @article{hardcorelogic2025,
41
+ title={HardcoreLogic: Challenging Large Reasoning Models with Long-tail Logic Puzzle Games},
42
+ author={},
43
+ year={2025},
44
+ url={https://arxiv.org/},
45
+ }'''
46
+
47
+ column_names = OrderedDict({
48
+ "model": "Model",
49
+ "open-source": "Open Source",
50
+ "total accuracy": "Total Acc",
51
+ "unsolvable puzzle": "Unsolvable Puzzle ACC",
52
+ })
53
+
54
+ column_names_puzzle = OrderedDict({
55
+ "model": "Model",
56
+ "total accuracy": "Total Acc",
57
+ "Zebra": "Zebra",
58
+ "Binario": "Binario",
59
+ "Crypto": "Crypto",
60
+ "Hanoi": "Hanoi",
61
+ "Hitpri": "Hitpri",
62
+ "Kakurasu": "Kakurasu",
63
+ "Minesweeper": "Minesweeper",
64
+ "Navigation": "Navigation",
65
+ "Skyscraper": "Skyscraper",
66
+ "Sudoku": "Sudoku",
67
+ })
68
+
69
+ LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
70
+ """
71
+
72
+ LEADERBOARD_REMARKS_MAIN = """
73
+ """
74
+
75
+ RANKING_COLUMN = "total accuracy"
76
+
77
+ ORDERED_COLUMN_NAMES = [
78
+ "model",
79
+ "mode",
80
+ "open-source",
81
+ "total accuracy",
82
+ "increased complexity",
83
+ "uncommon elements",
84
+ "unsolvable puzzle",
85
+ "temperature",
86
+ "n_sampling",
87
+ "n"
88
+ ]
89
+
90
+ ORDERED_COLUMN_NAMES_PUZZLE = [
91
+ "model",
92
+ "mode",
93
+ "open-source",
94
+ "total accuracy",
95
+ "Zebra",
96
+ "Binario",
97
+ "Crypto",
98
+ "Hanoi",
99
+ "Hitpri",
100
+ "Kakurasu",
101
+ "Minesweeper",
102
+ "Navigation",
103
+ "Skyscraper",
104
+ "Sudoku",
105
+ "temperature",
106
+ "n_sampling",
107
+ "n"
108
+ ]
109
+
110
+
111
+ js_light = """
112
+ function refresh() {
113
+ const url = new URL(window.location);
114
+
115
+ if (url.searchParams.get('__theme') !== 'light') {
116
+ url.searchParams.set('__theme', 'light');
117
+ window.location.href = url.href;
118
+ }
119
+
120
+ // Find the fieldset with the given id
121
+ const fieldset = document.getElementById("rank-column-radio");
122
+
123
+ // Create a new span element with the text "Decoding Mode:"
124
+ const rankBySpan = document.createElement("span");
125
+ rankBySpan.textContent = "Decoding Mode: ";
126
+ rankBySpan.style.fontWeight = "bold"; // Optional: make the text bold
127
+ rankBySpan.style.fontSize = "19px"; // Larger font size
128
+ rankBySpan.style.paddingRight = "18px"; // Add padding on the right
129
+
130
+ // Wrap the span and the labels in a flex container
131
+ const flexContainer = document.createElement("div");
132
+ flexContainer.style.display = "flex";
133
+ flexContainer.style.alignItems = "center";
134
+
135
+ // Insert the rankBySpan at the beginning of the flex container
136
+ flexContainer.appendChild(rankBySpan);
137
+
138
+ // Move all existing labels into the flex container
139
+ while (fieldset.firstChild) {
140
+ flexContainer.appendChild(fieldset.firstChild);
141
+ }
142
+
143
+ // Append the flex container back to the fieldset
144
+ fieldset.appendChild(flexContainer);
145
+ }
146
+ """
147
+
148
+ js_code = """
149
+ function scroll_top() {
150
+ console.log("Hello from Gradio!");
151
+ const bubbles = document.querySelectorAll('.bubble-wrap');
152
+ bubbles.forEach((bubble, index) => {
153
+ setTimeout(() => {
154
+ bubble.scrollTop = 0;
155
+ }, index * 100); // Delay of 100ms between each iteration
156
+ });
157
+
158
+ }
159
+ """
160
+
161
+
162
+ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)"
163
+
164
+ css = """
165
+ /* ========== 🌟 Global Typography ========== */
166
+ code {
167
+ font-size: large;
168
+ }
169
+ footer {visibility: hidden}
170
+
171
+ .markdown-text{font-size: 14pt}
172
+ .markdown-text-small{font-size: 13pt}
173
+ .markdown-text-tiny{font-size: 12pt}
174
+
175
+ /* ========== 🎓 Fudan Blue Theme Colors ========== */
176
+ :root {
177
+ --fudan-blue: #002D72;
178
+ --fudan-blue-light: #E6EEF8;
179
+ --fudan-gray: #f7f7f7;
180
+ --fudan-border: #c8d6e5;
181
+ --fudan-highlight: #1E56A0;
182
+ }
183
+
184
+ /* ========== 🏅 Leaderboard Table ========== */
185
+ #leaderboard-table th,
186
+ #leaderboard-puzzle-table th {
187
+ background-color: var(--fudan-blue);
188
+ color: white;
189
+ text-align: center;
190
+ padding: 10px;
191
+ font-size: 15px;
192
+ border-bottom: 2px solid var(--fudan-border);
193
+ }
194
+
195
+ #leaderboard-table td,
196
+ #leaderboard-puzzle-table td {
197
+ text-align: center;
198
+ font-size: 14px;
199
+ padding: 8px;
200
+ background-color: white;
201
+ border-bottom: 1px solid var(--fudan-border);
202
+ }
203
+
204
+ /* Hover 行高亮 */
205
+ #leaderboard-table tr:hover,
206
+ #leaderboard-puzzle-table tr:hover {
207
+ background-color: var(--fudan-blue-light);
208
+ transition: background-color 0.2s ease-in-out;
209
+ }
210
+
211
+ /* 表格整体外观 */
212
+ #leaderboard-table,
213
+ #leaderboard-puzzle-table {
214
+ border-collapse: collapse;
215
+ border-radius: 10px;
216
+ overflow: hidden;
217
+ box-shadow: 0 0 10px rgba(0, 45, 114, 0.15);
218
+ }
219
+
220
+ /* ========== 📊 Tabs ========== */
221
+ .tab-buttons button[role="tab"] {
222
+ font-size: 15px;
223
+ font-weight: 600;
224
+ color: var(--fudan-blue);
225
+ border: 1px solid var(--fudan-border);
226
+ border-radius: 8px;
227
+ background-color: white;
228
+ padding: 8px 16px;
229
+ margin-right: 5px;
230
+ transition: all 0.2s ease-in-out;
231
+ }
232
+
233
+ .tab-buttons button[role="tab"]:hover {
234
+ background-color: var(--fudan-blue-light);
235
+ }
236
+
237
+ button.selected[role="tab"][aria-selected="true"] {
238
+ background-color: var(--fudan-blue);
239
+ color: white;
240
+ font-weight: bold;
241
+ font-size: 16px;
242
+ }
243
+
244
+ /* ========== 📦 Accordion & Buttons ========== */
245
+ .accordion-label button span{
246
+ font-size: 14pt;
247
+ font-weight: bold;
248
+ color: var(--fudan-blue);
249
+ }
250
+
251
+ .btn_boderline{
252
+ border: 1px solid var(--fudan-blue);
253
+ border-radius: 5px;
254
+ padding: 6px 12px;
255
+ margin: 5px;
256
+ font-size: 14pt;
257
+ font-weight: bold;
258
+ background-color: var(--fudan-blue-light);
259
+ color: var(--fudan-blue);
260
+ transition: background-color 0.3s;
261
+ }
262
+ .btn_boderline:hover{
263
+ background-color: var(--fudan-blue);
264
+ color: white;
265
+ }
266
+
267
+ /* ========== 🧩 Box & Card ========== */
268
+ .box_md{
269
+ border: 1px solid var(--fudan-border);
270
+ border-radius: 10px;
271
+ padding: 10px;
272
+ font-size: 12pt;
273
+ margin: 8px;
274
+ background-color: white;
275
+ box-shadow: 0 0 6px rgba(0, 45, 114, 0.1);
276
+ }
277
+
278
+ /* ========== 💬 Markdown Text Enhancements ========== */
279
+ .markdown-text-details{
280
+ margin: 10px;
281
+ padding: 10px;
282
+ background-color: var(--fudan-gray);
283
+ border-left: 4px solid var(--fudan-blue);
284
+ border-radius: 6px;
285
+ }
286
+
287
+ /* ========== 📈 Plot & Visualization ========== */
288
+ .plotly-plot{
289
+ height: auto;
290
+ max-height: 600px;
291
+ min-height: 600px;
292
+ border: 1px solid var(--fudan-border);
293
+ border-radius: 10px;
294
+ }
295
+
296
+ /* ========== 🧷 Misc Components ========== */
297
+ .sample_button{
298
+ border: 2px solid var(--fudan-blue);
299
+ border-radius: 10px;
300
+ padding: 10px;
301
+ font-size: 17pt;
302
+ font-weight: bold;
303
+ margin: 5px;
304
+ background-color: var(--fudan-blue-light);
305
+ color: var(--fudan-blue);
306
+ transition: all 0.3s ease-in-out;
307
+ }
308
+ .sample_button:hover {
309
+ background-color: var(--fudan-blue);
310
+ color: white;
311
+ }
312
+
313
+ /* Scrollable Table Containers */
314
+ #leaderboard-table,
315
+ #leaderboard-puzzle-table {
316
+ display: block;
317
+ max-height: 800px;
318
+ overflow-y: auto;
319
+ }
320
+
321
+
322
+ """
data_utils.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets.utils.logging import disable_progress_bar
2
+ from constants import column_names, RANKING_COLUMN, ORDERED_COLUMN_NAMES
3
+ from utils_display import make_clickable_model
4
+ disable_progress_bar()
5
+ import json
6
+ import os
7
+
8
+ summary_file = "HardcoreLogic-Eval/results_dirs/hardcorelogic.summary.json"
9
+ result_dir = "HardcoreLogic-Eval/results_dirs"
10
+ results_by_model = {}
11
+
12
+ # Formats the columns
13
+ def formatter(x):
14
+ if type(x) is str:
15
+ x = x
16
+ else:
17
+ x = round(x, 2)
18
+ return x
19
+
20
+ def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_columns=ORDERED_COLUMN_NAMES, click_url=True):
21
+ df = df[[col for col in column_names.keys() if col in df.columns]].copy()
22
+ for col in df.columns:
23
+ if col == "model" and click_url:
24
+ df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
25
+ else:
26
+ df[col] = df[col].apply(formatter) # For numerical values
27
+ list_columns = [col for col in ordered_columns if col in df.columns]
28
+ df = df[list_columns]
29
+ if rank_column in df.columns:
30
+ df.sort_values(by=rank_column, inplace=True, ascending=False)
31
+ df.rename(columns=column_names, inplace=True)
32
+ return df
33
+
model_info.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Qwen3-8B": {"pretty_name": "Qwen3-8B","hf_model_id": "Qwen/Qwen3-8B"},
3
+ "Qwen3-30B-A3B-Thinking-2507": {"pretty_name": "Qwen3-30B-A3B-Thinking-2507","hf_model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507"},
4
+ "Qwen3-32B": {"pretty_name": "Qwen3-32B","hf_model_id": "Qwen/Qwen3-32B"},
5
+ "Qwen3-Next-80B-A3B-Thinking": {"pretty_name": "Qwen3-Next-80B-A3B-Thinking","hf_model_id": "Qwen/QQwen3-Next-80B-A3B-Thinking"},
6
+ "Qwen3-235B-A22B-Thinking-2507": {"pretty_name": "Qwen3-235B-A22B-Thinking-2507","hf_model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507"},
7
+ "MiniMax-M1-40k": {"pretty_name": "MiniMax-M1-40k","hf_model_id": "MiniMaxAI/MiniMax-M1-40k"},
8
+ "DeepSeek-R1-0528-Qwen3-8B": {"pretty_name": "DeepSeek-R1-0528-Qwen3-8B","hf_model_id": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"},
9
+ "DeepSeek-V3.1": {"pretty_name": "DeepSeek-V3.1","hf_model_id": "deepseek-ai/DeepSeek-V3.1"},
10
+ "DeepSeek-R1-0528": {"pretty_name": "DeepSeek-R1-0528","hf_model_id": "deepseek-ai/DeepSeek-R1-0528"},
11
+ "GLM-4.5": {"pretty_name": "GLM-4.5","hf_model_id": "zai-org/GLM-4.5"},
12
+ "Kimi-K2-Instruct": {"pretty_name": "Kimi-K2-Instruct","hf_model_id": "moonshotai/Kimi-K2-Instruct"},
13
+ "Seed-OSS-36B-Instruct": {"pretty_name": "Seed-OSS-36B-Instruct","hf_model_id": "ByteDance-Seed/Seed-OSS-36B-Instruct"},
14
+ "gpt-oss-120b": {"pretty_name": "gpt-oss-120b","hf_model_id": "openai/gpt-oss-120b"},
15
+ "gpt-5": {"pretty_name": "gpt-5","hf_model_id": false},
16
+ "gpt-5-mini": {"pretty_name": "gpt-5-mini","hf_model_id": false},
17
+ "o4-mini": {"pretty_name": "o4-mini","hf_model_id": false},
18
+ "grok-4": {"pretty_name": "grok-4","hf_model_id": false},
19
+ "gemini-2.5-pro": {"pretty_name": "gemini-2.5-pro","hf_model_id": false},
20
+ "grok-3-mini": {"pretty_name": "grok-3-mini","hf_model_id": false},
21
+ "claude-sonnet-4-thinking": {"pretty_name": "claude-sonnet-4-thinking","hf_model_id": false},
22
+ "gemini-2.5-flash": {"pretty_name": "gemini-2.5-flash","hf_model_id": false}
23
+ }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio[oauth]==4.19.2
2
+ datasets
3
+ toolz==0.12.1
4
+ plotly
5
+ tabulate
utils_display.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ with open("model_info.json", "r") as f:
4
+ model_info = json.load(f)
5
+
6
+ def make_clickable_model(model_name):
7
+ global model_info
8
+ modified_model_name = model_name
9
+
10
+ if model_name in model_info:
11
+ info = model_info[model_name]
12
+ pretty_name = info.get("pretty_name", model_name)
13
+ hf_id = info.get("hf_model_id", False)
14
+
15
+ if hf_id:
16
+ link = f"https://huggingface.co/{hf_id}"
17
+ modified_model_name = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">{pretty_name}</a>'
18
+ else:
19
+ modified_model_name = f'{pretty_name}'
20
+
21
+ return modified_model_name
22
+
23
+
24
+
25
+ def styled_error(error):
26
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
27
+
28
+ def styled_warning(warn):
29
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
30
+
31
+ def styled_message(message):
32
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"