ggcristian commited on
Commit
be0239b
·
1 Parent(s): cc21640

Small refactor: Add `config` with model metadata and move constants to its own file

Browse files
Files changed (5) hide show
  1. app.py +80 -451
  2. config/constants.py +58 -0
  3. config/model_metadata.py +112 -0
  4. results/parse.py +32 -248
  5. utils.py +53 -104
app.py CHANGED
@@ -1,269 +1,72 @@
1
  import sys
2
 
3
  import gradio as gr
4
- import pandas as pd
5
- import plotly.express as px
6
  from gradio.themes.utils import colors
7
 
8
- from results.parse import parse_agg, read_data
 
 
 
 
9
  from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
 
 
 
 
 
 
 
 
10
  from style.css_html_js import custom_css
11
- from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases
12
-
13
-
14
- def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
15
- subset = df.copy()
16
-
17
- # Filter by task specific benchmarks when 'All' benchmarks is selected
18
- if task == "Spec-to-RTL":
19
- valid_benchmarks = s2r_benchs
20
- if benchmark == "All":
21
- subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
22
- elif task == "Code Completion":
23
- valid_benchmarks = cc_benchs
24
- if benchmark == "All":
25
- subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
26
- elif task == "Line Completion †":
27
- valid_benchmarks = lc_benchs
28
- if benchmark == "All":
29
- subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
30
-
31
- if benchmark != "All":
32
- subset = df[df["Benchmark"] == benchmark]
33
-
34
- if model_type != "All":
35
- # without emojis
36
- subset = subset[subset["Model Type"] == model_type.split(" ")[0]]
37
- if search_query:
38
- subset = subset[
39
- subset["Model"].str.contains(search_query, case=False, na=False)
40
- ]
41
- max_params = float(max_params)
42
- subset = subset[subset["Params"] <= max_params]
43
-
44
- if benchmark == "All":
45
- if task == "Spec-to-RTL":
46
- return filter_bench_all(subset, df_agg, agg_column="Agg S2R")
47
- elif task == "Code Completion":
48
- return filter_bench_all(subset, df_agg, agg_column="Agg MC")
49
- elif task == "Line Completion †":
50
- return filter_RTLRepo(subset)
51
- elif benchmark == "RTL-Repo":
52
- return filter_RTLRepo(subset)
53
- else:
54
- agg_column = None
55
- if benchmark == "VerilogEval S2R":
56
- agg_column = "Agg VerilogEval S2R"
57
- elif benchmark == "VerilogEval MC":
58
- agg_column = "Agg VerilogEval MC"
59
- elif benchmark == "RTLLM":
60
- agg_column = "Agg RTLLM"
61
- elif benchmark == "VeriGen":
62
- agg_column = "Agg VeriGen"
63
-
64
- return filter_bench(subset, df_agg, agg_column)
65
-
66
-
67
- def update_benchmarks_by_task(task):
68
- if task == "Spec-to-RTL":
69
- new_benchmarks = ["All"] + s2r_benchs
70
- elif task == "Code Completion":
71
- new_benchmarks = ["All"] + cc_benchs
72
- elif task == "Line Completion †":
73
- new_benchmarks = lc_benchs
74
- else:
75
- new_benchmarks = ["All"] + benchmarks
76
- benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
77
- filtered = filter_leaderboard(
78
- task,
79
- benchmark_value,
80
- model_type_dropdown.value,
81
- search_box.value,
82
- params_slider.value,
83
- )
84
- return gr.update(value=benchmark_value, choices=new_benchmarks), filtered
85
-
86
-
87
- def generate_scatter_plot(benchmark, metric):
88
- benchmark, metric = handle_special_cases(benchmark, metric)
89
 
90
- subset = df[df["Benchmark"] == benchmark]
91
- if benchmark == "RTL-Repo":
92
- subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)]
93
- detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean()
94
- detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True)
95
- else:
96
- detailed_scores = subset.pivot_table(
97
- index="Model", columns="Metric", values="Score"
98
- ).reset_index()
99
 
100
- details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model")
101
- scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna(
102
- subset=["Params", metric]
103
- )
104
-
105
- scatter_data["x"] = scatter_data["Params"]
106
- scatter_data["y"] = scatter_data[metric]
107
- scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40
108
-
109
- type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
110
- scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray")
111
-
112
- y_axis_limits = {
113
- "Functionality (FNC)": [5, 90],
114
- "Syntax (STX)": [20, 100],
115
- "Synthesis (SYN)": [5, 90],
116
- "Power": [0, 50],
117
- "Performance": [0, 50],
118
- "Area": [0, 50],
119
- "Exact Matching (EM)": [0, 50],
120
- }
121
- y_range = y_axis_limits.get(metric, [0, 80])
122
 
123
- fig = px.scatter(
124
- scatter_data,
125
- x="x",
126
- y="y",
127
- log_x=True,
128
- size="size",
129
- color="Model Type",
130
- text="Model",
131
- hover_data={metric: ":.2f"},
132
- title=f"Params vs. {metric} for {benchmark}",
133
- labels={"x": "# Params (Log Scale)", "y": metric},
134
- template="plotly_white",
135
- height=600,
136
- width=1200,
137
- )
138
-
139
- fig.update_traces(
140
- textposition="top center",
141
- textfont_size=10,
142
- marker=dict(opacity=0.8, line=dict(width=0.5, color="black")),
143
- )
144
- fig.update_layout(
145
- xaxis=dict(
146
- showgrid=True,
147
- type="log",
148
- tickmode="array",
149
- tickvals=[8, 14, 32, 72, 200, 700],
150
- ticktext=["8", "14", "32", "72", "200", "700"],
151
- ),
152
- showlegend=False,
153
- yaxis=dict(range=y_range),
154
- margin=dict(l=50, r=50, t=50, b=50),
155
- plot_bgcolor="white",
156
- )
157
-
158
- return fig
159
-
160
-
161
- with gr.Blocks(
162
- css=custom_css, theme=gr.themes.Default(primary_hue=colors.emerald)
163
- ) as app:
164
- df_icarus, benchmarks, metrics, default_metric = read_data(
165
- "results/results_icarus.json"
166
- )
167
- df_agg_icarus = parse_agg("results/aggregated_scores_icarus.csv")
168
- df_verilator, _, _, _ = read_data("results/results_verilator.json")
169
- df_agg_verilator = parse_agg("results/aggregated_scores_verilator.csv")
170
- df = df_icarus
171
- df_agg = df_agg_icarus
172
- tasks = ["Spec-to-RTL", "Code Completion", "Line Completion †"]
173
- s2r_benchs = ["VerilogEval S2R", "RTLLM"]
174
- cc_benchs = ["VerilogEval MC", "VeriGen"]
175
- lc_benchs = ["RTL-Repo"]
176
- non_rtl_metrics = [
177
- "Syntax (STX)",
178
- "Functionality (FNC)",
179
- "Synthesis (SYN)",
180
- "Power",
181
- "Performance",
182
- "Area",
183
- ]
184
- rtl_metrics = ["Exact Matching (EM)"]
185
- model_types = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]
186
 
187
- gr.HTML(
188
- """
189
- <div align="center">
190
- <img src='/gradio_api/file=logo_new.png' alt='TuRTLe Logo' width='220'/>
191
- </div>
192
- """
193
  )
194
- gr.HTML(
195
- """
196
- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
197
- <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
198
- <div style="text-align: center; margin-bottom: 0px; margin-top: 0px;">
199
- <a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
200
- <button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
201
- GitHub Repo
202
- </button>
203
- </a>
204
 
205
- <a href="http://arxiv.org/abs/2504.01986" target="_blank" style="text-decoration: none; margin-right: 10px;">
206
- <button style="background: #b31b1b; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
207
- arXiv MLCAD 2025
208
- </button>
209
- </a>
210
 
211
- <a href="mailto:hpai@bsc.es?subject=TuRTLe%20leaderboard%20new%20entry&body=Link%20to%20HuggingFace%20Model:" style="text-decoration: none;">
212
- <button style="background: #00674F; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
213
- How to submit
214
- </button>
215
- </a>
216
- <p style="margin-top: 15px;">If you have any inquiries or wish to collaborate:
217
- <a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
218
- </p>
219
- </div>
220
- """
221
- )
222
- gr.HTML(
223
- """
224
- <div style=" margin-top:-10px !important;">
225
- <p style="margin-bottom: 15px; text-align: start !important;">
226
- Welcome to the TuRTLe Model Leaderboard! TuRTLe is a
227
- <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b>
228
- for hardware design.
229
- Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b>
230
- (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs.
231
- Use the filters below to explore different RTL benchmarks, simulators and models.
232
- </p>
233
- <p style="margin-top:10px; text-align:start !important;">
234
- <span style="font-variant:small-caps; font-weight:bold;">UPDATE (SEPT 2025):</span> Added <span>gpt-oss-20b</span> and <span>gpt-oss-120b</span> to the leaderboard
235
- </p>
236
- <p style="margin-top:-6px; text-align:start !important;">
237
- <span style="font-variant:small-caps; font-weight:bold;">UPDATE (JULY 2025):</span> Our TuRTLe paper was accepted to
238
- <a href="https://mlcad.org/symposium/2025/" target="_blank">MLCAD 2025</a> in September (Santa Cruz, CA), plus we've added Verilator as a new simulator alongside Icarus Verilog
239
- </p>
240
- <p style="margin-top: -6px; text-align: start !important;">
241
- <span style="font-variant: small-caps; font-weight: bold;">UPDATE (JUNE 2025):</span> We make our framework open-source on GitHub and we add 7 new recent models! For a total of 40 base and instruct models and 5 RTL benchmarks
242
- </p>
243
- </div>
244
- """
245
- )
246
- with gr.Tabs():
247
  with gr.Tab("Leaderboard"):
 
248
  with gr.Row(equal_height=True):
249
  with gr.Column(scale=4):
250
- task_radio = gr.Radio(
251
- choices=tasks, label="Select Task", value="Spec-to-RTL"
252
- )
253
  with gr.Column(scale=3):
254
  benchmark_radio = gr.Radio(
255
- choices=["All"] + s2r_benchs,
256
  label="Select Benchmark",
257
- value="All",
258
  )
259
  with gr.Column(scale=2, min_width=180):
260
  simulator_radio = gr.Radio(
261
- choices=["Icarus", "Verilator"],
262
- value="Icarus",
263
  label="Select Simulator",
264
  scale=1,
265
  )
266
 
 
267
  with gr.Row(equal_height=True):
268
  search_box = gr.Textbox(
269
  label="Search Model",
@@ -271,74 +74,61 @@ with gr.Blocks(
271
  scale=2,
272
  )
273
  model_type_dropdown = gr.Radio(
274
- choices=model_types,
275
  label="Select Model Type",
276
- value="All",
277
  scale=3,
278
  )
279
  params_slider = gr.Slider(
280
- minimum=df["Params"].min(),
281
- maximum=700,
282
- value=700,
283
  label="Max Params",
284
  step=1,
285
  scale=2,
286
  )
287
 
 
288
  leaderboard = gr.DataFrame(
289
- value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700),
 
 
290
  headers="first row",
291
  show_row_numbers=True,
292
  wrap=True,
293
- datatype=[
294
- "html",
295
- "html",
296
- ],
297
  interactive=False,
298
- column_widths=[
299
- "7%",
300
- "28%",
301
- "13%",
302
- "10%",
303
- "13%",
304
- "10%",
305
- "14%",
306
- ],
307
  elem_classes="dataframe-leaderboard",
308
  )
309
 
310
- gr.HTML(
311
- """
312
- <div id="lc-footnote" style="font-size: 13px; opacity: 0.6; margin-top: -5px; z-index:999; text-align: left;">
313
- <span style="font-weight: 600; opacity: 1;">†</span>
314
- <em>Line Completion</em> excludes “reasoning” models since this task targets quick auto-completion<br/>
315
- Additionally, for <em>Line Completion</em> and <em>Code Completion</em> benchmarks we use <b>Base</b> model variant (if available), and for <em>Spec-to-RTL</em> we use <b>Instruct</b> model variant
316
- </div>
317
- """
318
- )
319
 
 
320
  with gr.Tab("Plot View"):
321
  with gr.Row(equal_height=True):
322
- default_benchmark = s2r_benchs[0]
323
  bubble_benchmark = gr.Dropdown(
324
  choices=benchmarks,
325
  label="Select Benchmark",
326
  value=default_benchmark,
327
  elem_classes="gr-dropdown",
328
  )
329
- default_metric = non_rtl_metrics[0]
330
  bubble_metric = gr.Dropdown(
331
- choices=non_rtl_metrics,
332
  label="Select Metric",
333
  value=default_metric,
334
  )
335
  with gr.Row(equal_height=True):
336
  scatter_plot = gr.Plot(
337
- value=generate_scatter_plot(default_benchmark, default_metric),
338
  label="Bubble Chart",
339
  elem_id="full-width-plot",
340
  )
341
 
 
342
  with gr.Tab("Metrics Information"):
343
  with open("./static/metrics.md", "r") as file:
344
  gr.Markdown(
@@ -349,52 +139,12 @@ with gr.Blocks(
349
  ],
350
  elem_classes="metrics-page",
351
  )
352
- with gr.Tab("About Us"):
353
- gr.HTML(
354
- """
355
- <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
356
- <div style="display: flex; justify-content: center; align-items: center; gap: 5%; margin-bottom: 20px;">
357
- <img src='/gradio_api/file=hpai_logo_grad.png' alt='HPAI Group Logo' style="width: 45%;"/>
358
- <img src='/gradio_api/file=bsc-logo.png' alt='BSC Logo' style="width: 25%;"/>
359
- </div>
360
-
361
- <p style="font-size: 16px; text-align: start;">
362
- The <b>High-Performance Artificial Intelligence (HPAI)</b> group is part of the
363
- <a href="https://bsc.es/" target="_blank">Barcelona Supercomputing Center (BSC)</a>.
364
- This leaderboard is maintained by HPAI as part of our commitment to <b>open science</b>.
365
- </p>
366
-
367
- <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
368
- <li><a href="https://hpai.bsc.es/" target="_blank">HPAI Website</a></li>
369
- <li><a href="https://github.com/HPAI-BSC/" target="_blank">HPAI GitHub Organization Page</a></li>
370
- <li><a href="https://huggingface.co/HPAI-BSC/" target="_blank">HPAI Hugging Face Organization Page</a></li>
371
- </ul>
372
 
373
- <p style="font-size: 16px; margin-top: 15px;">
374
- Feel free to contact us:
375
- </p>
376
 
377
- <p style="font-size: 16px;">Email: <a href="mailto:hpai@bsc.es"><b>hpai@bsc.es</b></a></p>
378
- </div>
379
- """
380
- )
381
  with gr.Tab("References"):
382
- gr.HTML(
383
- """
384
- <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
385
- <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
386
- <li><a href="https://github.com/bigcode-project/bigcode-evaluation-harness" target="_blank">Code Generation LM Evaluation Harness</a></li>
387
- <li>Williams, S. Icarus Verilog [Computer software]. <a href="https://github.com/steveicarus/iverilog" target="_blank">https://github.com/steveicarus/iverilog</a></li>
388
- <li>Snyder, W., Wasson, P., Galbi, D., & et al. Verilator [Computer software]. <a href="https://github.com/verilator/verilator" target="_blank">https://github.com/verilator/verilator</a></li>
389
- <li>RTL-Repo: Allam and M. Shalan, “Rtl-repo: A benchmark for evaluating llms on large-scale rtl design projects,” in 2024 IEEE LLM Aided Design Workshop (LAD). IEEE, 2024, pp. 1–5.</li>
390
- <li>VeriGen: S. Thakur, B. Ahmad, H. Pearce, B. Tan, B. Dolan-Gavitt, R. Karri, and S. Garg, “Verigen: A large language model for verilog code generation,” ACM Transactions on Design Automation of Electronic Systems, vol. 29, no. 3, pp. 1–31, 2024. </li>
391
- <li>VerilogEval (I): M. Liu, N. Pinckney, B. Khailany, and H. Ren, “Verilogeval: Evaluating large language models for verilog code generation,” in 2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 2023, pp. 1–8.</li>
392
- <li>VerilogEval (II): N. Pinckney, C. Batten, M. Liu, H. Ren, and B. Khailany, “Revisiting VerilogEval: A Year of Improvements in Large-Language Models for Hardware Code Generation,” ACM Trans. Des. Autom. Electron. Syst., feb 2025. https://doi.org/10.1145/3718088</li>
393
- <li>RTLLM: Y. Lu, S. Liu, Q. Zhang, and Z. Xie, “Rtllm: An open-source benchmark for design rtl generation with large language model,” in 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC). IEEE, 2024, pp. 722–727.</li>
394
- </ul>
395
- </div>
396
- """
397
- )
398
 
399
  with gr.Row():
400
  with gr.Accordion("📙 Citation", open=False):
@@ -406,143 +156,22 @@ with gr.Blocks(
406
  show_copy_button=True,
407
  )
408
 
409
- # event handlers, ugly way but it works
410
- task_radio.change(
411
- fn=update_benchmarks_by_task,
412
- inputs=[task_radio],
413
- outputs=[benchmark_radio, leaderboard],
414
- )
415
- benchmark_radio.change(
416
- fn=filter_leaderboard,
417
- inputs=[
418
- task_radio,
419
- benchmark_radio,
420
- model_type_dropdown,
421
- search_box,
422
- params_slider,
423
- ],
424
- outputs=leaderboard,
425
- )
426
- model_type_dropdown.change(
427
- fn=filter_leaderboard,
428
- inputs=[
429
- task_radio,
430
- benchmark_radio,
431
- model_type_dropdown,
432
- search_box,
433
- params_slider,
434
- ],
435
- outputs=leaderboard,
436
- )
437
- search_box.change(
438
- fn=filter_leaderboard,
439
- inputs=[
440
- task_radio,
441
- benchmark_radio,
442
- model_type_dropdown,
443
- search_box,
444
- params_slider,
445
- ],
446
- outputs=leaderboard,
447
- )
448
- params_slider.change(
449
- fn=filter_leaderboard,
450
- inputs=[
451
- task_radio,
452
- benchmark_radio,
453
- model_type_dropdown,
454
- search_box,
455
- params_slider,
456
- ],
457
- outputs=leaderboard,
458
- )
459
-
460
- def on_benchmark_change(benchmark, _):
461
- if benchmark == "RTL-Repo":
462
- metric = "Exact Matching (EM)"
463
- return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
464
- benchmark, metric
465
- )
466
- else:
467
- metric = non_rtl_metrics[0]
468
- return gr.update(
469
- choices=non_rtl_metrics[:-1], value=metric
470
- ), generate_scatter_plot(benchmark, metric)
471
-
472
- def on_metric_change(benchmark, metric):
473
- benchmark, metric = handle_special_cases(benchmark, metric)
474
- fig = generate_scatter_plot(benchmark, metric)
475
- return gr.update(value=benchmark), fig
476
-
477
- def on_simulator_change(
478
- simulator,
479
- task,
480
- benchmark,
481
- model_type,
482
- search,
483
- max_params,
484
- plot_bench,
485
- plot_metric,
486
- ):
487
- global df, df_agg
488
- if simulator == "Icarus":
489
- df, df_agg = df_icarus, df_agg_icarus
490
- else:
491
- df, df_agg = df_verilator, df_agg_verilator
492
-
493
- leaderboard_df = filter_leaderboard(
494
- task, benchmark, model_type, search, max_params
495
- )
496
- fig = generate_scatter_plot(plot_bench, plot_metric)
497
- return leaderboard_df, fig
498
-
499
- bubble_benchmark.change(
500
- fn=on_benchmark_change,
501
- inputs=[bubble_benchmark, bubble_metric],
502
- outputs=[bubble_metric, scatter_plot],
503
- js=""" // this is to avoid resetting user scroll each time a plot is re-generated
504
- (benchmark, metric) => {
505
- let scrollY = window.scrollY;
506
- const observer = new MutationObserver(() => {
507
- window.scrollTo(0, scrollY);
508
- observer.disconnect();
509
- });
510
- observer.observe(document.getElementById('full-width-plot'), { childList: true });
511
- return [benchmark, metric];
512
- }
513
- """,
514
- )
515
-
516
- bubble_metric.change(
517
- fn=on_metric_change,
518
- inputs=[bubble_benchmark, bubble_metric],
519
- outputs=[bubble_benchmark, scatter_plot],
520
- js=""" // this is to avoid resetting user scroll each time a plot is re-generated
521
- (benchmark, metric) => {
522
- let scrollY = window.scrollY;
523
- const observer = new MutationObserver(() => {
524
- window.scrollTo(0, scrollY);
525
- observer.disconnect();
526
- });
527
- observer.observe(document.getElementById('full-width-plot'), { childList: true });
528
- return [benchmark, metric];
529
- }
530
- """,
531
- )
532
-
533
- simulator_radio.change(
534
- fn=on_simulator_change,
535
- inputs=[
536
- simulator_radio,
537
- task_radio,
538
- benchmark_radio,
539
- model_type_dropdown,
540
- search_box,
541
- params_slider,
542
- bubble_benchmark,
543
- bubble_metric,
544
- ],
545
- outputs=[leaderboard, scatter_plot],
546
  )
547
 
548
 
 
1
  import sys
2
 
3
  import gradio as gr
 
 
4
  from gradio.themes.utils import colors
5
 
6
+ from config import constants as C
7
+ from handlers.leaderboard_handlers import create_leaderboard_handlers
8
+ from results.parse import get_metadata, parse_agg, read_dataframe
9
+ from src.data_processing import filter_leaderboard, generate_scatter_plot
10
+ from src.models import Simulator
11
  from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
12
+ from static.html_content import (
13
+ ABOUT_US_HTML,
14
+ HEADER_HTML,
15
+ INTRO_HTML,
16
+ LC_FOOTNOTE_HTML,
17
+ NAV_BUTTONS_HTML,
18
+ REFERENCES_HTML,
19
+ )
20
  from style.css_html_js import custom_css
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue=colors.emerald)) as app:
23
+ # Load csv results
24
+ df_icarus = read_dataframe(C.ICARUS_RESULTS)
25
+ df_verilator = read_dataframe(C.VERILATOR_RESULTS)
 
 
 
 
 
26
 
27
+ # Load aggregated scores
28
+ df_agg_icarus = parse_agg(C.ICARUS_AGG)
29
+ df_agg_verilator = parse_agg(C.VERILATOR_AGG)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # columns of the csvs
32
+ benchmarks, metrics, default_metric = get_metadata(df_icarus)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # Each time we select a simulator, we need to use that sim's dataframe
35
+ state = Simulator(
36
+ icarus_df=df_icarus,
37
+ icarus_agg=df_agg_icarus,
38
+ verilator_df=df_verilator,
39
+ verilator_agg=df_agg_verilator,
40
  )
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # Header view
43
+ gr.HTML(HEADER_HTML)
44
+ gr.HTML(NAV_BUTTONS_HTML)
45
+ gr.HTML(INTRO_HTML)
 
46
 
47
+ # Main view
48
+ with gr.Tabs() as tabs:
49
+ # Leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  with gr.Tab("Leaderboard"):
51
+ # 1st row filters (select task, benchmark and sim)
52
  with gr.Row(equal_height=True):
53
  with gr.Column(scale=4):
54
+ task_radio = gr.Radio(choices=C.TASKS, label="Select Task", value=C.DEFAULT_TASK)
 
 
55
  with gr.Column(scale=3):
56
  benchmark_radio = gr.Radio(
57
+ choices=[C.DEFAULT_BENCHMARK] + C.S2R_BENCHMARKS,
58
  label="Select Benchmark",
59
+ value=C.DEFAULT_BENCHMARK,
60
  )
61
  with gr.Column(scale=2, min_width=180):
62
  simulator_radio = gr.Radio(
63
+ choices=C.SIMULATORS,
64
+ value=C.SIMULATORS[0],
65
  label="Select Simulator",
66
  scale=1,
67
  )
68
 
69
+ # 2nd row filters (search, model type, params)
70
  with gr.Row(equal_height=True):
71
  search_box = gr.Textbox(
72
  label="Search Model",
 
74
  scale=2,
75
  )
76
  model_type_dropdown = gr.Radio(
77
+ choices=C.MODEL_TYPES,
78
  label="Select Model Type",
79
+ value=C.DEFAULT_MODEL_TYPE,
80
  scale=3,
81
  )
82
  params_slider = gr.Slider(
83
+ minimum=state.get_current_df()["Params"].min(),
84
+ maximum=C.DEFAULT_MAX_PARAMS,
85
+ value=C.DEFAULT_MAX_PARAMS,
86
  label="Max Params",
87
  step=1,
88
  scale=2,
89
  )
90
 
91
+ # main leaderboard content
92
  leaderboard = gr.DataFrame(
93
+ value=filter_leaderboard(
94
+ C.DEFAULT_TASK, C.DEFAULT_BENCHMARK, C.DEFAULT_MODEL_TYPE, "", C.DEFAULT_MAX_PARAMS, state
95
+ ),
96
  headers="first row",
97
  show_row_numbers=True,
98
  wrap=True,
99
+ datatype=["html", "html"],
 
 
 
100
  interactive=False,
101
+ column_widths=["7%", "28%", "13%", "10%", "13%", "10%", "14%"],
 
 
 
 
 
 
 
 
102
  elem_classes="dataframe-leaderboard",
103
  )
104
 
105
+ # caption for the Base vs Instruct models
106
+ gr.HTML(LC_FOOTNOTE_HTML)
 
 
 
 
 
 
 
107
 
108
+ # all plots using Plotly
109
  with gr.Tab("Plot View"):
110
  with gr.Row(equal_height=True):
111
+ default_benchmark = C.S2R_BENCHMARKS[0]
112
  bubble_benchmark = gr.Dropdown(
113
  choices=benchmarks,
114
  label="Select Benchmark",
115
  value=default_benchmark,
116
  elem_classes="gr-dropdown",
117
  )
118
+ default_metric = C.NON_RTL_METRICS[0]
119
  bubble_metric = gr.Dropdown(
120
+ choices=C.NON_RTL_METRICS,
121
  label="Select Metric",
122
  value=default_metric,
123
  )
124
  with gr.Row(equal_height=True):
125
  scatter_plot = gr.Plot(
126
+ value=generate_scatter_plot(default_benchmark, default_metric, state),
127
  label="Bubble Chart",
128
  elem_id="full-width-plot",
129
  )
130
 
131
+ # Markdown / Latex explaining our metrics
132
  with gr.Tab("Metrics Information"):
133
  with open("./static/metrics.md", "r") as file:
134
  gr.Markdown(
 
139
  ],
140
  elem_classes="metrics-page",
141
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ with gr.Tab("About Us"):
144
+ gr.HTML(ABOUT_US_HTML)
 
145
 
 
 
 
 
146
  with gr.Tab("References"):
147
+ gr.HTML(REFERENCES_HTML)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  with gr.Row():
150
  with gr.Accordion("📙 Citation", open=False):
 
156
  show_copy_button=True,
157
  )
158
 
159
+ # all event handlers are declared at /handlers/
160
+ # if you need to add interactivity, then you'll need to add one!
161
+ create_leaderboard_handlers(
162
+ filter_leaderboard_fn=filter_leaderboard,
163
+ generate_scatter_plot_fn=generate_scatter_plot,
164
+ task_radio=task_radio,
165
+ benchmark_radio=benchmark_radio,
166
+ model_type_dropdown=model_type_dropdown,
167
+ search_box=search_box,
168
+ params_slider=params_slider,
169
+ bubble_benchmark=bubble_benchmark,
170
+ bubble_metric=bubble_metric,
171
+ scatter_plot=scatter_plot,
172
+ leaderboard=leaderboard,
173
+ simulator_radio=simulator_radio,
174
+ state=state,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  )
176
 
177
 
config/constants.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ RESULTS_DIR = "results"
2
+ ICARUS_RESULTS = f"{RESULTS_DIR}/results_icarus.json"
3
+ VERILATOR_RESULTS = f"{RESULTS_DIR}/results_verilator.json"
4
+ ICARUS_AGG = f"{RESULTS_DIR}/aggregated_scores_icarus.csv"
5
+ VERILATOR_AGG = f"{RESULTS_DIR}/aggregated_scores_verilator.csv"
6
+
7
+ TASKS = ["Spec-to-RTL", "Code Completion", "Line Completion †"]
8
+ S2R_BENCHMARKS = ["VerilogEval S2R", "RTLLM"]
9
+ CC_BENCHMARKS = ["VerilogEval MC", "VeriGen"]
10
+ LC_BENCHMARKS = ["RTL-Repo"]
11
+
12
+ MODEL_TYPES = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]
13
+ TYPE_EMOJI = {"RTL-Specific": "🔴", "General": "🟢", "Coding": "🔵"}
14
+
15
+ NON_RTL_METRICS = ["Syntax (STX)", "Functionality (FNC)", "Synthesis (SYN)", "Power", "Performance", "Area"]
16
+ RTL_METRICS = ["Exact Matching (EM)"]
17
+
18
+ COLUMN_MAPPINGS = {
19
+ "Params": "Parameters (B)",
20
+ "Syntax (STX)": "Syntax",
21
+ "Functionality (FNC)": "Functionality",
22
+ "Synthesis (SYN)": "Synthesis",
23
+ "Post-Synthesis (PSQ)": "Post-Synthesis",
24
+ }
25
+
26
+ COLUMN_ORDER = [
27
+ "Type",
28
+ "Model",
29
+ "Parameters (B)",
30
+ "Syntax",
31
+ "Functionality",
32
+ "Synthesis",
33
+ "Post-Synthesis",
34
+ ]
35
+
36
+ TYPE_COLORS = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
37
+
38
+ Y_AXIS_LIMITS = {
39
+ "Functionality (FNC)": [5, 90],
40
+ "Syntax (STX)": [20, 100],
41
+ "Synthesis (SYN)": [5, 90],
42
+ "Power": [0, 50],
43
+ "Performance": [0, 50],
44
+ "Area": [0, 50],
45
+ "Exact Matching (EM)": [0, 50],
46
+ }
47
+
48
+ SCATTER_PLOT_X_TICKS = {
49
+ "tickvals": [8, 14, 32, 72, 200, 700],
50
+ "ticktext": ["8", "14", "32", "72", "200", "700"],
51
+ }
52
+
53
+ DEFAULT_MAX_PARAMS = 700
54
+ DEFAULT_TASK = "Spec-to-RTL"
55
+ DEFAULT_BENCHMARK = "All"
56
+ DEFAULT_MODEL_TYPE = "All"
57
+
58
+ SIMULATORS = ["Icarus", "Verilator"]
config/model_metadata.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Literal
3
+
4
+
5
+ @dataclass
6
+ class ModelMetadata:
7
+ url: str # HF model card
8
+ params: float # in B
9
+ model_type: Literal["General", "Coding", "RTL-Specific"]
10
+ release: Literal["V1", "V2", "V3"] # release of the leaderboard for which the model was included
11
+ model_arch: Literal["Dense", "Reasoning"] # to distinguish between reasoners and non-reasoners
12
+
13
+
14
+ # fmt: off
15
+ MODELS = {
16
+ "DeepSeek R1-0528": ModelMetadata(
17
+ "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", 685, "General", "V2", "Reasoning"
18
+ ),
19
+ "DeepSeek R1": ModelMetadata(
20
+ "https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General", "V1", "Reasoning"
21
+ ),
22
+ "Llama 3.1 405B": ModelMetadata(
23
+ "https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8", 406, "General", "V1", "Dense"
24
+ ),
25
+ "Qwen3 236B A22B": ModelMetadata(
26
+ "https://huggingface.co/Qwen/Qwen3-235B-A22B", 235, "General", "V2", "Reasoning"
27
+ ),
28
+ "Llama 3.(1-3) 70B": ModelMetadata(
29
+ "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", 70.6, "General", "V1", "Dense"
30
+ ),
31
+ "Qwen2.5 72B": ModelMetadata(
32
+ "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", 72.7, "General", "V1", "Dense"
33
+ ),
34
+ "QwQ 32B": ModelMetadata(
35
+ "https://huggingface.co/Qwen/QwQ-32B", 32.8, "General", "V2", "Reasoning"
36
+ ),
37
+ "Qwen2.5 32B": ModelMetadata(
38
+ "https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General", "V1", "Dense"
39
+ ),
40
+ "StarChat2 15B v0.1": ModelMetadata(
41
+ "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1", 16, "General", "V1", "Dense"
42
+ ),
43
+ "DeepSeek R1 Distill Qwen 14B": ModelMetadata(
44
+ "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", 14.8, "General", "V1", "Reasoning"
45
+ ),
46
+ "CodeLlama 70B": ModelMetadata(
47
+ "https://huggingface.co/codellama/CodeLlama-70b-hf", 69, "Coding", "V1", "Dense"
48
+ ),
49
+ "QwenCoder 2.5 32B": ModelMetadata(
50
+ "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct", 32.5, "Coding", "V1", "Dense"
51
+ ),
52
+ "DeepSeek Coder 33B": ModelMetadata(
53
+ "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct", 33.3, "Coding", "V1", "Dense"
54
+ ),
55
+ "QwenCoder 2.5 14B": ModelMetadata(
56
+ "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct", 14.7, "Coding", "V1", "Dense"
57
+ ),
58
+ "DeepCoder 14B": ModelMetadata(
59
+ "https://huggingface.co/agentica-org/DeepCoder-14B-Preview", 14.8, "Coding", "V2", "Reasoning"
60
+ ),
61
+ "OpenCoder 8B": ModelMetadata(
62
+ "https://huggingface.co/infly/OpenCoder-8B-Instruct", 7.77, "Coding", "V1", "Dense"
63
+ ),
64
+ "SeedCoder 8B": ModelMetadata(
65
+ "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct", 8.25, "Coding", "V2", "Dense"
66
+ ),
67
+ "SeedCoder 8B Reasoning": ModelMetadata(
68
+ "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16", 8.25, "Coding", "V2", "Reasoning"
69
+ ),
70
+ "QwenCoder 2.5 7B": ModelMetadata(
71
+ "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct", 7.61, "Coding", "V1", "Dense"
72
+ ),
73
+ "DeepSeek Coder 6.7B": ModelMetadata(
74
+ "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct", 6.74, "Coding", "V1", "Dense"
75
+ ),
76
+ "HaVen-CodeQwen": ModelMetadata(
77
+ "https://huggingface.co/yangyiyao/HaVen-CodeQwen", 7.25, "RTL-Specific", "V1", "Dense"
78
+ ),
79
+ "CodeV R1 Distill Qwen 7B": ModelMetadata(
80
+ "https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B", 7.62, "RTL-Specific", "V2", "Reasoning"
81
+ ),
82
+ "CodeV-CL-7B": ModelMetadata(
83
+ "https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific", "V1", "Dense"
84
+ ),
85
+ "CodeV-QW-7B": ModelMetadata(
86
+ "https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific", "V1", "Dense"
87
+ ),
88
+ "CodeV-DS-6.7B": ModelMetadata(
89
+ "https://huggingface.co/yang-z/CodeV-DS-6.7B", 6.74, "RTL-Specific", "V1", "Dense"
90
+ ),
91
+ "RTLCoder Mistral": ModelMetadata(
92
+ "https://huggingface.co/ishorn5/RTLCoder-v1.1", 7.24, "RTL-Specific", "V1", "Dense"
93
+ ),
94
+ "RTLCoder DeepSeek": ModelMetadata(
95
+ "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1", 6.74, "RTL-Specific", "V1", "Dense"
96
+ ),
97
+ "OriGen": ModelMetadata(
98
+ "https://huggingface.co/henryen/OriGen", 6.74, "RTL-Specific", "V1", "Dense"
99
+ ),
100
+ "Qwen3 Coder 480B A35B": ModelMetadata(
101
+ "https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct", 480, "Coding", "V2", "Dense"
102
+ ),
103
+ "Magistral Small 2506": ModelMetadata(
104
+ "https://huggingface.co/mistralai/Magistral-Small-2506", 23.6, "General", "V2", "Reasoning"
105
+ ),
106
+ "gpt-oss-20b": ModelMetadata(
107
+ "https://huggingface.co/openai/gpt-oss-20b", 21.5, "General", "V2", "Reasoning"
108
+ ),
109
+ "gpt-oss-120b": ModelMetadata(
110
+ "https://huggingface.co/openai/gpt-oss-120b", 120, "General", "V2", "Reasoning"
111
+ ),
112
+ }
results/parse.py CHANGED
@@ -1,238 +1,11 @@
1
- import csv
2
- import json
3
- import locale
4
  import os
5
  import sys
6
- from typing import Dict, Union
7
-
8
  import pandas as pd
9
 
10
- model_details = {
11
- "DeepSeek R1-0528": (
12
- "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
13
- 685,
14
- "General",
15
- "V2",
16
- "Reasoning", # "Dense" or "Reasoning"
17
- ),
18
- "DeepSeek R1": (
19
- "https://huggingface.co/deepseek-ai/DeepSeek-R1",
20
- 685,
21
- "General",
22
- "V1",
23
- "Reasoning",
24
- ),
25
- "Llama 3.1 405B": (
26
- "https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8",
27
- 406,
28
- "General",
29
- "V1",
30
- "Dense",
31
- ),
32
- "Qwen3 236B A22B": (
33
- "https://huggingface.co/Qwen/Qwen3-235B-A22B",
34
- 235,
35
- "General",
36
- "V2",
37
- "Reasoning",
38
- ),
39
- "Llama 3.(1-3) 70B": (
40
- "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
41
- 70.6,
42
- "General",
43
- "V1",
44
- "Dense",
45
- ),
46
- "Qwen2.5 72B": (
47
- "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
48
- 72.7,
49
- "General",
50
- "V1",
51
- "Dense",
52
- ),
53
- "QwQ 32B": (
54
- "https://huggingface.co/Qwen/QwQ-32B",
55
- 32.8,
56
- "General",
57
- "V2",
58
- "Reasoning",
59
- ),
60
- "Qwen2.5 32B": (
61
- "https://huggingface.co/Qwen/Qwen2.5-32B",
62
- 32.5,
63
- "General",
64
- "V1",
65
- "Dense",
66
- ),
67
- "StarChat2 15B v0.1": (
68
- "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
69
- 16,
70
- "General",
71
- "V1",
72
- "Dense",
73
- ),
74
- "DeepSeek R1 Distill Qwen 14B": (
75
- "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
76
- 14.8,
77
- "General",
78
- "V1",
79
- "Reasoning",
80
- ),
81
- "CodeLlama 70B": (
82
- "https://huggingface.co/codellama/CodeLlama-70b-hf",
83
- 69,
84
- "Coding",
85
- "V1",
86
- "Dense",
87
- ),
88
- "QwenCoder 2.5 32B": (
89
- "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
90
- 32.5,
91
- "Coding",
92
- "V1",
93
- "Dense",
94
- ),
95
- "DeepSeek Coder 33B": (
96
- "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
97
- 33.3,
98
- "Coding",
99
- "V1",
100
- "Dense",
101
- ),
102
- "QwenCoder 2.5 14B": (
103
- "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
104
- 14.7,
105
- "Coding",
106
- "V1",
107
- "Dense",
108
- ),
109
- "DeepCoder 14B": (
110
- "https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
111
- 14.8,
112
- "Coding",
113
- "V2",
114
- "Reasoning",
115
- ),
116
- "OpenCoder 8B": (
117
- "https://huggingface.co/infly/OpenCoder-8B-Instruct",
118
- 7.77,
119
- "Coding",
120
- "V1",
121
- "Dense",
122
- ),
123
- "SeedCoder 8B": (
124
- "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct",
125
- 8.25,
126
- "Coding",
127
- "V2",
128
- "Dense",
129
- ),
130
- "SeedCoder 8B Reasoning": (
131
- "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
132
- 8.25,
133
- "Coding",
134
- "V2",
135
- "Reasoning",
136
- ),
137
- "QwenCoder 2.5 7B": (
138
- "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
139
- 7.61,
140
- "Coding",
141
- "V1",
142
- "Dense",
143
- ),
144
- "DeepSeek Coder 6.7B": (
145
- "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
146
- 6.74,
147
- "Coding",
148
- "V1",
149
- "Dense",
150
- ),
151
- "HaVen-CodeQwen": (
152
- "https://huggingface.co/yangyiyao/HaVen-CodeQwen",
153
- 7.25,
154
- "RTL-Specific",
155
- "V1",
156
- "Dense",
157
- ),
158
- "CodeV R1 Distill Qwen 7B": (
159
- "https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B",
160
- 7.62,
161
- "RTL-Specific",
162
- "V2",
163
- "Reasoning",
164
- ),
165
- "CodeV-CL-7B": (
166
- "https://huggingface.co/yang-z/CodeV-CL-7B",
167
- 6.74,
168
- "RTL-Specific",
169
- "V1",
170
- "Dense",
171
- ),
172
- "CodeV-QW-7B": (
173
- "https://huggingface.co/yang-z/CodeV-QW-7B",
174
- 7.25,
175
- "RTL-Specific",
176
- "V1",
177
- "Dense",
178
- ),
179
- "CodeV-DS-6.7B": (
180
- "https://huggingface.co/yang-z/CodeV-DS-6.7B",
181
- 6.74,
182
- "RTL-Specific",
183
- "V1",
184
- "Dense",
185
- ),
186
- "RTLCoder Mistral": (
187
- "https://huggingface.co/ishorn5/RTLCoder-v1.1",
188
- 7.24,
189
- "RTL-Specific",
190
- "V1",
191
- "Dense",
192
- ),
193
- "RTLCoder DeepSeek": (
194
- "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
195
- 6.74,
196
- "RTL-Specific",
197
- "V1",
198
- "Dense",
199
- ),
200
- "OriGen": (
201
- "https://huggingface.co/henryen/OriGen",
202
- 6.74,
203
- "RTL-Specific",
204
- "V1",
205
- "Dense",
206
- ),
207
- "Qwen3 Coder 480B A35B": (
208
- "https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct",
209
- 480,
210
- "Coding",
211
- "V2",
212
- "Dense",
213
- ),
214
- "Magistral Small 2506": (
215
- "https://huggingface.co/mistralai/Magistral-Small-2506",
216
- 23.6,
217
- "General",
218
- "V2",
219
- "Reasoning",
220
- ),
221
- "gpt-oss-20b": (
222
- "https://huggingface.co/openai/gpt-oss-20b",
223
- 21.5,
224
- "General",
225
- "V2",
226
- "Reasoning",
227
- ),
228
- "gpt-oss-120b": (
229
- "https://huggingface.co/openai/gpt-oss-120b",
230
- 120,
231
- "General",
232
- "V2",
233
- "Reasoning",
234
- ),
235
- }
236
 
237
 
238
  def get_headers(reader, agg=False) -> Union[list, list]:
@@ -248,15 +21,19 @@ def get_headers(reader, agg=False) -> Union[list, list]:
248
  return metrics, benchs
249
 
250
 
251
- def get_model_params_and_url(model) -> Union[str, str, float, str, str]:
252
- if model not in model_details:
253
- return "-", 0.0, "-", "-", "-"
254
- url = model_details[model][0]
255
- params = model_details[model][1]
256
- type = model_details[model][2]
257
- release = model_details[model][3]
258
- reasoning = model_details[model][4]
259
- return url, params, type, release, reasoning
 
 
 
 
260
 
261
 
262
  def parse_results(csv_path: str) -> list[dict]:
@@ -275,7 +52,7 @@ def parse_results(csv_path: str) -> list[dict]:
275
  model = row[0]
276
  if not model:
277
  continue
278
- url, params, type, release, reasoning = get_model_params_and_url(model)
279
  models.append(model)
280
  row = row[1:]
281
  ctr = 0
@@ -294,7 +71,6 @@ def parse_results(csv_path: str) -> list[dict]:
294
  record["Thinking"] = reasoning
295
  dataset.append(record)
296
  ctr += 1
297
- print(models)
298
  return dataset
299
 
300
 
@@ -318,9 +94,7 @@ def read_json(json_path: str = "results/results_icarus.json"):
318
  return data
319
 
320
 
321
- def read_data(
322
- json_path: str = "results/results_icarus.json",
323
- ) -> tuple[pd.DataFrame, list, list, str]:
324
  data = read_json(json_path)
325
  df = pd.DataFrame(data)
326
  df.rename(
@@ -334,11 +108,21 @@ def read_data(
334
  inplace=True,
335
  )
336
  df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
 
 
 
 
337
  benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
338
  metrics = df["Metric"].unique().tolist()
339
- default_metric = (
340
- "Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0]
341
- )
 
 
 
 
 
 
342
  return df, benchmarks, metrics, default_metric
343
 
344
 
 
 
 
 
1
  import os
2
  import sys
3
+ import csv
4
+ import json
5
  import pandas as pd
6
 
7
+ from typing import Dict, Union
8
+ from config.model_metadata import MODELS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  def get_headers(reader, agg=False) -> Union[list, list]:
 
21
  return metrics, benchs
22
 
23
 
24
+ def get_model_metadata(model_key: str) -> tuple[str, float, str, str, str]:
25
+ try:
26
+ model_metadata = MODELS[model_key]
27
+ except KeyError:
28
+ raise KeyError(f"Unknown model: {model_key}")
29
+
30
+ return (
31
+ model_metadata.url,
32
+ model_metadata.params,
33
+ model_metadata.model_type,
34
+ model_metadata.release,
35
+ model_metadata.model_arch,
36
+ )
37
 
38
 
39
  def parse_results(csv_path: str) -> list[dict]:
 
52
  model = row[0]
53
  if not model:
54
  continue
55
+ url, params, type, release, reasoning = get_model_metadata(model)
56
  models.append(model)
57
  row = row[1:]
58
  ctr = 0
 
71
  record["Thinking"] = reasoning
72
  dataset.append(record)
73
  ctr += 1
 
74
  return dataset
75
 
76
 
 
94
  return data
95
 
96
 
97
+ def read_dataframe(json_path: str) -> pd.DataFrame:
 
 
98
  data = read_json(json_path)
99
  df = pd.DataFrame(data)
100
  df.rename(
 
108
  inplace=True,
109
  )
110
  df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
111
+ return df
112
+
113
+
114
+ def get_metadata(df: pd.DataFrame) -> tuple[list, list, str]:
115
  benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
116
  metrics = df["Metric"].unique().tolist()
117
+ default_metric = "Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0]
118
+ return benchmarks, metrics, default_metric
119
+
120
+
121
+ def read_data(
122
+ json_path: str = "results/results_icarus.json",
123
+ ) -> tuple[pd.DataFrame, list, list, str]:
124
+ df = read_dataframe(json_path)
125
+ benchmarks, metrics, default_metric = get_metadata(df)
126
  return df, benchmarks, metrics, default_metric
127
 
128
 
utils.py CHANGED
@@ -6,13 +6,7 @@ import pandas as pd
6
  import plotly.express as px
7
  import plotly.graph_objects as go
8
 
9
- # fmt: off
10
- type_emoji = {
11
- "RTL-Specific": "🔴",
12
- "General": "🟢",
13
- "Coding": "🔵"
14
- }
15
- # fmt: on
16
 
17
 
18
  def model_hyperlink(link, model_name, release, thinking=False):
@@ -23,11 +17,7 @@ def model_hyperlink(link, model_name, release, thinking=False):
23
  if release == "V1":
24
  return ret + reasoning_badge if thinking == "Reasoning" else ret
25
  else:
26
- return (
27
- ret + reasoning_badge + new_badge
28
- if thinking == "Reasoning"
29
- else ret + new_badge
30
- )
31
 
32
 
33
  def handle_special_cases(benchmark, metric):
@@ -39,13 +29,19 @@ def handle_special_cases(benchmark, metric):
39
 
40
 
41
  def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
 
 
 
42
  subset = subset.drop(subset[subset.Score < 0.0].index)
43
- details = subset[
44
- ["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]
45
- ].drop_duplicates("Model")
46
- filtered_df = subset[["Model", "Score"]].rename(
47
- columns={"Score": "Exact Matching (EM)"}
 
 
48
  )
 
49
  filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
50
  filtered_df["Model"] = filtered_df.apply(
51
  lambda row: model_hyperlink(
@@ -55,31 +51,28 @@ def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
55
  ),
56
  axis=1,
57
  )
58
- filtered_df["Type"] = filtered_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
59
  filtered_df = filtered_df[["Type", "Model", "Params", "Exact Matching (EM)"]]
60
- filtered_df = filtered_df.sort_values(
61
- by="Exact Matching (EM)", ascending=False
62
- ).reset_index(drop=True)
63
  return filtered_df
64
 
65
 
66
  def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
67
- details = subset[
68
- ["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]
69
- ].drop_duplicates("Model")
 
 
 
70
  if "RTLLM" in subset["Benchmark"].unique():
71
  pivot_df = (
72
- subset.pivot_table(
73
- index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r
74
- )
75
  .reset_index()
76
  .round(2)
77
  )
78
  else:
79
  pivot_df = (
80
- subset.pivot_table(
81
- index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc
82
- )
83
  .reset_index()
84
  .round(2)
85
  )
@@ -94,39 +87,20 @@ def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataF
94
 
95
  pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
96
  pivot_df["Model"] = pivot_df.apply(
97
- lambda row: model_hyperlink(
98
- row["Model URL"], row["Model"], row["Release"], row["Thinking"]
99
- ),
100
  axis=1,
101
  )
102
- pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
103
- pivot_df["Post-Synthesis (PSQ)"] = (
104
- pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
105
- )
 
 
 
 
 
 
106
 
107
- pivot_df.rename(
108
- columns={
109
- "Params": "Parameters (B)",
110
- "Syntax (STX)": "Syntax",
111
- "Functionality (FNC)": "Functionality",
112
- "Synthesis (SYN)": "Synthesis",
113
- "Post-Synthesis (PSQ)": "Post-Synthesis",
114
- },
115
- inplace=True,
116
- )
117
- columns_order = [
118
- "Type",
119
- "Model",
120
- "Parameters (B)",
121
- "Syntax",
122
- "Functionality",
123
- "Synthesis",
124
- "Post-Synthesis",
125
- ]
126
- pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
127
- pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(
128
- drop=True
129
- )
130
  return pivot_df
131
 
132
 
@@ -154,65 +128,40 @@ def custom_agg_cc(vals):
154
  return round(result, 2)
155
 
156
 
157
- def filter_bench_all(
158
- subset: pd.DataFrame, df_agg=None, agg_column=None
159
- ) -> pd.DataFrame:
160
- details = subset[
161
- ["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]
162
- ].drop_duplicates("Model")
 
163
  if "RTLLM" in subset["Benchmark"].unique():
164
  pivot_df = (
165
- subset.pivot_table(
166
- index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r
167
- )
168
  .reset_index()
169
  .round(2)
170
  )
171
  else:
172
  pivot_df = (
173
- subset.pivot_table(
174
- index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc
175
- )
176
  .reset_index()
177
  .round(2)
178
  )
179
 
180
  pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
181
- print(pivot_df.columns)
182
  pivot_df["Model"] = pivot_df.apply(
183
- lambda row: model_hyperlink(
184
- row["Model URL"], row["Model"], row["Release"], row["Thinking"]
185
- ),
186
  axis=1,
187
  )
188
- pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
189
- pivot_df["Post-Synthesis Quality"] = (
190
- pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
191
- )
192
 
193
- pivot_df.rename(
194
- columns={
195
- "Params": "Parameters (B)",
196
- "Exact Matching (EM)": "EM",
197
- "Syntax (STX)": "Syntax",
198
- "Functionality (FNC)": "Functionality",
199
- "Synthesis (SYN)": "Synthesis",
200
- "Post-Synthesis Quality": "Post-Synthesis",
201
- },
202
- inplace=True,
203
- )
204
 
205
- columns_order = [
206
- "Type",
207
- "Model",
208
- "Parameters (B)",
209
- "Syntax",
210
- "Functionality",
211
- "Synthesis",
212
- "Post-Synthesis",
213
- ]
214
- pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
215
- pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(
216
- drop=True
217
- )
218
  return pivot_df
 
6
  import plotly.express as px
7
  import plotly.graph_objects as go
8
 
9
+ from config.constants import COLUMN_MAPPINGS, COLUMN_ORDER, TYPE_EMOJI
 
 
 
 
 
 
10
 
11
 
12
  def model_hyperlink(link, model_name, release, thinking=False):
 
17
  if release == "V1":
18
  return ret + reasoning_badge if thinking == "Reasoning" else ret
19
  else:
20
+ return ret + reasoning_badge + new_badge if thinking == "Reasoning" else ret + new_badge
 
 
 
 
21
 
22
 
23
  def handle_special_cases(benchmark, metric):
 
29
 
30
 
31
  def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
32
+ if subset.empty:
33
+ return pd.DataFrame(columns=["Type", "Model", "Params", "Exact Matching (EM)"])
34
+
35
  subset = subset.drop(subset[subset.Score < 0.0].index)
36
+
37
+ # Check again if empty after filtering
38
+ if subset.empty:
39
+ return pd.DataFrame(columns=["Type", "Model", "Params", "Exact Matching (EM)"])
40
+
41
+ details = subset[["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]].drop_duplicates(
42
+ "Model"
43
  )
44
+ filtered_df = subset[["Model", "Score"]].rename(columns={"Score": "Exact Matching (EM)"})
45
  filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
46
  filtered_df["Model"] = filtered_df.apply(
47
  lambda row: model_hyperlink(
 
51
  ),
52
  axis=1,
53
  )
54
+ filtered_df["Type"] = filtered_df["Model Type"].map(lambda x: TYPE_EMOJI.get(x, ""))
55
  filtered_df = filtered_df[["Type", "Model", "Params", "Exact Matching (EM)"]]
56
+ filtered_df = filtered_df.sort_values(by="Exact Matching (EM)", ascending=False).reset_index(drop=True)
 
 
57
  return filtered_df
58
 
59
 
60
  def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
61
+ if subset.empty:
62
+ return pd.DataFrame(columns=COLUMN_ORDER)
63
+
64
+ details = subset[["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]].drop_duplicates(
65
+ "Model"
66
+ )
67
  if "RTLLM" in subset["Benchmark"].unique():
68
  pivot_df = (
69
+ subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r)
 
 
70
  .reset_index()
71
  .round(2)
72
  )
73
  else:
74
  pivot_df = (
75
+ subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc)
 
 
76
  .reset_index()
77
  .round(2)
78
  )
 
87
 
88
  pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
89
  pivot_df["Model"] = pivot_df.apply(
90
+ lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"], row["Thinking"]),
 
 
91
  axis=1,
92
  )
93
+ pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: TYPE_EMOJI.get(x, ""))
94
+
95
+ if all(col in pivot_df.columns for col in ["Power", "Performance", "Area"]):
96
+ pivot_df["Post-Synthesis (PSQ)"] = pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
97
+
98
+ pivot_df.rename(columns=COLUMN_MAPPINGS, inplace=True)
99
+ pivot_df = pivot_df[[col for col in COLUMN_ORDER if col in pivot_df.columns]]
100
+
101
+ if "Functionality" in pivot_df.columns:
102
+ pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(drop=True)
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  return pivot_df
105
 
106
 
 
128
  return round(result, 2)
129
 
130
 
131
+ def filter_bench_all(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
132
+ if subset.empty:
133
+ return pd.DataFrame(columns=COLUMN_ORDER)
134
+
135
+ details = subset[["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]].drop_duplicates(
136
+ "Model"
137
+ )
138
  if "RTLLM" in subset["Benchmark"].unique():
139
  pivot_df = (
140
+ subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r)
 
 
141
  .reset_index()
142
  .round(2)
143
  )
144
  else:
145
  pivot_df = (
146
+ subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc)
 
 
147
  .reset_index()
148
  .round(2)
149
  )
150
 
151
  pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
 
152
  pivot_df["Model"] = pivot_df.apply(
153
+ lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"], row["Thinking"]),
 
 
154
  axis=1,
155
  )
156
+ pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: TYPE_EMOJI.get(x, ""))
 
 
 
157
 
158
+ if all(col in pivot_df.columns for col in ["Power", "Performance", "Area"]):
159
+ pivot_df["Post-Synthesis Quality"] = pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
160
+
161
+ pivot_df.rename(columns=COLUMN_MAPPINGS, inplace=True)
162
+ pivot_df = pivot_df[[col for col in COLUMN_ORDER if col in pivot_df.columns]]
163
+
164
+ if "Functionality" in pivot_df.columns:
165
+ pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(drop=True)
 
 
 
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  return pivot_df