Spaces:
Running
Running
Commit
·
be0239b
1
Parent(s):
cc21640
Small refactor: Add `config` with model metadata and move constants to its own file
Browse files- app.py +80 -451
- config/constants.py +58 -0
- config/model_metadata.py +112 -0
- results/parse.py +32 -248
- utils.py +53 -104
app.py
CHANGED
|
@@ -1,269 +1,72 @@
|
|
| 1 |
import sys
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
-
import pandas as pd
|
| 5 |
-
import plotly.express as px
|
| 6 |
from gradio.themes.utils import colors
|
| 7 |
|
| 8 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from style.css_html_js import custom_css
|
| 11 |
-
from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
|
| 15 |
-
subset = df.copy()
|
| 16 |
-
|
| 17 |
-
# Filter by task specific benchmarks when 'All' benchmarks is selected
|
| 18 |
-
if task == "Spec-to-RTL":
|
| 19 |
-
valid_benchmarks = s2r_benchs
|
| 20 |
-
if benchmark == "All":
|
| 21 |
-
subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
|
| 22 |
-
elif task == "Code Completion":
|
| 23 |
-
valid_benchmarks = cc_benchs
|
| 24 |
-
if benchmark == "All":
|
| 25 |
-
subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
|
| 26 |
-
elif task == "Line Completion †":
|
| 27 |
-
valid_benchmarks = lc_benchs
|
| 28 |
-
if benchmark == "All":
|
| 29 |
-
subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
|
| 30 |
-
|
| 31 |
-
if benchmark != "All":
|
| 32 |
-
subset = df[df["Benchmark"] == benchmark]
|
| 33 |
-
|
| 34 |
-
if model_type != "All":
|
| 35 |
-
# without emojis
|
| 36 |
-
subset = subset[subset["Model Type"] == model_type.split(" ")[0]]
|
| 37 |
-
if search_query:
|
| 38 |
-
subset = subset[
|
| 39 |
-
subset["Model"].str.contains(search_query, case=False, na=False)
|
| 40 |
-
]
|
| 41 |
-
max_params = float(max_params)
|
| 42 |
-
subset = subset[subset["Params"] <= max_params]
|
| 43 |
-
|
| 44 |
-
if benchmark == "All":
|
| 45 |
-
if task == "Spec-to-RTL":
|
| 46 |
-
return filter_bench_all(subset, df_agg, agg_column="Agg S2R")
|
| 47 |
-
elif task == "Code Completion":
|
| 48 |
-
return filter_bench_all(subset, df_agg, agg_column="Agg MC")
|
| 49 |
-
elif task == "Line Completion †":
|
| 50 |
-
return filter_RTLRepo(subset)
|
| 51 |
-
elif benchmark == "RTL-Repo":
|
| 52 |
-
return filter_RTLRepo(subset)
|
| 53 |
-
else:
|
| 54 |
-
agg_column = None
|
| 55 |
-
if benchmark == "VerilogEval S2R":
|
| 56 |
-
agg_column = "Agg VerilogEval S2R"
|
| 57 |
-
elif benchmark == "VerilogEval MC":
|
| 58 |
-
agg_column = "Agg VerilogEval MC"
|
| 59 |
-
elif benchmark == "RTLLM":
|
| 60 |
-
agg_column = "Agg RTLLM"
|
| 61 |
-
elif benchmark == "VeriGen":
|
| 62 |
-
agg_column = "Agg VeriGen"
|
| 63 |
-
|
| 64 |
-
return filter_bench(subset, df_agg, agg_column)
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
def update_benchmarks_by_task(task):
|
| 68 |
-
if task == "Spec-to-RTL":
|
| 69 |
-
new_benchmarks = ["All"] + s2r_benchs
|
| 70 |
-
elif task == "Code Completion":
|
| 71 |
-
new_benchmarks = ["All"] + cc_benchs
|
| 72 |
-
elif task == "Line Completion †":
|
| 73 |
-
new_benchmarks = lc_benchs
|
| 74 |
-
else:
|
| 75 |
-
new_benchmarks = ["All"] + benchmarks
|
| 76 |
-
benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
|
| 77 |
-
filtered = filter_leaderboard(
|
| 78 |
-
task,
|
| 79 |
-
benchmark_value,
|
| 80 |
-
model_type_dropdown.value,
|
| 81 |
-
search_box.value,
|
| 82 |
-
params_slider.value,
|
| 83 |
-
)
|
| 84 |
-
return gr.update(value=benchmark_value, choices=new_benchmarks), filtered
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
def generate_scatter_plot(benchmark, metric):
|
| 88 |
-
benchmark, metric = handle_special_cases(benchmark, metric)
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True)
|
| 95 |
-
else:
|
| 96 |
-
detailed_scores = subset.pivot_table(
|
| 97 |
-
index="Model", columns="Metric", values="Score"
|
| 98 |
-
).reset_index()
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
)
|
| 104 |
-
|
| 105 |
-
scatter_data["x"] = scatter_data["Params"]
|
| 106 |
-
scatter_data["y"] = scatter_data[metric]
|
| 107 |
-
scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40
|
| 108 |
-
|
| 109 |
-
type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
|
| 110 |
-
scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray")
|
| 111 |
-
|
| 112 |
-
y_axis_limits = {
|
| 113 |
-
"Functionality (FNC)": [5, 90],
|
| 114 |
-
"Syntax (STX)": [20, 100],
|
| 115 |
-
"Synthesis (SYN)": [5, 90],
|
| 116 |
-
"Power": [0, 50],
|
| 117 |
-
"Performance": [0, 50],
|
| 118 |
-
"Area": [0, 50],
|
| 119 |
-
"Exact Matching (EM)": [0, 50],
|
| 120 |
-
}
|
| 121 |
-
y_range = y_axis_limits.get(metric, [0, 80])
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
x="x",
|
| 126 |
-
y="y",
|
| 127 |
-
log_x=True,
|
| 128 |
-
size="size",
|
| 129 |
-
color="Model Type",
|
| 130 |
-
text="Model",
|
| 131 |
-
hover_data={metric: ":.2f"},
|
| 132 |
-
title=f"Params vs. {metric} for {benchmark}",
|
| 133 |
-
labels={"x": "# Params (Log Scale)", "y": metric},
|
| 134 |
-
template="plotly_white",
|
| 135 |
-
height=600,
|
| 136 |
-
width=1200,
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
fig.update_traces(
|
| 140 |
-
textposition="top center",
|
| 141 |
-
textfont_size=10,
|
| 142 |
-
marker=dict(opacity=0.8, line=dict(width=0.5, color="black")),
|
| 143 |
-
)
|
| 144 |
-
fig.update_layout(
|
| 145 |
-
xaxis=dict(
|
| 146 |
-
showgrid=True,
|
| 147 |
-
type="log",
|
| 148 |
-
tickmode="array",
|
| 149 |
-
tickvals=[8, 14, 32, 72, 200, 700],
|
| 150 |
-
ticktext=["8", "14", "32", "72", "200", "700"],
|
| 151 |
-
),
|
| 152 |
-
showlegend=False,
|
| 153 |
-
yaxis=dict(range=y_range),
|
| 154 |
-
margin=dict(l=50, r=50, t=50, b=50),
|
| 155 |
-
plot_bgcolor="white",
|
| 156 |
-
)
|
| 157 |
-
|
| 158 |
-
return fig
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
with gr.Blocks(
|
| 162 |
-
css=custom_css, theme=gr.themes.Default(primary_hue=colors.emerald)
|
| 163 |
-
) as app:
|
| 164 |
-
df_icarus, benchmarks, metrics, default_metric = read_data(
|
| 165 |
-
"results/results_icarus.json"
|
| 166 |
-
)
|
| 167 |
-
df_agg_icarus = parse_agg("results/aggregated_scores_icarus.csv")
|
| 168 |
-
df_verilator, _, _, _ = read_data("results/results_verilator.json")
|
| 169 |
-
df_agg_verilator = parse_agg("results/aggregated_scores_verilator.csv")
|
| 170 |
-
df = df_icarus
|
| 171 |
-
df_agg = df_agg_icarus
|
| 172 |
-
tasks = ["Spec-to-RTL", "Code Completion", "Line Completion †"]
|
| 173 |
-
s2r_benchs = ["VerilogEval S2R", "RTLLM"]
|
| 174 |
-
cc_benchs = ["VerilogEval MC", "VeriGen"]
|
| 175 |
-
lc_benchs = ["RTL-Repo"]
|
| 176 |
-
non_rtl_metrics = [
|
| 177 |
-
"Syntax (STX)",
|
| 178 |
-
"Functionality (FNC)",
|
| 179 |
-
"Synthesis (SYN)",
|
| 180 |
-
"Power",
|
| 181 |
-
"Performance",
|
| 182 |
-
"Area",
|
| 183 |
-
]
|
| 184 |
-
rtl_metrics = ["Exact Matching (EM)"]
|
| 185 |
-
model_types = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
)
|
| 194 |
-
gr.HTML(
|
| 195 |
-
"""
|
| 196 |
-
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
| 197 |
-
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
|
| 198 |
-
<div style="text-align: center; margin-bottom: 0px; margin-top: 0px;">
|
| 199 |
-
<a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
|
| 200 |
-
<button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
|
| 201 |
-
GitHub Repo
|
| 202 |
-
</button>
|
| 203 |
-
</a>
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
</a>
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
</button>
|
| 215 |
-
</a>
|
| 216 |
-
<p style="margin-top: 15px;">If you have any inquiries or wish to collaborate:
|
| 217 |
-
<a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
|
| 218 |
-
</p>
|
| 219 |
-
</div>
|
| 220 |
-
"""
|
| 221 |
-
)
|
| 222 |
-
gr.HTML(
|
| 223 |
-
"""
|
| 224 |
-
<div style=" margin-top:-10px !important;">
|
| 225 |
-
<p style="margin-bottom: 15px; text-align: start !important;">
|
| 226 |
-
Welcome to the TuRTLe Model Leaderboard! TuRTLe is a
|
| 227 |
-
<b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b>
|
| 228 |
-
for hardware design.
|
| 229 |
-
Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b>
|
| 230 |
-
(PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs.
|
| 231 |
-
Use the filters below to explore different RTL benchmarks, simulators and models.
|
| 232 |
-
</p>
|
| 233 |
-
<p style="margin-top:10px; text-align:start !important;">
|
| 234 |
-
<span style="font-variant:small-caps; font-weight:bold;">UPDATE (SEPT 2025):</span> Added <span>gpt-oss-20b</span> and <span>gpt-oss-120b</span> to the leaderboard
|
| 235 |
-
</p>
|
| 236 |
-
<p style="margin-top:-6px; text-align:start !important;">
|
| 237 |
-
<span style="font-variant:small-caps; font-weight:bold;">UPDATE (JULY 2025):</span> Our TuRTLe paper was accepted to
|
| 238 |
-
<a href="https://mlcad.org/symposium/2025/" target="_blank">MLCAD 2025</a> in September (Santa Cruz, CA), plus we've added Verilator as a new simulator alongside Icarus Verilog
|
| 239 |
-
</p>
|
| 240 |
-
<p style="margin-top: -6px; text-align: start !important;">
|
| 241 |
-
<span style="font-variant: small-caps; font-weight: bold;">UPDATE (JUNE 2025):</span> We make our framework open-source on GitHub and we add 7 new recent models! For a total of 40 base and instruct models and 5 RTL benchmarks
|
| 242 |
-
</p>
|
| 243 |
-
</div>
|
| 244 |
-
"""
|
| 245 |
-
)
|
| 246 |
-
with gr.Tabs():
|
| 247 |
with gr.Tab("Leaderboard"):
|
|
|
|
| 248 |
with gr.Row(equal_height=True):
|
| 249 |
with gr.Column(scale=4):
|
| 250 |
-
task_radio = gr.Radio(
|
| 251 |
-
choices=tasks, label="Select Task", value="Spec-to-RTL"
|
| 252 |
-
)
|
| 253 |
with gr.Column(scale=3):
|
| 254 |
benchmark_radio = gr.Radio(
|
| 255 |
-
choices=[
|
| 256 |
label="Select Benchmark",
|
| 257 |
-
value=
|
| 258 |
)
|
| 259 |
with gr.Column(scale=2, min_width=180):
|
| 260 |
simulator_radio = gr.Radio(
|
| 261 |
-
choices=
|
| 262 |
-
value=
|
| 263 |
label="Select Simulator",
|
| 264 |
scale=1,
|
| 265 |
)
|
| 266 |
|
|
|
|
| 267 |
with gr.Row(equal_height=True):
|
| 268 |
search_box = gr.Textbox(
|
| 269 |
label="Search Model",
|
|
@@ -271,74 +74,61 @@ with gr.Blocks(
|
|
| 271 |
scale=2,
|
| 272 |
)
|
| 273 |
model_type_dropdown = gr.Radio(
|
| 274 |
-
choices=
|
| 275 |
label="Select Model Type",
|
| 276 |
-
value=
|
| 277 |
scale=3,
|
| 278 |
)
|
| 279 |
params_slider = gr.Slider(
|
| 280 |
-
minimum=
|
| 281 |
-
maximum=
|
| 282 |
-
value=
|
| 283 |
label="Max Params",
|
| 284 |
step=1,
|
| 285 |
scale=2,
|
| 286 |
)
|
| 287 |
|
|
|
|
| 288 |
leaderboard = gr.DataFrame(
|
| 289 |
-
value=filter_leaderboard(
|
|
|
|
|
|
|
| 290 |
headers="first row",
|
| 291 |
show_row_numbers=True,
|
| 292 |
wrap=True,
|
| 293 |
-
datatype=[
|
| 294 |
-
"html",
|
| 295 |
-
"html",
|
| 296 |
-
],
|
| 297 |
interactive=False,
|
| 298 |
-
column_widths=[
|
| 299 |
-
"7%",
|
| 300 |
-
"28%",
|
| 301 |
-
"13%",
|
| 302 |
-
"10%",
|
| 303 |
-
"13%",
|
| 304 |
-
"10%",
|
| 305 |
-
"14%",
|
| 306 |
-
],
|
| 307 |
elem_classes="dataframe-leaderboard",
|
| 308 |
)
|
| 309 |
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
<div id="lc-footnote" style="font-size: 13px; opacity: 0.6; margin-top: -5px; z-index:999; text-align: left;">
|
| 313 |
-
<span style="font-weight: 600; opacity: 1;">†</span>
|
| 314 |
-
<em>Line Completion</em> excludes “reasoning” models since this task targets quick auto-completion<br/>
|
| 315 |
-
Additionally, for <em>Line Completion</em> and <em>Code Completion</em> benchmarks we use <b>Base</b> model variant (if available), and for <em>Spec-to-RTL</em> we use <b>Instruct</b> model variant
|
| 316 |
-
</div>
|
| 317 |
-
"""
|
| 318 |
-
)
|
| 319 |
|
|
|
|
| 320 |
with gr.Tab("Plot View"):
|
| 321 |
with gr.Row(equal_height=True):
|
| 322 |
-
default_benchmark =
|
| 323 |
bubble_benchmark = gr.Dropdown(
|
| 324 |
choices=benchmarks,
|
| 325 |
label="Select Benchmark",
|
| 326 |
value=default_benchmark,
|
| 327 |
elem_classes="gr-dropdown",
|
| 328 |
)
|
| 329 |
-
default_metric =
|
| 330 |
bubble_metric = gr.Dropdown(
|
| 331 |
-
choices=
|
| 332 |
label="Select Metric",
|
| 333 |
value=default_metric,
|
| 334 |
)
|
| 335 |
with gr.Row(equal_height=True):
|
| 336 |
scatter_plot = gr.Plot(
|
| 337 |
-
value=generate_scatter_plot(default_benchmark, default_metric),
|
| 338 |
label="Bubble Chart",
|
| 339 |
elem_id="full-width-plot",
|
| 340 |
)
|
| 341 |
|
|
|
|
| 342 |
with gr.Tab("Metrics Information"):
|
| 343 |
with open("./static/metrics.md", "r") as file:
|
| 344 |
gr.Markdown(
|
|
@@ -349,52 +139,12 @@ with gr.Blocks(
|
|
| 349 |
],
|
| 350 |
elem_classes="metrics-page",
|
| 351 |
)
|
| 352 |
-
with gr.Tab("About Us"):
|
| 353 |
-
gr.HTML(
|
| 354 |
-
"""
|
| 355 |
-
<div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
|
| 356 |
-
<div style="display: flex; justify-content: center; align-items: center; gap: 5%; margin-bottom: 20px;">
|
| 357 |
-
<img src='/gradio_api/file=hpai_logo_grad.png' alt='HPAI Group Logo' style="width: 45%;"/>
|
| 358 |
-
<img src='/gradio_api/file=bsc-logo.png' alt='BSC Logo' style="width: 25%;"/>
|
| 359 |
-
</div>
|
| 360 |
-
|
| 361 |
-
<p style="font-size: 16px; text-align: start;">
|
| 362 |
-
The <b>High-Performance Artificial Intelligence (HPAI)</b> group is part of the
|
| 363 |
-
<a href="https://bsc.es/" target="_blank">Barcelona Supercomputing Center (BSC)</a>.
|
| 364 |
-
This leaderboard is maintained by HPAI as part of our commitment to <b>open science</b>.
|
| 365 |
-
</p>
|
| 366 |
-
|
| 367 |
-
<ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
|
| 368 |
-
<li><a href="https://hpai.bsc.es/" target="_blank">HPAI Website</a></li>
|
| 369 |
-
<li><a href="https://github.com/HPAI-BSC/" target="_blank">HPAI GitHub Organization Page</a></li>
|
| 370 |
-
<li><a href="https://huggingface.co/HPAI-BSC/" target="_blank">HPAI Hugging Face Organization Page</a></li>
|
| 371 |
-
</ul>
|
| 372 |
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
</p>
|
| 376 |
|
| 377 |
-
<p style="font-size: 16px;">Email: <a href="mailto:hpai@bsc.es"><b>hpai@bsc.es</b></a></p>
|
| 378 |
-
</div>
|
| 379 |
-
"""
|
| 380 |
-
)
|
| 381 |
with gr.Tab("References"):
|
| 382 |
-
gr.HTML(
|
| 383 |
-
"""
|
| 384 |
-
<div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
|
| 385 |
-
<ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
|
| 386 |
-
<li><a href="https://github.com/bigcode-project/bigcode-evaluation-harness" target="_blank">Code Generation LM Evaluation Harness</a></li>
|
| 387 |
-
<li>Williams, S. Icarus Verilog [Computer software]. <a href="https://github.com/steveicarus/iverilog" target="_blank">https://github.com/steveicarus/iverilog</a></li>
|
| 388 |
-
<li>Snyder, W., Wasson, P., Galbi, D., & et al. Verilator [Computer software]. <a href="https://github.com/verilator/verilator" target="_blank">https://github.com/verilator/verilator</a></li>
|
| 389 |
-
<li>RTL-Repo: Allam and M. Shalan, “Rtl-repo: A benchmark for evaluating llms on large-scale rtl design projects,” in 2024 IEEE LLM Aided Design Workshop (LAD). IEEE, 2024, pp. 1–5.</li>
|
| 390 |
-
<li>VeriGen: S. Thakur, B. Ahmad, H. Pearce, B. Tan, B. Dolan-Gavitt, R. Karri, and S. Garg, “Verigen: A large language model for verilog code generation,” ACM Transactions on Design Automation of Electronic Systems, vol. 29, no. 3, pp. 1–31, 2024. </li>
|
| 391 |
-
<li>VerilogEval (I): M. Liu, N. Pinckney, B. Khailany, and H. Ren, “Verilogeval: Evaluating large language models for verilog code generation,” in 2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 2023, pp. 1–8.</li>
|
| 392 |
-
<li>VerilogEval (II): N. Pinckney, C. Batten, M. Liu, H. Ren, and B. Khailany, “Revisiting VerilogEval: A Year of Improvements in Large-Language Models for Hardware Code Generation,” ACM Trans. Des. Autom. Electron. Syst., feb 2025. https://doi.org/10.1145/3718088</li>
|
| 393 |
-
<li>RTLLM: Y. Lu, S. Liu, Q. Zhang, and Z. Xie, “Rtllm: An open-source benchmark for design rtl generation with large language model,” in 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC). IEEE, 2024, pp. 722–727.</li>
|
| 394 |
-
</ul>
|
| 395 |
-
</div>
|
| 396 |
-
"""
|
| 397 |
-
)
|
| 398 |
|
| 399 |
with gr.Row():
|
| 400 |
with gr.Accordion("📙 Citation", open=False):
|
|
@@ -406,143 +156,22 @@ with gr.Blocks(
|
|
| 406 |
show_copy_button=True,
|
| 407 |
)
|
| 408 |
|
| 409 |
-
# event handlers
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
)
|
| 426 |
-
model_type_dropdown.change(
|
| 427 |
-
fn=filter_leaderboard,
|
| 428 |
-
inputs=[
|
| 429 |
-
task_radio,
|
| 430 |
-
benchmark_radio,
|
| 431 |
-
model_type_dropdown,
|
| 432 |
-
search_box,
|
| 433 |
-
params_slider,
|
| 434 |
-
],
|
| 435 |
-
outputs=leaderboard,
|
| 436 |
-
)
|
| 437 |
-
search_box.change(
|
| 438 |
-
fn=filter_leaderboard,
|
| 439 |
-
inputs=[
|
| 440 |
-
task_radio,
|
| 441 |
-
benchmark_radio,
|
| 442 |
-
model_type_dropdown,
|
| 443 |
-
search_box,
|
| 444 |
-
params_slider,
|
| 445 |
-
],
|
| 446 |
-
outputs=leaderboard,
|
| 447 |
-
)
|
| 448 |
-
params_slider.change(
|
| 449 |
-
fn=filter_leaderboard,
|
| 450 |
-
inputs=[
|
| 451 |
-
task_radio,
|
| 452 |
-
benchmark_radio,
|
| 453 |
-
model_type_dropdown,
|
| 454 |
-
search_box,
|
| 455 |
-
params_slider,
|
| 456 |
-
],
|
| 457 |
-
outputs=leaderboard,
|
| 458 |
-
)
|
| 459 |
-
|
| 460 |
-
def on_benchmark_change(benchmark, _):
|
| 461 |
-
if benchmark == "RTL-Repo":
|
| 462 |
-
metric = "Exact Matching (EM)"
|
| 463 |
-
return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
|
| 464 |
-
benchmark, metric
|
| 465 |
-
)
|
| 466 |
-
else:
|
| 467 |
-
metric = non_rtl_metrics[0]
|
| 468 |
-
return gr.update(
|
| 469 |
-
choices=non_rtl_metrics[:-1], value=metric
|
| 470 |
-
), generate_scatter_plot(benchmark, metric)
|
| 471 |
-
|
| 472 |
-
def on_metric_change(benchmark, metric):
|
| 473 |
-
benchmark, metric = handle_special_cases(benchmark, metric)
|
| 474 |
-
fig = generate_scatter_plot(benchmark, metric)
|
| 475 |
-
return gr.update(value=benchmark), fig
|
| 476 |
-
|
| 477 |
-
def on_simulator_change(
|
| 478 |
-
simulator,
|
| 479 |
-
task,
|
| 480 |
-
benchmark,
|
| 481 |
-
model_type,
|
| 482 |
-
search,
|
| 483 |
-
max_params,
|
| 484 |
-
plot_bench,
|
| 485 |
-
plot_metric,
|
| 486 |
-
):
|
| 487 |
-
global df, df_agg
|
| 488 |
-
if simulator == "Icarus":
|
| 489 |
-
df, df_agg = df_icarus, df_agg_icarus
|
| 490 |
-
else:
|
| 491 |
-
df, df_agg = df_verilator, df_agg_verilator
|
| 492 |
-
|
| 493 |
-
leaderboard_df = filter_leaderboard(
|
| 494 |
-
task, benchmark, model_type, search, max_params
|
| 495 |
-
)
|
| 496 |
-
fig = generate_scatter_plot(plot_bench, plot_metric)
|
| 497 |
-
return leaderboard_df, fig
|
| 498 |
-
|
| 499 |
-
bubble_benchmark.change(
|
| 500 |
-
fn=on_benchmark_change,
|
| 501 |
-
inputs=[bubble_benchmark, bubble_metric],
|
| 502 |
-
outputs=[bubble_metric, scatter_plot],
|
| 503 |
-
js=""" // this is to avoid resetting user scroll each time a plot is re-generated
|
| 504 |
-
(benchmark, metric) => {
|
| 505 |
-
let scrollY = window.scrollY;
|
| 506 |
-
const observer = new MutationObserver(() => {
|
| 507 |
-
window.scrollTo(0, scrollY);
|
| 508 |
-
observer.disconnect();
|
| 509 |
-
});
|
| 510 |
-
observer.observe(document.getElementById('full-width-plot'), { childList: true });
|
| 511 |
-
return [benchmark, metric];
|
| 512 |
-
}
|
| 513 |
-
""",
|
| 514 |
-
)
|
| 515 |
-
|
| 516 |
-
bubble_metric.change(
|
| 517 |
-
fn=on_metric_change,
|
| 518 |
-
inputs=[bubble_benchmark, bubble_metric],
|
| 519 |
-
outputs=[bubble_benchmark, scatter_plot],
|
| 520 |
-
js=""" // this is to avoid resetting user scroll each time a plot is re-generated
|
| 521 |
-
(benchmark, metric) => {
|
| 522 |
-
let scrollY = window.scrollY;
|
| 523 |
-
const observer = new MutationObserver(() => {
|
| 524 |
-
window.scrollTo(0, scrollY);
|
| 525 |
-
observer.disconnect();
|
| 526 |
-
});
|
| 527 |
-
observer.observe(document.getElementById('full-width-plot'), { childList: true });
|
| 528 |
-
return [benchmark, metric];
|
| 529 |
-
}
|
| 530 |
-
""",
|
| 531 |
-
)
|
| 532 |
-
|
| 533 |
-
simulator_radio.change(
|
| 534 |
-
fn=on_simulator_change,
|
| 535 |
-
inputs=[
|
| 536 |
-
simulator_radio,
|
| 537 |
-
task_radio,
|
| 538 |
-
benchmark_radio,
|
| 539 |
-
model_type_dropdown,
|
| 540 |
-
search_box,
|
| 541 |
-
params_slider,
|
| 542 |
-
bubble_benchmark,
|
| 543 |
-
bubble_metric,
|
| 544 |
-
],
|
| 545 |
-
outputs=[leaderboard, scatter_plot],
|
| 546 |
)
|
| 547 |
|
| 548 |
|
|
|
|
| 1 |
import sys
|
| 2 |
|
| 3 |
import gradio as gr
|
|
|
|
|
|
|
| 4 |
from gradio.themes.utils import colors
|
| 5 |
|
| 6 |
+
from config import constants as C
|
| 7 |
+
from handlers.leaderboard_handlers import create_leaderboard_handlers
|
| 8 |
+
from results.parse import get_metadata, parse_agg, read_dataframe
|
| 9 |
+
from src.data_processing import filter_leaderboard, generate_scatter_plot
|
| 10 |
+
from src.models import Simulator
|
| 11 |
from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
|
| 12 |
+
from static.html_content import (
|
| 13 |
+
ABOUT_US_HTML,
|
| 14 |
+
HEADER_HTML,
|
| 15 |
+
INTRO_HTML,
|
| 16 |
+
LC_FOOTNOTE_HTML,
|
| 17 |
+
NAV_BUTTONS_HTML,
|
| 18 |
+
REFERENCES_HTML,
|
| 19 |
+
)
|
| 20 |
from style.css_html_js import custom_css
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue=colors.emerald)) as app:
|
| 23 |
+
# Load csv results
|
| 24 |
+
df_icarus = read_dataframe(C.ICARUS_RESULTS)
|
| 25 |
+
df_verilator = read_dataframe(C.VERILATOR_RESULTS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
# Load aggregated scores
|
| 28 |
+
df_agg_icarus = parse_agg(C.ICARUS_AGG)
|
| 29 |
+
df_agg_verilator = parse_agg(C.VERILATOR_AGG)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
# columns of the csvs
|
| 32 |
+
benchmarks, metrics, default_metric = get_metadata(df_icarus)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
# Each time we select a simulator, we need to use that sim's dataframe
|
| 35 |
+
state = Simulator(
|
| 36 |
+
icarus_df=df_icarus,
|
| 37 |
+
icarus_agg=df_agg_icarus,
|
| 38 |
+
verilator_df=df_verilator,
|
| 39 |
+
verilator_agg=df_agg_verilator,
|
| 40 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
# Header view
|
| 43 |
+
gr.HTML(HEADER_HTML)
|
| 44 |
+
gr.HTML(NAV_BUTTONS_HTML)
|
| 45 |
+
gr.HTML(INTRO_HTML)
|
|
|
|
| 46 |
|
| 47 |
+
# Main view
|
| 48 |
+
with gr.Tabs() as tabs:
|
| 49 |
+
# Leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
with gr.Tab("Leaderboard"):
|
| 51 |
+
# 1st row filters (select task, benchmark and sim)
|
| 52 |
with gr.Row(equal_height=True):
|
| 53 |
with gr.Column(scale=4):
|
| 54 |
+
task_radio = gr.Radio(choices=C.TASKS, label="Select Task", value=C.DEFAULT_TASK)
|
|
|
|
|
|
|
| 55 |
with gr.Column(scale=3):
|
| 56 |
benchmark_radio = gr.Radio(
|
| 57 |
+
choices=[C.DEFAULT_BENCHMARK] + C.S2R_BENCHMARKS,
|
| 58 |
label="Select Benchmark",
|
| 59 |
+
value=C.DEFAULT_BENCHMARK,
|
| 60 |
)
|
| 61 |
with gr.Column(scale=2, min_width=180):
|
| 62 |
simulator_radio = gr.Radio(
|
| 63 |
+
choices=C.SIMULATORS,
|
| 64 |
+
value=C.SIMULATORS[0],
|
| 65 |
label="Select Simulator",
|
| 66 |
scale=1,
|
| 67 |
)
|
| 68 |
|
| 69 |
+
# 2nd row filters (search, model type, params)
|
| 70 |
with gr.Row(equal_height=True):
|
| 71 |
search_box = gr.Textbox(
|
| 72 |
label="Search Model",
|
|
|
|
| 74 |
scale=2,
|
| 75 |
)
|
| 76 |
model_type_dropdown = gr.Radio(
|
| 77 |
+
choices=C.MODEL_TYPES,
|
| 78 |
label="Select Model Type",
|
| 79 |
+
value=C.DEFAULT_MODEL_TYPE,
|
| 80 |
scale=3,
|
| 81 |
)
|
| 82 |
params_slider = gr.Slider(
|
| 83 |
+
minimum=state.get_current_df()["Params"].min(),
|
| 84 |
+
maximum=C.DEFAULT_MAX_PARAMS,
|
| 85 |
+
value=C.DEFAULT_MAX_PARAMS,
|
| 86 |
label="Max Params",
|
| 87 |
step=1,
|
| 88 |
scale=2,
|
| 89 |
)
|
| 90 |
|
| 91 |
+
# main leaderboard content
|
| 92 |
leaderboard = gr.DataFrame(
|
| 93 |
+
value=filter_leaderboard(
|
| 94 |
+
C.DEFAULT_TASK, C.DEFAULT_BENCHMARK, C.DEFAULT_MODEL_TYPE, "", C.DEFAULT_MAX_PARAMS, state
|
| 95 |
+
),
|
| 96 |
headers="first row",
|
| 97 |
show_row_numbers=True,
|
| 98 |
wrap=True,
|
| 99 |
+
datatype=["html", "html"],
|
|
|
|
|
|
|
|
|
|
| 100 |
interactive=False,
|
| 101 |
+
column_widths=["7%", "28%", "13%", "10%", "13%", "10%", "14%"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
elem_classes="dataframe-leaderboard",
|
| 103 |
)
|
| 104 |
|
| 105 |
+
# caption for the Base vs Instruct models
|
| 106 |
+
gr.HTML(LC_FOOTNOTE_HTML)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
+
# all plots using Plotly
|
| 109 |
with gr.Tab("Plot View"):
|
| 110 |
with gr.Row(equal_height=True):
|
| 111 |
+
default_benchmark = C.S2R_BENCHMARKS[0]
|
| 112 |
bubble_benchmark = gr.Dropdown(
|
| 113 |
choices=benchmarks,
|
| 114 |
label="Select Benchmark",
|
| 115 |
value=default_benchmark,
|
| 116 |
elem_classes="gr-dropdown",
|
| 117 |
)
|
| 118 |
+
default_metric = C.NON_RTL_METRICS[0]
|
| 119 |
bubble_metric = gr.Dropdown(
|
| 120 |
+
choices=C.NON_RTL_METRICS,
|
| 121 |
label="Select Metric",
|
| 122 |
value=default_metric,
|
| 123 |
)
|
| 124 |
with gr.Row(equal_height=True):
|
| 125 |
scatter_plot = gr.Plot(
|
| 126 |
+
value=generate_scatter_plot(default_benchmark, default_metric, state),
|
| 127 |
label="Bubble Chart",
|
| 128 |
elem_id="full-width-plot",
|
| 129 |
)
|
| 130 |
|
| 131 |
+
# Markdown / Latex explaining our metrics
|
| 132 |
with gr.Tab("Metrics Information"):
|
| 133 |
with open("./static/metrics.md", "r") as file:
|
| 134 |
gr.Markdown(
|
|
|
|
| 139 |
],
|
| 140 |
elem_classes="metrics-page",
|
| 141 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
+
with gr.Tab("About Us"):
|
| 144 |
+
gr.HTML(ABOUT_US_HTML)
|
|
|
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
with gr.Tab("References"):
|
| 147 |
+
gr.HTML(REFERENCES_HTML)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
with gr.Row():
|
| 150 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
|
| 156 |
show_copy_button=True,
|
| 157 |
)
|
| 158 |
|
| 159 |
+
# all event handlers are declared at /handlers/
|
| 160 |
+
# if you need to add interactivity, then you'll need to add one!
|
| 161 |
+
create_leaderboard_handlers(
|
| 162 |
+
filter_leaderboard_fn=filter_leaderboard,
|
| 163 |
+
generate_scatter_plot_fn=generate_scatter_plot,
|
| 164 |
+
task_radio=task_radio,
|
| 165 |
+
benchmark_radio=benchmark_radio,
|
| 166 |
+
model_type_dropdown=model_type_dropdown,
|
| 167 |
+
search_box=search_box,
|
| 168 |
+
params_slider=params_slider,
|
| 169 |
+
bubble_benchmark=bubble_benchmark,
|
| 170 |
+
bubble_metric=bubble_metric,
|
| 171 |
+
scatter_plot=scatter_plot,
|
| 172 |
+
leaderboard=leaderboard,
|
| 173 |
+
simulator_radio=simulator_radio,
|
| 174 |
+
state=state,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
)
|
| 176 |
|
| 177 |
|
config/constants.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
RESULTS_DIR = "results"
|
| 2 |
+
ICARUS_RESULTS = f"{RESULTS_DIR}/results_icarus.json"
|
| 3 |
+
VERILATOR_RESULTS = f"{RESULTS_DIR}/results_verilator.json"
|
| 4 |
+
ICARUS_AGG = f"{RESULTS_DIR}/aggregated_scores_icarus.csv"
|
| 5 |
+
VERILATOR_AGG = f"{RESULTS_DIR}/aggregated_scores_verilator.csv"
|
| 6 |
+
|
| 7 |
+
TASKS = ["Spec-to-RTL", "Code Completion", "Line Completion †"]
|
| 8 |
+
S2R_BENCHMARKS = ["VerilogEval S2R", "RTLLM"]
|
| 9 |
+
CC_BENCHMARKS = ["VerilogEval MC", "VeriGen"]
|
| 10 |
+
LC_BENCHMARKS = ["RTL-Repo"]
|
| 11 |
+
|
| 12 |
+
MODEL_TYPES = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]
|
| 13 |
+
TYPE_EMOJI = {"RTL-Specific": "🔴", "General": "🟢", "Coding": "🔵"}
|
| 14 |
+
|
| 15 |
+
NON_RTL_METRICS = ["Syntax (STX)", "Functionality (FNC)", "Synthesis (SYN)", "Power", "Performance", "Area"]
|
| 16 |
+
RTL_METRICS = ["Exact Matching (EM)"]
|
| 17 |
+
|
| 18 |
+
COLUMN_MAPPINGS = {
|
| 19 |
+
"Params": "Parameters (B)",
|
| 20 |
+
"Syntax (STX)": "Syntax",
|
| 21 |
+
"Functionality (FNC)": "Functionality",
|
| 22 |
+
"Synthesis (SYN)": "Synthesis",
|
| 23 |
+
"Post-Synthesis (PSQ)": "Post-Synthesis",
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
COLUMN_ORDER = [
|
| 27 |
+
"Type",
|
| 28 |
+
"Model",
|
| 29 |
+
"Parameters (B)",
|
| 30 |
+
"Syntax",
|
| 31 |
+
"Functionality",
|
| 32 |
+
"Synthesis",
|
| 33 |
+
"Post-Synthesis",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
TYPE_COLORS = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
|
| 37 |
+
|
| 38 |
+
Y_AXIS_LIMITS = {
|
| 39 |
+
"Functionality (FNC)": [5, 90],
|
| 40 |
+
"Syntax (STX)": [20, 100],
|
| 41 |
+
"Synthesis (SYN)": [5, 90],
|
| 42 |
+
"Power": [0, 50],
|
| 43 |
+
"Performance": [0, 50],
|
| 44 |
+
"Area": [0, 50],
|
| 45 |
+
"Exact Matching (EM)": [0, 50],
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
SCATTER_PLOT_X_TICKS = {
|
| 49 |
+
"tickvals": [8, 14, 32, 72, 200, 700],
|
| 50 |
+
"ticktext": ["8", "14", "32", "72", "200", "700"],
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
DEFAULT_MAX_PARAMS = 700
|
| 54 |
+
DEFAULT_TASK = "Spec-to-RTL"
|
| 55 |
+
DEFAULT_BENCHMARK = "All"
|
| 56 |
+
DEFAULT_MODEL_TYPE = "All"
|
| 57 |
+
|
| 58 |
+
SIMULATORS = ["Icarus", "Verilator"]
|
config/model_metadata.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import Literal
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass
|
| 6 |
+
class ModelMetadata:
|
| 7 |
+
url: str # HF model card
|
| 8 |
+
params: float # in B
|
| 9 |
+
model_type: Literal["General", "Coding", "RTL-Specific"]
|
| 10 |
+
release: Literal["V1", "V2", "V3"] # release of the leaderboard for which the model was included
|
| 11 |
+
model_arch: Literal["Dense", "Reasoning"] # to distinguish between reasoners and non-reasoners
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# fmt: off
|
| 15 |
+
MODELS = {
|
| 16 |
+
"DeepSeek R1-0528": ModelMetadata(
|
| 17 |
+
"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", 685, "General", "V2", "Reasoning"
|
| 18 |
+
),
|
| 19 |
+
"DeepSeek R1": ModelMetadata(
|
| 20 |
+
"https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General", "V1", "Reasoning"
|
| 21 |
+
),
|
| 22 |
+
"Llama 3.1 405B": ModelMetadata(
|
| 23 |
+
"https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8", 406, "General", "V1", "Dense"
|
| 24 |
+
),
|
| 25 |
+
"Qwen3 236B A22B": ModelMetadata(
|
| 26 |
+
"https://huggingface.co/Qwen/Qwen3-235B-A22B", 235, "General", "V2", "Reasoning"
|
| 27 |
+
),
|
| 28 |
+
"Llama 3.(1-3) 70B": ModelMetadata(
|
| 29 |
+
"https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", 70.6, "General", "V1", "Dense"
|
| 30 |
+
),
|
| 31 |
+
"Qwen2.5 72B": ModelMetadata(
|
| 32 |
+
"https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", 72.7, "General", "V1", "Dense"
|
| 33 |
+
),
|
| 34 |
+
"QwQ 32B": ModelMetadata(
|
| 35 |
+
"https://huggingface.co/Qwen/QwQ-32B", 32.8, "General", "V2", "Reasoning"
|
| 36 |
+
),
|
| 37 |
+
"Qwen2.5 32B": ModelMetadata(
|
| 38 |
+
"https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General", "V1", "Dense"
|
| 39 |
+
),
|
| 40 |
+
"StarChat2 15B v0.1": ModelMetadata(
|
| 41 |
+
"https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1", 16, "General", "V1", "Dense"
|
| 42 |
+
),
|
| 43 |
+
"DeepSeek R1 Distill Qwen 14B": ModelMetadata(
|
| 44 |
+
"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", 14.8, "General", "V1", "Reasoning"
|
| 45 |
+
),
|
| 46 |
+
"CodeLlama 70B": ModelMetadata(
|
| 47 |
+
"https://huggingface.co/codellama/CodeLlama-70b-hf", 69, "Coding", "V1", "Dense"
|
| 48 |
+
),
|
| 49 |
+
"QwenCoder 2.5 32B": ModelMetadata(
|
| 50 |
+
"https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct", 32.5, "Coding", "V1", "Dense"
|
| 51 |
+
),
|
| 52 |
+
"DeepSeek Coder 33B": ModelMetadata(
|
| 53 |
+
"https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct", 33.3, "Coding", "V1", "Dense"
|
| 54 |
+
),
|
| 55 |
+
"QwenCoder 2.5 14B": ModelMetadata(
|
| 56 |
+
"https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct", 14.7, "Coding", "V1", "Dense"
|
| 57 |
+
),
|
| 58 |
+
"DeepCoder 14B": ModelMetadata(
|
| 59 |
+
"https://huggingface.co/agentica-org/DeepCoder-14B-Preview", 14.8, "Coding", "V2", "Reasoning"
|
| 60 |
+
),
|
| 61 |
+
"OpenCoder 8B": ModelMetadata(
|
| 62 |
+
"https://huggingface.co/infly/OpenCoder-8B-Instruct", 7.77, "Coding", "V1", "Dense"
|
| 63 |
+
),
|
| 64 |
+
"SeedCoder 8B": ModelMetadata(
|
| 65 |
+
"https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct", 8.25, "Coding", "V2", "Dense"
|
| 66 |
+
),
|
| 67 |
+
"SeedCoder 8B Reasoning": ModelMetadata(
|
| 68 |
+
"https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16", 8.25, "Coding", "V2", "Reasoning"
|
| 69 |
+
),
|
| 70 |
+
"QwenCoder 2.5 7B": ModelMetadata(
|
| 71 |
+
"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct", 7.61, "Coding", "V1", "Dense"
|
| 72 |
+
),
|
| 73 |
+
"DeepSeek Coder 6.7B": ModelMetadata(
|
| 74 |
+
"https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct", 6.74, "Coding", "V1", "Dense"
|
| 75 |
+
),
|
| 76 |
+
"HaVen-CodeQwen": ModelMetadata(
|
| 77 |
+
"https://huggingface.co/yangyiyao/HaVen-CodeQwen", 7.25, "RTL-Specific", "V1", "Dense"
|
| 78 |
+
),
|
| 79 |
+
"CodeV R1 Distill Qwen 7B": ModelMetadata(
|
| 80 |
+
"https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B", 7.62, "RTL-Specific", "V2", "Reasoning"
|
| 81 |
+
),
|
| 82 |
+
"CodeV-CL-7B": ModelMetadata(
|
| 83 |
+
"https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific", "V1", "Dense"
|
| 84 |
+
),
|
| 85 |
+
"CodeV-QW-7B": ModelMetadata(
|
| 86 |
+
"https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific", "V1", "Dense"
|
| 87 |
+
),
|
| 88 |
+
"CodeV-DS-6.7B": ModelMetadata(
|
| 89 |
+
"https://huggingface.co/yang-z/CodeV-DS-6.7B", 6.74, "RTL-Specific", "V1", "Dense"
|
| 90 |
+
),
|
| 91 |
+
"RTLCoder Mistral": ModelMetadata(
|
| 92 |
+
"https://huggingface.co/ishorn5/RTLCoder-v1.1", 7.24, "RTL-Specific", "V1", "Dense"
|
| 93 |
+
),
|
| 94 |
+
"RTLCoder DeepSeek": ModelMetadata(
|
| 95 |
+
"https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1", 6.74, "RTL-Specific", "V1", "Dense"
|
| 96 |
+
),
|
| 97 |
+
"OriGen": ModelMetadata(
|
| 98 |
+
"https://huggingface.co/henryen/OriGen", 6.74, "RTL-Specific", "V1", "Dense"
|
| 99 |
+
),
|
| 100 |
+
"Qwen3 Coder 480B A35B": ModelMetadata(
|
| 101 |
+
"https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct", 480, "Coding", "V2", "Dense"
|
| 102 |
+
),
|
| 103 |
+
"Magistral Small 2506": ModelMetadata(
|
| 104 |
+
"https://huggingface.co/mistralai/Magistral-Small-2506", 23.6, "General", "V2", "Reasoning"
|
| 105 |
+
),
|
| 106 |
+
"gpt-oss-20b": ModelMetadata(
|
| 107 |
+
"https://huggingface.co/openai/gpt-oss-20b", 21.5, "General", "V2", "Reasoning"
|
| 108 |
+
),
|
| 109 |
+
"gpt-oss-120b": ModelMetadata(
|
| 110 |
+
"https://huggingface.co/openai/gpt-oss-120b", 120, "General", "V2", "Reasoning"
|
| 111 |
+
),
|
| 112 |
+
}
|
results/parse.py
CHANGED
|
@@ -1,238 +1,11 @@
|
|
| 1 |
-
import csv
|
| 2 |
-
import json
|
| 3 |
-
import locale
|
| 4 |
import os
|
| 5 |
import sys
|
| 6 |
-
|
| 7 |
-
|
| 8 |
import pandas as pd
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
"https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
|
| 13 |
-
685,
|
| 14 |
-
"General",
|
| 15 |
-
"V2",
|
| 16 |
-
"Reasoning", # "Dense" or "Reasoning"
|
| 17 |
-
),
|
| 18 |
-
"DeepSeek R1": (
|
| 19 |
-
"https://huggingface.co/deepseek-ai/DeepSeek-R1",
|
| 20 |
-
685,
|
| 21 |
-
"General",
|
| 22 |
-
"V1",
|
| 23 |
-
"Reasoning",
|
| 24 |
-
),
|
| 25 |
-
"Llama 3.1 405B": (
|
| 26 |
-
"https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8",
|
| 27 |
-
406,
|
| 28 |
-
"General",
|
| 29 |
-
"V1",
|
| 30 |
-
"Dense",
|
| 31 |
-
),
|
| 32 |
-
"Qwen3 236B A22B": (
|
| 33 |
-
"https://huggingface.co/Qwen/Qwen3-235B-A22B",
|
| 34 |
-
235,
|
| 35 |
-
"General",
|
| 36 |
-
"V2",
|
| 37 |
-
"Reasoning",
|
| 38 |
-
),
|
| 39 |
-
"Llama 3.(1-3) 70B": (
|
| 40 |
-
"https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
|
| 41 |
-
70.6,
|
| 42 |
-
"General",
|
| 43 |
-
"V1",
|
| 44 |
-
"Dense",
|
| 45 |
-
),
|
| 46 |
-
"Qwen2.5 72B": (
|
| 47 |
-
"https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
|
| 48 |
-
72.7,
|
| 49 |
-
"General",
|
| 50 |
-
"V1",
|
| 51 |
-
"Dense",
|
| 52 |
-
),
|
| 53 |
-
"QwQ 32B": (
|
| 54 |
-
"https://huggingface.co/Qwen/QwQ-32B",
|
| 55 |
-
32.8,
|
| 56 |
-
"General",
|
| 57 |
-
"V2",
|
| 58 |
-
"Reasoning",
|
| 59 |
-
),
|
| 60 |
-
"Qwen2.5 32B": (
|
| 61 |
-
"https://huggingface.co/Qwen/Qwen2.5-32B",
|
| 62 |
-
32.5,
|
| 63 |
-
"General",
|
| 64 |
-
"V1",
|
| 65 |
-
"Dense",
|
| 66 |
-
),
|
| 67 |
-
"StarChat2 15B v0.1": (
|
| 68 |
-
"https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
|
| 69 |
-
16,
|
| 70 |
-
"General",
|
| 71 |
-
"V1",
|
| 72 |
-
"Dense",
|
| 73 |
-
),
|
| 74 |
-
"DeepSeek R1 Distill Qwen 14B": (
|
| 75 |
-
"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
| 76 |
-
14.8,
|
| 77 |
-
"General",
|
| 78 |
-
"V1",
|
| 79 |
-
"Reasoning",
|
| 80 |
-
),
|
| 81 |
-
"CodeLlama 70B": (
|
| 82 |
-
"https://huggingface.co/codellama/CodeLlama-70b-hf",
|
| 83 |
-
69,
|
| 84 |
-
"Coding",
|
| 85 |
-
"V1",
|
| 86 |
-
"Dense",
|
| 87 |
-
),
|
| 88 |
-
"QwenCoder 2.5 32B": (
|
| 89 |
-
"https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 90 |
-
32.5,
|
| 91 |
-
"Coding",
|
| 92 |
-
"V1",
|
| 93 |
-
"Dense",
|
| 94 |
-
),
|
| 95 |
-
"DeepSeek Coder 33B": (
|
| 96 |
-
"https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
|
| 97 |
-
33.3,
|
| 98 |
-
"Coding",
|
| 99 |
-
"V1",
|
| 100 |
-
"Dense",
|
| 101 |
-
),
|
| 102 |
-
"QwenCoder 2.5 14B": (
|
| 103 |
-
"https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
|
| 104 |
-
14.7,
|
| 105 |
-
"Coding",
|
| 106 |
-
"V1",
|
| 107 |
-
"Dense",
|
| 108 |
-
),
|
| 109 |
-
"DeepCoder 14B": (
|
| 110 |
-
"https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
|
| 111 |
-
14.8,
|
| 112 |
-
"Coding",
|
| 113 |
-
"V2",
|
| 114 |
-
"Reasoning",
|
| 115 |
-
),
|
| 116 |
-
"OpenCoder 8B": (
|
| 117 |
-
"https://huggingface.co/infly/OpenCoder-8B-Instruct",
|
| 118 |
-
7.77,
|
| 119 |
-
"Coding",
|
| 120 |
-
"V1",
|
| 121 |
-
"Dense",
|
| 122 |
-
),
|
| 123 |
-
"SeedCoder 8B": (
|
| 124 |
-
"https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct",
|
| 125 |
-
8.25,
|
| 126 |
-
"Coding",
|
| 127 |
-
"V2",
|
| 128 |
-
"Dense",
|
| 129 |
-
),
|
| 130 |
-
"SeedCoder 8B Reasoning": (
|
| 131 |
-
"https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
|
| 132 |
-
8.25,
|
| 133 |
-
"Coding",
|
| 134 |
-
"V2",
|
| 135 |
-
"Reasoning",
|
| 136 |
-
),
|
| 137 |
-
"QwenCoder 2.5 7B": (
|
| 138 |
-
"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
|
| 139 |
-
7.61,
|
| 140 |
-
"Coding",
|
| 141 |
-
"V1",
|
| 142 |
-
"Dense",
|
| 143 |
-
),
|
| 144 |
-
"DeepSeek Coder 6.7B": (
|
| 145 |
-
"https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
|
| 146 |
-
6.74,
|
| 147 |
-
"Coding",
|
| 148 |
-
"V1",
|
| 149 |
-
"Dense",
|
| 150 |
-
),
|
| 151 |
-
"HaVen-CodeQwen": (
|
| 152 |
-
"https://huggingface.co/yangyiyao/HaVen-CodeQwen",
|
| 153 |
-
7.25,
|
| 154 |
-
"RTL-Specific",
|
| 155 |
-
"V1",
|
| 156 |
-
"Dense",
|
| 157 |
-
),
|
| 158 |
-
"CodeV R1 Distill Qwen 7B": (
|
| 159 |
-
"https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B",
|
| 160 |
-
7.62,
|
| 161 |
-
"RTL-Specific",
|
| 162 |
-
"V2",
|
| 163 |
-
"Reasoning",
|
| 164 |
-
),
|
| 165 |
-
"CodeV-CL-7B": (
|
| 166 |
-
"https://huggingface.co/yang-z/CodeV-CL-7B",
|
| 167 |
-
6.74,
|
| 168 |
-
"RTL-Specific",
|
| 169 |
-
"V1",
|
| 170 |
-
"Dense",
|
| 171 |
-
),
|
| 172 |
-
"CodeV-QW-7B": (
|
| 173 |
-
"https://huggingface.co/yang-z/CodeV-QW-7B",
|
| 174 |
-
7.25,
|
| 175 |
-
"RTL-Specific",
|
| 176 |
-
"V1",
|
| 177 |
-
"Dense",
|
| 178 |
-
),
|
| 179 |
-
"CodeV-DS-6.7B": (
|
| 180 |
-
"https://huggingface.co/yang-z/CodeV-DS-6.7B",
|
| 181 |
-
6.74,
|
| 182 |
-
"RTL-Specific",
|
| 183 |
-
"V1",
|
| 184 |
-
"Dense",
|
| 185 |
-
),
|
| 186 |
-
"RTLCoder Mistral": (
|
| 187 |
-
"https://huggingface.co/ishorn5/RTLCoder-v1.1",
|
| 188 |
-
7.24,
|
| 189 |
-
"RTL-Specific",
|
| 190 |
-
"V1",
|
| 191 |
-
"Dense",
|
| 192 |
-
),
|
| 193 |
-
"RTLCoder DeepSeek": (
|
| 194 |
-
"https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
|
| 195 |
-
6.74,
|
| 196 |
-
"RTL-Specific",
|
| 197 |
-
"V1",
|
| 198 |
-
"Dense",
|
| 199 |
-
),
|
| 200 |
-
"OriGen": (
|
| 201 |
-
"https://huggingface.co/henryen/OriGen",
|
| 202 |
-
6.74,
|
| 203 |
-
"RTL-Specific",
|
| 204 |
-
"V1",
|
| 205 |
-
"Dense",
|
| 206 |
-
),
|
| 207 |
-
"Qwen3 Coder 480B A35B": (
|
| 208 |
-
"https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct",
|
| 209 |
-
480,
|
| 210 |
-
"Coding",
|
| 211 |
-
"V2",
|
| 212 |
-
"Dense",
|
| 213 |
-
),
|
| 214 |
-
"Magistral Small 2506": (
|
| 215 |
-
"https://huggingface.co/mistralai/Magistral-Small-2506",
|
| 216 |
-
23.6,
|
| 217 |
-
"General",
|
| 218 |
-
"V2",
|
| 219 |
-
"Reasoning",
|
| 220 |
-
),
|
| 221 |
-
"gpt-oss-20b": (
|
| 222 |
-
"https://huggingface.co/openai/gpt-oss-20b",
|
| 223 |
-
21.5,
|
| 224 |
-
"General",
|
| 225 |
-
"V2",
|
| 226 |
-
"Reasoning",
|
| 227 |
-
),
|
| 228 |
-
"gpt-oss-120b": (
|
| 229 |
-
"https://huggingface.co/openai/gpt-oss-120b",
|
| 230 |
-
120,
|
| 231 |
-
"General",
|
| 232 |
-
"V2",
|
| 233 |
-
"Reasoning",
|
| 234 |
-
),
|
| 235 |
-
}
|
| 236 |
|
| 237 |
|
| 238 |
def get_headers(reader, agg=False) -> Union[list, list]:
|
|
@@ -248,15 +21,19 @@ def get_headers(reader, agg=False) -> Union[list, list]:
|
|
| 248 |
return metrics, benchs
|
| 249 |
|
| 250 |
|
| 251 |
-
def
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
|
| 262 |
def parse_results(csv_path: str) -> list[dict]:
|
|
@@ -275,7 +52,7 @@ def parse_results(csv_path: str) -> list[dict]:
|
|
| 275 |
model = row[0]
|
| 276 |
if not model:
|
| 277 |
continue
|
| 278 |
-
url, params, type, release, reasoning =
|
| 279 |
models.append(model)
|
| 280 |
row = row[1:]
|
| 281 |
ctr = 0
|
|
@@ -294,7 +71,6 @@ def parse_results(csv_path: str) -> list[dict]:
|
|
| 294 |
record["Thinking"] = reasoning
|
| 295 |
dataset.append(record)
|
| 296 |
ctr += 1
|
| 297 |
-
print(models)
|
| 298 |
return dataset
|
| 299 |
|
| 300 |
|
|
@@ -318,9 +94,7 @@ def read_json(json_path: str = "results/results_icarus.json"):
|
|
| 318 |
return data
|
| 319 |
|
| 320 |
|
| 321 |
-
def
|
| 322 |
-
json_path: str = "results/results_icarus.json",
|
| 323 |
-
) -> tuple[pd.DataFrame, list, list, str]:
|
| 324 |
data = read_json(json_path)
|
| 325 |
df = pd.DataFrame(data)
|
| 326 |
df.rename(
|
|
@@ -334,11 +108,21 @@ def read_data(
|
|
| 334 |
inplace=True,
|
| 335 |
)
|
| 336 |
df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
|
| 338 |
metrics = df["Metric"].unique().tolist()
|
| 339 |
-
default_metric = (
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
return df, benchmarks, metrics, default_metric
|
| 343 |
|
| 344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
+
import csv
|
| 4 |
+
import json
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
+
from typing import Dict, Union
|
| 8 |
+
from config.model_metadata import MODELS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def get_headers(reader, agg=False) -> Union[list, list]:
|
|
|
|
| 21 |
return metrics, benchs
|
| 22 |
|
| 23 |
|
| 24 |
+
def get_model_metadata(model_key: str) -> tuple[str, float, str, str, str]:
|
| 25 |
+
try:
|
| 26 |
+
model_metadata = MODELS[model_key]
|
| 27 |
+
except KeyError:
|
| 28 |
+
raise KeyError(f"Unknown model: {model_key}")
|
| 29 |
+
|
| 30 |
+
return (
|
| 31 |
+
model_metadata.url,
|
| 32 |
+
model_metadata.params,
|
| 33 |
+
model_metadata.model_type,
|
| 34 |
+
model_metadata.release,
|
| 35 |
+
model_metadata.model_arch,
|
| 36 |
+
)
|
| 37 |
|
| 38 |
|
| 39 |
def parse_results(csv_path: str) -> list[dict]:
|
|
|
|
| 52 |
model = row[0]
|
| 53 |
if not model:
|
| 54 |
continue
|
| 55 |
+
url, params, type, release, reasoning = get_model_metadata(model)
|
| 56 |
models.append(model)
|
| 57 |
row = row[1:]
|
| 58 |
ctr = 0
|
|
|
|
| 71 |
record["Thinking"] = reasoning
|
| 72 |
dataset.append(record)
|
| 73 |
ctr += 1
|
|
|
|
| 74 |
return dataset
|
| 75 |
|
| 76 |
|
|
|
|
| 94 |
return data
|
| 95 |
|
| 96 |
|
| 97 |
+
def read_dataframe(json_path: str) -> pd.DataFrame:
|
|
|
|
|
|
|
| 98 |
data = read_json(json_path)
|
| 99 |
df = pd.DataFrame(data)
|
| 100 |
df.rename(
|
|
|
|
| 108 |
inplace=True,
|
| 109 |
)
|
| 110 |
df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
|
| 111 |
+
return df
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def get_metadata(df: pd.DataFrame) -> tuple[list, list, str]:
|
| 115 |
benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
|
| 116 |
metrics = df["Metric"].unique().tolist()
|
| 117 |
+
default_metric = "Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0]
|
| 118 |
+
return benchmarks, metrics, default_metric
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def read_data(
|
| 122 |
+
json_path: str = "results/results_icarus.json",
|
| 123 |
+
) -> tuple[pd.DataFrame, list, list, str]:
|
| 124 |
+
df = read_dataframe(json_path)
|
| 125 |
+
benchmarks, metrics, default_metric = get_metadata(df)
|
| 126 |
return df, benchmarks, metrics, default_metric
|
| 127 |
|
| 128 |
|
utils.py
CHANGED
|
@@ -6,13 +6,7 @@ import pandas as pd
|
|
| 6 |
import plotly.express as px
|
| 7 |
import plotly.graph_objects as go
|
| 8 |
|
| 9 |
-
|
| 10 |
-
type_emoji = {
|
| 11 |
-
"RTL-Specific": "🔴",
|
| 12 |
-
"General": "🟢",
|
| 13 |
-
"Coding": "🔵"
|
| 14 |
-
}
|
| 15 |
-
# fmt: on
|
| 16 |
|
| 17 |
|
| 18 |
def model_hyperlink(link, model_name, release, thinking=False):
|
|
@@ -23,11 +17,7 @@ def model_hyperlink(link, model_name, release, thinking=False):
|
|
| 23 |
if release == "V1":
|
| 24 |
return ret + reasoning_badge if thinking == "Reasoning" else ret
|
| 25 |
else:
|
| 26 |
-
return
|
| 27 |
-
ret + reasoning_badge + new_badge
|
| 28 |
-
if thinking == "Reasoning"
|
| 29 |
-
else ret + new_badge
|
| 30 |
-
)
|
| 31 |
|
| 32 |
|
| 33 |
def handle_special_cases(benchmark, metric):
|
|
@@ -39,13 +29,19 @@ def handle_special_cases(benchmark, metric):
|
|
| 39 |
|
| 40 |
|
| 41 |
def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
| 42 |
subset = subset.drop(subset[subset.Score < 0.0].index)
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
)
|
|
|
|
| 49 |
filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
|
| 50 |
filtered_df["Model"] = filtered_df.apply(
|
| 51 |
lambda row: model_hyperlink(
|
|
@@ -55,31 +51,28 @@ def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
|
|
| 55 |
),
|
| 56 |
axis=1,
|
| 57 |
)
|
| 58 |
-
filtered_df["Type"] = filtered_df["Model Type"].map(lambda x:
|
| 59 |
filtered_df = filtered_df[["Type", "Model", "Params", "Exact Matching (EM)"]]
|
| 60 |
-
filtered_df = filtered_df.sort_values(
|
| 61 |
-
by="Exact Matching (EM)", ascending=False
|
| 62 |
-
).reset_index(drop=True)
|
| 63 |
return filtered_df
|
| 64 |
|
| 65 |
|
| 66 |
def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
| 70 |
if "RTLLM" in subset["Benchmark"].unique():
|
| 71 |
pivot_df = (
|
| 72 |
-
subset.pivot_table(
|
| 73 |
-
index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r
|
| 74 |
-
)
|
| 75 |
.reset_index()
|
| 76 |
.round(2)
|
| 77 |
)
|
| 78 |
else:
|
| 79 |
pivot_df = (
|
| 80 |
-
subset.pivot_table(
|
| 81 |
-
index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc
|
| 82 |
-
)
|
| 83 |
.reset_index()
|
| 84 |
.round(2)
|
| 85 |
)
|
|
@@ -94,39 +87,20 @@ def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataF
|
|
| 94 |
|
| 95 |
pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
|
| 96 |
pivot_df["Model"] = pivot_df.apply(
|
| 97 |
-
lambda row: model_hyperlink(
|
| 98 |
-
row["Model URL"], row["Model"], row["Release"], row["Thinking"]
|
| 99 |
-
),
|
| 100 |
axis=1,
|
| 101 |
)
|
| 102 |
-
pivot_df["Type"] = pivot_df["Model Type"].map(lambda x:
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
pivot_df.rename(
|
| 108 |
-
columns={
|
| 109 |
-
"Params": "Parameters (B)",
|
| 110 |
-
"Syntax (STX)": "Syntax",
|
| 111 |
-
"Functionality (FNC)": "Functionality",
|
| 112 |
-
"Synthesis (SYN)": "Synthesis",
|
| 113 |
-
"Post-Synthesis (PSQ)": "Post-Synthesis",
|
| 114 |
-
},
|
| 115 |
-
inplace=True,
|
| 116 |
-
)
|
| 117 |
-
columns_order = [
|
| 118 |
-
"Type",
|
| 119 |
-
"Model",
|
| 120 |
-
"Parameters (B)",
|
| 121 |
-
"Syntax",
|
| 122 |
-
"Functionality",
|
| 123 |
-
"Synthesis",
|
| 124 |
-
"Post-Synthesis",
|
| 125 |
-
]
|
| 126 |
-
pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
|
| 127 |
-
pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(
|
| 128 |
-
drop=True
|
| 129 |
-
)
|
| 130 |
return pivot_df
|
| 131 |
|
| 132 |
|
|
@@ -154,65 +128,40 @@ def custom_agg_cc(vals):
|
|
| 154 |
return round(result, 2)
|
| 155 |
|
| 156 |
|
| 157 |
-
def filter_bench_all(
|
| 158 |
-
subset:
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
| 163 |
if "RTLLM" in subset["Benchmark"].unique():
|
| 164 |
pivot_df = (
|
| 165 |
-
subset.pivot_table(
|
| 166 |
-
index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r
|
| 167 |
-
)
|
| 168 |
.reset_index()
|
| 169 |
.round(2)
|
| 170 |
)
|
| 171 |
else:
|
| 172 |
pivot_df = (
|
| 173 |
-
subset.pivot_table(
|
| 174 |
-
index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc
|
| 175 |
-
)
|
| 176 |
.reset_index()
|
| 177 |
.round(2)
|
| 178 |
)
|
| 179 |
|
| 180 |
pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
|
| 181 |
-
print(pivot_df.columns)
|
| 182 |
pivot_df["Model"] = pivot_df.apply(
|
| 183 |
-
lambda row: model_hyperlink(
|
| 184 |
-
row["Model URL"], row["Model"], row["Release"], row["Thinking"]
|
| 185 |
-
),
|
| 186 |
axis=1,
|
| 187 |
)
|
| 188 |
-
pivot_df["Type"] = pivot_df["Model Type"].map(lambda x:
|
| 189 |
-
pivot_df["Post-Synthesis Quality"] = (
|
| 190 |
-
pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
|
| 191 |
-
)
|
| 192 |
|
| 193 |
-
pivot_df.
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
},
|
| 202 |
-
inplace=True,
|
| 203 |
-
)
|
| 204 |
|
| 205 |
-
columns_order = [
|
| 206 |
-
"Type",
|
| 207 |
-
"Model",
|
| 208 |
-
"Parameters (B)",
|
| 209 |
-
"Syntax",
|
| 210 |
-
"Functionality",
|
| 211 |
-
"Synthesis",
|
| 212 |
-
"Post-Synthesis",
|
| 213 |
-
]
|
| 214 |
-
pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
|
| 215 |
-
pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(
|
| 216 |
-
drop=True
|
| 217 |
-
)
|
| 218 |
return pivot_df
|
|
|
|
| 6 |
import plotly.express as px
|
| 7 |
import plotly.graph_objects as go
|
| 8 |
|
| 9 |
+
from config.constants import COLUMN_MAPPINGS, COLUMN_ORDER, TYPE_EMOJI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def model_hyperlink(link, model_name, release, thinking=False):
|
|
|
|
| 17 |
if release == "V1":
|
| 18 |
return ret + reasoning_badge if thinking == "Reasoning" else ret
|
| 19 |
else:
|
| 20 |
+
return ret + reasoning_badge + new_badge if thinking == "Reasoning" else ret + new_badge
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
def handle_special_cases(benchmark, metric):
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
|
| 32 |
+
if subset.empty:
|
| 33 |
+
return pd.DataFrame(columns=["Type", "Model", "Params", "Exact Matching (EM)"])
|
| 34 |
+
|
| 35 |
subset = subset.drop(subset[subset.Score < 0.0].index)
|
| 36 |
+
|
| 37 |
+
# Check again if empty after filtering
|
| 38 |
+
if subset.empty:
|
| 39 |
+
return pd.DataFrame(columns=["Type", "Model", "Params", "Exact Matching (EM)"])
|
| 40 |
+
|
| 41 |
+
details = subset[["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]].drop_duplicates(
|
| 42 |
+
"Model"
|
| 43 |
)
|
| 44 |
+
filtered_df = subset[["Model", "Score"]].rename(columns={"Score": "Exact Matching (EM)"})
|
| 45 |
filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
|
| 46 |
filtered_df["Model"] = filtered_df.apply(
|
| 47 |
lambda row: model_hyperlink(
|
|
|
|
| 51 |
),
|
| 52 |
axis=1,
|
| 53 |
)
|
| 54 |
+
filtered_df["Type"] = filtered_df["Model Type"].map(lambda x: TYPE_EMOJI.get(x, ""))
|
| 55 |
filtered_df = filtered_df[["Type", "Model", "Params", "Exact Matching (EM)"]]
|
| 56 |
+
filtered_df = filtered_df.sort_values(by="Exact Matching (EM)", ascending=False).reset_index(drop=True)
|
|
|
|
|
|
|
| 57 |
return filtered_df
|
| 58 |
|
| 59 |
|
| 60 |
def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
|
| 61 |
+
if subset.empty:
|
| 62 |
+
return pd.DataFrame(columns=COLUMN_ORDER)
|
| 63 |
+
|
| 64 |
+
details = subset[["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]].drop_duplicates(
|
| 65 |
+
"Model"
|
| 66 |
+
)
|
| 67 |
if "RTLLM" in subset["Benchmark"].unique():
|
| 68 |
pivot_df = (
|
| 69 |
+
subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r)
|
|
|
|
|
|
|
| 70 |
.reset_index()
|
| 71 |
.round(2)
|
| 72 |
)
|
| 73 |
else:
|
| 74 |
pivot_df = (
|
| 75 |
+
subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc)
|
|
|
|
|
|
|
| 76 |
.reset_index()
|
| 77 |
.round(2)
|
| 78 |
)
|
|
|
|
| 87 |
|
| 88 |
pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
|
| 89 |
pivot_df["Model"] = pivot_df.apply(
|
| 90 |
+
lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"], row["Thinking"]),
|
|
|
|
|
|
|
| 91 |
axis=1,
|
| 92 |
)
|
| 93 |
+
pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: TYPE_EMOJI.get(x, ""))
|
| 94 |
+
|
| 95 |
+
if all(col in pivot_df.columns for col in ["Power", "Performance", "Area"]):
|
| 96 |
+
pivot_df["Post-Synthesis (PSQ)"] = pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
|
| 97 |
+
|
| 98 |
+
pivot_df.rename(columns=COLUMN_MAPPINGS, inplace=True)
|
| 99 |
+
pivot_df = pivot_df[[col for col in COLUMN_ORDER if col in pivot_df.columns]]
|
| 100 |
+
|
| 101 |
+
if "Functionality" in pivot_df.columns:
|
| 102 |
+
pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(drop=True)
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return pivot_df
|
| 105 |
|
| 106 |
|
|
|
|
| 128 |
return round(result, 2)
|
| 129 |
|
| 130 |
|
| 131 |
+
def filter_bench_all(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
|
| 132 |
+
if subset.empty:
|
| 133 |
+
return pd.DataFrame(columns=COLUMN_ORDER)
|
| 134 |
+
|
| 135 |
+
details = subset[["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]].drop_duplicates(
|
| 136 |
+
"Model"
|
| 137 |
+
)
|
| 138 |
if "RTLLM" in subset["Benchmark"].unique():
|
| 139 |
pivot_df = (
|
| 140 |
+
subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r)
|
|
|
|
|
|
|
| 141 |
.reset_index()
|
| 142 |
.round(2)
|
| 143 |
)
|
| 144 |
else:
|
| 145 |
pivot_df = (
|
| 146 |
+
subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc)
|
|
|
|
|
|
|
| 147 |
.reset_index()
|
| 148 |
.round(2)
|
| 149 |
)
|
| 150 |
|
| 151 |
pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
|
|
|
|
| 152 |
pivot_df["Model"] = pivot_df.apply(
|
| 153 |
+
lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"], row["Thinking"]),
|
|
|
|
|
|
|
| 154 |
axis=1,
|
| 155 |
)
|
| 156 |
+
pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: TYPE_EMOJI.get(x, ""))
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
if all(col in pivot_df.columns for col in ["Power", "Performance", "Area"]):
|
| 159 |
+
pivot_df["Post-Synthesis Quality"] = pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
|
| 160 |
+
|
| 161 |
+
pivot_df.rename(columns=COLUMN_MAPPINGS, inplace=True)
|
| 162 |
+
pivot_df = pivot_df[[col for col in COLUMN_ORDER if col in pivot_df.columns]]
|
| 163 |
+
|
| 164 |
+
if "Functionality" in pivot_df.columns:
|
| 165 |
+
pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
return pivot_df
|