TuRTLe-Leaderboard

Running

App Files Files Community

ggcristian commited on Oct 3

Commit

be0239b

1 Parent(s): cc21640

Small refactor: Add `config` with model metadata and move constants to its own file

Browse files

Files changed (5) hide show

app.py +80 -451
config/constants.py +58 -0
config/model_metadata.py +112 -0
results/parse.py +32 -248
utils.py +53 -104

app.py CHANGED Viewed

@@ -1,269 +1,72 @@
 import sys
 import gradio as gr
-import pandas as pd
-import plotly.express as px
 from gradio.themes.utils import colors
-from results.parse import parse_agg, read_data
 from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
 from style.css_html_js import custom_css
-from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases
-def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
-    subset = df.copy()
-    # Filter by task specific benchmarks when 'All' benchmarks is selected
-    if task == "Spec-to-RTL":
-        valid_benchmarks = s2r_benchs
-        if benchmark == "All":
-            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
-    elif task == "Code Completion":
-        valid_benchmarks = cc_benchs
-        if benchmark == "All":
-            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
-    elif task == "Line Completion †":
-        valid_benchmarks = lc_benchs
-        if benchmark == "All":
-            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
-    if benchmark != "All":
-        subset = df[df["Benchmark"] == benchmark]
-    if model_type != "All":
-        # without emojis
-        subset = subset[subset["Model Type"] == model_type.split(" ")[0]]
-    if search_query:
-        subset = subset[
-            subset["Model"].str.contains(search_query, case=False, na=False)
-        ]
-    max_params = float(max_params)
-    subset = subset[subset["Params"] <= max_params]
-    if benchmark == "All":
-        if task == "Spec-to-RTL":
-            return filter_bench_all(subset, df_agg, agg_column="Agg S2R")
-        elif task == "Code Completion":
-            return filter_bench_all(subset, df_agg, agg_column="Agg MC")
-        elif task == "Line Completion †":
-            return filter_RTLRepo(subset)
-    elif benchmark == "RTL-Repo":
-        return filter_RTLRepo(subset)
-    else:
-        agg_column = None
-        if benchmark == "VerilogEval S2R":
-            agg_column = "Agg VerilogEval S2R"
-        elif benchmark == "VerilogEval MC":
-            agg_column = "Agg VerilogEval MC"
-        elif benchmark == "RTLLM":
-            agg_column = "Agg RTLLM"
-        elif benchmark == "VeriGen":
-            agg_column = "Agg VeriGen"
-        return filter_bench(subset, df_agg, agg_column)
-def update_benchmarks_by_task(task):
-    if task == "Spec-to-RTL":
-        new_benchmarks = ["All"] + s2r_benchs
-    elif task == "Code Completion":
-        new_benchmarks = ["All"] + cc_benchs
-    elif task == "Line Completion †":
-        new_benchmarks = lc_benchs
-    else:
-        new_benchmarks = ["All"] + benchmarks
-    benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
-    filtered = filter_leaderboard(
-        task,
-        benchmark_value,
-        model_type_dropdown.value,
-        search_box.value,
-        params_slider.value,
-    )
-    return gr.update(value=benchmark_value, choices=new_benchmarks), filtered
-def generate_scatter_plot(benchmark, metric):
-    benchmark, metric = handle_special_cases(benchmark, metric)
-    subset = df[df["Benchmark"] == benchmark]
-    if benchmark == "RTL-Repo":
-        subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)]
-        detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean()
-        detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True)
-    else:
-        detailed_scores = subset.pivot_table(
-            index="Model", columns="Metric", values="Score"
-        ).reset_index()
-    details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model")
-    scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna(
-        subset=["Params", metric]
-    )
-    scatter_data["x"] = scatter_data["Params"]
-    scatter_data["y"] = scatter_data[metric]
-    scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40
-    type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
-    scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray")
-    y_axis_limits = {
-        "Functionality (FNC)": [5, 90],
-        "Syntax (STX)": [20, 100],
-        "Synthesis (SYN)": [5, 90],
-        "Power": [0, 50],
-        "Performance": [0, 50],
-        "Area": [0, 50],
-        "Exact Matching (EM)": [0, 50],
-    }
-    y_range = y_axis_limits.get(metric, [0, 80])
-    fig = px.scatter(
-        scatter_data,
-        x="x",
-        y="y",
-        log_x=True,
-        size="size",
-        color="Model Type",
-        text="Model",
-        hover_data={metric: ":.2f"},
-        title=f"Params vs. {metric} for {benchmark}",
-        labels={"x": "# Params (Log Scale)", "y": metric},
-        template="plotly_white",
-        height=600,
-        width=1200,
-    )
-    fig.update_traces(
-        textposition="top center",
-        textfont_size=10,
-        marker=dict(opacity=0.8, line=dict(width=0.5, color="black")),
-    )
-    fig.update_layout(
-        xaxis=dict(
-            showgrid=True,
-            type="log",
-            tickmode="array",
-            tickvals=[8, 14, 32, 72, 200, 700],
-            ticktext=["8", "14", "32", "72", "200", "700"],
-        ),
-        showlegend=False,
-        yaxis=dict(range=y_range),
-        margin=dict(l=50, r=50, t=50, b=50),
-        plot_bgcolor="white",
-    )
-    return fig
-with gr.Blocks(
-    css=custom_css, theme=gr.themes.Default(primary_hue=colors.emerald)
-) as app:
-    df_icarus, benchmarks, metrics, default_metric = read_data(
-        "results/results_icarus.json"
-    )
-    df_agg_icarus = parse_agg("results/aggregated_scores_icarus.csv")
-    df_verilator, _, _, _ = read_data("results/results_verilator.json")
-    df_agg_verilator = parse_agg("results/aggregated_scores_verilator.csv")
-    df = df_icarus
-    df_agg = df_agg_icarus
-    tasks = ["Spec-to-RTL", "Code Completion", "Line Completion †"]
-    s2r_benchs = ["VerilogEval S2R", "RTLLM"]
-    cc_benchs = ["VerilogEval MC", "VeriGen"]
-    lc_benchs = ["RTL-Repo"]
-    non_rtl_metrics = [
-        "Syntax (STX)",
-        "Functionality (FNC)",
-        "Synthesis (SYN)",
-        "Power",
-        "Performance",
-        "Area",
-    ]
-    rtl_metrics = ["Exact Matching (EM)"]
-    model_types = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]
-    gr.HTML(
-        """
-    <div align="center">
-        <img src='/gradio_api/file=logo_new.png' alt='TuRTLe Logo' width='220'/>
-    </div>
-    """
     )
-    gr.HTML(
-        """
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
-    <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
-    <div style="text-align: center; margin-bottom: 0px; margin-top: 0px;">
-        <a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
-            <button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
-                GitHub Repo
-            </button>
-        </a>
-        <a href="http://arxiv.org/abs/2504.01986" target="_blank" style="text-decoration: none; margin-right: 10px;">
-            <button style="background: #b31b1b; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
-                arXiv MLCAD 2025
-            </button>
-        </a>
-        <a href="mailto:hpai@bsc.es?subject=TuRTLe%20leaderboard%20new%20entry&body=Link%20to%20HuggingFace%20Model:" style="text-decoration: none;">
-            <button style="background: #00674F; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
-                How to submit
-            </button>
-        </a>
-        <p style="margin-top: 15px;">If you have any inquiries or wish to collaborate:
-            <a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
-        </p>
-    </div>
-    """
-    )
-    gr.HTML(
-        """
-        <div style=" margin-top:-10px !important;">
-            <p style="margin-bottom: 15px; text-align: start !important;">
-                Welcome to the TuRTLe Model Leaderboard! TuRTLe is a
-                <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b>
-                for hardware design.
-                Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b>
-                (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs.
-                Use the filters below to explore different RTL benchmarks, simulators and models.
-            </p>
-            <p style="margin-top:10px; text-align:start !important;">
-                <span style="font-variant:small-caps; font-weight:bold;">UPDATE (SEPT 2025):</span> Added <span>gpt-oss-20b</span> and <span>gpt-oss-120b</span> to the leaderboard
-            </p>
-            <p style="margin-top:-6px; text-align:start !important;">
-                <span style="font-variant:small-caps; font-weight:bold;">UPDATE (JULY 2025):</span> Our TuRTLe paper was accepted to
-                <a href="https://mlcad.org/symposium/2025/" target="_blank">MLCAD 2025</a> in September (Santa Cruz, CA), plus we've added Verilator as a new simulator alongside Icarus Verilog
-            </p>
-            <p style="margin-top: -6px; text-align: start !important;">
-                <span style="font-variant: small-caps; font-weight: bold;">UPDATE (JUNE 2025):</span> We make our framework open-source on GitHub and we add 7 new recent models! For a total of 40 base and instruct models and 5 RTL benchmarks
-            </p>
-        </div>
-        """
-    )
-    with gr.Tabs():
         with gr.Tab("Leaderboard"):
             with gr.Row(equal_height=True):
                 with gr.Column(scale=4):
-                    task_radio = gr.Radio(
-                        choices=tasks, label="Select Task", value="Spec-to-RTL"
-                    )
                 with gr.Column(scale=3):
                     benchmark_radio = gr.Radio(
-                        choices=["All"] + s2r_benchs,
                         label="Select Benchmark",
-                        value="All",
                     )
                 with gr.Column(scale=2, min_width=180):
                     simulator_radio = gr.Radio(
-                        choices=["Icarus", "Verilator"],
-                        value="Icarus",
                         label="Select Simulator",
                         scale=1,
                     )
             with gr.Row(equal_height=True):
                 search_box = gr.Textbox(
                     label="Search Model",
@@ -271,74 +74,61 @@ with gr.Blocks(
                     scale=2,
                 )
                 model_type_dropdown = gr.Radio(
-                    choices=model_types,
                     label="Select Model Type",
-                    value="All",
                     scale=3,
                 )
                 params_slider = gr.Slider(
-                    minimum=df["Params"].min(),
-                    maximum=700,
-                    value=700,
                     label="Max Params",
                     step=1,
                     scale=2,
                 )
             leaderboard = gr.DataFrame(
-                value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700),
                 headers="first row",
                 show_row_numbers=True,
                 wrap=True,
-                datatype=[
-                    "html",
-                    "html",
-                ],
                 interactive=False,
-                column_widths=[
-                    "7%",
-                    "28%",
-                    "13%",
-                    "10%",
-                    "13%",
-                    "10%",
-                    "14%",
-                ],
                 elem_classes="dataframe-leaderboard",
             )
-            gr.HTML(
-                """
-                    <div id="lc-footnote" style="font-size: 13px; opacity: 0.6; margin-top: -5px; z-index:999; text-align: left;">
-                        <span style="font-weight: 600; opacity: 1;">†</span>
-                        <em>Line Completion</em> excludes “reasoning” models since this task targets quick auto-completion<br/>
-                        Additionally, for <em>Line Completion</em> and <em>Code Completion</em> benchmarks we use <b>Base</b> model variant (if available), and for <em>Spec-to-RTL</em> we use <b>Instruct</b> model variant
-                    </div>
-                """
-            )
         with gr.Tab("Plot View"):
             with gr.Row(equal_height=True):
-                default_benchmark = s2r_benchs[0]
                 bubble_benchmark = gr.Dropdown(
                     choices=benchmarks,
                     label="Select Benchmark",
                     value=default_benchmark,
                     elem_classes="gr-dropdown",
                 )
-                default_metric = non_rtl_metrics[0]
                 bubble_metric = gr.Dropdown(
-                    choices=non_rtl_metrics,
                     label="Select Metric",
                     value=default_metric,
                 )
             with gr.Row(equal_height=True):
                 scatter_plot = gr.Plot(
-                    value=generate_scatter_plot(default_benchmark, default_metric),
                     label="Bubble Chart",
                     elem_id="full-width-plot",
                 )
         with gr.Tab("Metrics Information"):
             with open("./static/metrics.md", "r") as file:
                 gr.Markdown(
@@ -349,52 +139,12 @@ with gr.Blocks(
                     ],
                     elem_classes="metrics-page",
                 )
-        with gr.Tab("About Us"):
-            gr.HTML(
-                """
-                <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
-                    <div style="display: flex; justify-content: center; align-items: center; gap: 5%; margin-bottom: 20px;">
-                        <img src='/gradio_api/file=hpai_logo_grad.png' alt='HPAI Group Logo' style="width: 45%;"/>
-                        <img src='/gradio_api/file=bsc-logo.png' alt='BSC Logo' style="width: 25%;"/>
-                    </div>
-                    <p style="font-size: 16px; text-align: start;">
-                        The <b>High-Performance Artificial Intelligence (HPAI)</b> group is part of the
-                        <a href="https://bsc.es/" target="_blank">Barcelona Supercomputing Center (BSC)</a>.
-                        This leaderboard is maintained by HPAI as part of our commitment to <b>open science</b>.
-                    </p>
-                    <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
-                        <li><a href="https://hpai.bsc.es/" target="_blank">HPAI Website</a></li>
-                        <li><a href="https://github.com/HPAI-BSC/" target="_blank">HPAI GitHub Organization Page</a></li>
-                        <li><a href="https://huggingface.co/HPAI-BSC/" target="_blank">HPAI Hugging Face Organization Page</a></li>
-                    </ul>
-                    <p style="font-size: 16px; margin-top: 15px;">
-                        Feel free to contact us:
-                    </p>
-                    <p style="font-size: 16px;">Email: <a href="mailto:hpai@bsc.es"><b>hpai@bsc.es</b></a></p>
-                </div>
-                """
-            )
         with gr.Tab("References"):
-            gr.HTML(
-                """
-            <div style="max-width: 800px; margin: auto; padding: 20px; border: 1px solid #ccc; border-radius: 10px;">
-                    <ul style="font-size: 16px; margin-bottom: 20px; margin-top: 20px;">
-                        <li><a href="https://github.com/bigcode-project/bigcode-evaluation-harness" target="_blank">Code Generation LM Evaluation Harness</a></li>
-                        <li>Williams, S. Icarus Verilog [Computer software]. <a href="https://github.com/steveicarus/iverilog" target="_blank">https://github.com/steveicarus/iverilog</a></li>
-                        <li>Snyder, W., Wasson, P., Galbi, D., & et al. Verilator [Computer software]. <a href="https://github.com/verilator/verilator" target="_blank">https://github.com/verilator/verilator</a></li>
-                        <li>RTL-Repo: Allam and M. Shalan, “Rtl-repo: A benchmark for evaluating llms on large-scale rtl design projects,” in 2024 IEEE LLM Aided Design Workshop (LAD). IEEE, 2024, pp. 1–5.</li>
-                        <li>VeriGen: S. Thakur, B. Ahmad, H. Pearce, B. Tan, B. Dolan-Gavitt, R. Karri, and S. Garg, “Verigen: A large language model for verilog code generation,” ACM Transactions on Design Automation of Electronic Systems, vol. 29, no. 3, pp. 1–31, 2024. </li>
-                        <li>VerilogEval (I): M. Liu, N. Pinckney, B. Khailany, and H. Ren, “Verilogeval: Evaluating large language models for verilog code generation,” in 2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD). IEEE, 2023, pp. 1–8.</li>
-                        <li>VerilogEval (II): N. Pinckney, C. Batten, M. Liu, H. Ren, and B. Khailany, “Revisiting VerilogEval: A Year of Improvements in Large-Language Models for Hardware Code Generation,” ACM Trans. Des. Autom. Electron. Syst., feb 2025. https://doi.org/10.1145/3718088</li>
-                        <li>RTLLM: Y. Lu, S. Liu, Q. Zhang, and Z. Xie, “Rtllm: An open-source benchmark for design rtl generation with large language model,” in 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC). IEEE, 2024, pp. 722–727.</li>
-                    </ul>
-                </div>
-                """
-            )
         with gr.Row():
             with gr.Accordion("📙 Citation", open=False):
@@ -406,143 +156,22 @@ with gr.Blocks(
                     show_copy_button=True,
                 )
-    # event handlers, ugly way but it works
-    task_radio.change(
-        fn=update_benchmarks_by_task,
-        inputs=[task_radio],
-        outputs=[benchmark_radio, leaderboard],
-    )
-    benchmark_radio.change(
-        fn=filter_leaderboard,
-        inputs=[
-            task_radio,
-            benchmark_radio,
-            model_type_dropdown,
-            search_box,
-            params_slider,
-        ],
-        outputs=leaderboard,
-    )
-    model_type_dropdown.change(
-        fn=filter_leaderboard,
-        inputs=[
-            task_radio,
-            benchmark_radio,
-            model_type_dropdown,
-            search_box,
-            params_slider,
-        ],
-        outputs=leaderboard,
-    )
-    search_box.change(
-        fn=filter_leaderboard,
-        inputs=[
-            task_radio,
-            benchmark_radio,
-            model_type_dropdown,
-            search_box,
-            params_slider,
-        ],
-        outputs=leaderboard,
-    )
-    params_slider.change(
-        fn=filter_leaderboard,
-        inputs=[
-            task_radio,
-            benchmark_radio,
-            model_type_dropdown,
-            search_box,
-            params_slider,
-        ],
-        outputs=leaderboard,
-    )
-    def on_benchmark_change(benchmark, _):
-        if benchmark == "RTL-Repo":
-            metric = "Exact Matching (EM)"
-            return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
-                benchmark, metric
-            )
-        else:
-            metric = non_rtl_metrics[0]
-            return gr.update(
-                choices=non_rtl_metrics[:-1], value=metric
-            ), generate_scatter_plot(benchmark, metric)
-    def on_metric_change(benchmark, metric):
-        benchmark, metric = handle_special_cases(benchmark, metric)
-        fig = generate_scatter_plot(benchmark, metric)
-        return gr.update(value=benchmark), fig
-    def on_simulator_change(
-        simulator,
-        task,
-        benchmark,
-        model_type,
-        search,
-        max_params,
-        plot_bench,
-        plot_metric,
-    ):
-        global df, df_agg
-        if simulator == "Icarus":
-            df, df_agg = df_icarus, df_agg_icarus
-        else:
-            df, df_agg = df_verilator, df_agg_verilator
-        leaderboard_df = filter_leaderboard(
-            task, benchmark, model_type, search, max_params
-        )
-        fig = generate_scatter_plot(plot_bench, plot_metric)
-        return leaderboard_df, fig
-    bubble_benchmark.change(
-        fn=on_benchmark_change,
-        inputs=[bubble_benchmark, bubble_metric],
-        outputs=[bubble_metric, scatter_plot],
-        js=""" // this is to avoid resetting user scroll each time a plot is re-generated
-        (benchmark, metric) => {
-            let scrollY = window.scrollY;
-            const observer = new MutationObserver(() => {
-                window.scrollTo(0, scrollY);
-                observer.disconnect();
-            });
-            observer.observe(document.getElementById('full-width-plot'), { childList: true });
-            return [benchmark, metric];
-        }
-        """,
-    )
-    bubble_metric.change(
-        fn=on_metric_change,
-        inputs=[bubble_benchmark, bubble_metric],
-        outputs=[bubble_benchmark, scatter_plot],
-        js=""" // this is to avoid resetting user scroll each time a plot is re-generated
-        (benchmark, metric) => {
-            let scrollY = window.scrollY;
-            const observer = new MutationObserver(() => {
-                window.scrollTo(0, scrollY);
-                observer.disconnect();
-            });
-            observer.observe(document.getElementById('full-width-plot'), { childList: true });
-            return [benchmark, metric];
-        }
-        """,
-    )
-    simulator_radio.change(
-        fn=on_simulator_change,
-        inputs=[
-            simulator_radio,
-            task_radio,
-            benchmark_radio,
-            model_type_dropdown,
-            search_box,
-            params_slider,
-            bubble_benchmark,
-            bubble_metric,
-        ],
-        outputs=[leaderboard, scatter_plot],
     )

 import sys
 import gradio as gr
 from gradio.themes.utils import colors
+from config import constants as C
+from handlers.leaderboard_handlers import create_leaderboard_handlers
+from results.parse import get_metadata, parse_agg, read_dataframe
+from src.data_processing import filter_leaderboard, generate_scatter_plot
+from src.models import Simulator
 from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
+from static.html_content import (
+    ABOUT_US_HTML,
+    HEADER_HTML,
+    INTRO_HTML,
+    LC_FOOTNOTE_HTML,
+    NAV_BUTTONS_HTML,
+    REFERENCES_HTML,
+)
 from style.css_html_js import custom_css
+with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue=colors.emerald)) as app:
+    # Load csv results
+    df_icarus = read_dataframe(C.ICARUS_RESULTS)
+    df_verilator = read_dataframe(C.VERILATOR_RESULTS)
+    # Load aggregated scores
+    df_agg_icarus = parse_agg(C.ICARUS_AGG)
+    df_agg_verilator = parse_agg(C.VERILATOR_AGG)
+    # columns of the csvs
+    benchmarks, metrics, default_metric = get_metadata(df_icarus)
+    # Each time we select a simulator, we need to use that sim's dataframe
+    state = Simulator(
+        icarus_df=df_icarus,
+        icarus_agg=df_agg_icarus,
+        verilator_df=df_verilator,
+        verilator_agg=df_agg_verilator,
     )
+    # Header view
+    gr.HTML(HEADER_HTML)
+    gr.HTML(NAV_BUTTONS_HTML)
+    gr.HTML(INTRO_HTML)
+    # Main view
+    with gr.Tabs() as tabs:
+        # Leaderboard
         with gr.Tab("Leaderboard"):
+            # 1st row filters (select task, benchmark and sim)
             with gr.Row(equal_height=True):
                 with gr.Column(scale=4):
+                    task_radio = gr.Radio(choices=C.TASKS, label="Select Task", value=C.DEFAULT_TASK)
                 with gr.Column(scale=3):
                     benchmark_radio = gr.Radio(
+                        choices=[C.DEFAULT_BENCHMARK] + C.S2R_BENCHMARKS,
                         label="Select Benchmark",
+                        value=C.DEFAULT_BENCHMARK,
                     )
                 with gr.Column(scale=2, min_width=180):
                     simulator_radio = gr.Radio(
+                        choices=C.SIMULATORS,
+                        value=C.SIMULATORS[0],
                         label="Select Simulator",
                         scale=1,
                     )
+            # 2nd row filters (search, model type, params)
             with gr.Row(equal_height=True):
                 search_box = gr.Textbox(
                     label="Search Model",
                     scale=2,
                 )
                 model_type_dropdown = gr.Radio(
+                    choices=C.MODEL_TYPES,
                     label="Select Model Type",
+                    value=C.DEFAULT_MODEL_TYPE,
                     scale=3,
                 )
                 params_slider = gr.Slider(
+                    minimum=state.get_current_df()["Params"].min(),
+                    maximum=C.DEFAULT_MAX_PARAMS,
+                    value=C.DEFAULT_MAX_PARAMS,
                     label="Max Params",
                     step=1,
                     scale=2,
                 )
+            # main leaderboard content
             leaderboard = gr.DataFrame(
+                value=filter_leaderboard(
+                    C.DEFAULT_TASK, C.DEFAULT_BENCHMARK, C.DEFAULT_MODEL_TYPE, "", C.DEFAULT_MAX_PARAMS, state
+                ),
                 headers="first row",
                 show_row_numbers=True,
                 wrap=True,
+                datatype=["html", "html"],
                 interactive=False,
+                column_widths=["7%", "28%", "13%", "10%", "13%", "10%", "14%"],
                 elem_classes="dataframe-leaderboard",
             )
+            # caption for the Base vs Instruct models
+            gr.HTML(LC_FOOTNOTE_HTML)
+        # all plots using Plotly
         with gr.Tab("Plot View"):
             with gr.Row(equal_height=True):
+                default_benchmark = C.S2R_BENCHMARKS[0]
                 bubble_benchmark = gr.Dropdown(
                     choices=benchmarks,
                     label="Select Benchmark",
                     value=default_benchmark,
                     elem_classes="gr-dropdown",
                 )
+                default_metric = C.NON_RTL_METRICS[0]
                 bubble_metric = gr.Dropdown(
+                    choices=C.NON_RTL_METRICS,
                     label="Select Metric",
                     value=default_metric,
                 )
             with gr.Row(equal_height=True):
                 scatter_plot = gr.Plot(
+                    value=generate_scatter_plot(default_benchmark, default_metric, state),
                     label="Bubble Chart",
                     elem_id="full-width-plot",
                 )
+        # Markdown / Latex explaining our metrics
         with gr.Tab("Metrics Information"):
             with open("./static/metrics.md", "r") as file:
                 gr.Markdown(
                     ],
                     elem_classes="metrics-page",
                 )
+        with gr.Tab("About Us"):
+            gr.HTML(ABOUT_US_HTML)
         with gr.Tab("References"):
+            gr.HTML(REFERENCES_HTML)
         with gr.Row():
             with gr.Accordion("📙 Citation", open=False):
                     show_copy_button=True,
                 )
+    # all event handlers are declared at /handlers/
+    # if you need to add interactivity, then you'll need to add one!
+    create_leaderboard_handlers(
+        filter_leaderboard_fn=filter_leaderboard,
+        generate_scatter_plot_fn=generate_scatter_plot,
+        task_radio=task_radio,
+        benchmark_radio=benchmark_radio,
+        model_type_dropdown=model_type_dropdown,
+        search_box=search_box,
+        params_slider=params_slider,
+        bubble_benchmark=bubble_benchmark,
+        bubble_metric=bubble_metric,
+        scatter_plot=scatter_plot,
+        leaderboard=leaderboard,
+        simulator_radio=simulator_radio,
+        state=state,
     )

config/constants.py ADDED Viewed

	@@ -0,0 +1,58 @@

+RESULTS_DIR = "results"
+ICARUS_RESULTS = f"{RESULTS_DIR}/results_icarus.json"
+VERILATOR_RESULTS = f"{RESULTS_DIR}/results_verilator.json"
+ICARUS_AGG = f"{RESULTS_DIR}/aggregated_scores_icarus.csv"
+VERILATOR_AGG = f"{RESULTS_DIR}/aggregated_scores_verilator.csv"
+TASKS = ["Spec-to-RTL", "Code Completion", "Line Completion †"]
+S2R_BENCHMARKS = ["VerilogEval S2R", "RTLLM"]
+CC_BENCHMARKS = ["VerilogEval MC", "VeriGen"]
+LC_BENCHMARKS = ["RTL-Repo"]
+MODEL_TYPES = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]
+TYPE_EMOJI = {"RTL-Specific": "🔴", "General": "🟢", "Coding": "🔵"}
+NON_RTL_METRICS = ["Syntax (STX)", "Functionality (FNC)", "Synthesis (SYN)", "Power", "Performance", "Area"]
+RTL_METRICS = ["Exact Matching (EM)"]
+COLUMN_MAPPINGS = {
+    "Params": "Parameters (B)",
+    "Syntax (STX)": "Syntax",
+    "Functionality (FNC)": "Functionality",
+    "Synthesis (SYN)": "Synthesis",
+    "Post-Synthesis (PSQ)": "Post-Synthesis",
+}
+COLUMN_ORDER = [
+    "Type",
+    "Model",
+    "Parameters (B)",
+    "Syntax",
+    "Functionality",
+    "Synthesis",
+    "Post-Synthesis",
+]
+TYPE_COLORS = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
+Y_AXIS_LIMITS = {
+    "Functionality (FNC)": [5, 90],
+    "Syntax (STX)": [20, 100],
+    "Synthesis (SYN)": [5, 90],
+    "Power": [0, 50],
+    "Performance": [0, 50],
+    "Area": [0, 50],
+    "Exact Matching (EM)": [0, 50],
+}
+SCATTER_PLOT_X_TICKS = {
+    "tickvals": [8, 14, 32, 72, 200, 700],
+    "ticktext": ["8", "14", "32", "72", "200", "700"],
+}
+DEFAULT_MAX_PARAMS = 700
+DEFAULT_TASK = "Spec-to-RTL"
+DEFAULT_BENCHMARK = "All"
+DEFAULT_MODEL_TYPE = "All"
+SIMULATORS = ["Icarus", "Verilator"]

config/model_metadata.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from dataclasses import dataclass
+from typing import Literal
+@dataclass
+class ModelMetadata:
+    url: str  # HF model card
+    params: float  # in B
+    model_type: Literal["General", "Coding", "RTL-Specific"]
+    release: Literal["V1", "V2", "V3"]  # release of the leaderboard for which the model was included
+    model_arch: Literal["Dense", "Reasoning"]  # to distinguish between reasoners and non-reasoners
+# fmt: off
+MODELS = {
+    "DeepSeek R1-0528": ModelMetadata(
+        "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", 685, "General", "V2", "Reasoning"
+    ),
+    "DeepSeek R1": ModelMetadata(
+        "https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General", "V1", "Reasoning"
+    ),
+    "Llama 3.1 405B": ModelMetadata(
+        "https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8", 406, "General", "V1", "Dense"
+    ),
+    "Qwen3 236B A22B": ModelMetadata(
+        "https://huggingface.co/Qwen/Qwen3-235B-A22B", 235, "General", "V2", "Reasoning"
+    ),
+    "Llama 3.(1-3) 70B": ModelMetadata(
+        "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", 70.6, "General", "V1", "Dense"
+    ),
+    "Qwen2.5 72B": ModelMetadata(
+        "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", 72.7, "General", "V1", "Dense"
+    ),
+    "QwQ 32B": ModelMetadata(
+        "https://huggingface.co/Qwen/QwQ-32B", 32.8, "General", "V2", "Reasoning"
+    ),
+    "Qwen2.5 32B": ModelMetadata(
+        "https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General", "V1", "Dense"
+    ),
+    "StarChat2 15B v0.1": ModelMetadata(
+        "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1", 16, "General", "V1", "Dense"
+    ),
+    "DeepSeek R1 Distill Qwen 14B": ModelMetadata(
+        "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", 14.8, "General", "V1", "Reasoning"
+    ),
+    "CodeLlama 70B": ModelMetadata(
+        "https://huggingface.co/codellama/CodeLlama-70b-hf", 69, "Coding", "V1", "Dense"
+    ),
+    "QwenCoder 2.5 32B": ModelMetadata(
+        "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct", 32.5, "Coding", "V1", "Dense"
+    ),
+    "DeepSeek Coder 33B": ModelMetadata(
+        "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct", 33.3, "Coding", "V1", "Dense"
+    ),
+    "QwenCoder 2.5 14B": ModelMetadata(
+        "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct", 14.7, "Coding", "V1", "Dense"
+    ),
+    "DeepCoder 14B": ModelMetadata(
+        "https://huggingface.co/agentica-org/DeepCoder-14B-Preview", 14.8, "Coding", "V2", "Reasoning"
+    ),
+    "OpenCoder 8B": ModelMetadata(
+        "https://huggingface.co/infly/OpenCoder-8B-Instruct", 7.77, "Coding", "V1", "Dense"
+    ),
+    "SeedCoder 8B": ModelMetadata(
+        "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct", 8.25, "Coding", "V2", "Dense"
+    ),
+    "SeedCoder 8B Reasoning": ModelMetadata(
+        "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16", 8.25, "Coding", "V2", "Reasoning"
+    ),
+    "QwenCoder 2.5 7B": ModelMetadata(
+        "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct", 7.61, "Coding", "V1", "Dense"
+    ),
+    "DeepSeek Coder 6.7B": ModelMetadata(
+        "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct", 6.74, "Coding", "V1", "Dense"
+    ),
+    "HaVen-CodeQwen": ModelMetadata(
+        "https://huggingface.co/yangyiyao/HaVen-CodeQwen", 7.25, "RTL-Specific", "V1", "Dense"
+    ),
+    "CodeV R1 Distill Qwen 7B": ModelMetadata(
+        "https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B", 7.62, "RTL-Specific", "V2", "Reasoning"
+    ),
+    "CodeV-CL-7B": ModelMetadata(
+        "https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific", "V1", "Dense"
+    ),
+    "CodeV-QW-7B": ModelMetadata(
+        "https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific", "V1", "Dense"
+    ),
+    "CodeV-DS-6.7B": ModelMetadata(
+        "https://huggingface.co/yang-z/CodeV-DS-6.7B", 6.74, "RTL-Specific", "V1", "Dense"
+    ),
+    "RTLCoder Mistral": ModelMetadata(
+        "https://huggingface.co/ishorn5/RTLCoder-v1.1", 7.24, "RTL-Specific", "V1", "Dense"
+    ),
+    "RTLCoder DeepSeek": ModelMetadata(
+        "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1", 6.74, "RTL-Specific", "V1", "Dense"
+    ),
+    "OriGen": ModelMetadata(
+        "https://huggingface.co/henryen/OriGen", 6.74, "RTL-Specific", "V1", "Dense"
+    ),
+    "Qwen3 Coder 480B A35B": ModelMetadata(
+        "https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct", 480, "Coding", "V2", "Dense"
+    ),
+    "Magistral Small 2506": ModelMetadata(
+        "https://huggingface.co/mistralai/Magistral-Small-2506", 23.6, "General", "V2", "Reasoning"
+    ),
+    "gpt-oss-20b": ModelMetadata(
+        "https://huggingface.co/openai/gpt-oss-20b", 21.5, "General", "V2", "Reasoning"
+    ),
+    "gpt-oss-120b": ModelMetadata(
+        "https://huggingface.co/openai/gpt-oss-120b", 120, "General", "V2", "Reasoning"
+    ),
+}

results/parse.py CHANGED Viewed

@@ -1,238 +1,11 @@
-import csv
-import json
-import locale
 import os
 import sys
-from typing import Dict, Union
 import pandas as pd
-model_details = {
-    "DeepSeek R1-0528": (
-        "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
-        685,
-        "General",
-        "V2",
-        "Reasoning",  # "Dense" or "Reasoning"
-    ),
-    "DeepSeek R1": (
-        "https://huggingface.co/deepseek-ai/DeepSeek-R1",
-        685,
-        "General",
-        "V1",
-        "Reasoning",
-    ),
-    "Llama 3.1 405B": (
-        "https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8",
-        406,
-        "General",
-        "V1",
-        "Dense",
-    ),
-    "Qwen3 236B A22B": (
-        "https://huggingface.co/Qwen/Qwen3-235B-A22B",
-        235,
-        "General",
-        "V2",
-        "Reasoning",
-    ),
-    "Llama 3.(1-3) 70B": (
-        "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
-        70.6,
-        "General",
-        "V1",
-        "Dense",
-    ),
-    "Qwen2.5 72B": (
-        "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
-        72.7,
-        "General",
-        "V1",
-        "Dense",
-    ),
-    "QwQ 32B": (
-        "https://huggingface.co/Qwen/QwQ-32B",
-        32.8,
-        "General",
-        "V2",
-        "Reasoning",
-    ),
-    "Qwen2.5 32B": (
-        "https://huggingface.co/Qwen/Qwen2.5-32B",
-        32.5,
-        "General",
-        "V1",
-        "Dense",
-    ),
-    "StarChat2 15B v0.1": (
-        "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
-        16,
-        "General",
-        "V1",
-        "Dense",
-    ),
-    "DeepSeek R1 Distill Qwen 14B": (
-        "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
-        14.8,
-        "General",
-        "V1",
-        "Reasoning",
-    ),
-    "CodeLlama 70B": (
-        "https://huggingface.co/codellama/CodeLlama-70b-hf",
-        69,
-        "Coding",
-        "V1",
-        "Dense",
-    ),
-    "QwenCoder 2.5 32B": (
-        "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
-        32.5,
-        "Coding",
-        "V1",
-        "Dense",
-    ),
-    "DeepSeek Coder 33B": (
-        "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
-        33.3,
-        "Coding",
-        "V1",
-        "Dense",
-    ),
-    "QwenCoder 2.5 14B": (
-        "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
-        14.7,
-        "Coding",
-        "V1",
-        "Dense",
-    ),
-    "DeepCoder 14B": (
-        "https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
-        14.8,
-        "Coding",
-        "V2",
-        "Reasoning",
-    ),
-    "OpenCoder 8B": (
-        "https://huggingface.co/infly/OpenCoder-8B-Instruct",
-        7.77,
-        "Coding",
-        "V1",
-        "Dense",
-    ),
-    "SeedCoder 8B": (
-        "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct",
-        8.25,
-        "Coding",
-        "V2",
-        "Dense",
-    ),
-    "SeedCoder 8B Reasoning": (
-        "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
-        8.25,
-        "Coding",
-        "V2",
-        "Reasoning",
-    ),
-    "QwenCoder 2.5 7B": (
-        "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
-        7.61,
-        "Coding",
-        "V1",
-        "Dense",
-    ),
-    "DeepSeek Coder 6.7B": (
-        "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
-        6.74,
-        "Coding",
-        "V1",
-        "Dense",
-    ),
-    "HaVen-CodeQwen": (
-        "https://huggingface.co/yangyiyao/HaVen-CodeQwen",
-        7.25,
-        "RTL-Specific",
-        "V1",
-        "Dense",
-    ),
-    "CodeV R1 Distill Qwen 7B": (
-        "https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B",
-        7.62,
-        "RTL-Specific",
-        "V2",
-        "Reasoning",
-    ),
-    "CodeV-CL-7B": (
-        "https://huggingface.co/yang-z/CodeV-CL-7B",
-        6.74,
-        "RTL-Specific",
-        "V1",
-        "Dense",
-    ),
-    "CodeV-QW-7B": (
-        "https://huggingface.co/yang-z/CodeV-QW-7B",
-        7.25,
-        "RTL-Specific",
-        "V1",
-        "Dense",
-    ),
-    "CodeV-DS-6.7B": (
-        "https://huggingface.co/yang-z/CodeV-DS-6.7B",
-        6.74,
-        "RTL-Specific",
-        "V1",
-        "Dense",
-    ),
-    "RTLCoder Mistral": (
-        "https://huggingface.co/ishorn5/RTLCoder-v1.1",
-        7.24,
-        "RTL-Specific",
-        "V1",
-        "Dense",
-    ),
-    "RTLCoder DeepSeek": (
-        "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
-        6.74,
-        "RTL-Specific",
-        "V1",
-        "Dense",
-    ),
-    "OriGen": (
-        "https://huggingface.co/henryen/OriGen",
-        6.74,
-        "RTL-Specific",
-        "V1",
-        "Dense",
-    ),
-    "Qwen3 Coder 480B A35B": (
-        "https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct",
-        480,
-        "Coding",
-        "V2",
-        "Dense",
-    ),
-    "Magistral Small 2506": (
-        "https://huggingface.co/mistralai/Magistral-Small-2506",
-        23.6,
-        "General",
-        "V2",
-        "Reasoning",
-    ),
-    "gpt-oss-20b": (
-        "https://huggingface.co/openai/gpt-oss-20b",
-        21.5,
-        "General",
-        "V2",
-        "Reasoning",
-    ),
-    "gpt-oss-120b": (
-        "https://huggingface.co/openai/gpt-oss-120b",
-        120,
-        "General",
-        "V2",
-        "Reasoning",
-    ),
-}
 def get_headers(reader, agg=False) -> Union[list, list]:
@@ -248,15 +21,19 @@ def get_headers(reader, agg=False) -> Union[list, list]:
     return metrics, benchs
-def get_model_params_and_url(model) -> Union[str, str, float, str, str]:
-    if model not in model_details:
-        return "-", 0.0, "-", "-", "-"
-    url = model_details[model][0]
-    params = model_details[model][1]
-    type = model_details[model][2]
-    release = model_details[model][3]
-    reasoning = model_details[model][4]
-    return url, params, type, release, reasoning
 def parse_results(csv_path: str) -> list[dict]:
@@ -275,7 +52,7 @@ def parse_results(csv_path: str) -> list[dict]:
             model = row[0]
             if not model:
                 continue
-            url, params, type, release, reasoning = get_model_params_and_url(model)
             models.append(model)
             row = row[1:]
             ctr = 0
@@ -294,7 +71,6 @@ def parse_results(csv_path: str) -> list[dict]:
                 record["Thinking"] = reasoning
                 dataset.append(record)
                 ctr += 1
-    print(models)
     return dataset
@@ -318,9 +94,7 @@ def read_json(json_path: str = "results/results_icarus.json"):
     return data
-def read_data(
-    json_path: str = "results/results_icarus.json",
-) -> tuple[pd.DataFrame, list, list, str]:
     data = read_json(json_path)
     df = pd.DataFrame(data)
     df.rename(
@@ -334,11 +108,21 @@ def read_data(
         inplace=True,
     )
     df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
     benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
     metrics = df["Metric"].unique().tolist()
-    default_metric = (
-        "Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0]
-    )
     return df, benchmarks, metrics, default_metric

 import os
 import sys
+import csv
+import json
 import pandas as pd
+from typing import Dict, Union
+from config.model_metadata import MODELS
 def get_headers(reader, agg=False) -> Union[list, list]:
     return metrics, benchs
+def get_model_metadata(model_key: str) -> tuple[str, float, str, str, str]:
+    try:
+        model_metadata = MODELS[model_key]
+    except KeyError:
+        raise KeyError(f"Unknown model: {model_key}")
+    return (
+        model_metadata.url,
+        model_metadata.params,
+        model_metadata.model_type,
+        model_metadata.release,
+        model_metadata.model_arch,
+    )
 def parse_results(csv_path: str) -> list[dict]:
             model = row[0]
             if not model:
                 continue
+            url, params, type, release, reasoning = get_model_metadata(model)
             models.append(model)
             row = row[1:]
             ctr = 0
                 record["Thinking"] = reasoning
                 dataset.append(record)
                 ctr += 1
     return dataset
     return data
+def read_dataframe(json_path: str) -> pd.DataFrame:
     data = read_json(json_path)
     df = pd.DataFrame(data)
     df.rename(
         inplace=True,
     )
     df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
+    return df
+def get_metadata(df: pd.DataFrame) -> tuple[list, list, str]:
     benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
     metrics = df["Metric"].unique().tolist()
+    default_metric = "Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0]
+    return benchmarks, metrics, default_metric
+def read_data(
+    json_path: str = "results/results_icarus.json",
+) -> tuple[pd.DataFrame, list, list, str]:
+    df = read_dataframe(json_path)
+    benchmarks, metrics, default_metric = get_metadata(df)
     return df, benchmarks, metrics, default_metric

utils.py CHANGED Viewed

@@ -6,13 +6,7 @@ import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
-# fmt: off
-type_emoji = {
-    "RTL-Specific": "🔴",
-    "General": "🟢",
-    "Coding": "🔵"
-}
-# fmt: on
 def model_hyperlink(link, model_name, release, thinking=False):
@@ -23,11 +17,7 @@ def model_hyperlink(link, model_name, release, thinking=False):
     if release == "V1":
         return ret + reasoning_badge if thinking == "Reasoning" else ret
     else:
-        return (
-            ret + reasoning_badge + new_badge
-            if thinking == "Reasoning"
-            else ret + new_badge
-        )
 def handle_special_cases(benchmark, metric):
@@ -39,13 +29,19 @@ def handle_special_cases(benchmark, metric):
 def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
     subset = subset.drop(subset[subset.Score < 0.0].index)
-    details = subset[
-        ["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]
-    ].drop_duplicates("Model")
-    filtered_df = subset[["Model", "Score"]].rename(
-        columns={"Score": "Exact Matching (EM)"}
     )
     filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
     filtered_df["Model"] = filtered_df.apply(
         lambda row: model_hyperlink(
@@ -55,31 +51,28 @@ def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
         ),
         axis=1,
     )
-    filtered_df["Type"] = filtered_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
     filtered_df = filtered_df[["Type", "Model", "Params", "Exact Matching (EM)"]]
-    filtered_df = filtered_df.sort_values(
-        by="Exact Matching (EM)", ascending=False
-    ).reset_index(drop=True)
     return filtered_df
 def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
-    details = subset[
-        ["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]
-    ].drop_duplicates("Model")
     if "RTLLM" in subset["Benchmark"].unique():
         pivot_df = (
-            subset.pivot_table(
-                index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r
-            )
             .reset_index()
             .round(2)
         )
     else:
         pivot_df = (
-            subset.pivot_table(
-                index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc
-            )
             .reset_index()
             .round(2)
         )
@@ -94,39 +87,20 @@ def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataF
     pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
     pivot_df["Model"] = pivot_df.apply(
-        lambda row: model_hyperlink(
-            row["Model URL"], row["Model"], row["Release"], row["Thinking"]
-        ),
         axis=1,
     )
-    pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
-    pivot_df["Post-Synthesis (PSQ)"] = (
-        pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
-    )
-    pivot_df.rename(
-        columns={
-            "Params": "Parameters (B)",
-            "Syntax (STX)": "Syntax",
-            "Functionality (FNC)": "Functionality",
-            "Synthesis (SYN)": "Synthesis",
-            "Post-Synthesis (PSQ)": "Post-Synthesis",
-        },
-        inplace=True,
-    )
-    columns_order = [
-        "Type",
-        "Model",
-        "Parameters (B)",
-        "Syntax",
-        "Functionality",
-        "Synthesis",
-        "Post-Synthesis",
-    ]
-    pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
-    pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(
-        drop=True
-    )
     return pivot_df
@@ -154,65 +128,40 @@ def custom_agg_cc(vals):
     return round(result, 2)
-def filter_bench_all(
-    subset: pd.DataFrame, df_agg=None, agg_column=None
-) -> pd.DataFrame:
-    details = subset[
-        ["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]
-    ].drop_duplicates("Model")
     if "RTLLM" in subset["Benchmark"].unique():
         pivot_df = (
-            subset.pivot_table(
-                index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r
-            )
             .reset_index()
             .round(2)
         )
     else:
         pivot_df = (
-            subset.pivot_table(
-                index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc
-            )
             .reset_index()
             .round(2)
         )
     pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
-    print(pivot_df.columns)
     pivot_df["Model"] = pivot_df.apply(
-        lambda row: model_hyperlink(
-            row["Model URL"], row["Model"], row["Release"], row["Thinking"]
-        ),
         axis=1,
     )
-    pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
-    pivot_df["Post-Synthesis Quality"] = (
-        pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
-    )
-    pivot_df.rename(
-        columns={
-            "Params": "Parameters (B)",
-            "Exact Matching (EM)": "EM",
-            "Syntax (STX)": "Syntax",
-            "Functionality (FNC)": "Functionality",
-            "Synthesis (SYN)": "Synthesis",
-            "Post-Synthesis Quality": "Post-Synthesis",
-        },
-        inplace=True,
-    )
-    columns_order = [
-        "Type",
-        "Model",
-        "Parameters (B)",
-        "Syntax",
-        "Functionality",
-        "Synthesis",
-        "Post-Synthesis",
-    ]
-    pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
-    pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(
-        drop=True
-    )
     return pivot_df

 import plotly.express as px
 import plotly.graph_objects as go
+from config.constants import COLUMN_MAPPINGS, COLUMN_ORDER, TYPE_EMOJI
 def model_hyperlink(link, model_name, release, thinking=False):
     if release == "V1":
         return ret + reasoning_badge if thinking == "Reasoning" else ret
     else:
+        return ret + reasoning_badge + new_badge if thinking == "Reasoning" else ret + new_badge
 def handle_special_cases(benchmark, metric):
 def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
+    if subset.empty:
+        return pd.DataFrame(columns=["Type", "Model", "Params", "Exact Matching (EM)"])
     subset = subset.drop(subset[subset.Score < 0.0].index)
+    # Check again if empty after filtering
+    if subset.empty:
+        return pd.DataFrame(columns=["Type", "Model", "Params", "Exact Matching (EM)"])
+    details = subset[["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]].drop_duplicates(
+        "Model"
     )
+    filtered_df = subset[["Model", "Score"]].rename(columns={"Score": "Exact Matching (EM)"})
     filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
     filtered_df["Model"] = filtered_df.apply(
         lambda row: model_hyperlink(
         ),
         axis=1,
     )
+    filtered_df["Type"] = filtered_df["Model Type"].map(lambda x: TYPE_EMOJI.get(x, ""))
     filtered_df = filtered_df[["Type", "Model", "Params", "Exact Matching (EM)"]]
+    filtered_df = filtered_df.sort_values(by="Exact Matching (EM)", ascending=False).reset_index(drop=True)
     return filtered_df
 def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
+    if subset.empty:
+        return pd.DataFrame(columns=COLUMN_ORDER)
+    details = subset[["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]].drop_duplicates(
+        "Model"
+    )
     if "RTLLM" in subset["Benchmark"].unique():
         pivot_df = (
+            subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r)
             .reset_index()
             .round(2)
         )
     else:
         pivot_df = (
+            subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc)
             .reset_index()
             .round(2)
         )
     pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
     pivot_df["Model"] = pivot_df.apply(
+        lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"], row["Thinking"]),
         axis=1,
     )
+    pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: TYPE_EMOJI.get(x, ""))
+    if all(col in pivot_df.columns for col in ["Power", "Performance", "Area"]):
+        pivot_df["Post-Synthesis (PSQ)"] = pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
+    pivot_df.rename(columns=COLUMN_MAPPINGS, inplace=True)
+    pivot_df = pivot_df[[col for col in COLUMN_ORDER if col in pivot_df.columns]]
+    if "Functionality" in pivot_df.columns:
+        pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(drop=True)
     return pivot_df
     return round(result, 2)
+def filter_bench_all(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
+    if subset.empty:
+        return pd.DataFrame(columns=COLUMN_ORDER)
+    details = subset[["Model", "Model URL", "Model Type", "Params", "Release", "Thinking"]].drop_duplicates(
+        "Model"
+    )
     if "RTLLM" in subset["Benchmark"].unique():
         pivot_df = (
+            subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r)
             .reset_index()
             .round(2)
         )
     else:
         pivot_df = (
+            subset.pivot_table(index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc)
             .reset_index()
             .round(2)
         )
     pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
     pivot_df["Model"] = pivot_df.apply(
+        lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"], row["Thinking"]),
         axis=1,
     )
+    pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: TYPE_EMOJI.get(x, ""))
+    if all(col in pivot_df.columns for col in ["Power", "Performance", "Area"]):
+        pivot_df["Post-Synthesis Quality"] = pivot_df[["Power", "Performance", "Area"]].mean(axis=1).round(2)
+    pivot_df.rename(columns=COLUMN_MAPPINGS, inplace=True)
+    pivot_df = pivot_df[[col for col in COLUMN_ORDER if col in pivot_df.columns]]
+    if "Functionality" in pivot_df.columns:
+        pivot_df = pivot_df.sort_values(by="Functionality", ascending=False).reset_index(drop=True)
     return pivot_df