TuRTLe-Leaderboard

Running

App Files Files Community

ggcristian commited on May 27

Commit

711a69b

1 Parent(s): 2c02057

Re-organize code

Browse files

Files changed (8) hide show

app.py +263 -127
aggregated_scores.csv → results/aggregated_scores.csv +0 -0
parse.py → results/parse.py +115 -34
results.csv → results/results.csv +0 -0
results.json → results/results.json +0 -0
about.py → static/about.py +0 -0
metrics.md → static/metrics.md +0 -0
css_html_js.py → style/css_html_js.py +0 -0

app.py CHANGED Viewed

@@ -1,71 +1,69 @@
-import json
-from typing import Union
 import gradio as gr
-import numpy as np
 import pandas as pd
 import plotly.express as px
-import plotly.graph_objects as go
 from gradio.themes.utils import colors
-from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
-from about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
-from css_html_js import custom_css, trigger_plot
-from parse import parse_agg, read_data, read_json
-from utils import (filter_bench, filter_bench_all, filter_RTLRepo,
-                   handle_special_cases, model_hyperlink, type_emoji)
 def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
     subset = df.copy()
     # Filter by task specific benchmarks when 'All' benchmarks is selected
     if task == "Spec-to-RTL":
         valid_benchmarks = s2r_benchs
-        if benchmark == 'All':
-            subset = subset[subset['Benchmark'].isin(valid_benchmarks)]
     elif task == "Code Completion":
         valid_benchmarks = cc_benchs
-        if benchmark == 'All':
-            subset = subset[subset['Benchmark'].isin(valid_benchmarks)]
     elif task == "Line Completion":
         valid_benchmarks = lc_benchs
-        if benchmark == 'All':
-            subset = subset[subset['Benchmark'].isin(valid_benchmarks)]
-    if benchmark != 'All':
-        subset = df[df['Benchmark'] == benchmark]
-    if model_type != 'All':
         # without emojis
-        subset = subset[subset['Model Type'] == model_type.split(" ")[0]]
     if search_query:
-        subset = subset[subset['Model'].str.contains(search_query, case=False, na=False)]
     max_params = float(max_params)
-    subset = subset[subset['Params'] <= max_params]
-    if benchmark == 'All':
-        if task == 'Spec-to-RTL':
-            return filter_bench_all(subset, df_agg, agg_column='Agg S2R')
-        elif task == 'Code Completion':
-            return filter_bench_all(subset, df_agg, agg_column='Agg MC')
-        elif task == 'Line Completion':
             return filter_RTLRepo(subset)
-    elif benchmark == 'RTL-Repo':
         return filter_RTLRepo(subset)
     else:
         agg_column = None
-        if benchmark == 'VerilogEval S2R':
-            agg_column = 'Agg VerilogEval S2R'
-        elif benchmark == 'VerilogEval MC':
-            agg_column = 'Agg VerilogEval MC'
-        elif benchmark == 'RTLLM':
-            agg_column = 'Agg RTLLM'
-        elif benchmark == 'VeriGen':
-            agg_column = 'Agg VeriGen'
         return filter_bench(subset, df_agg, agg_column)
 def update_benchmarks_by_task(task):
     if task == "Spec-to-RTL":
         new_benchmarks = ["All"] + s2r_benchs
@@ -76,59 +74,90 @@ def update_benchmarks_by_task(task):
     else:
         new_benchmarks = ["All"] + benchmarks
     benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
-    filtered = filter_leaderboard(task, benchmark_value, model_type_dropdown.value, search_box.value, params_slider.value)
     return gr.update(value=benchmark_value, choices=new_benchmarks), filtered
 def generate_scatter_plot(benchmark, metric):
     benchmark, metric = handle_special_cases(benchmark, metric)
-    subset = df[df['Benchmark'] == benchmark]
     if benchmark == "RTL-Repo":
-        subset = subset[subset['Metric'].str.contains('EM', case=False, na=False)]
-        detailed_scores = subset.groupby('Model', as_index=False)['Score'].mean()
-        detailed_scores.rename(columns={'Score': 'Exact Matching (EM)'}, inplace=True)
     else:
-        detailed_scores = subset.pivot_table(index='Model', columns='Metric', values='Score').reset_index()
-    details = df[['Model', 'Params', 'Model Type']].drop_duplicates('Model')
-    scatter_data = pd.merge(detailed_scores, details, on='Model', how='left').dropna(subset=['Params', metric])
-    scatter_data['x'] = scatter_data['Params']
-    scatter_data['y'] = scatter_data[metric]
-    scatter_data['size'] = (scatter_data['x'] ** 0.3) * 40
     type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
-    scatter_data['color'] = scatter_data['Model Type'].map(type_colors).fillna('gray')
     y_axis_limits = {
-        'Functionality (FNC)': [5, 90], 'Syntax (STX)': [20, 100], 'Synthesis (SYN)': [5, 90],
-        'Power': [0, 50], 'Performance': [0, 50], 'Area': [0, 50], 'Exact Matching (EM)': [0, 50]
     }
     y_range = y_axis_limits.get(metric, [0, 80])
     fig = px.scatter(
-        scatter_data, x='x', y='y', log_x=True, size='size', color='Model Type', text='Model',
-        hover_data={metric: ':.2f'}, title=f'Params vs. {metric} for {benchmark}',
-        labels={'x': '# Params (Log Scale)', 'y': metric}, template="plotly_white",
-        height=600, width=1200
     )
     fig.update_traces(
-        textposition='top center', textfont_size=10,
-        marker=dict(opacity=0.8, line=dict(width=0.5, color='black'))
     )
     fig.update_layout(
         xaxis=dict(
-            showgrid=True, type='log', tickmode='array',
             tickvals=[8, 14, 32, 72, 200, 700],
-            ticktext=['8', '14', '32', '72', '200', '700']
         ),
-        showlegend=False, yaxis=dict(range=y_range),
-        margin=dict(l=50, r=50, t=50, b=50), plot_bgcolor='white'
     )
     return fig
 js_func = """
 function refresh() {
     const url = new URL(window.location);
@@ -139,24 +168,36 @@ function refresh() {
     }
 }
 """
-with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)) as app:
     df, benchmarks, metrics, default_metric = read_data()
-    df_agg = parse_agg("./aggregated_scores.csv")
     tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"]
     s2r_benchs = ["VerilogEval S2R", "RTLLM"]
     cc_benchs = ["VerilogEval MC", "VeriGen"]
     lc_benchs = ["RTL-Repo"]
-    non_rtl_metrics = ["Syntax (STX)", "Functionality (FNC)", "Synthesis (SYN)", "Power", "Performance", "Area"]
     rtl_metrics = ["Exact Matching (EM)"]
-    model_types = ['All', 'General 🟢', 'Coding 🔵', 'RTL-Specific 🔴']
-    gr.HTML("""
     <p align="center" style="margin-bottom: -10px;">
         <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> <br/>
     </p>
-    """)
-    gr.HTML("""
     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
     <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
     <div style="text-align: center; margin-bottom: 15px;">
@@ -184,60 +225,99 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
         <a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
     </p>
     </div>
-    """)
     with gr.Tabs():
         with gr.Tab("Leaderboard"):
             with gr.Row(equal_height=True):
                 with gr.Column():
-                    task_radio = gr.Radio(choices=tasks, label="Select Task", value='Spec-to-RTL')
-                with gr.Column():
-                    benchmark_radio = gr.Radio(choices=["All"] + s2r_benchs, label="Select Benchmark", value='All')
-            with gr.Row(equal_height=True):
-                    search_box = gr.Textbox(
-                        label="Search Model",
-                        placeholder="Type model name...",
-                        scale=2,
-                    )
-                    model_type_dropdown = gr.Radio(
-                        choices=model_types,
-                        label="Select Model Type",
-                        value='All',
-                        scale=3,
                     )
-                    params_slider = gr.Slider(
-                        minimum=df['Params'].min(),
-                        maximum=700,
-                        value=700,
-                        label="Max Params",
-                        step=1,
-                        scale=2,
                     )
             leaderboard = gr.DataFrame(
-                value=filter_leaderboard('Spec-to-RTL', 'All', 'All', "", 700),
                 headers="first row",
                 show_row_numbers=True,
                 wrap=True,
-                datatype=["markdown", "html",],
                 interactive=False,
-                column_widths=["7%", "25%", "10%", "17%", "6%", "6%", "6%", "6%", "6%", "7%"])
         with gr.Tab("Plot View"):
             with gr.Row(equal_height=True):
                 default_benchmark = s2r_benchs[0]
-                bubble_benchmark = gr.Dropdown(choices=benchmarks, label="Select Benchmark", value=default_benchmark, elem_classes="gr-dropdown")
                 default_metric = non_rtl_metrics[0]
-                bubble_metric = gr.Dropdown(choices=non_rtl_metrics[:-1], label="Select Metric", value=default_metric)
             with gr.Row(equal_height=True):
-                scatter_plot = gr.Plot(value=generate_scatter_plot(default_benchmark, default_metric), label="Bubble Chart", elem_id="full-width-plot")
         with gr.Tab("Metrics Information"):
-            with open("metrics.md", "r") as file:
-                gr.Markdown(file.read(), latex_delimiters=[
-                    {"left": "$$", "right": "$$", "display": True},
-                    {"left": "$", "right": "$", "display": False}
-                ], elem_classes="metrics-page")
         with gr.Tab("About Us"):
             gr.HTML(
                 """
@@ -267,7 +347,7 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
                 </div>
                 """
             )
         with gr.Row():
             with gr.Accordion("📙 Citation", open=False):
                 citation_button = gr.Textbox(
@@ -277,21 +357,69 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
                     elem_id="citation-button",
                     show_copy_button=True,
                 )
     # event handlers, ugly way but it works
-    task_radio.change(fn=update_benchmarks_by_task, inputs=[task_radio], outputs=[benchmark_radio, leaderboard])
-    benchmark_radio.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
-    model_type_dropdown.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
-    search_box.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
-    params_slider.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
     def on_benchmark_change(benchmark, _):
         if benchmark == "RTL-Repo":
             metric = "Exact Matching (EM)"
-            return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(benchmark, metric)
         else:
             metric = non_rtl_metrics[0]
-            return gr.update(choices=non_rtl_metrics[:-1], value=metric), generate_scatter_plot(benchmark, metric)
     def on_metric_change(benchmark, metric):
         benchmark, metric = handle_special_cases(benchmark, metric)
@@ -299,7 +427,7 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
         return gr.update(value=benchmark), fig
     bubble_benchmark.change(
-        fn=on_benchmark_change,
         inputs=[bubble_benchmark, bubble_metric],
         outputs=[bubble_metric, scatter_plot],
         js=""" // this is to avoid resetting user scroll each time a plot is re-generated
@@ -312,7 +440,8 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
             observer.observe(document.getElementById('full-width-plot'), { childList: true });
             return [benchmark, metric];
         }
-        """)
     bubble_metric.change(
         fn=on_metric_change,
@@ -328,7 +457,14 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
             observer.observe(document.getElementById('full-width-plot'), { childList: true });
             return [benchmark, metric];
         }
-        """)
-app.launch(allowed_paths=["logo.png", "hpai_logo_grad.png", "bsc-logo.png"])

+import sys
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 from gradio.themes.utils import colors
+from results.parse import parse_agg, read_data
+from static.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
+from style.css_html_js import custom_css
+from utils import filter_bench, filter_bench_all, filter_RTLRepo, handle_special_cases
 def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
     subset = df.copy()
     # Filter by task specific benchmarks when 'All' benchmarks is selected
     if task == "Spec-to-RTL":
         valid_benchmarks = s2r_benchs
+        if benchmark == "All":
+            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
     elif task == "Code Completion":
         valid_benchmarks = cc_benchs
+        if benchmark == "All":
+            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
     elif task == "Line Completion":
         valid_benchmarks = lc_benchs
+        if benchmark == "All":
+            subset = subset[subset["Benchmark"].isin(valid_benchmarks)]
+    if benchmark != "All":
+        subset = df[df["Benchmark"] == benchmark]
+    if model_type != "All":
         # without emojis
+        subset = subset[subset["Model Type"] == model_type.split(" ")[0]]
     if search_query:
+        subset = subset[
+            subset["Model"].str.contains(search_query, case=False, na=False)
+        ]
     max_params = float(max_params)
+    subset = subset[subset["Params"] <= max_params]
+    if benchmark == "All":
+        if task == "Spec-to-RTL":
+            return filter_bench_all(subset, df_agg, agg_column="Agg S2R")
+        elif task == "Code Completion":
+            return filter_bench_all(subset, df_agg, agg_column="Agg MC")
+        elif task == "Line Completion":
             return filter_RTLRepo(subset)
+    elif benchmark == "RTL-Repo":
         return filter_RTLRepo(subset)
     else:
         agg_column = None
+        if benchmark == "VerilogEval S2R":
+            agg_column = "Agg VerilogEval S2R"
+        elif benchmark == "VerilogEval MC":
+            agg_column = "Agg VerilogEval MC"
+        elif benchmark == "RTLLM":
+            agg_column = "Agg RTLLM"
+        elif benchmark == "VeriGen":
+            agg_column = "Agg VeriGen"
         return filter_bench(subset, df_agg, agg_column)
 def update_benchmarks_by_task(task):
     if task == "Spec-to-RTL":
         new_benchmarks = ["All"] + s2r_benchs
     else:
         new_benchmarks = ["All"] + benchmarks
     benchmark_value = "All" if "All" in new_benchmarks else new_benchmarks[0]
+    filtered = filter_leaderboard(
+        task,
+        benchmark_value,
+        model_type_dropdown.value,
+        search_box.value,
+        params_slider.value,
+    )
     return gr.update(value=benchmark_value, choices=new_benchmarks), filtered
 def generate_scatter_plot(benchmark, metric):
     benchmark, metric = handle_special_cases(benchmark, metric)
+    subset = df[df["Benchmark"] == benchmark]
     if benchmark == "RTL-Repo":
+        subset = subset[subset["Metric"].str.contains("EM", case=False, na=False)]
+        detailed_scores = subset.groupby("Model", as_index=False)["Score"].mean()
+        detailed_scores.rename(columns={"Score": "Exact Matching (EM)"}, inplace=True)
     else:
+        detailed_scores = subset.pivot_table(
+            index="Model", columns="Metric", values="Score"
+        ).reset_index()
+    details = df[["Model", "Params", "Model Type"]].drop_duplicates("Model")
+    scatter_data = pd.merge(detailed_scores, details, on="Model", how="left").dropna(
+        subset=["Params", metric]
+    )
+    scatter_data["x"] = scatter_data["Params"]
+    scatter_data["y"] = scatter_data[metric]
+    scatter_data["size"] = (scatter_data["x"] ** 0.3) * 40
     type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
+    scatter_data["color"] = scatter_data["Model Type"].map(type_colors).fillna("gray")
     y_axis_limits = {
+        "Functionality (FNC)": [5, 90],
+        "Syntax (STX)": [20, 100],
+        "Synthesis (SYN)": [5, 90],
+        "Power": [0, 50],
+        "Performance": [0, 50],
+        "Area": [0, 50],
+        "Exact Matching (EM)": [0, 50],
     }
     y_range = y_axis_limits.get(metric, [0, 80])
     fig = px.scatter(
+        scatter_data,
+        x="x",
+        y="y",
+        log_x=True,
+        size="size",
+        color="Model Type",
+        text="Model",
+        hover_data={metric: ":.2f"},
+        title=f"Params vs. {metric} for {benchmark}",
+        labels={"x": "# Params (Log Scale)", "y": metric},
+        template="plotly_white",
+        height=600,
+        width=1200,
     )
     fig.update_traces(
+        textposition="top center",
+        textfont_size=10,
+        marker=dict(opacity=0.8, line=dict(width=0.5, color="black")),
     )
     fig.update_layout(
         xaxis=dict(
+            showgrid=True,
+            type="log",
+            tickmode="array",
             tickvals=[8, 14, 32, 72, 200, 700],
+            ticktext=["8", "14", "32", "72", "200", "700"],
         ),
+        showlegend=False,
+        yaxis=dict(range=y_range),
+        margin=dict(l=50, r=50, t=50, b=50),
+        plot_bgcolor="white",
     )
     return fig
 js_func = """
 function refresh() {
     const url = new URL(window.location);
     }
 }
 """
+with gr.Blocks(
+    css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)
+) as app:
     df, benchmarks, metrics, default_metric = read_data()
+    df_agg = parse_agg("./results/aggregated_scores.csv")
     tasks = ["Spec-to-RTL", "Code Completion", "Line Completion"]
     s2r_benchs = ["VerilogEval S2R", "RTLLM"]
     cc_benchs = ["VerilogEval MC", "VeriGen"]
     lc_benchs = ["RTL-Repo"]
+    non_rtl_metrics = [
+        "Syntax (STX)",
+        "Functionality (FNC)",
+        "Synthesis (SYN)",
+        "Power",
+        "Performance",
+        "Area",
+    ]
     rtl_metrics = ["Exact Matching (EM)"]
+    model_types = ["All", "General 🟢", "Coding 🔵", "RTL-Specific 🔴"]
+    gr.HTML(
+        """
     <p align="center" style="margin-bottom: -10px;">
         <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> <br/>
     </p>
+    """
+    )
+    gr.HTML(
+        """
     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
     <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
     <div style="text-align: center; margin-bottom: 15px;">
         <a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
     </p>
     </div>
+    """
+    )
     with gr.Tabs():
         with gr.Tab("Leaderboard"):
             with gr.Row(equal_height=True):
                 with gr.Column():
+                    task_radio = gr.Radio(
+                        choices=tasks, label="Select Task", value="Spec-to-RTL"
                     )
+                with gr.Column():
+                    benchmark_radio = gr.Radio(
+                        choices=["All"] + s2r_benchs,
+                        label="Select Benchmark",
+                        value="All",
                     )
+            with gr.Row(equal_height=True):
+                search_box = gr.Textbox(
+                    label="Search Model",
+                    placeholder="Type model name...",
+                    scale=2,
+                )
+                model_type_dropdown = gr.Radio(
+                    choices=model_types,
+                    label="Select Model Type",
+                    value="All",
+                    scale=3,
+                )
+                params_slider = gr.Slider(
+                    minimum=df["Params"].min(),
+                    maximum=700,
+                    value=700,
+                    label="Max Params",
+                    step=1,
+                    scale=2,
+                )
             leaderboard = gr.DataFrame(
+                value=filter_leaderboard("Spec-to-RTL", "All", "All", "", 700),
                 headers="first row",
                 show_row_numbers=True,
                 wrap=True,
+                datatype=[
+                    "markdown",
+                    "html",
+                ],
                 interactive=False,
+                column_widths=[
+                    "7%",
+                    "25%",
+                    "10%",
+                    "17%",
+                    "6%",
+                    "6%",
+                    "6%",
+                    "6%",
+                    "6%",
+                    "7%",
+                ],
+            )
         with gr.Tab("Plot View"):
             with gr.Row(equal_height=True):
                 default_benchmark = s2r_benchs[0]
+                bubble_benchmark = gr.Dropdown(
+                    choices=benchmarks,
+                    label="Select Benchmark",
+                    value=default_benchmark,
+                    elem_classes="gr-dropdown",
+                )
                 default_metric = non_rtl_metrics[0]
+                bubble_metric = gr.Dropdown(
+                    choices=non_rtl_metrics,
+                    label="Select Metric",
+                    value=default_metric,
+                )
             with gr.Row(equal_height=True):
+                scatter_plot = gr.Plot(
+                    value=generate_scatter_plot(default_benchmark, default_metric),
+                    label="Bubble Chart",
+                    elem_id="full-width-plot",
+                )
         with gr.Tab("Metrics Information"):
+            with open("./static/metrics.md", "r") as file:
+                gr.Markdown(
+                    file.read(),
+                    latex_delimiters=[
+                        {"left": "$$", "right": "$$", "display": True},
+                        {"left": "$", "right": "$", "display": False},
+                    ],
+                    elem_classes="metrics-page",
+                )
         with gr.Tab("About Us"):
             gr.HTML(
                 """
                 </div>
                 """
             )
         with gr.Row():
             with gr.Accordion("📙 Citation", open=False):
                 citation_button = gr.Textbox(
                     elem_id="citation-button",
                     show_copy_button=True,
                 )
     # event handlers, ugly way but it works
+    task_radio.change(
+        fn=update_benchmarks_by_task,
+        inputs=[task_radio],
+        outputs=[benchmark_radio, leaderboard],
+    )
+    benchmark_radio.change(
+        fn=filter_leaderboard,
+        inputs=[
+            task_radio,
+            benchmark_radio,
+            model_type_dropdown,
+            search_box,
+            params_slider,
+        ],
+        outputs=leaderboard,
+    )
+    model_type_dropdown.change(
+        fn=filter_leaderboard,
+        inputs=[
+            task_radio,
+            benchmark_radio,
+            model_type_dropdown,
+            search_box,
+            params_slider,
+        ],
+        outputs=leaderboard,
+    )
+    search_box.change(
+        fn=filter_leaderboard,
+        inputs=[
+            task_radio,
+            benchmark_radio,
+            model_type_dropdown,
+            search_box,
+            params_slider,
+        ],
+        outputs=leaderboard,
+    )
+    params_slider.change(
+        fn=filter_leaderboard,
+        inputs=[
+            task_radio,
+            benchmark_radio,
+            model_type_dropdown,
+            search_box,
+            params_slider,
+        ],
+        outputs=leaderboard,
+    )
     def on_benchmark_change(benchmark, _):
         if benchmark == "RTL-Repo":
             metric = "Exact Matching (EM)"
+            return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(
+                benchmark, metric
+            )
         else:
             metric = non_rtl_metrics[0]
+            return gr.update(
+                choices=non_rtl_metrics[:-1], value=metric
+            ), generate_scatter_plot(benchmark, metric)
     def on_metric_change(benchmark, metric):
         benchmark, metric = handle_special_cases(benchmark, metric)
         return gr.update(value=benchmark), fig
     bubble_benchmark.change(
+        fn=on_benchmark_change,
         inputs=[bubble_benchmark, bubble_metric],
         outputs=[bubble_metric, scatter_plot],
         js=""" // this is to avoid resetting user scroll each time a plot is re-generated
             observer.observe(document.getElementById('full-width-plot'), { childList: true });
             return [benchmark, metric];
         }
+        """,
+    )
     bubble_metric.change(
         fn=on_metric_change,
             observer.observe(document.getElementById('full-width-plot'), { childList: true });
             return [benchmark, metric];
         }
+        """,
+    )
+app.launch(
+    allowed_paths=[
+        "logo.png",
+        "hpai_logo_grad.png",
+        "bsc-logo.png",
+    ]
+)

aggregated_scores.csv → results/aggregated_scores.csv RENAMED Viewed

File without changes

parse.py → results/parse.py RENAMED Viewed

@@ -1,35 +1,99 @@
-import json
-import pandas as pd
 import csv
-from typing import Dict, Union
 import locale
 model_details = {
     "DeepSeek R1": ("https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General"),
-    "Llama 3.1 405B": ("https://huggingface.co/meta-llama/Llama-3.1-405B", 406, "General"),
-    "Llama 3.(1-3) 70B": ("https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct", 70.6, "General"),
-    "Qwen2.5 72B": ("https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", 72.7, "General"),
     "Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General"),
-    "StarChat2 15B v0.1": ("https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1", 16, "General"),
-    "DeepSeek R1 Distill Qwen 14B": ("https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", 14.8, "General"),
-    "CodeLlama 70B": ("https://huggingface.co/codellama/CodeLlama-70b-hf", 69, "Coding"),
-    "QwenCoder 2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct", 32.5, "Coding"),
-    "DeepSeek Coder 33B": ("https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct", 33.3, "Coding"),
-    "QwenCoder 2.5 14B": ("https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct", 14.7, "Coding"),
-    "OpenCoder 8B": ("https://huggingface.co/infly/OpenCoder-8B-Instruct", 7.77, "Coding"),
-    "QwenCoder 2.5 7B": ("https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct", 7.61, "Coding"),
-    "DeepSeek Coder 6,7B": ("https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct", 6.74, "Coding"),
-    "HaVen-CodeQwen": ("https://huggingface.co/yangyiyao/HaVen-CodeQwen", 7.25, "RTL-Specific"),
     "CodeV-CL-7B": ("https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific"),
     "CodeV-QW-7B": ("https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific"),
-    "CodeV-DS-6.7B": ("https://huggingface.co/yang-z/CodeV-DS-6.7B", 6.74, "RTL-Specific"),
-    "RTLCoder Mistral": ("https://huggingface.co/ishorn5/RTLCoder-v1.1", 7.24, "RTL-Specific"),
-    "RTLCoder DeepSeek": ("https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1", 6.74, "RTL-Specific"),
-    "OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific")
 }
 def get_headers(reader, agg=False) -> Union[list, list]:
     metrics, benchs = [], []
     for i, row in enumerate(reader):
@@ -42,6 +106,7 @@ def get_headers(reader, agg=False) -> Union[list, list]:
             return metrics
     return metrics, benchs
 def get_model_params_and_url(model) -> Union[str, str, float]:
     if model not in model_details:
         return "-", "-", "-"
@@ -50,6 +115,7 @@ def get_model_params_and_url(model) -> Union[str, str, float]:
     type = model_details[model][2]
     return url, params, type
 def parse_results(csv_path: str) -> list[dict]:
     """
     Each row has the following format:
@@ -57,8 +123,8 @@ def parse_results(csv_path: str) -> list[dict]:
     """
     dataset = []
     models = []
-    with open(csv_path, newline='') as csvfile:
-        reader = csv.reader(csvfile, delimiter=',')
         metrics, benchs = get_headers(reader)
         for i, row in enumerate(reader):
             model = row[0]
@@ -69,12 +135,12 @@ def parse_results(csv_path: str) -> list[dict]:
             for metric, bench in zip(metrics, benchs):
                 if metric == "EM":
                     metric = "Exact Matching (EM)"
-                record = {}
                 record["Model"] = model
                 record["Model Type"] = type
                 record["Benchmark"] = bench
                 record["Task"] = metric
-                record["Result"] = float(row[ctr].replace(',','.'))
                 record["Model URL"] = url
                 record["Params"] = params
                 dataset.append(record)
@@ -82,32 +148,47 @@ def parse_results(csv_path: str) -> list[dict]:
     print(models)
     return dataset
 def parse_agg(csv_path: str) -> list[dict]:
     """
     Each row has the following format:
         MODEL | BENCHMARK | TASK | METRIC | RESULT
     """
-    return pd.read_csv("aggregated_scores.csv")
 def writeJson(data: list):
-    with open('results.json', 'w') as f:
         json.dump(data, f, indent=4, ensure_ascii=False)
     print("Done")
 def read_json():
-    json_path = "./results.json"
     with open(json_path, "r", encoding="utf-8") as file:
         data = json.load(file)
     return data
 def read_data() -> Union[pd.DataFrame, list, list, str]:
     data = read_json()
     df = pd.DataFrame(data)
-    df.rename(columns={'Model': 'Model', 'Benchmark': 'Benchmark', 'Task': 'Metric', 'Result': 'Score', 'EM': 'Exact Matching (EM)'}, inplace=True)
-    df['Params'] = pd.to_numeric(df['Params'], errors='coerce')
-    benchmarks = sorted(df['Benchmark'].unique().tolist(), reverse=True)
-    metrics = df['Metric'].unique().tolist()
-    default_metric = 'Functionality (FNC)' if 'Functionality (FNC)' in metrics else metrics[0]
     return df, benchmarks, metrics, default_metric

 import csv
+import json
 import locale
+from typing import Dict, Union
+import pandas as pd
 model_details = {
     "DeepSeek R1": ("https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General"),
+    "Llama 3.1 405B": (
+        "https://huggingface.co/meta-llama/Llama-3.1-405B",
+        406,
+        "General",
+    ),
+    "Llama 3.(1-3) 70B": (
+        "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
+        70.6,
+        "General",
+    ),
+    "Qwen2.5 72B": (
+        "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
+        72.7,
+        "General",
+    ),
     "Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General"),
+    "StarChat2 15B v0.1": (
+        "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
+        16,
+        "General",
+    ),
+    "DeepSeek R1 Distill Qwen 14B": (
+        "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+        14.8,
+        "General",
+    ),
+    "CodeLlama 70B": (
+        "https://huggingface.co/codellama/CodeLlama-70b-hf",
+        69,
+        "Coding",
+    ),
+    "QwenCoder 2.5 32B": (
+        "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
+        32.5,
+        "Coding",
+    ),
+    "DeepSeek Coder 33B": (
+        "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
+        33.3,
+        "Coding",
+    ),
+    "QwenCoder 2.5 14B": (
+        "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
+        14.7,
+        "Coding",
+    ),
+    "OpenCoder 8B": (
+        "https://huggingface.co/infly/OpenCoder-8B-Instruct",
+        7.77,
+        "Coding",
+    ),
+    "QwenCoder 2.5 7B": (
+        "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
+        7.61,
+        "Coding",
+    ),
+    "DeepSeek Coder 6,7B": (
+        "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
+        6.74,
+        "Coding",
+    ),
+    "HaVen-CodeQwen": (
+        "https://huggingface.co/yangyiyao/HaVen-CodeQwen",
+        7.25,
+        "RTL-Specific",
+    ),
     "CodeV-CL-7B": ("https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific"),
     "CodeV-QW-7B": ("https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific"),
+    "CodeV-DS-6.7B": (
+        "https://huggingface.co/yang-z/CodeV-DS-6.7B",
+        6.74,
+        "RTL-Specific",
+    ),
+    "RTLCoder Mistral": (
+        "https://huggingface.co/ishorn5/RTLCoder-v1.1",
+        7.24,
+        "RTL-Specific",
+    ),
+    "RTLCoder DeepSeek": (
+        "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
+        6.74,
+        "RTL-Specific",
+    ),
+    "OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific"),
 }
 def get_headers(reader, agg=False) -> Union[list, list]:
     metrics, benchs = [], []
     for i, row in enumerate(reader):
             return metrics
     return metrics, benchs
 def get_model_params_and_url(model) -> Union[str, str, float]:
     if model not in model_details:
         return "-", "-", "-"
     type = model_details[model][2]
     return url, params, type
 def parse_results(csv_path: str) -> list[dict]:
     """
     Each row has the following format:
     """
     dataset = []
     models = []
+    with open(csv_path, newline="") as csvfile:
+        reader = csv.reader(csvfile, delimiter=",")
         metrics, benchs = get_headers(reader)
         for i, row in enumerate(reader):
             model = row[0]
             for metric, bench in zip(metrics, benchs):
                 if metric == "EM":
                     metric = "Exact Matching (EM)"
+                record = {}
                 record["Model"] = model
                 record["Model Type"] = type
                 record["Benchmark"] = bench
                 record["Task"] = metric
+                record["Result"] = float(row[ctr].replace(",", "."))
                 record["Model URL"] = url
                 record["Params"] = params
                 dataset.append(record)
     print(models)
     return dataset
 def parse_agg(csv_path: str) -> list[dict]:
     """
     Each row has the following format:
         MODEL | BENCHMARK | TASK | METRIC | RESULT
     """
+    return pd.read_csv("results/aggregated_scores.csv")
 def writeJson(data: list):
+    with open("results/results.json", "w") as f:
         json.dump(data, f, indent=4, ensure_ascii=False)
     print("Done")
 def read_json():
+    json_path = "results/results.json"
     with open(json_path, "r", encoding="utf-8") as file:
         data = json.load(file)
     return data
 def read_data() -> Union[pd.DataFrame, list, list, str]:
     data = read_json()
     df = pd.DataFrame(data)
+    df.rename(
+        columns={
+            "Model": "Model",
+            "Benchmark": "Benchmark",
+            "Task": "Metric",
+            "Result": "Score",
+            "EM": "Exact Matching (EM)",
+        },
+        inplace=True,
+    )
+    df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
+    benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
+    metrics = df["Metric"].unique().tolist()
+    default_metric = (
+        "Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0]
+    )
     return df, benchmarks, metrics, default_metric

results.csv → results/results.csv RENAMED Viewed

File without changes

results.json → results/results.json RENAMED Viewed

File without changes

about.py → static/about.py RENAMED Viewed

File without changes

metrics.md → static/metrics.md RENAMED Viewed

File without changes

css_html_js.py → style/css_html_js.py RENAMED Viewed

File without changes