Spaces:

ServiceNow
/

browsergym-leaderboard

Running

App Files Files Community

meghsn commited on Dec 3, 2024

Commit

51b9b31

1 Parent(s): f4d95d8

Updated latest results

Browse files

Files changed (32) hide show

app.py +20 -5
results/Bgym-Claude-3.5-Sonnet/assistantbench.json +16 -0
results/Bgym-Claude-3.5-Sonnet/miniwob.json +16 -0
results/Bgym-Claude-3.5-Sonnet/webarena.json +16 -0
results/Bgym-Claude-3.5-Sonnet/weblinx.json +16 -0
results/Bgym-GPT-4o-mini/assistantbench.json +16 -0
results/Bgym-GPT-4o-mini/miniwob.json +2 -2
results/Bgym-GPT-4o-mini/webarena.json +16 -0
results/Bgym-GPT-4o-mini/weblinx.json +16 -0
results/Bgym-GPT-4o-mini/workarena-l3.json +16 -0
results/Bgym-GPT-4o/assistantbench.json +16 -0
results/Bgym-GPT-4o/miniwob.json +1 -1
results/Bgym-GPT-4o/webarena.json +2 -2
results/Bgym-GPT-4o/weblinx.json +16 -0
results/Bgym-GPT-o1-mini/assistantbench.json +16 -0
results/Bgym-GPT-o1-mini/miniwob.json +16 -0
results/Bgym-GPT-o1-mini/webarena.json +16 -0
results/Bgym-GPT-o1-mini/weblinx.json +16 -0
results/Bgym-GPT-o1-mini/workarena-l3.json +16 -0
results/Bgym-Llama-3.1-405b/README.md +1 -0
results/Bgym-Llama-3.1-405b/assistantbench.json +16 -0
results/Bgym-Llama-3.1-405b/miniwob.json +16 -0
results/Bgym-Llama-3.1-405b/webarena.json +16 -0
results/Bgym-Llama-3.1-405b/weblinx.json +16 -0
results/Bgym-Llama-3.1-405b/workarena-l1.json +16 -0
results/Bgym-Llama-3.1-405b/workarena-l2.json +16 -0
results/Bgym-Llama-3.1-405b/workarena-l3.json +16 -0
results/Bgym-Llama-3.1-70b/assistantbench.json +16 -0
results/Bgym-Llama-3.1-70b/miniwob.json +16 -0
results/Bgym-Llama-3.1-70b/webarena.json +16 -0
results/Bgym-Llama-3.1-70b/weblinx.json +16 -0
results/Bgym-Llama-3.1-70b/workarena-l3.json +16 -0

app.py CHANGED Viewed

@@ -44,9 +44,11 @@ def sanitize_column_name(col: str) -> str:
     return html.escape(str(col))
 def sanitize_cell_value(value: Any) -> str:
-    """Sanitize cell values for HTML display"""
     if isinstance(value, (int, float)):
         return str(value)
     return html.escape(str(value))
 def create_html_table_main(df):
@@ -169,8 +171,9 @@ def create_html_table_benchmark(df, benchmark):
                     html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
             elif column == "Reproduced_all":
                 continue
-            # elif column == "Score":
-            #     html += f'<td>{row[column]}</td>'
             else:
                 html += f'<td>{sanitize_cell_value(row[column])}</td>'
         html += '</tr>'
@@ -205,6 +208,19 @@ def check_sanity(agent):
 def main():
     st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
     st.markdown("""
         <head>
             <meta http-equiv="Content-Security-Policy"
@@ -477,9 +493,8 @@ MIT
             if dfs_to_concat:
                 df_ = pd.concat(dfs_to_concat, ignore_index=True)
             df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
             df_['Score'] = df_['Score'].astype(str)
-            df_['Score'] = df_.apply(lambda row: f"{row['Score']} ± {row['std_err']}", axis=1)
-            df_ = df_.drop(columns=['std_err'])
             html_table = create_html_table_benchmark(df_, benchmark)
             st.markdown(html_table, unsafe_allow_html=True)

     return html.escape(str(col))
 def sanitize_cell_value(value: Any) -> str:
     if isinstance(value, (int, float)):
         return str(value)
+    if isinstance(value, str) and '±' in value:
+        score, std_err = value.split('±')
+        return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>'
     return html.escape(str(value))
 def create_html_table_main(df):
                     html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
             elif column == "Reproduced_all":
                 continue
+            elif column == "Score":
+                score_with_std_err = f'{row[column]} ± {row["std_err"]}'
+                html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
             else:
                 html += f'<td>{sanitize_cell_value(row[column])}</td>'
         html += '</tr>'
 def main():
     st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
+    st.markdown("""
+        <style>
+        :root {
+            --lighter-color: #888; /* Default for light theme */
+        }
+        @media (prefers-color-scheme: dark) {
+            :root {
+                --lighter-color: #ccc; /* Default for dark theme */
+            }
+        }
+        </style>
+    """, unsafe_allow_html=True)
     st.markdown("""
         <head>
             <meta http-equiv="Content-Security-Policy"
             if dfs_to_concat:
                 df_ = pd.concat(dfs_to_concat, ignore_index=True)
             df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
+            df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
             df_['Score'] = df_['Score'].astype(str)
             html_table = create_html_table_benchmark(df_, benchmark)
             st.markdown(html_table, unsafe_allow_html=True)

results/Bgym-Claude-3.5-Sonnet/assistantbench.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Claude-3.5-Sonnet",
+        "study_id": "study_id",
+        "benchmark": "AssistantBench",
+        "score": 5.2,
+        "std_err": 1.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Claude-3.5-Sonnet/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Claude-3.5-Sonnet",
+        "study_id": "study_id",
+        "benchmark": "MiniWoB",
+        "score": 69.8,
+        "std_err": 1.8,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Claude-3.5-Sonnet/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Claude-3.5-Sonnet",
+        "study_id": "study_id",
+        "benchmark": "WebArena",
+        "score": 36.2,
+        "std_err": 1.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Claude-3.5-Sonnet/weblinx.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Claude-3.5-Sonnet",
+        "study_id": "study_id",
+        "benchmark": "WebLINX",
+        "score": 13.7,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-GPT-4o-mini/assistantbench.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "AssistantBench",
+        "score": 2.1,
+        "std_err": 1.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o-mini/miniwob.json CHANGED Viewed

@@ -4,8 +4,8 @@
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
-        "score": 58.8,
-        "std_err": 1.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
+        "score": 56.6,
+        "std_err": 2.0,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-4o-mini/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 17.4,
+        "std_err": 1.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o-mini/weblinx.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebLINX",
+        "score": 11.6,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o-mini/workarena-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o/assistantbench.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "AssistantBench",
+        "score": 4.8,
+        "std_err": 2.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o/miniwob.json CHANGED Viewed

@@ -4,7 +4,7 @@
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
-        "score": 65.6,
         "std_err": 1.9,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",

         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
+        "score": 63.8,
         "std_err": 1.9,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",

results/Bgym-GPT-4o/webarena.json CHANGED Viewed

@@ -4,8 +4,8 @@
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebArena",
-        "score": 23.5,
-        "std_err": 0.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebArena",
+        "score": 31.4,
+        "std_err": 1.6,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-4o/weblinx.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebLINX",
+        "score": 12.5,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-o1-mini/assistantbench.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-o1-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "AssistantBench",
+        "score": 6.9,
+        "std_err": 2.2,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-o1-mini/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-o1-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 67.8,
+        "std_err": 1.9,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-o1-mini/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-o1-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 28.6,
+        "std_err": 1.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-o1-mini/weblinx.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-o1-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebLINX",
+        "score": 12.5,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-o1-mini/workarena-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-o1-mini",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3.1-405b/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ### Llama-3.1-405B

results/Bgym-Llama-3.1-405b/assistantbench.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-405b",
+        "study_id": "study_id",
+        "benchmark": "AssistantBench",
+        "score": 3.9,
+        "std_err": 1.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-405b/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-405b",
+        "study_id": "study_id",
+        "benchmark": "MiniWoB",
+        "score": 64.6,
+        "std_err": 1.9,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-405b/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-405b",
+        "study_id": "study_id",
+        "benchmark": "WebArena",
+        "score": 24.0,
+        "std_err": 1.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-405b/weblinx.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-405b",
+        "study_id": "study_id",
+        "benchmark": "WebLINX",
+        "score": 7.9,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-405b/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-405b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 43.3,
+        "std_err": 2.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-405b/workarena-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-405b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L2",
+        "score": 7.2,
+        "std_err": 1.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3.1-405b/workarena-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-405b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-70b/assistantbench.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-70b",
+        "study_id": "study_id",
+        "benchmark": "AssistantBench",
+        "score": 2.8,
+        "std_err": 1.1,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-70b/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-70b",
+        "study_id": "study_id",
+        "benchmark": "MiniWoB",
+        "score": 57.6,
+        "std_err": 2.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-70b/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-70b",
+        "study_id": "study_id",
+        "benchmark": "WebArena",
+        "score": 18.4,
+        "std_err": 1.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-70b/weblinx.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-70b",
+        "study_id": "study_id",
+        "benchmark": "WebLINX",
+        "score": 8.9,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]

results/Bgym-Llama-3.1-70b/workarena-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3.1-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    }
+]