Updated latest results
Browse files- app.py +20 -5
- results/Bgym-Claude-3.5-Sonnet/assistantbench.json +16 -0
- results/Bgym-Claude-3.5-Sonnet/miniwob.json +16 -0
- results/Bgym-Claude-3.5-Sonnet/webarena.json +16 -0
- results/Bgym-Claude-3.5-Sonnet/weblinx.json +16 -0
- results/Bgym-GPT-4o-mini/assistantbench.json +16 -0
- results/Bgym-GPT-4o-mini/miniwob.json +2 -2
- results/Bgym-GPT-4o-mini/webarena.json +16 -0
- results/Bgym-GPT-4o-mini/weblinx.json +16 -0
- results/Bgym-GPT-4o-mini/workarena-l3.json +16 -0
- results/Bgym-GPT-4o/assistantbench.json +16 -0
- results/Bgym-GPT-4o/miniwob.json +1 -1
- results/Bgym-GPT-4o/webarena.json +2 -2
- results/Bgym-GPT-4o/weblinx.json +16 -0
- results/Bgym-GPT-o1-mini/assistantbench.json +16 -0
- results/Bgym-GPT-o1-mini/miniwob.json +16 -0
- results/Bgym-GPT-o1-mini/webarena.json +16 -0
- results/Bgym-GPT-o1-mini/weblinx.json +16 -0
- results/Bgym-GPT-o1-mini/workarena-l3.json +16 -0
- results/Bgym-Llama-3.1-405b/README.md +1 -0
- results/Bgym-Llama-3.1-405b/assistantbench.json +16 -0
- results/Bgym-Llama-3.1-405b/miniwob.json +16 -0
- results/Bgym-Llama-3.1-405b/webarena.json +16 -0
- results/Bgym-Llama-3.1-405b/weblinx.json +16 -0
- results/Bgym-Llama-3.1-405b/workarena-l1.json +16 -0
- results/Bgym-Llama-3.1-405b/workarena-l2.json +16 -0
- results/Bgym-Llama-3.1-405b/workarena-l3.json +16 -0
- results/Bgym-Llama-3.1-70b/assistantbench.json +16 -0
- results/Bgym-Llama-3.1-70b/miniwob.json +16 -0
- results/Bgym-Llama-3.1-70b/webarena.json +16 -0
- results/Bgym-Llama-3.1-70b/weblinx.json +16 -0
- results/Bgym-Llama-3.1-70b/workarena-l3.json +16 -0
app.py
CHANGED
|
@@ -44,9 +44,11 @@ def sanitize_column_name(col: str) -> str:
|
|
| 44 |
return html.escape(str(col))
|
| 45 |
|
| 46 |
def sanitize_cell_value(value: Any) -> str:
|
| 47 |
-
"""Sanitize cell values for HTML display"""
|
| 48 |
if isinstance(value, (int, float)):
|
| 49 |
return str(value)
|
|
|
|
|
|
|
|
|
|
| 50 |
return html.escape(str(value))
|
| 51 |
|
| 52 |
def create_html_table_main(df):
|
|
@@ -169,8 +171,9 @@ def create_html_table_benchmark(df, benchmark):
|
|
| 169 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
| 170 |
elif column == "Reproduced_all":
|
| 171 |
continue
|
| 172 |
-
|
| 173 |
-
|
|
|
|
| 174 |
else:
|
| 175 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
| 176 |
html += '</tr>'
|
|
@@ -205,6 +208,19 @@ def check_sanity(agent):
|
|
| 205 |
|
| 206 |
def main():
|
| 207 |
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
st.markdown("""
|
| 209 |
<head>
|
| 210 |
<meta http-equiv="Content-Security-Policy"
|
|
@@ -477,9 +493,8 @@ MIT
|
|
| 477 |
if dfs_to_concat:
|
| 478 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
| 479 |
df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
|
|
|
|
| 480 |
df_['Score'] = df_['Score'].astype(str)
|
| 481 |
-
df_['Score'] = df_.apply(lambda row: f"{row['Score']} ± {row['std_err']}", axis=1)
|
| 482 |
-
df_ = df_.drop(columns=['std_err'])
|
| 483 |
html_table = create_html_table_benchmark(df_, benchmark)
|
| 484 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 485 |
|
|
|
|
| 44 |
return html.escape(str(col))
|
| 45 |
|
| 46 |
def sanitize_cell_value(value: Any) -> str:
|
|
|
|
| 47 |
if isinstance(value, (int, float)):
|
| 48 |
return str(value)
|
| 49 |
+
if isinstance(value, str) and '±' in value:
|
| 50 |
+
score, std_err = value.split('±')
|
| 51 |
+
return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>'
|
| 52 |
return html.escape(str(value))
|
| 53 |
|
| 54 |
def create_html_table_main(df):
|
|
|
|
| 171 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
| 172 |
elif column == "Reproduced_all":
|
| 173 |
continue
|
| 174 |
+
elif column == "Score":
|
| 175 |
+
score_with_std_err = f'{row[column]} ± {row["std_err"]}'
|
| 176 |
+
html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
|
| 177 |
else:
|
| 178 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
| 179 |
html += '</tr>'
|
|
|
|
| 208 |
|
| 209 |
def main():
|
| 210 |
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
| 211 |
+
st.markdown("""
|
| 212 |
+
<style>
|
| 213 |
+
:root {
|
| 214 |
+
--lighter-color: #888; /* Default for light theme */
|
| 215 |
+
}
|
| 216 |
+
@media (prefers-color-scheme: dark) {
|
| 217 |
+
:root {
|
| 218 |
+
--lighter-color: #ccc; /* Default for dark theme */
|
| 219 |
+
}
|
| 220 |
+
}
|
| 221 |
+
</style>
|
| 222 |
+
""", unsafe_allow_html=True)
|
| 223 |
+
|
| 224 |
st.markdown("""
|
| 225 |
<head>
|
| 226 |
<meta http-equiv="Content-Security-Policy"
|
|
|
|
| 493 |
if dfs_to_concat:
|
| 494 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
| 495 |
df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
|
| 496 |
+
df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
|
| 497 |
df_['Score'] = df_['Score'].astype(str)
|
|
|
|
|
|
|
| 498 |
html_table = create_html_table_benchmark(df_, benchmark)
|
| 499 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 500 |
|
results/Bgym-Claude-3.5-Sonnet/assistantbench.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "AssistantBench",
|
| 6 |
+
"score": 5.2,
|
| 7 |
+
"std_err": 1.5,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Claude-3.5-Sonnet/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 69.8,
|
| 7 |
+
"std_err": 1.8,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Claude-3.5-Sonnet/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WebArena",
|
| 6 |
+
"score": 36.2,
|
| 7 |
+
"std_err": 1.7,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Claude-3.5-Sonnet/weblinx.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WebLINX",
|
| 6 |
+
"score": 13.7,
|
| 7 |
+
"std_err": 0.6,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-4o-mini/assistantbench.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "AssistantBench",
|
| 7 |
+
"score": 2.1,
|
| 8 |
+
"std_err": 1.0,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-4o-mini/miniwob.json
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 56.6,
|
| 8 |
+
"std_err": 2.0,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o-mini/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebArena",
|
| 7 |
+
"score": 17.4,
|
| 8 |
+
"std_err": 1.3,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-4o-mini/weblinx.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebLINX",
|
| 7 |
+
"score": 11.6,
|
| 8 |
+
"std_err": 0.6,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-4o-mini/workarena-l3.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L3",
|
| 7 |
+
"score": 0.0,
|
| 8 |
+
"std_err": 0.0,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-4o/assistantbench.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "AssistantBench",
|
| 7 |
+
"score": 4.8,
|
| 8 |
+
"std_err": 2.4,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-4o/miniwob.json
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
-
"score":
|
| 8 |
"std_err": 1.9,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
|
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 63.8,
|
| 8 |
"std_err": 1.9,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
results/Bgym-GPT-4o/webarena.json
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebArena",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebArena",
|
| 7 |
+
"score": 31.4,
|
| 8 |
+
"std_err": 1.6,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o/weblinx.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebLINX",
|
| 7 |
+
"score": 12.5,
|
| 8 |
+
"std_err": 0.6,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-o1-mini/assistantbench.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "AssistantBench",
|
| 7 |
+
"score": 6.9,
|
| 8 |
+
"std_err": 2.2,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-o1-mini/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 67.8,
|
| 8 |
+
"std_err": 1.9,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-o1-mini/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebArena",
|
| 7 |
+
"score": 28.6,
|
| 8 |
+
"std_err": 1.6,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-o1-mini/weblinx.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebLINX",
|
| 7 |
+
"score": 12.5,
|
| 8 |
+
"std_err": 0.6,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-o1-mini/workarena-l3.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L3",
|
| 7 |
+
"score": 0.0,
|
| 8 |
+
"std_err": 0.0,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-405b/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
### Llama-3.1-405B
|
results/Bgym-Llama-3.1-405b/assistantbench.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "AssistantBench",
|
| 6 |
+
"score": 3.9,
|
| 7 |
+
"std_err": 1.0,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-405b/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 64.6,
|
| 7 |
+
"std_err": 1.9,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-405b/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WebArena",
|
| 6 |
+
"score": 24.0,
|
| 7 |
+
"std_err": 1.5,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-405b/weblinx.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WebLINX",
|
| 6 |
+
"score": 7.9,
|
| 7 |
+
"std_err": 0.5,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-405b/workarena-l1.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 43.3,
|
| 7 |
+
"std_err": 2.7,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-405b/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 7.2,
|
| 8 |
+
"std_err": 1.7,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-405b/workarena-l3.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WorkArena-L3",
|
| 6 |
+
"score": 0.0,
|
| 7 |
+
"std_err": 0.0,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-70b/assistantbench.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "AssistantBench",
|
| 6 |
+
"score": 2.8,
|
| 7 |
+
"std_err": 1.1,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-70b/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 57.6,
|
| 7 |
+
"std_err": 2.0,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-70b/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WebArena",
|
| 6 |
+
"score": 18.4,
|
| 7 |
+
"std_err": 1.4,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-70b/weblinx.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WebLINX",
|
| 6 |
+
"score": 8.9,
|
| 7 |
+
"std_err": 0.5,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-70b/workarena-l3.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WorkArena-L3",
|
| 6 |
+
"score": 0.0,
|
| 7 |
+
"std_err": 0.0,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|