Cosmetic changes, update results
Browse files- app.py +61 -23
- results/Bgym-Claude-3.5-Sonnet/README.md +1 -0
- results/Bgym-Claude-3.5-Sonnet/workarena-l1.json +16 -0
- results/Bgym-Claude-3.5-Sonnet/workarena-l2.json +16 -0
- results/{test-agent β Bgym-Claude-3.5-Sonnet}/workarena-l3.json +3 -3
- results/Bgym-GPT-3.5/workarena-l1.json +0 -28
- results/Bgym-GPT-4o-V/config.json +0 -4
- results/Bgym-GPT-4o-mini/README.md +1 -0
- results/{test-agent β Bgym-GPT-4o-mini}/miniwob.json +3 -3
- results/{test-agent β Bgym-GPT-4o-mini}/workarena-l1.json +3 -3
- results/{test-agent β Bgym-GPT-4o-mini}/workarena-l2.json +3 -3
- results/Bgym-GPT-4o/config.json +0 -4
- results/Bgym-GPT-4o/miniwob.json +2 -2
- results/Bgym-GPT-4o/workarena-l1.json +2 -2
- results/Bgym-GPT-4o/workarena-l2.json +2 -2
- results/Bgym-GPT-o1-mini/README.md +1 -0
- results/{test-agent/webarena.json β Bgym-GPT-o1-mini/workarena-l1.json} +4 -4
- results/Bgym-GPT-o1-mini/workarena-l2.json +16 -0
- results/Bgym-Llama-3-70b/config.json +0 -4
- results/Bgym-Llama-3-70b/workarena-l1.json +0 -42
- results/Bgym-Llama-3.1-70b/README.md +1 -0
- results/Bgym-Llama-3.1-70b/workarena-l1.json +16 -0
- results/Bgym-Llama-3.1-70b/workarena-l2.json +16 -0
- results/Bgym-Mixtral-8x22b/config.json +0 -4
- results/Bgym-Mixtral-8x22b/workarena-l1.json +0 -28
- results/test-agent/README.md +0 -1
app.py
CHANGED
|
@@ -9,6 +9,7 @@ import plotly.graph_objs as go
|
|
| 9 |
from huggingface_hub import HfApi
|
| 10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
| 11 |
import streamlit.components.v1 as components
|
|
|
|
| 12 |
|
| 13 |
from urllib.parse import quote
|
| 14 |
from pathlib import Path
|
|
@@ -49,6 +50,26 @@ def sanitize_cell_value(value: Any) -> str:
|
|
| 49 |
return html.escape(str(value))
|
| 50 |
|
| 51 |
def create_html_table_main(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
html = '''
|
| 53 |
<style>
|
| 54 |
table {
|
|
@@ -87,7 +108,28 @@ def create_html_table_main(df):
|
|
| 87 |
html += '</div>'
|
| 88 |
return html
|
| 89 |
|
| 90 |
-
def create_html_table_benchmark(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
html = '''
|
| 92 |
<style>
|
| 93 |
table {
|
|
@@ -127,6 +169,8 @@ def create_html_table_benchmark(df):
|
|
| 127 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
| 128 |
elif column == "Reproduced_all":
|
| 129 |
continue
|
|
|
|
|
|
|
| 130 |
else:
|
| 131 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
| 132 |
html += '</tr>'
|
|
@@ -183,7 +227,10 @@ def main():
|
|
| 183 |
continue
|
| 184 |
agent_results = []
|
| 185 |
for benchmark in BENCHMARKS:
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
| 187 |
agent_results.extend(json.load(f))
|
| 188 |
all_results[agent] = agent_results
|
| 189 |
|
|
@@ -217,11 +264,9 @@ def main():
|
|
| 217 |
if dfs_to_concat:
|
| 218 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
df = df.sort_values(by='WebArena', ascending=False)
|
| 224 |
-
|
| 225 |
# Add a search bar
|
| 226 |
search_query = st.text_input("Search agents", "", key="search_main")
|
| 227 |
|
|
@@ -240,14 +285,6 @@ def main():
|
|
| 240 |
return ""
|
| 241 |
|
| 242 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
| 243 |
-
# st.dataframe(
|
| 244 |
-
# df[['Agent'] + BENCHMARKS],
|
| 245 |
-
# use_container_width=True,
|
| 246 |
-
# column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
|
| 247 |
-
# hide_index=True,
|
| 248 |
-
# # height=int(len(df) * 36.2),
|
| 249 |
-
# )
|
| 250 |
-
# st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
| 251 |
html_table = create_html_table_main(df)
|
| 252 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 253 |
|
|
@@ -395,18 +432,21 @@ MIT
|
|
| 395 |
for value in values:
|
| 396 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
|
| 397 |
result_dict["Score"] = value["score"]
|
|
|
|
| 398 |
result_dict["Benchmark Specific"] = value["benchmark_specific"]
|
| 399 |
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
|
| 400 |
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
|
| 401 |
result_dict["Reproducible"] = value["reproducible"]
|
| 402 |
result_dict["Comments"] = value["comments"]
|
| 403 |
result_dict["Study ID"] = value["study_id"]
|
|
|
|
| 404 |
result_dict["Date"] = value["date_time"]
|
| 405 |
result_dict["Reproduced"] = []
|
| 406 |
result_dict["Reproduced_all"] = []
|
| 407 |
flag = 1
|
| 408 |
if not flag:
|
| 409 |
result_dict["Score"] = "-"
|
|
|
|
| 410 |
result_dict["Benchmark Specific"] = "-"
|
| 411 |
result_dict["Benchmark Tuned"] = "-"
|
| 412 |
result_dict["Followed Evaluation Protocol"] = "-"
|
|
@@ -418,6 +458,7 @@ MIT
|
|
| 418 |
result_dict["Reproduced_all"] = []
|
| 419 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
|
| 420 |
result_dict["Reproduced"].append(value["score"])
|
|
|
|
| 421 |
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
|
| 422 |
if result_dict["Reproduced"]:
|
| 423 |
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
|
|
@@ -435,14 +476,11 @@ MIT
|
|
| 435 |
# Concatenate the DataFrames
|
| 436 |
if dfs_to_concat:
|
| 437 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
# hide_index=True,
|
| 444 |
-
# )
|
| 445 |
-
html_table = create_html_table_benchmark(df_)
|
| 446 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 447 |
|
| 448 |
|
|
|
|
| 9 |
from huggingface_hub import HfApi
|
| 10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
| 11 |
import streamlit.components.v1 as components
|
| 12 |
+
from datetime import datetime
|
| 13 |
|
| 14 |
from urllib.parse import quote
|
| 15 |
from pathlib import Path
|
|
|
|
| 50 |
return html.escape(str(value))
|
| 51 |
|
| 52 |
def create_html_table_main(df):
|
| 53 |
+
col1, col2 = st.columns([2,6])
|
| 54 |
+
with col1:
|
| 55 |
+
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
|
| 56 |
+
with col2:
|
| 57 |
+
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
|
| 58 |
+
|
| 59 |
+
def get_sort_value(row):
|
| 60 |
+
if row == "-":
|
| 61 |
+
return float('-inf')
|
| 62 |
+
else:
|
| 63 |
+
try:
|
| 64 |
+
return float(row)
|
| 65 |
+
except ValueError:
|
| 66 |
+
return row
|
| 67 |
+
|
| 68 |
+
# Sort dataframe
|
| 69 |
+
if sort_order == "Ascending":
|
| 70 |
+
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
|
| 71 |
+
else:
|
| 72 |
+
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
|
| 73 |
html = '''
|
| 74 |
<style>
|
| 75 |
table {
|
|
|
|
| 108 |
html += '</div>'
|
| 109 |
return html
|
| 110 |
|
| 111 |
+
def create_html_table_benchmark(df, benchmark):
|
| 112 |
+
col1, col2 = st.columns([2,6])
|
| 113 |
+
with col1:
|
| 114 |
+
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
|
| 115 |
+
with col2:
|
| 116 |
+
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
|
| 117 |
+
|
| 118 |
+
def get_sort_value(row):
|
| 119 |
+
if row == "-":
|
| 120 |
+
return float('-inf')
|
| 121 |
+
else:
|
| 122 |
+
try:
|
| 123 |
+
return float(row)
|
| 124 |
+
except ValueError:
|
| 125 |
+
return row
|
| 126 |
+
|
| 127 |
+
# Sort dataframe
|
| 128 |
+
if sort_order == "Ascending":
|
| 129 |
+
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
|
| 130 |
+
else:
|
| 131 |
+
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
|
| 132 |
+
|
| 133 |
html = '''
|
| 134 |
<style>
|
| 135 |
table {
|
|
|
|
| 169 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
| 170 |
elif column == "Reproduced_all":
|
| 171 |
continue
|
| 172 |
+
# elif column == "Score":
|
| 173 |
+
# html += f'<td>{row[column]}</td>'
|
| 174 |
else:
|
| 175 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
| 176 |
html += '</tr>'
|
|
|
|
| 227 |
continue
|
| 228 |
agent_results = []
|
| 229 |
for benchmark in BENCHMARKS:
|
| 230 |
+
file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
|
| 231 |
+
if not file_path.is_file():
|
| 232 |
+
continue
|
| 233 |
+
with open(file_path) as f:
|
| 234 |
agent_results.extend(json.load(f))
|
| 235 |
all_results[agent] = agent_results
|
| 236 |
|
|
|
|
| 264 |
if dfs_to_concat:
|
| 265 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
| 266 |
|
| 267 |
+
for benchmark in BENCHMARKS:
|
| 268 |
+
df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
|
| 269 |
+
df[benchmark] = df[benchmark].astype(str)
|
|
|
|
|
|
|
| 270 |
# Add a search bar
|
| 271 |
search_query = st.text_input("Search agents", "", key="search_main")
|
| 272 |
|
|
|
|
| 285 |
return ""
|
| 286 |
|
| 287 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
html_table = create_html_table_main(df)
|
| 289 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 290 |
|
|
|
|
| 432 |
for value in values:
|
| 433 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
|
| 434 |
result_dict["Score"] = value["score"]
|
| 435 |
+
result_dict["std_err"] = value["std_err"]
|
| 436 |
result_dict["Benchmark Specific"] = value["benchmark_specific"]
|
| 437 |
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
|
| 438 |
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
|
| 439 |
result_dict["Reproducible"] = value["reproducible"]
|
| 440 |
result_dict["Comments"] = value["comments"]
|
| 441 |
result_dict["Study ID"] = value["study_id"]
|
| 442 |
+
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
|
| 443 |
result_dict["Date"] = value["date_time"]
|
| 444 |
result_dict["Reproduced"] = []
|
| 445 |
result_dict["Reproduced_all"] = []
|
| 446 |
flag = 1
|
| 447 |
if not flag:
|
| 448 |
result_dict["Score"] = "-"
|
| 449 |
+
result_dict["std_err"] = "-"
|
| 450 |
result_dict["Benchmark Specific"] = "-"
|
| 451 |
result_dict["Benchmark Tuned"] = "-"
|
| 452 |
result_dict["Followed Evaluation Protocol"] = "-"
|
|
|
|
| 458 |
result_dict["Reproduced_all"] = []
|
| 459 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
|
| 460 |
result_dict["Reproduced"].append(value["score"])
|
| 461 |
+
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
|
| 462 |
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
|
| 463 |
if result_dict["Reproduced"]:
|
| 464 |
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
|
|
|
|
| 476 |
# Concatenate the DataFrames
|
| 477 |
if dfs_to_concat:
|
| 478 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
| 479 |
+
df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
|
| 480 |
+
df_['Score'] = df_['Score'].astype(str)
|
| 481 |
+
df_['Score'] = df_.apply(lambda row: f"{row['Score']} Β± {row['std_err']}", axis=1)
|
| 482 |
+
df_ = df_.drop(columns=['std_err'])
|
| 483 |
+
html_table = create_html_table_benchmark(df_, benchmark)
|
|
|
|
|
|
|
|
|
|
| 484 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 485 |
|
| 486 |
|
results/Bgym-Claude-3.5-Sonnet/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
### Claude 3.5 Sonnet model
|
results/Bgym-Claude-3.5-Sonnet/workarena-l1.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 56.4,
|
| 7 |
+
"std_err": 2.7,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Claude-3.5-Sonnet/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 39.1,
|
| 8 |
+
"std_err": 3.2,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/{test-agent β Bgym-Claude-3.5-Sonnet}/workarena-l3.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
-
"score": 0.
|
| 8 |
-
"std_err": 0.
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
+
"score": 0.4,
|
| 8 |
+
"std_err": 0.4,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-3.5/workarena-l1.json
CHANGED
|
@@ -12,33 +12,5 @@
|
|
| 12 |
"reproducible": "Yes",
|
| 13 |
"comments": "NA",
|
| 14 |
"original_or_reproduced": "Original"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"agent_name": "Bgym-GPT-3.5",
|
| 18 |
-
"study_id": "study_id",
|
| 19 |
-
"benchmark": "WorkArena-L1",
|
| 20 |
-
"score": 5.7,
|
| 21 |
-
"std_err": 0.3,
|
| 22 |
-
"benchmark_specific": "No",
|
| 23 |
-
"benchmark_tuned": "No",
|
| 24 |
-
"followed_evaluation_protocol": "Yes",
|
| 25 |
-
"reproducible": "Yes",
|
| 26 |
-
"comments": "NA",
|
| 27 |
-
"original_or_reproduced": "Reproduced",
|
| 28 |
-
"date_time": "2021-01-04 12:06:00"
|
| 29 |
-
},
|
| 30 |
-
{
|
| 31 |
-
"benchmark": "WorkArena-L1",
|
| 32 |
-
"agent_name": "Bgym-GPT-3.5",
|
| 33 |
-
"study_id": "study_id",
|
| 34 |
-
"score": 5.1,
|
| 35 |
-
"std_err": 0.3,
|
| 36 |
-
"benchmark_specific": "No",
|
| 37 |
-
"benchmark_tuned": "No",
|
| 38 |
-
"followed_evaluation_protocol": "Yes",
|
| 39 |
-
"reproducible": "Yes",
|
| 40 |
-
"comments": "NA",
|
| 41 |
-
"original_or_reproduced": "Reproduced",
|
| 42 |
-
"date_time": "2021-01-04 12:06:00"
|
| 43 |
}
|
| 44 |
]
|
|
|
|
| 12 |
"reproducible": "Yes",
|
| 13 |
"comments": "NA",
|
| 14 |
"original_or_reproduced": "Original"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
]
|
results/Bgym-GPT-4o-V/config.json
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "GPT-4o-V",
|
| 3 |
-
"backend_llm": "GPT-4o-V"
|
| 4 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-mini/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
## GPT-4o-mini model
|
results/{test-agent β Bgym-GPT-4o-mini}/miniwob.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 58.8,
|
| 8 |
+
"std_err": 1.4,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/{test-agent β Bgym-GPT-4o-mini}/workarena-l1.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
+
"score": 27,
|
| 8 |
+
"std_err": 2.4,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/{test-agent β Bgym-GPT-4o-mini}/workarena-l2.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err": 0.
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 1.3,
|
| 8 |
+
"std_err": 0.7,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o/config.json
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "GPT-4o",
|
| 3 |
-
"backend_llm": "GPT-4o"
|
| 4 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o/miniwob.json
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 65.6,
|
| 8 |
+
"std_err": 1.9,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o/workarena-l1.json
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
+
"score": 45.5,
|
| 8 |
+
"std_err": 2.7,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o/workarena-l2.json
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 8.5,
|
| 8 |
+
"std_err": 1.8,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-o1-mini/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
## GPT-o1-mini model
|
results/{test-agent/webarena.json β Bgym-GPT-o1-mini/workarena-l1.json}
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L1",
|
| 7 |
+
"score": 56.7,
|
| 8 |
+
"std_err": 2.7,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-o1-mini/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 14.9,
|
| 8 |
+
"std_err": 2.3,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3-70b/config.json
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "Llama-3-70B",
|
| 3 |
-
"backend_llm": "Llama-3-70B"
|
| 4 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/workarena-l1.json
CHANGED
|
@@ -12,47 +12,5 @@
|
|
| 12 |
"comments": "NA",
|
| 13 |
"original_or_reproduced": "Original",
|
| 14 |
"date_time": "2021-01-01 12:00:00"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"agent_name": "Bgym-Llama-3-70b",
|
| 18 |
-
"study_id": "study_id",
|
| 19 |
-
"benchmark": "WorkArena-L1",
|
| 20 |
-
"score": 15.9,
|
| 21 |
-
"std_err": 0.6,
|
| 22 |
-
"benchmark_specific": "No",
|
| 23 |
-
"benchmark_tuned": "No",
|
| 24 |
-
"followed_evaluation_protocol": "Yes",
|
| 25 |
-
"reproducible": "Yes",
|
| 26 |
-
"comments": "NA",
|
| 27 |
-
"original_or_reproduced": "Reproduced",
|
| 28 |
-
"date_time": "2021-01-04 12:06:00"
|
| 29 |
-
},
|
| 30 |
-
{
|
| 31 |
-
"agent_name": "Bgym-Llama-3-70b",
|
| 32 |
-
"study_id": "study_id",
|
| 33 |
-
"benchmark": "WorkArena-L1",
|
| 34 |
-
"score": 19.9,
|
| 35 |
-
"std_err": 0.6,
|
| 36 |
-
"benchmark_specific": "No",
|
| 37 |
-
"benchmark_tuned": "No",
|
| 38 |
-
"followed_evaluation_protocol": "Yes",
|
| 39 |
-
"reproducible": "Yes",
|
| 40 |
-
"comments": "NA",
|
| 41 |
-
"original_or_reproduced": "Reproduced",
|
| 42 |
-
"date_time": "2021-01-05 2:07:00"
|
| 43 |
-
},
|
| 44 |
-
{
|
| 45 |
-
"agent_name": "Bgym-Llama-3-70b",
|
| 46 |
-
"study_id": "study_id",
|
| 47 |
-
"benchmark": "WorkArena-L1",
|
| 48 |
-
"score": 17.9,
|
| 49 |
-
"std_err": 0.6,
|
| 50 |
-
"benchmark_specific": "No",
|
| 51 |
-
"benchmark_tuned": "No",
|
| 52 |
-
"followed_evaluation_protocol": "Yes",
|
| 53 |
-
"reproducible": "Yes",
|
| 54 |
-
"comments": "NA",
|
| 55 |
-
"original_or_reproduced": "Reproduced",
|
| 56 |
-
"date_time": "2021-01-12 12:00:00"
|
| 57 |
}
|
| 58 |
]
|
|
|
|
| 12 |
"comments": "NA",
|
| 13 |
"original_or_reproduced": "Original",
|
| 14 |
"date_time": "2021-01-01 12:00:00"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
]
|
results/Bgym-Llama-3.1-70b/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
### Llama-3.1-70B
|
results/Bgym-Llama-3.1-70b/workarena-l1.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 27.9,
|
| 7 |
+
"std_err": 2.5,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3.1-70b/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 2.1,
|
| 8 |
+
"std_err": 0.9,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Mixtral-8x22b/config.json
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "Mixtral-8x22B",
|
| 3 |
-
"backend_llm": "Mixtral-8x22B"
|
| 4 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/workarena-l1.json
CHANGED
|
@@ -12,33 +12,5 @@
|
|
| 12 |
"comments": "NA",
|
| 13 |
"original_or_reproduced": "Original",
|
| 14 |
"date_time": "2021-01-04 12:06:00"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
| 18 |
-
"study_id": "study_id",
|
| 19 |
-
"benchmark": "WorkArena-L1",
|
| 20 |
-
"score": 11.4,
|
| 21 |
-
"std_err": 0.7,
|
| 22 |
-
"benchmark_specific": "No",
|
| 23 |
-
"benchmark_tuned": "No",
|
| 24 |
-
"followed_evaluation_protocol": "Yes",
|
| 25 |
-
"reproducible": "Yes",
|
| 26 |
-
"comments": "NA",
|
| 27 |
-
"original_or_reproduced": "Reproduced",
|
| 28 |
-
"date_time": "2021-01-04 12:06:00"
|
| 29 |
-
},
|
| 30 |
-
{
|
| 31 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
| 32 |
-
"study_id": "study_id",
|
| 33 |
-
"benchmark": "WorkArena-L1",
|
| 34 |
-
"score": 13.4,
|
| 35 |
-
"std_err": 0.7,
|
| 36 |
-
"benchmark_specific": "No",
|
| 37 |
-
"benchmark_tuned": "No",
|
| 38 |
-
"followed_evaluation_protocol": "Yes",
|
| 39 |
-
"reproducible": "Yes",
|
| 40 |
-
"comments": "NA",
|
| 41 |
-
"original_or_reproduced": "Reproduced",
|
| 42 |
-
"date_time": "2021-01-04 12:06:00"
|
| 43 |
}
|
| 44 |
]
|
|
|
|
| 12 |
"comments": "NA",
|
| 13 |
"original_or_reproduced": "Original",
|
| 14 |
"date_time": "2021-01-04 12:06:00"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
]
|
results/test-agent/README.md
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
### Test agent
|
|
|
|
|
|