Security checks
Browse files- .gitignore +2 -0
- app.py +95 -48
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
app.py
CHANGED
|
@@ -10,23 +10,46 @@ from huggingface_hub import HfApi
|
|
| 10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
| 11 |
import streamlit.components.v1 as components
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
|
| 14 |
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
|
| 15 |
|
| 16 |
-
def
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
with col2:
|
| 21 |
-
sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True)
|
| 22 |
-
|
| 23 |
-
# Sort dataframe
|
| 24 |
-
if sort_order == "Ascending":
|
| 25 |
-
df = df.sort_values(by=sort_column)
|
| 26 |
-
else:
|
| 27 |
-
df = df.sort_values(by=sort_column, ascending=False)
|
| 28 |
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
html = '''
|
| 31 |
<style>
|
| 32 |
table {
|
|
@@ -50,20 +73,22 @@ def create_html_table_main(df, benchmarks):
|
|
| 50 |
html += '<table>'
|
| 51 |
html += '<thead><tr>'
|
| 52 |
for column in df.columns:
|
| 53 |
-
html += f'<th>{column}</th>'
|
| 54 |
html += '</tr></thead>'
|
| 55 |
html += '<tbody>'
|
| 56 |
for _, row in df.iterrows():
|
| 57 |
html += '<tr>'
|
| 58 |
for col in df.columns:
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
| 60 |
html += '</tr>'
|
| 61 |
html += '</tbody></table>'
|
| 62 |
html += '</div>'
|
| 63 |
return html
|
| 64 |
|
| 65 |
-
def create_html_table_benchmark(df
|
| 66 |
-
# Create HTML table without JavaScript sorting
|
| 67 |
html = '''
|
| 68 |
<style>
|
| 69 |
table {
|
|
@@ -88,7 +113,7 @@ def create_html_table_benchmark(df, benchmarks):
|
|
| 88 |
html += '<thead><tr>'
|
| 89 |
for column in df.columns:
|
| 90 |
if column != "Reproduced_all":
|
| 91 |
-
html += f'<th>{column}</th>'
|
| 92 |
html += '</tr></thead>'
|
| 93 |
html += '<tbody>'
|
| 94 |
for _, row in df.iterrows():
|
|
@@ -96,41 +121,60 @@ def create_html_table_benchmark(df, benchmarks):
|
|
| 96 |
for column in df.columns:
|
| 97 |
if column == "Reproduced":
|
| 98 |
if row[column] == "-":
|
| 99 |
-
html += f'<td>{row[column]}</td>'
|
| 100 |
else:
|
| 101 |
-
|
|
|
|
|
|
|
| 102 |
elif column == "Reproduced_all":
|
| 103 |
continue
|
| 104 |
else:
|
| 105 |
-
html += f'<td>{row[column]}</td>'
|
| 106 |
html += '</tr>'
|
| 107 |
html += '</tbody></table>'
|
| 108 |
html += '</div>'
|
| 109 |
return html
|
| 110 |
|
| 111 |
def check_sanity(agent):
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
def main():
|
| 133 |
-
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
all_agents = os.listdir("results")
|
| 136 |
all_results = {}
|
|
@@ -148,7 +192,7 @@ def main():
|
|
| 148 |
st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
|
| 149 |
# content = create_yall()
|
| 150 |
# tab1, tab2, tab3, tab4 = st.tabs(["π WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "π About"])
|
| 151 |
-
tabs = st.tabs(["π
|
| 152 |
|
| 153 |
with tabs[0]:
|
| 154 |
# Leaderboard tab
|
|
@@ -190,8 +234,13 @@ def main():
|
|
| 190 |
# Display the filtered DataFrame or the entire leaderboard
|
| 191 |
|
| 192 |
def make_hyperlink(agent_name):
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
| 196 |
# st.dataframe(
|
| 197 |
# df[['Agent'] + BENCHMARKS],
|
|
@@ -201,10 +250,8 @@ def main():
|
|
| 201 |
# # height=int(len(df) * 36.2),
|
| 202 |
# )
|
| 203 |
# st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
| 204 |
-
html_table = create_html_table_main(df
|
| 205 |
-
# print (html_table)
|
| 206 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 207 |
-
# components.html(html_table, height=600, scrolling=True)
|
| 208 |
|
| 209 |
if st.button("Export to CSV", key="export_main"):
|
| 210 |
# Export the DataFrame to CSV
|
|
@@ -280,7 +327,7 @@ def main():
|
|
| 280 |
# column_config={benchmark: {'alignment': 'center'}},
|
| 281 |
# hide_index=True,
|
| 282 |
# )
|
| 283 |
-
html_table = create_html_table_benchmark(df_
|
| 284 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 285 |
|
| 286 |
|
|
|
|
| 10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
| 11 |
import streamlit.components.v1 as components
|
| 12 |
|
| 13 |
+
from urllib.parse import quote
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import re
|
| 16 |
+
import html
|
| 17 |
+
from typing import Dict, Any
|
| 18 |
+
|
| 19 |
# BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
|
| 20 |
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
|
| 21 |
|
| 22 |
+
def sanitize_agent_name(agent_name):
|
| 23 |
+
# Only allow alphanumeric chars, hyphen, underscore
|
| 24 |
+
if agent_name.startswith('.'):
|
| 25 |
+
raise ValueError("Agent name cannot start with a dot")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
if not re.match("^[a-zA-Z0-9-_][a-zA-Z0-9-_.]*$", agent_name):
|
| 28 |
+
raise ValueError("Invalid agent name format")
|
| 29 |
+
return agent_name
|
| 30 |
+
|
| 31 |
+
def safe_path_join(*parts):
|
| 32 |
+
# Ensure we stay within results directory
|
| 33 |
+
base = Path("results").resolve()
|
| 34 |
+
try:
|
| 35 |
+
path = base.joinpath(*parts).resolve()
|
| 36 |
+
if not str(path).startswith(str(base)):
|
| 37 |
+
raise ValueError("Path traversal detected")
|
| 38 |
+
return path
|
| 39 |
+
except Exception:
|
| 40 |
+
raise ValueError("Invalid path")
|
| 41 |
+
|
| 42 |
+
def sanitize_column_name(col: str) -> str:
|
| 43 |
+
"""Sanitize column names for HTML display"""
|
| 44 |
+
return html.escape(str(col))
|
| 45 |
+
|
| 46 |
+
def sanitize_cell_value(value: Any) -> str:
|
| 47 |
+
"""Sanitize cell values for HTML display"""
|
| 48 |
+
if isinstance(value, (int, float)):
|
| 49 |
+
return str(value)
|
| 50 |
+
return html.escape(str(value))
|
| 51 |
+
|
| 52 |
+
def create_html_table_main(df):
|
| 53 |
html = '''
|
| 54 |
<style>
|
| 55 |
table {
|
|
|
|
| 73 |
html += '<table>'
|
| 74 |
html += '<thead><tr>'
|
| 75 |
for column in df.columns:
|
| 76 |
+
html += f'<th>{sanitize_column_name(column)}</th>'
|
| 77 |
html += '</tr></thead>'
|
| 78 |
html += '<tbody>'
|
| 79 |
for _, row in df.iterrows():
|
| 80 |
html += '<tr>'
|
| 81 |
for col in df.columns:
|
| 82 |
+
if col == "Agent":
|
| 83 |
+
html += f'<td>{row[col]}</td>'
|
| 84 |
+
else:
|
| 85 |
+
html += f'<td>{sanitize_cell_value(row[col])}</td>'
|
| 86 |
html += '</tr>'
|
| 87 |
html += '</tbody></table>'
|
| 88 |
html += '</div>'
|
| 89 |
return html
|
| 90 |
|
| 91 |
+
def create_html_table_benchmark(df):
|
|
|
|
| 92 |
html = '''
|
| 93 |
<style>
|
| 94 |
table {
|
|
|
|
| 113 |
html += '<thead><tr>'
|
| 114 |
for column in df.columns:
|
| 115 |
if column != "Reproduced_all":
|
| 116 |
+
html += f'<th>{sanitize_column_name(column)}</th>'
|
| 117 |
html += '</tr></thead>'
|
| 118 |
html += '<tbody>'
|
| 119 |
for _, row in df.iterrows():
|
|
|
|
| 121 |
for column in df.columns:
|
| 122 |
if column == "Reproduced":
|
| 123 |
if row[column] == "-":
|
| 124 |
+
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
| 125 |
else:
|
| 126 |
+
summary = sanitize_cell_value(row[column])
|
| 127 |
+
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
| 128 |
+
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
| 129 |
elif column == "Reproduced_all":
|
| 130 |
continue
|
| 131 |
else:
|
| 132 |
+
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
| 133 |
html += '</tr>'
|
| 134 |
html += '</tbody></table>'
|
| 135 |
html += '</div>'
|
| 136 |
return html
|
| 137 |
|
| 138 |
def check_sanity(agent):
|
| 139 |
+
try:
|
| 140 |
+
safe_agent = sanitize_agent_name(agent)
|
| 141 |
+
for benchmark in BENCHMARKS:
|
| 142 |
+
file_path = safe_path_join(safe_agent, f"{benchmark.lower()}.json")
|
| 143 |
+
if not file_path.is_file():
|
| 144 |
+
continue
|
| 145 |
+
original_count = 0
|
| 146 |
+
with open(file_path) as f:
|
| 147 |
+
results = json.load(f)
|
| 148 |
+
for result in results:
|
| 149 |
+
if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
|
| 150 |
+
return False
|
| 151 |
+
if result["agent_name"] != agent:
|
| 152 |
+
return False
|
| 153 |
+
if result["benchmark"] != benchmark:
|
| 154 |
+
return False
|
| 155 |
+
if result["original_or_reproduced"] == "Original":
|
| 156 |
+
original_count += 1
|
| 157 |
+
if original_count != 1:
|
| 158 |
+
return False
|
| 159 |
+
return True
|
| 160 |
+
except ValueError:
|
| 161 |
+
return False
|
| 162 |
|
| 163 |
def main():
|
| 164 |
+
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
| 165 |
+
st.markdown("""
|
| 166 |
+
<head>
|
| 167 |
+
<meta http-equiv="Content-Security-Policy"
|
| 168 |
+
content="default-src 'self' https://huggingface.co;
|
| 169 |
+
script-src 'self' 'unsafe-inline';
|
| 170 |
+
style-src 'self' 'unsafe-inline';
|
| 171 |
+
img-src 'self' data: https:;
|
| 172 |
+
frame-ancestors 'none';">
|
| 173 |
+
<meta http-equiv="X-Frame-Options" content="DENY">
|
| 174 |
+
<meta http-equiv="X-Content-Type-Options" content="nosniff">
|
| 175 |
+
<meta http-equiv="Referrer-Policy" content="strict-origin-when-cross-origin">
|
| 176 |
+
</head>
|
| 177 |
+
""", unsafe_allow_html=True)
|
| 178 |
|
| 179 |
all_agents = os.listdir("results")
|
| 180 |
all_results = {}
|
|
|
|
| 192 |
st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
|
| 193 |
# content = create_yall()
|
| 194 |
# tab1, tab2, tab3, tab4 = st.tabs(["π WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "π About"])
|
| 195 |
+
tabs = st.tabs(["π Main Leaderboard",] + BENCHMARKS + ["π About"])
|
| 196 |
|
| 197 |
with tabs[0]:
|
| 198 |
# Leaderboard tab
|
|
|
|
| 234 |
# Display the filtered DataFrame or the entire leaderboard
|
| 235 |
|
| 236 |
def make_hyperlink(agent_name):
|
| 237 |
+
try:
|
| 238 |
+
safe_name = sanitize_agent_name(agent_name)
|
| 239 |
+
safe_url = f"https://huggingface.co/spaces/ServiceNow/browsergym-leaderboard/blob/main/results/{quote(safe_name)}/README.md"
|
| 240 |
+
return f'<a href="{html.escape(safe_url)}" target="_blank">{html.escape(safe_name)}</a>'
|
| 241 |
+
except ValueError:
|
| 242 |
+
return ""
|
| 243 |
+
|
| 244 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
| 245 |
# st.dataframe(
|
| 246 |
# df[['Agent'] + BENCHMARKS],
|
|
|
|
| 250 |
# # height=int(len(df) * 36.2),
|
| 251 |
# )
|
| 252 |
# st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
| 253 |
+
html_table = create_html_table_main(df)
|
|
|
|
| 254 |
st.markdown(html_table, unsafe_allow_html=True)
|
|
|
|
| 255 |
|
| 256 |
if st.button("Export to CSV", key="export_main"):
|
| 257 |
# Export the DataFrame to CSV
|
|
|
|
| 327 |
# column_config={benchmark: {'alignment': 'center'}},
|
| 328 |
# hide_index=True,
|
| 329 |
# )
|
| 330 |
+
html_table = create_html_table_benchmark(df_)
|
| 331 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 332 |
|
| 333 |
|