Spaces:

ServiceNow
/

browsergym-leaderboard

Running

App Files Files Community

meghsn commited on Nov 21, 2024

Commit

b667dc2

1 Parent(s): 2705446

Security checks

Browse files

Files changed (2) hide show

.gitignore +2 -0
app.py +95 -48

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ *.pyc

app.py CHANGED Viewed

@@ -10,23 +10,46 @@ from huggingface_hub import HfApi
 from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
 import streamlit.components.v1 as components
 # BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
 BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
-def create_html_table_main(df, benchmarks):
-    col1, col2 = st.columns([2,6])
-    with col1:
-        sort_column = st.selectbox("Sort by", df.columns.tolist())
-    with col2:
-        sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True)
-    # Sort dataframe
-    if sort_order == "Ascending":
-        df = df.sort_values(by=sort_column)
-    else:
-        df = df.sort_values(by=sort_column, ascending=False)
-    # Create HTML table without JavaScript sorting
     html = '''
     <style>
         table {
@@ -50,20 +73,22 @@ def create_html_table_main(df, benchmarks):
     html += '<table>'
     html += '<thead><tr>'
     for column in df.columns:
-        html += f'<th>{column}</th>'
     html += '</tr></thead>'
     html += '<tbody>'
     for _, row in df.iterrows():
         html += '<tr>'
         for col in df.columns:
-            html += f'<td>{row[col]}</td>'
         html += '</tr>'
     html += '</tbody></table>'
     html += '</div>'
     return html
-def create_html_table_benchmark(df, benchmarks):
-    # Create HTML table without JavaScript sorting
     html = '''
     <style>
         table {
@@ -88,7 +113,7 @@ def create_html_table_benchmark(df, benchmarks):
     html += '<thead><tr>'
     for column in df.columns:
         if column != "Reproduced_all":
-            html += f'<th>{column}</th>'
     html += '</tr></thead>'
     html += '<tbody>'
     for _, row in df.iterrows():
@@ -96,41 +121,60 @@ def create_html_table_benchmark(df, benchmarks):
         for column in df.columns:
             if column == "Reproduced":
                 if row[column] == "-":
-                    html += f'<td>{row[column]}</td>'
                 else:
-                    html += f'<td><details><summary>{row[column]}</summary>{"<br>".join(map(str, row["Reproduced_all"]))}</details></td>'
             elif column == "Reproduced_all":
                 continue
             else:
-                html += f'<td>{row[column]}</td>'
         html += '</tr>'
     html += '</tbody></table>'
     html += '</div>'
     return html
 def check_sanity(agent):
-    for benchmark in BENCHMARKS:
-        file_path = f"results/{agent}/{benchmark.lower()}.json"
-        if not os.path.exists(file_path):
-            continue
-        original_count = 0
-        with open(file_path) as f:
-            results = json.load(f)
-            for result in results:
-                if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
-                    return False
-                if result["agent_name"] != agent:
-                    return False
-                if result["benchmark"] != benchmark:
-                    return False
-                if result["original_or_reproduced"] == "Original":
-                    original_count += 1
-        if original_count != 1:
-            return False
-    return True
 def main():
-    st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide")
     all_agents = os.listdir("results")
     all_results = {}
@@ -148,7 +192,7 @@ def main():
     st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
     # content = create_yall()
     # tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"])
-    tabs = st.tabs(["🏆 WebAgent Leaderboard",] +  BENCHMARKS + ["📝 About"])
     with tabs[0]:
         # Leaderboard tab
@@ -190,8 +234,13 @@ def main():
         # Display the filtered DataFrame or the entire leaderboard
         def make_hyperlink(agent_name):
-            url = f"https://huggingface.co/spaces/meghsn/WebAgent-Leaderboard/blob/main/results/{agent_name}/README.md"
-            return f'<a href="{url}" target="_blank">{agent_name}</a>'
         df['Agent'] = df['Agent'].apply(make_hyperlink)
         # st.dataframe(
         #     df[['Agent'] + BENCHMARKS],
@@ -201,10 +250,8 @@ def main():
         #     # height=int(len(df) * 36.2),
         # )
         # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
-        html_table = create_html_table_main(df, BENCHMARKS)
-        # print (html_table)
         st.markdown(html_table, unsafe_allow_html=True)
-        # components.html(html_table, height=600, scrolling=True)
         if st.button("Export to CSV", key="export_main"):
             # Export the DataFrame to CSV
@@ -280,7 +327,7 @@ def main():
             #     column_config={benchmark: {'alignment': 'center'}},
             #     hide_index=True,
             # )
-            html_table = create_html_table_benchmark(df_, BENCHMARKS)
             st.markdown(html_table, unsafe_allow_html=True)

 from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
 import streamlit.components.v1 as components
+from urllib.parse import quote
+from pathlib import Path
+import re
+import html
+from typing import Dict, Any
 # BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
 BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
+def sanitize_agent_name(agent_name):
+    # Only allow alphanumeric chars, hyphen, underscore
+    if agent_name.startswith('.'):
+        raise ValueError("Agent name cannot start with a dot")
+    if not re.match("^[a-zA-Z0-9-_][a-zA-Z0-9-_.]*$", agent_name):
+        raise ValueError("Invalid agent name format")
+    return agent_name
+def safe_path_join(*parts):
+    # Ensure we stay within results directory
+    base = Path("results").resolve()
+    try:
+        path = base.joinpath(*parts).resolve()
+        if not str(path).startswith(str(base)):
+            raise ValueError("Path traversal detected")
+        return path
+    except Exception:
+        raise ValueError("Invalid path")
+def sanitize_column_name(col: str) -> str:
+    """Sanitize column names for HTML display"""
+    return html.escape(str(col))
+def sanitize_cell_value(value: Any) -> str:
+    """Sanitize cell values for HTML display"""
+    if isinstance(value, (int, float)):
+        return str(value)
+    return html.escape(str(value))
+def create_html_table_main(df):
     html = '''
     <style>
         table {
     html += '<table>'
     html += '<thead><tr>'
     for column in df.columns:
+        html += f'<th>{sanitize_column_name(column)}</th>'
     html += '</tr></thead>'
     html += '<tbody>'
     for _, row in df.iterrows():
         html += '<tr>'
         for col in df.columns:
+            if col == "Agent":
+                html += f'<td>{row[col]}</td>'
+            else:
+                html += f'<td>{sanitize_cell_value(row[col])}</td>'
         html += '</tr>'
     html += '</tbody></table>'
     html += '</div>'
     return html
+def create_html_table_benchmark(df):
     html = '''
     <style>
         table {
     html += '<thead><tr>'
     for column in df.columns:
         if column != "Reproduced_all":
+            html += f'<th>{sanitize_column_name(column)}</th>'
     html += '</tr></thead>'
     html += '<tbody>'
     for _, row in df.iterrows():
         for column in df.columns:
             if column == "Reproduced":
                 if row[column] == "-":
+                    html += f'<td>{sanitize_cell_value(row[column])}</td>'
                 else:
+                    summary = sanitize_cell_value(row[column])
+                    details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
+                    html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
             elif column == "Reproduced_all":
                 continue
             else:
+                html += f'<td>{sanitize_cell_value(row[column])}</td>'
         html += '</tr>'
     html += '</tbody></table>'
     html += '</div>'
     return html
 def check_sanity(agent):
+    try:
+        safe_agent = sanitize_agent_name(agent)
+        for benchmark in BENCHMARKS:
+            file_path = safe_path_join(safe_agent, f"{benchmark.lower()}.json")
+            if not file_path.is_file():
+                continue
+            original_count = 0
+            with open(file_path) as f:
+                results = json.load(f)
+                for result in results:
+                    if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
+                        return False
+                    if result["agent_name"] != agent:
+                        return False
+                    if result["benchmark"] != benchmark:
+                        return False
+                    if result["original_or_reproduced"] == "Original":
+                        original_count += 1
+            if original_count != 1:
+                return False
+        return True
+    except ValueError:
+        return False
 def main():
+    st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
+    st.markdown("""
+        <head>
+            <meta http-equiv="Content-Security-Policy"
+                content="default-src 'self' https://huggingface.co;
+                        script-src 'self' 'unsafe-inline';
+                        style-src 'self' 'unsafe-inline';
+                        img-src 'self' data: https:;
+                        frame-ancestors 'none';">
+            <meta http-equiv="X-Frame-Options" content="DENY">
+            <meta http-equiv="X-Content-Type-Options" content="nosniff">
+            <meta http-equiv="Referrer-Policy" content="strict-origin-when-cross-origin">
+        </head>
+    """, unsafe_allow_html=True)
     all_agents = os.listdir("results")
     all_results = {}
     st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
     # content = create_yall()
     # tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"])
+    tabs = st.tabs(["🏆 Main Leaderboard",] +  BENCHMARKS + ["📝 About"])
     with tabs[0]:
         # Leaderboard tab
         # Display the filtered DataFrame or the entire leaderboard
         def make_hyperlink(agent_name):
+            try:
+                safe_name = sanitize_agent_name(agent_name)
+                safe_url = f"https://huggingface.co/spaces/ServiceNow/browsergym-leaderboard/blob/main/results/{quote(safe_name)}/README.md"
+                return f'<a href="{html.escape(safe_url)}" target="_blank">{html.escape(safe_name)}</a>'
+            except ValueError:
+                return ""
         df['Agent'] = df['Agent'].apply(make_hyperlink)
         # st.dataframe(
         #     df[['Agent'] + BENCHMARKS],
         #     # height=int(len(df) * 36.2),
         # )
         # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
+        html_table = create_html_table_main(df)
         st.markdown(html_table, unsafe_allow_html=True)
         if st.button("Export to CSV", key="export_main"):
             # Export the DataFrame to CSV
             #     column_config={benchmark: {'alignment': 'center'}},
             #     hide_index=True,
             # )
+            html_table = create_html_table_benchmark(df_)
             st.markdown(html_table, unsafe_allow_html=True)