Amber Tanaka commited on
Commit
1971175
·
unverified ·
1 Parent(s): ee1b999

Current Updated! State (#4)

Browse files
app.py CHANGED
@@ -4,9 +4,9 @@ import os
4
 
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import HfApi
7
- import literature_understanding, main_page, c_and_e, data_analysis, e2e
8
 
9
- from content import TITLE, css
10
 
11
  # --- Constants and Configuration ---
12
  LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
@@ -15,7 +15,7 @@ OWNER = "allenai"
15
  PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
16
  LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
17
  api = HfApi()
18
- LOGO_PATH = "Ai2_logo_pink_padding_RGB.png"
19
 
20
 
21
 
@@ -50,13 +50,22 @@ theme = gr.themes.Base(
50
  button_primary_background_fill_dark='*primary_900',
51
  button_primary_background_fill_hover='*secondary_600',
52
  button_primary_background_fill_hover_dark='*primary_600',
 
 
 
 
 
53
  button_primary_text_color='*neutral_900',
54
- button_primary_text_color_dark='*neutral_900'
 
 
 
 
 
 
55
  )
56
- # --- Gradio App Definition ---
57
- demo = gr.Blocks(theme=theme, css=css)
58
- with demo:
59
- gr.Image(
60
  value=LOGO_PATH,
61
  show_label=False,
62
  interactive=False,
@@ -65,17 +74,26 @@ with demo:
65
  show_fullscreen_button=False,
66
  elem_id="logo-image"
67
  )
68
- gr.HTML(TITLE)
69
-
 
 
70
  main_page.demo.render()
71
  with demo.route("Literature Understanding"):
 
72
  literature_understanding.demo.render()
73
  with demo.route("Code & Execution"):
 
74
  c_and_e.demo.render()
75
  with demo.route("Data Analysis"):
 
76
  data_analysis.demo.render()
77
  with demo.route("Discovery"):
 
78
  e2e.demo.render()
 
 
 
79
 
80
  # --- Scheduler and Launch
81
  def restart_space_job():
 
4
 
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import HfApi
7
+ import literature_understanding, main_page, c_and_e, data_analysis, e2e, submission
8
 
9
+ from content import css
10
 
11
  # --- Constants and Configuration ---
12
  LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
 
15
  PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
16
  LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
17
  api = HfApi()
18
+ LOGO_PATH = "assets/logo.svg"
19
 
20
 
21
 
 
50
  button_primary_background_fill_dark='*primary_900',
51
  button_primary_background_fill_hover='*secondary_600',
52
  button_primary_background_fill_hover_dark='*primary_600',
53
+ button_secondary_background_fill="#9FEAD1",
54
+ button_secondary_background_fill_dark="#9FEAD1",
55
+ button_secondary_text_color="*neutral_900",
56
+ button_secondary_text_color_dark="*neutral_900",
57
+ block_title_text_color="*neutral_900",
58
  button_primary_text_color='*neutral_900',
59
+ block_title_text_color_dark="#ffffff",
60
+ checkbox_label_text_color_dark="#000",
61
+ button_primary_text_color_dark='*neutral_900',
62
+ block_border_color="#032629",
63
+ block_border_color_dark="#9fead1",
64
+ block_background_fill_dark="#032629",
65
+ block_background_fill="#FAF2E9",
66
  )
67
+ def render_logo():
68
+ return gr.Image(
 
 
69
  value=LOGO_PATH,
70
  show_label=False,
71
  interactive=False,
 
74
  show_fullscreen_button=False,
75
  elem_id="logo-image"
76
  )
77
+ # --- Gradio App Definition ---
78
+ demo = gr.Blocks(theme=theme, css=css)
79
+ with demo:
80
+ render_logo()
81
  main_page.demo.render()
82
  with demo.route("Literature Understanding"):
83
+ render_logo()
84
  literature_understanding.demo.render()
85
  with demo.route("Code & Execution"):
86
+ render_logo()
87
  c_and_e.demo.render()
88
  with demo.route("Data Analysis"):
89
+ render_logo()
90
  data_analysis.demo.render()
91
  with demo.route("Discovery"):
92
+ render_logo()
93
  e2e.demo.render()
94
+ with demo.route(" 🚀 Submit an Agent"):
95
+ render_logo()
96
+ submission.demo.render()
97
 
98
  # --- Scheduler and Launch
99
  def restart_space_job():
assets/just-icon.svg ADDED
assets/logo.svg ADDED
c_and_e.py CHANGED
@@ -2,13 +2,19 @@ import gradio as gr
2
  import pandas as pd
3
 
4
  # Import our UI factories and the data loader
5
- from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
6
-
7
  # Define the category for this page
8
  CATEGORY_NAME = "Code Execution"
9
 
10
  with gr.Blocks() as demo:
11
- gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
 
 
 
 
 
 
12
 
13
  # --- This page now has two main sections: Validation and Test ---
14
  with gr.Tabs():
 
2
  import pandas as pd
3
 
4
  # Import our UI factories and the data loader
5
+ from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data,create_sub_navigation_bar
6
+ from content import PLACEHOLDER_DESCRIPTION
7
  # Define the category for this page
8
  CATEGORY_NAME = "Code Execution"
9
 
10
  with gr.Blocks() as demo:
11
+ gr.Markdown(f"## {CATEGORY_NAME} Aggregated")
12
+ validation_df, validation_tag_map = get_full_leaderboard_data("validation")
13
+ test_df, test_tag_map = get_full_leaderboard_data("test")
14
+ gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
15
+ if validation_tag_map:
16
+ create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
17
+
18
 
19
  # --- This page now has two main sections: Validation and Test ---
20
  with gr.Tabs():
content.py CHANGED
@@ -1,8 +1,5 @@
1
- TITLE = """<h1 align="center" id="space-title">AstaBench Leaderboard</h1>"""
2
 
3
- INTRODUCTION_TEXT = """
4
- ## Introduction
5
- """
6
  INTRO_PARAGRAPH = """
7
  AI agents are on the rise, promising everything from travel planning to scientific discovery. But evaluating them—especially for real-world research tasks—remains a messy, inconsistent process. Metrics vary, cost is often ignored, and scientific use cases are rarely the focus. <br>
8
  <br>
@@ -11,9 +8,14 @@ Enter AstaBench, a grand challenge benchmark developed by Ai2 to test how well a
11
  SCATTER_DISCLAIMER = """
12
  Only agents that have cost data available will be shown in the scatter plot. If you don't see your agent, please ensure that you have provided cost data in your submission.
13
  """
14
-
15
- SUBMISSION_TEXT = """
16
- ## Submissions
 
 
 
 
 
17
  """
18
 
19
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -66,17 +68,18 @@ def hf_uri_to_web_url(uri: str) -> str:
66
  return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"
67
 
68
  css = """
69
- .submission-accordion {
70
- border-style: solid;
71
- border-width: 3px !important;
72
- border-color: #ec4899;
73
  }
74
- .submission-accordion span.svelte-1w6vloh {
75
- font-weight: bold !important;
76
- font-size: 1.2em !important;
77
  }
78
  #logo-image {
79
- margin: auto;
 
 
80
  max-width: 250px;
81
  height: auto;
82
  }
@@ -84,7 +87,6 @@ css = """
84
  height: auto !important;
85
  max-height: none !important;
86
  }
87
-
88
  .table-wrap {
89
  max-height: none !important;
90
  height: auto !important;
@@ -96,24 +98,55 @@ table.gr-table th, table.gr-table td {
96
  width: 1%;
97
  white-space: nowrap;
98
  }
99
-
 
 
100
  table.gr-table {
101
  font-size: 14px !important;
102
  }
103
-
104
- /* Example of making the "Agent" column (the 1st column) a bit wider if needed */
105
- table.gr-table th:nth-child(1),
106
- table.gr-table td:nth-child(1) {
107
- min-width: 150px !important;
108
- white-space: normal !important; /* Allow agent names to wrap if long */
109
- }
110
  .html-container {
111
  padding-top: 0 !important;
112
  }
113
  #scatter-disclaimer {
114
  color: #f0529c !important;
115
  }
 
 
 
116
  thead.svelte-1e98i6s th {
117
  background: white !important;
118
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  """
 
1
+ TITLE = """<h1 align="left" id="space-title">AstaBench Leaderboard</h1>"""
2
 
 
 
 
3
  INTRO_PARAGRAPH = """
4
  AI agents are on the rise, promising everything from travel planning to scientific discovery. But evaluating them—especially for real-world research tasks—remains a messy, inconsistent process. Metrics vary, cost is often ignored, and scientific use cases are rarely the focus. <br>
5
  <br>
 
8
  SCATTER_DISCLAIMER = """
9
  Only agents that have cost data available will be shown in the scatter plot. If you don't see your agent, please ensure that you have provided cost data in your submission.
10
  """
11
+ PARETO_DISCLAIMER = """
12
+ Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
13
+ """
14
+ LIT_DESCRIPTION = """
15
+ Several of the evaluations in AstaBench probe an AI model's literature understanding skills — that is, its ability to find research papers based on a description, review questions on citation quality, retrieve information from the literature, and so on.
16
+ """
17
+ PLACEHOLDER_DESCRIPTION = """
18
+ THIS IS PLACEHOLDER TEXT. AstaBench is a benchmark suite designed to evaluate AI agents on their ability to perform complex tasks that require reasoning, planning, and execution. It includes a variety of benchmarks that test different aspects of agent performance, such as literature understanding, data analysis, and code execution.
19
  """
20
 
21
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
68
  return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"
69
 
70
  css = """
71
+ #intro-paragraph {
72
+ font-size: 18px;
73
+ max-width: 60%;
 
74
  }
75
+ #category-intro {
76
+ font-size: 18px;
77
+ max-width: 60%;
78
  }
79
  #logo-image {
80
+ margin: 0;
81
+ margin-bottom: 30px;
82
+ justify-content: flex-start;
83
  max-width: 250px;
84
  height: auto;
85
  }
 
87
  height: auto !important;
88
  max-height: none !important;
89
  }
 
90
  .table-wrap {
91
  max-height: none !important;
92
  height: auto !important;
 
98
  width: 1%;
99
  white-space: nowrap;
100
  }
101
+ table.svelte-1e98i6s td {
102
+ vertical-align: top !important;
103
+ }
104
  table.gr-table {
105
  font-size: 14px !important;
106
  }
 
 
 
 
 
 
 
107
  .html-container {
108
  padding-top: 0 !important;
109
  }
110
  #scatter-disclaimer {
111
  color: #f0529c !important;
112
  }
113
+ #pareto-disclaimer {
114
+ color: #f0529c !important;
115
+ }
116
  thead.svelte-1e98i6s th {
117
  background: white !important;
118
  }
119
+ .dark thead.svelte-1e98i6s th {
120
+ background: #091a1a !important;
121
+ }
122
+ .cell-wrap.svelte-v1pjjd {
123
+ font-family: 'Manrope';
124
+ }
125
+ nav.svelte-ti537g.svelte-ti537g {
126
+ justify-content: flex-start;
127
+ }
128
+ #legend-markdown span {
129
+ margin-right: 15px !important;
130
+ }
131
+ #leaderboard-accordion .label-wrap {
132
+ font-size: 1.4rem !important;
133
+ }
134
+ .dark #leaderboard-accordion .label-wrap {
135
+ color: #0FCB8C !important;
136
+ }
137
+ .dark block.svelte-1svsvh2 {
138
+ background: #032629 !important;
139
+ }
140
+ .sub-nav-bar {
141
+ margin-bottom: 20px; /* The space below the nav bar */
142
+ }
143
+ .sub-nav-bar a {
144
+ font-size: 16px;
145
+ border-radius: 5px;
146
+ transition: background-color 0.2s;
147
+ padding-right: 15px;
148
+ }
149
+ .padding.svelte-phx28p {
150
+ padding: 0 !important;
151
+ }
152
  """
data_analysis.py CHANGED
@@ -2,13 +2,18 @@ import gradio as gr
2
  import pandas as pd
3
 
4
  # Import our UI factories and the data loader
5
- from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
6
-
7
  # Define the category for this page
8
  CATEGORY_NAME = "Data Analysis"
9
 
10
  with gr.Blocks() as demo:
11
- gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
 
 
 
 
 
12
 
13
  # --- This page now has two main sections: Validation and Test ---
14
  with gr.Tabs():
 
2
  import pandas as pd
3
 
4
  # Import our UI factories and the data loader
5
+ from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
6
+ from content import PLACEHOLDER_DESCRIPTION
7
  # Define the category for this page
8
  CATEGORY_NAME = "Data Analysis"
9
 
10
  with gr.Blocks() as demo:
11
+ gr.Markdown(f"## {CATEGORY_NAME} Aggregated")
12
+ validation_df, validation_tag_map = get_full_leaderboard_data("validation")
13
+ test_df, test_tag_map = get_full_leaderboard_data("test")
14
+ gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
15
+ if validation_tag_map:
16
+ create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
17
 
18
  # --- This page now has two main sections: Validation and Test ---
19
  with gr.Tabs():
e2e.py CHANGED
@@ -2,13 +2,18 @@ import gradio as gr
2
  import pandas as pd
3
 
4
  # Import our UI factories and the data loader
5
- from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
6
-
7
  # Define the category for this page
8
  CATEGORY_NAME = "Discovery"
9
 
10
  with gr.Blocks() as demo:
11
- gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
 
 
 
 
 
12
 
13
  # --- This page now has two main sections: Validation and Test ---
14
  with gr.Tabs():
 
2
  import pandas as pd
3
 
4
  # Import our UI factories and the data loader
5
+ from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
6
+ from content import PLACEHOLDER_DESCRIPTION
7
  # Define the category for this page
8
  CATEGORY_NAME = "Discovery"
9
 
10
  with gr.Blocks() as demo:
11
+ gr.Markdown(f"## {CATEGORY_NAME} Aggregated")
12
+ validation_df, validation_tag_map = get_full_leaderboard_data("validation")
13
+ test_df, test_tag_map = get_full_leaderboard_data("test")
14
+ gr.Markdown(PLACEHOLDER_DESCRIPTION, elem_id="category-intro")
15
+ if validation_tag_map:
16
+ create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
17
 
18
  # --- This page now has two main sections: Validation and Test ---
19
  with gr.Tabs():
json_leaderboard.py DELETED
@@ -1,485 +0,0 @@
1
- import logging
2
- from typing import Optional, Any, Dict # Added Dict
3
- from zoneinfo import ZoneInfo
4
-
5
- # datasets import might not be strictly needed by LeaderboardViewer itself anymore,
6
- # but _get_dataframe might still use types from it if EvalResult refers to them.
7
- # For now, let's keep it if your EvalResult or SuiteConfig models have dependencies.
8
- # If not, it can be removed from here.
9
- import datasets # Potentially removable from this file
10
- import matplotlib.pyplot as plt
11
- import plotly.express as px
12
- import plotly.graph_objects as go
13
- import numpy as np
14
- import pandas as pd
15
- import seaborn as sns
16
- import json # For loading the local JSON file
17
- import os # For checking file existence
18
-
19
- from agenteval import compute_summary_statistics
20
- from agenteval.config import SuiteConfig
21
- from agenteval.models import EvalResult
22
-
23
- logger = logging.getLogger(__name__)
24
-
25
- import logging
26
- from typing import Optional, Any, Dict, List # Added List
27
- from zoneinfo import ZoneInfo # Assuming this might be used by SuiteConfig/EvalResult or _get_dataframe
28
- import json
29
- import os
30
-
31
- # Assuming these are correctly imported from your project
32
- from agenteval.config import SuiteConfig
33
- from agenteval.models import EvalResult
34
- # from agenteval import compute_summary_statistics # Used by _get_dataframe
35
-
36
-
37
- class DataTransformer:
38
- """
39
- Load and visualize leaderboard from a single, local JSON result file.
40
- """
41
- _INFORMAL_TO_FORMAL_NAME_MAP = {
42
- "lit": "Literature Understanding",
43
- "data": "Data Analysis",
44
- "code": "Code Execution",
45
- "discovery": "Discovery",
46
- "arxivdigestables_validation": "Arxivdigestables Validation",
47
- "sqa_dev": "Sqa Dev",
48
- "litqa2_validation": "Litqa2 Validation",
49
- "paper_finder_validation": "Paper Finder Validation",
50
- "discoverybench_validation": "Discoverybench Validation",
51
- "core_bench_validation": "Core Bench Validation",
52
- "ds1000_validation": "DS1000 Validation",
53
- "e2e_discovery_validation": "E2E Discovery Validation",
54
- "super_validation": "Super Validation",
55
- # Add any other raw names that can appear in task.name or task.tags
56
- }
57
-
58
- def __init__(
59
- self,
60
- json_file_path: str, # Mandatory: path to the local JSON file
61
- split: str, # Still needed for context within the JSON's suite_config
62
- is_internal: bool = False
63
- ):
64
- self._json_file_path = json_file_path
65
- self._split = split
66
- self._internal = is_internal
67
- self._loaded_json_data: Optional[Dict[str, Any]] = None
68
- self._cfg: Optional[SuiteConfig] = None
69
-
70
- logger.info(f"Initializing LeaderboardViewer with local JSON file: {self._json_file_path}")
71
-
72
- # --- Load and Validate JSON data ---
73
- if not os.path.exists(self._json_file_path):
74
- raise FileNotFoundError(f"JSON file not found at path: {self._json_file_path}")
75
- try:
76
- with open(self._json_file_path, 'r', encoding='utf-8') as f:
77
- self._loaded_json_data = json.load(f)
78
- except json.JSONDecodeError as e:
79
- raise ValueError(f"Failed to parse JSON from local file {self._json_file_path}: {e}")
80
- except Exception as e:
81
- raise ValueError(f"Error reading local file {self._json_file_path}: {e}")
82
-
83
- if not self._loaded_json_data:
84
- raise ValueError(f"No data loaded from JSON file {self._json_file_path}.")
85
-
86
- try:
87
- eval_result = EvalResult.model_validate(self._loaded_json_data)
88
- except Exception as e:
89
- raise ValueError(f"Failed to validate JSON data from file '{self._json_file_path}' against EvalResult model: {e}")
90
-
91
- self._cfg = eval_result.suite_config
92
- if not isinstance(self._cfg, SuiteConfig):
93
- raise TypeError(f"self._cfg is not a SuiteConfig object after loading from '{self._json_file_path}', got {type(self._cfg)}.")
94
-
95
- # --- Populate Tag Map (Corrected Placement and Helper Function Access) ---
96
- self.tag_map: dict[str, list[str]] = {}
97
-
98
- # Access tasks from the loaded config
99
- tasks_for_split: List[Any] = self._cfg.get_tasks(self._split) # Assuming get_tasks returns a list of task-like objects
100
-
101
- for task in tasks_for_split:
102
- # Ensure task object has 'name' and 'tags' attributes
103
- if not hasattr(task, 'name') or not hasattr(task, 'tags'):
104
- logger.warning(f"Task object {task} is missing 'name' or 'tags' attribute. Skipping.")
105
- continue
106
-
107
- formal_task_display_name = self._get_formal_display_name_static(task.name) # Use the helper method
108
-
109
- if not (task.tags or []):
110
- continue
111
-
112
- for raw_tag_name in task.tags:
113
- formal_tag_display_name_key = self._get_formal_display_name_static(raw_tag_name)
114
-
115
- self.tag_map.setdefault(formal_tag_display_name_key, []).append(formal_task_display_name)
116
-
117
- for key in self.tag_map:
118
- self.tag_map[key] = sorted(list(set(self.tag_map[key])))
119
-
120
- # --- Helper function defined as a static method or regular method ---
121
- # Option 1: Static method (doesn't need 'self', uses the class attribute)
122
- @staticmethod
123
- def _get_formal_display_name_static(raw_name: str) -> str:
124
- """
125
- Helper function to get the formal display name for a raw tag or task name.
126
- Uses the class's map and provides a fallback.
127
- """
128
- return DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP.get(raw_name, raw_name.replace("_", " ").title())
129
-
130
- def _load(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
131
- """
132
- Prepares the DataFrame from the loaded JSON data.
133
- The JSON data is already loaded and validated in __init__.
134
- """
135
- if self._loaded_json_data is None or self._cfg is None:
136
- # This should not happen if __init__ completed successfully
137
- raise RuntimeError("LeaderboardViewer2 not properly initialized. JSON data or SuiteConfig is missing.")
138
-
139
- # The _get_dataframe function expects a list of records.
140
- # Since we have a single JSON file representing one result, wrap it in a list.
141
- records_list: list[dict] = [self._loaded_json_data]
142
-
143
- overview_df = _get_dataframe(
144
- records_list=records_list,
145
- split=self._split,
146
- is_internal=self._internal,
147
- suite_config=self._cfg, # Pass the SuiteConfig loaded in __init__
148
- )
149
- return overview_df, self.tag_map
150
-
151
- # --- view method remains the same as your last version ---
152
- def view(
153
- self,
154
- tag: Optional[str] = None,
155
- with_plots: bool = False,
156
- use_plotly: bool = False,
157
- ) -> tuple[pd.DataFrame, dict[str, Any]]:
158
- data, tag_map = self._load() # tag_map is also returned by _load now
159
- print(f"AHAHASHJDBFGASJHDBJAHSDB,AHDB {tag_map}")
160
- print(f"THIS IS THE DATA DATA DTAA {data.columns}")
161
- if data.empty or (len(data) == 1 and data.iloc[0].get("Agent") == "No data"):
162
- logger.warning("No data available to view. Returning empty DataFrame and plots.")
163
- return data, {}
164
-
165
- base_cols = ["Agent", "Submitter", "Date", "Logs"]
166
- existing_cols = [col for col in base_cols if col in data.columns]
167
-
168
- primary_score_col: str
169
- group_metric_names: list[str]
170
-
171
- if tag is None:
172
- primary = "Overall"
173
- group = list(tag_map.keys())
174
- else:
175
- primary = tag
176
- group = tag_map.get(tag, [])
177
-
178
- if f"{primary} Score" in data.columns:
179
- data = data.sort_values(f"{primary} Score", ascending=False)
180
- else:
181
- logger.warning(f"Primary metric '{primary}' for sorting not found. Data will not be sorted by it.")
182
-
183
- metrics_to_display = []
184
- if f"{primary} Cost" in data.columns:
185
- metrics_to_display.append(f"{primary} Cost")
186
- if f"{primary} Score" in data.columns:
187
- metrics_to_display.append(f"{primary} Score")
188
-
189
- for g_item in group:
190
- if g_item in data.columns:
191
- metrics_to_display.append(g_item)
192
- if f"{g_item} Cost" in data.columns:
193
- metrics_to_display.append(f"{g_item} Cost")
194
- if f"{g_item} Score" in data.columns:
195
- metrics_to_display.append(f"{g_item} Score")
196
-
197
-
198
- final_cols_to_display = existing_cols + [m for m in metrics_to_display if m in data.columns]
199
- final_cols_to_display = sorted(list(set(final_cols_to_display)), key=final_cols_to_display.index)
200
-
201
- df_view = data.loc[:, final_cols_to_display].reset_index(drop=True)
202
-
203
- plots: dict[str, Any] = {}
204
- if with_plots:
205
- plot_metric_names = [primary] + [g_item for g_item in group if g_item in data.columns]
206
- for metric_name in plot_metric_names:
207
- score_col = f"{metric_name} Score"
208
- cost_col = f"{metric_name} Cost"
209
- if score_col in df_view.columns and cost_col in df_view.columns:
210
- if use_plotly:
211
- fig = _plot_scatter_plotly(df_view, x=cost_col, y=score_col, agent_col="Agent")
212
- plots[f"scatter_{metric_name}"] = fig
213
- else:
214
- logger.warning(
215
- f"Skipping plot for '{metric_name}': score column '{score_col}' or cost column '{cost_col}' not found."
216
- )
217
- return df_view, plots
218
-
219
-
220
- def _safe_round(value, digits=2):
221
- return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
222
-
223
- def _get_dataframe(
224
- records_list: list[dict],
225
- split: str,
226
- is_internal: bool,
227
- suite_config: SuiteConfig,
228
- timezone: str = "US/Pacific",
229
- ) -> pd.DataFrame:
230
- # This function remains the same as in the previous version you provided.
231
- # It takes a list of records (which will be a list containing one item
232
- # from the loaded JSON file) and processes it.
233
- if not records_list:
234
- logger.warning(f"No records provided to _get_dataframe for split '{split}'. Returning empty DataFrame with placeholder.")
235
- expected_pretty_cols = ["Agent Name", "Submitter", "Date", "Overall Score", "Logs"]
236
- empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
237
- return empty_df
238
-
239
- cfg = suite_config
240
-
241
- rows = []
242
- for itm_idx, itm in enumerate(records_list):
243
- if not isinstance(itm, dict):
244
- logger.warning(f"Item {itm_idx} in records_list is not a dict, skipping.")
245
- continue
246
- try:
247
- ev = EvalResult.model_validate(itm)
248
- except Exception as e:
249
- logger.error(f"Failed to validate item {itm_idx} with EvalResult: {itm}. Error: {e}")
250
- continue
251
-
252
- sub = ev.submission
253
- date_str = None
254
- if sub.submit_time is not None:
255
- submit_dt = sub.submit_time
256
- if not isinstance(submit_dt, pd.Timestamp):
257
- if submit_dt.tzinfo is None:
258
- logger.debug(f"Submission time for {sub.agent_name} is timezone-naive, assuming UTC.")
259
- submit_dt = submit_dt.replace(tzinfo=ZoneInfo("UTC"))
260
- date_str = pd.Timestamp(submit_dt).tz_convert(ZoneInfo(timezone)).strftime("%Y-%m-%d")
261
- else:
262
- date_str = None
263
-
264
- if not ev.results:
265
- logger.warning(
266
- f"Skipping submission {sub.agent_name} ({sub.username or 'N/A'}) "
267
- f"({sub.submit_time or 'N/A'}) due to no results."
268
- )
269
- continue
270
- stats = compute_summary_statistics(
271
- suite_config=cfg, split=split, results=ev.results
272
- )
273
- flat = {}
274
- print(f"STATS STATS ASTATAS SD T S T A A {stats}")
275
- for key, s_obj in stats.items():
276
- parts = key.split("/")
277
- if parts[0] == "overall":
278
- flat["overall/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
279
- flat["overall/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
280
- elif parts[0] == "tag" and len(parts) > 1:
281
- tag_name = parts[1]
282
- flat[f"tag/{tag_name}/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
283
- flat[f"tag/{tag_name}/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
284
- elif parts[0] == "task" and len(parts) > 1:
285
- task_name = parts[1]
286
- score = getattr(s_obj, 'score', np.nan)
287
- cost = getattr(s_obj, 'cost', np.nan)
288
- score_stderr = getattr(s_obj, 'score_stderr', np.nan)
289
- cost_stderr = getattr(s_obj, 'cost_stderr', np.nan)
290
-
291
- flat[f"task/{task_name}/score"] = _safe_round(score)
292
- flat[f"task/{task_name}/score_ci"] = _safe_round(score_stderr * 1.96 if pd.notna(score_stderr) else np.nan)
293
- flat[f"task/{task_name}/cost"] = _safe_round(cost)
294
- flat[f"task/{task_name}/cost_ci"] = _safe_round(cost_stderr * 1.96 if pd.notna(cost_stderr) else np.nan)
295
- else:
296
- logger.debug(f"Uncommon key structure from compute_summary_statistics: '{key}'. Attempting generic add.")
297
- if hasattr(s_obj, 'score'):
298
- flat[f"{key}/score"] = _safe_round(s_obj.score)
299
- if hasattr(s_obj, 'cost'):
300
- flat[f"{key}/cost"] = _safe_round(s_obj.cost)
301
-
302
- current_logs_url = None
303
- if is_internal and sub.logs_url:
304
- current_logs_url = str(sub.logs_url)
305
- elif not is_internal and sub.logs_url_public:
306
- current_logs_url = str(sub.logs_url_public)
307
-
308
- rows.append(
309
- {
310
- "agent_name": sub.agent_name or "N/A",
311
- "username": sub.username or "N/A",
312
- "submit_time": date_str,
313
- **flat,
314
- "logs_url": current_logs_url,
315
- }
316
- )
317
-
318
- if not rows:
319
- logger.warning(f"No valid rows generated from records_list for split '{split}'. Returning empty DataFrame with placeholder.")
320
- expected_pretty_cols = ["Agent", "Submitter", "Date", "Overall Score", "Overall Cost", "Logs"]
321
- empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
322
- return empty_df
323
-
324
- df = pd.DataFrame(rows)
325
- pretty_cols = {c: _pretty_column_name(c) for c in df.columns if c in df.columns}
326
- overview = df.rename(columns=pretty_cols)
327
- return overview
328
-
329
- def _pretty_column_name(col: str) -> str:
330
- """Map raw column name to display name."""
331
- # --- Step 1: Fixed, direct mappings ---
332
- fixed_mappings = {
333
- "submit_time": "Date",
334
- "agent_name": "Agent",
335
- "username": "Submitter",
336
- "logs_url": "Logs",
337
- "overall/score": "Overall Score",
338
- "overall/cost": "Overall Cost",
339
- }
340
- if col in fixed_mappings:
341
- return fixed_mappings[col]
342
-
343
- # --- Step 2: Define your mapping for informal names to descriptive names ---
344
- informal_map = DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP
345
-
346
- # --- Step 3: Dynamic mappings for task or tag columns using the informal_to_formal_name_map ---
347
- parts = col.split("/")
348
- if len(parts) == 3:
349
- item_type, informal_name, metric_suffix = parts #
350
-
351
- formal_name = informal_map.get(informal_name)
352
- if formal_name is None:
353
- formal_name = informal_name.replace("_", " ").title()
354
- print(f"[DEBUG _pretty_column_name] Informal name '{informal_name}' not in map, using fallback: '{formal_name}'")
355
-
356
- if metric_suffix == "score":
357
- return f"{formal_name} Score"
358
- if metric_suffix == "cost":
359
- return f"{formal_name} Cost"
360
- if metric_suffix == "score_ci":
361
- return f"{formal_name} Score 95% CI"
362
- if metric_suffix == "cost_ci":
363
- return f"{formal_name} Cost 95% CI"
364
-
365
- # --- Step 4: Fallback for columns that don't match the "type/name/metric" pattern ---
366
- if "/" not in col:
367
- return col.replace("_", " ").title()
368
- else:
369
- return parts[-1].replace("_", " ").title()
370
-
371
- DEFAULT_Y_COLUMN = "Overall Score"
372
- DUMMY_X_VALUE_FOR_MISSING_COSTS = 0 # Value to use if x-axis data (costs) is missing
373
-
374
- def _plot_scatter_plotly(
375
- data: pd.DataFrame,
376
- x: Optional[str],
377
- y: str,
378
- agent_col: str = "Agent"
379
- ) -> go.Figure:
380
-
381
- x_col_to_use = x
382
- y_col_to_use = y
383
-
384
- # 1. Check if y-column exists
385
- if y_col_to_use not in data.columns:
386
- logger.error(
387
- f"y-axis column '{y_col_to_use}' MUST exist in DataFrame. "
388
- f"Cannot generate plot. Available columns: {data.columns.tolist()}"
389
- )
390
- return go.Figure()
391
-
392
- # 2. Check if agent_col exists
393
- if agent_col not in data.columns:
394
- logger.warning(
395
- f"Agent column '{agent_col}' not found in DataFrame. "
396
- f"Available columns: {data.columns.tolist()}. Returning empty figure."
397
- )
398
- return go.Figure()
399
-
400
- # 3. Prepare data (make a copy, handle numeric conversion for y)
401
- data_plot = data.copy()
402
- try:
403
- data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
404
- except Exception as e:
405
- logger.error(f"Error converting y-column '{y_col_to_use}' to numeric: {e}. Returning empty figure.")
406
- return go.Figure()
407
-
408
- # 4. Handle x-column (costs)
409
- x_axis_label = x_col_to_use if x_col_to_use else "Cost (Data N/A)" # Label for the x-axis
410
- x_data_is_valid = False
411
-
412
- if x_col_to_use and x_col_to_use in data_plot.columns:
413
- try:
414
- data_plot[x_col_to_use] = pd.to_numeric(data_plot[x_col_to_use], errors='coerce')
415
- # Check if there's any non-NaN data after coercion for x
416
- if data_plot[x_col_to_use].notna().any():
417
- x_data_is_valid = True
418
- else:
419
- logger.info(f"x-axis column '{x_col_to_use}' exists but contains all NaN/None values after numeric conversion.")
420
- except Exception as e:
421
- logger.warning(f"Error converting x-column '{x_col_to_use}' to numeric: {e}. Will use dummy x-values.")
422
- # x_data_is_valid remains False
423
- else:
424
- if x_col_to_use: # Name was provided but column doesn't exist
425
- logger.warning(f"x-axis column '{x_col_to_use}' not found in DataFrame.")
426
- else: # x (column name) was None
427
- logger.info("x-axis column name was not provided (is None).")
428
-
429
- if not x_data_is_valid:
430
- logger.info(f"Using dummy x-value '{DUMMY_X_VALUE_FOR_MISSING_COSTS}' for all data points as x-data is missing or invalid.")
431
- # Create a new column with the dummy x-value for all rows
432
- # Use a unique name for this dummy column to avoid potential clashes
433
- dummy_x_col_name = "__dummy_x_for_plotting__"
434
- data_plot[dummy_x_col_name] = DUMMY_X_VALUE_FOR_MISSING_COSTS
435
- x_col_to_use = dummy_x_col_name # Update x_col_to_use to point to our dummy data
436
- x_axis_label = x if x else "Cost (Data N/A)" # Use original x name for label if provided
437
- # or a generic label if x was None.
438
- # Could also be f"Cost (Fixed at {DUMMY_X_VALUE_FOR_MISSING_COSTS})"
439
-
440
-
441
- # 5. Drop rows where y is NaN (x is now guaranteed to have values, either real or dummy)
442
- data_plot.dropna(subset=[y_col_to_use], inplace=True)
443
-
444
- fig = go.Figure()
445
-
446
- if data_plot.empty:
447
- logger.warning(f"No valid data to plot for y='{y_col_to_use}' (and x='{x_col_to_use}') after cleaning NaNs from y.")
448
- # Still return a figure object, but it will be empty. Update layout for clarity.
449
- fig.update_layout(
450
- title=f"{y_col_to_use} vs. {x_axis_label} (No Data)",
451
- xaxis=dict(title=x_axis_label, range=[DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1] if not x_data_is_valid else None),
452
- yaxis=dict(title=y_col_to_use)
453
- )
454
- return fig
455
-
456
-
457
- for agent, group in data_plot.groupby(agent_col):
458
- hover_x_display = "%{x:.2f}" if x_data_is_valid else str(DUMMY_X_VALUE_FOR_MISSING_COSTS) + " (fixed)"
459
- fig.add_trace(go.Scatter(
460
- x=group[x_col_to_use],
461
- y=group[y_col_to_use],
462
- mode='markers',
463
- name=str(agent),
464
- hovertemplate=f"{x_axis_label}: {hover_x_display}<br>{y_col_to_use}: %{{y:.2f}}<extra>{str(agent)}</extra>",
465
- marker=dict(size=10)
466
- ))
467
-
468
- # Configure layout
469
- xaxis_config = dict(title=x_axis_label)
470
- if not x_data_is_valid: # If using dummy x, set a tighter, fixed range for x-axis
471
- xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
472
- xaxis_config['tickvals'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS] # Show only one tick at the dummy value
473
- xaxis_config['ticktext'] = [str(DUMMY_X_VALUE_FOR_MISSING_COSTS)]
474
- else: # Real x-data
475
- xaxis_config['rangemode'] = "tozero"
476
-
477
-
478
- fig.update_layout(
479
- title=f"{y_col_to_use} vs. {x_axis_label}",
480
- xaxis=xaxis_config,
481
- yaxis=dict(title=y_col_to_use, rangemode="tozero"),
482
- legend_title_text=agent_col
483
- )
484
-
485
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_transformer.py CHANGED
@@ -2,10 +2,8 @@ import plotly.graph_objects as go
2
  import numpy as np
3
  import pandas as pd
4
  import logging
5
- from typing import Optional, Any, Dict, List # Added List
6
- from zoneinfo import ZoneInfo # Assuming this might be used by SuiteConfig/EvalResult or _get_dataframe
7
- import json
8
- import os
9
 
10
  logger = logging.getLogger(__name__)
11
 
@@ -46,14 +44,18 @@ def _pretty_column_name(raw_col: str) -> str:
46
  """
47
  # Case 1: Handle fixed, special-case mappings first.
48
  fixed_mappings = {
 
49
  'Agent': 'Agent',
50
  'Agent description': 'Agent Description',
51
  'User/organization': 'Submitter',
52
  'Submission date': 'Date',
53
  'Overall': 'Overall Score',
54
  'Overall cost': 'Overall Cost',
55
- 'Logs': 'Logs'
 
 
56
  }
 
57
  if raw_col in fixed_mappings:
58
  return fixed_mappings[raw_col]
59
 
@@ -146,7 +148,6 @@ class DataTransformer:
146
  def __init__(self, dataframe: pd.DataFrame, tag_map: dict[str, list[str]]):
147
  """
148
  Initializes the viewer.
149
-
150
  Args:
151
  dataframe (pd.DataFrame): The presentation-ready leaderboard data.
152
  tag_map (dict): A map of formal tag names to formal task names.
@@ -188,29 +189,53 @@ class DataTransformer:
188
  if primary_score_col in self.data.columns:
189
  df_sorted = self.data.sort_values(primary_score_col, ascending=False, na_position='last')
190
 
191
- # --- 3. Build the List of Columns to Display ---
192
- base_cols = ["Agent", "Submitter"]
193
- new_cols = ["Openness", "Degree of Control"]
194
- ending_cols = ["Date", "Logs"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
- # Start with the primary metric score and cost
197
- metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
 
 
198
 
199
- # Add the score and cost for each item in our group
 
 
 
 
 
200
  for item in group_metrics:
201
  metrics_to_display.append(f"{item} Score")
202
  metrics_to_display.append(f"{item} Cost")
203
 
204
- # Combine base columns with metric columns, ensuring uniqueness and order
205
- final_cols_ordered = base_cols + list(dict.fromkeys(metrics_to_display))+ new_cols + ending_cols
206
 
207
- # Filter to only include columns that actually exist in our DataFrame
208
- df_view = df_sorted.copy()
209
  for col in final_cols_ordered:
210
  if col not in df_view.columns:
211
  df_view[col] = pd.NA
212
 
 
213
  df_view = df_view[final_cols_ordered].reset_index(drop=True)
 
214
 
215
  # Calculated and add "Categories Attempted" column
216
  if primary_metric == "Overall":
@@ -220,29 +245,28 @@ class DataTransformer:
220
 
221
  # Return the formatted string with the correct emoji
222
  if count == 4:
223
- return f"4/4"
224
  if count == 0:
225
- return f"0/4 🚫"
226
- return f"{count}/4 ⚠️"
227
 
228
  # Apply the function row-wise to create the new column
229
  attempted_column = df_view.apply(calculate_attempted, axis=1)
230
  # Insert the new column at a nice position (e.g., after "Date")
231
- df_view.insert(2, "Categories Attempted", attempted_column)
232
  else:
233
  total_benchmarks = len(group_metrics)
234
  def calculate_benchmarks_attempted(row):
235
  # Count how many benchmarks in this category have COST data reported
236
  count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Cost")))
237
  if count == total_benchmarks:
238
- return f"{count}/{total_benchmarks} "
239
  elif count == 0:
240
- return f"{count}/{total_benchmarks} 🚫"
241
  else:
242
- return f"{count}/{total_benchmarks}⚠️"
243
  # Insert the new column, for example, after "Date"
244
- df_view.insert(2, "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
245
-
246
 
247
  # --- 4. Generate the Scatter Plot for the Primary Metric ---
248
  plots: dict[str, go.Figure] = {}
@@ -254,7 +278,7 @@ class DataTransformer:
254
  data=df_view,
255
  x=primary_cost_col,
256
  y=primary_score_col,
257
- agent_col="Agent"
258
  )
259
  # Use a consistent key for easy retrieval later
260
  plots['scatter_plot'] = fig
@@ -274,24 +298,37 @@ def _plot_scatter_plotly(
274
  data: pd.DataFrame,
275
  x: Optional[str],
276
  y: str,
277
- agent_col: str = "Agent"
278
  ) -> go.Figure:
279
 
280
- # --- Steps 1-4: Data Validation and Preparation ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  x_col_to_use = x
282
  y_col_to_use = y
283
 
284
- if y_col_to_use not in data.columns:
285
- logger.error(f"y-axis column '{y_col_to_use}' not found.")
286
- return go.Figure()
287
- if agent_col not in data.columns:
288
- logger.warning(f"Agent column '{agent_col}' not found.")
289
  return go.Figure()
290
 
291
  data_plot = data.copy()
292
  data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
293
 
294
- x_axis_label = x if x else "Cost (Data N/A)"
295
  x_data_is_valid = False
296
  if x and x in data_plot.columns:
297
  try:
@@ -307,30 +344,27 @@ def _plot_scatter_plotly(
307
  x_col_to_use = dummy_x_col_name
308
  logger.info("Using dummy x-values for plotting.")
309
 
310
- # --- Step 5: Clean Data and Initialize Figure ---
311
- data_plot.dropna(subset=[y_col_to_use, x_col_to_use], inplace=True)
 
 
312
  fig = go.Figure()
313
  if data_plot.empty:
314
- logger.warning(f"No valid data to plot for y='{y_col_to_use}' and x='{x_col_to_use}'.")
315
  return fig
316
 
317
- # Step 6 - Calculate and Draw the Efficiency Frontier Line ---
318
  if x_data_is_valid:
319
- # Sort by cost (ascending), then by score (descending) to break ties
320
  sorted_data = data_plot.sort_values(by=[x_col_to_use, y_col_to_use], ascending=[True, False])
321
-
322
  frontier_points = []
323
  max_score_so_far = float('-inf')
324
 
325
- for index, row in sorted_data.iterrows():
326
  score = row[y_col_to_use]
327
- # If this point offers a better score than any we've seen before,
328
- # it's part of the frontier.
329
- if score > max_score_so_far:
330
  frontier_points.append({'x': row[x_col_to_use], 'y': score})
331
  max_score_so_far = score
332
 
333
- # Add the frontier line trace to the plot if we found any points
334
  if frontier_points:
335
  frontier_df = pd.DataFrame(frontier_points)
336
  fig.add_trace(go.Scatter(
@@ -339,22 +373,67 @@ def _plot_scatter_plotly(
339
  mode='lines',
340
  name='Efficiency Frontier',
341
  line=dict(color='firebrick', width=2, dash='dash'),
342
- hoverinfo='skip' # The line doesn't need a hover tooltip
343
  ))
344
 
345
- # --- Step 7: Plot Individual Agent Markers (No changes here) ---
346
- for agent, group in data_plot.groupby(agent_col):
347
- hover_x_display = "%{x:.2f}" if x_data_is_valid else "N/A"
 
 
 
 
 
 
 
 
 
 
 
348
  fig.add_trace(go.Scatter(
349
  x=group[x_col_to_use],
350
  y=group[y_col_to_use],
351
  mode='markers',
352
- name=str(agent),
353
- hovertemplate=f"<b>{str(agent)}</b><br>{x_axis_label}: {hover_x_display}<br>{y_col_to_use}: %{{y:.2f}}""<extra></extra>",
354
- marker=dict(size=10, opacity=0.8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  ))
356
 
357
- # --- Step 8: Configure Layout (No changes here) ---
358
  xaxis_config = dict(title=x_axis_label)
359
  if not x_data_is_valid:
360
  xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
@@ -362,15 +441,32 @@ def _plot_scatter_plotly(
362
  else:
363
  xaxis_config['rangemode'] = "tozero"
364
 
 
 
365
  fig.update_layout(
 
366
  title=f"{y_col_to_use} vs. {x_axis_label}",
367
  xaxis=xaxis_config,
368
  yaxis=dict(title=y_col_to_use, rangemode="tozero"),
369
- legend_title_text=agent_col
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  )
371
 
372
  return fig
373
 
 
374
  def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
375
  """
376
  Applies custom formatting to a cost column based on its corresponding score column.
@@ -398,7 +494,7 @@ def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
398
  if pd.notna(cost_value) and isinstance(cost_value, (int, float)):
399
  return f"${cost_value:.2f}"
400
  elif pd.notna(score_value):
401
- return f'<span style="color: {status_color};">Missing Cost</span>' # Score exists, but cost is missing
402
  else:
403
  return f'<span style="color: {status_color};">Not Attempted</span>' # Neither score nor cost exists
404
 
@@ -434,3 +530,43 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
434
  # Apply the formatting and return the updated DataFrame
435
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import pandas as pd
4
  import logging
5
+ from typing import Optional
6
+ import base64
 
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
 
44
  """
45
  # Case 1: Handle fixed, special-case mappings first.
46
  fixed_mappings = {
47
+ 'id': 'id',
48
  'Agent': 'Agent',
49
  'Agent description': 'Agent Description',
50
  'User/organization': 'Submitter',
51
  'Submission date': 'Date',
52
  'Overall': 'Overall Score',
53
  'Overall cost': 'Overall Cost',
54
+ 'Logs': 'Logs',
55
+ 'Openness': 'Openness',
56
+ 'Agent tooling': 'Agent Tooling',
57
  }
58
+
59
  if raw_col in fixed_mappings:
60
  return fixed_mappings[raw_col]
61
 
 
148
  def __init__(self, dataframe: pd.DataFrame, tag_map: dict[str, list[str]]):
149
  """
150
  Initializes the viewer.
 
151
  Args:
152
  dataframe (pd.DataFrame): The presentation-ready leaderboard data.
153
  tag_map (dict): A map of formal tag names to formal task names.
 
189
  if primary_score_col in self.data.columns:
190
  df_sorted = self.data.sort_values(primary_score_col, ascending=False, na_position='last')
191
 
192
+ df_view = df_sorted.copy()
193
+ #preserve just agent name for scatterplot hover
194
+ df_view['agent_for_hover'] = df_view['Agent']
195
+ # 3. Combine "Agent" and "Submitter" into a single HTML-formatted column
196
+ # We do this *before* defining the final column list.
197
+ if 'Agent' in df_view.columns and 'Submitter' in df_view.columns:
198
+
199
+ def combine_agent_submitter(row):
200
+ agent = row['Agent']
201
+ submitter = row['Submitter']
202
+
203
+ # Check if submitter exists and is not empty
204
+ if pd.notna(submitter) and submitter.strip() != '':
205
+ # Create a two-line HTML string with styled submitter text
206
+ return (
207
+ f"<div>{agent}<br>"
208
+ f"<span style='font-size: 0.9em; color: #667876;'>{submitter}</span>"
209
+ f"</div>"
210
+ )
211
+ else:
212
+ # If no submitter, just return the agent name
213
+ return agent
214
 
215
+ # Apply the function to create the new combined 'Agent' column
216
+ df_view['Agent'] = df_view.apply(combine_agent_submitter, axis=1)
217
+ # The 'Submitter' column is no longer needed
218
+ df_view = df_view.drop(columns=['Submitter'])
219
 
220
+ # 4. Build the List of Columns to Display (now simplified)
221
+ base_cols = ["id","Agent","agent_for_hover"]
222
+ new_cols = ["Openness", "Agent Tooling"]
223
+ ending_cols = ["Logs"]
224
+
225
+ metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
226
  for item in group_metrics:
227
  metrics_to_display.append(f"{item} Score")
228
  metrics_to_display.append(f"{item} Cost")
229
 
230
+ final_cols_ordered = new_cols + base_cols + list(dict.fromkeys(metrics_to_display)) + ending_cols
 
231
 
 
 
232
  for col in final_cols_ordered:
233
  if col not in df_view.columns:
234
  df_view[col] = pd.NA
235
 
236
+ # The final selection will now use the new column structure
237
  df_view = df_view[final_cols_ordered].reset_index(drop=True)
238
+ cols = len(final_cols_ordered)
239
 
240
  # Calculated and add "Categories Attempted" column
241
  if primary_metric == "Overall":
 
245
 
246
  # Return the formatted string with the correct emoji
247
  if count == 4:
248
+ return f"4/4"
249
  if count == 0:
250
+ return f"0/4"
251
+ return f"{count}/4"
252
 
253
  # Apply the function row-wise to create the new column
254
  attempted_column = df_view.apply(calculate_attempted, axis=1)
255
  # Insert the new column at a nice position (e.g., after "Date")
256
+ df_view.insert((cols - 1), "Categories Attempted", attempted_column)
257
  else:
258
  total_benchmarks = len(group_metrics)
259
  def calculate_benchmarks_attempted(row):
260
  # Count how many benchmarks in this category have COST data reported
261
  count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Cost")))
262
  if count == total_benchmarks:
263
+ return f"{count}/{total_benchmarks} "
264
  elif count == 0:
265
+ return f"{count}/{total_benchmarks} "
266
  else:
267
+ return f"{count}/{total_benchmarks}"
268
  # Insert the new column, for example, after "Date"
269
+ df_view.insert((cols - 1), "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
 
270
 
271
  # --- 4. Generate the Scatter Plot for the Primary Metric ---
272
  plots: dict[str, go.Figure] = {}
 
278
  data=df_view,
279
  x=primary_cost_col,
280
  y=primary_score_col,
281
+ agent_col="agent_for_hover"
282
  )
283
  # Use a consistent key for easy retrieval later
284
  plots['scatter_plot'] = fig
 
298
  data: pd.DataFrame,
299
  x: Optional[str],
300
  y: str,
301
+ agent_col: str = 'agent_for_hover'
302
  ) -> go.Figure:
303
 
304
+ # --- Section 1: Define Mappings ---
305
+ color_map = {
306
+ "Closed": "red",
307
+ "API Available": "orange",
308
+ "Open Source": "green",
309
+ "Open Source + Open Weights": "blue"
310
+ }
311
+ category_order = list(color_map.keys())
312
+
313
+ shape_map = {
314
+ "Standard": "star",
315
+ "Custom with Standard Search": "diamond",
316
+ "Fully Custom": "circle"
317
+ }
318
+ default_shape = 'square'
319
+
320
  x_col_to_use = x
321
  y_col_to_use = y
322
 
323
+ required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
324
+ if not all(col in data.columns for col in required_cols):
325
+ logger.error(f"Missing one or more required columns for plotting: {required_cols}")
 
 
326
  return go.Figure()
327
 
328
  data_plot = data.copy()
329
  data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
330
 
331
+ x_axis_label = f"{x} (USD)" if x else "Cost (Data N/A)"
332
  x_data_is_valid = False
333
  if x and x in data_plot.columns:
334
  try:
 
344
  x_col_to_use = dummy_x_col_name
345
  logger.info("Using dummy x-values for plotting.")
346
 
347
+ # Clean data based on all necessary columns
348
+ data_plot.dropna(subset=[y_col_to_use, x_col_to_use, "Openness", "Agent Tooling"], inplace=True)
349
+
350
+ # --- Section 3: Initialize Figure ---
351
  fig = go.Figure()
352
  if data_plot.empty:
353
+ logger.warning(f"No valid data to plot after cleaning.")
354
  return fig
355
 
356
+ # --- Section 4: Calculate and Draw Pareto Frontier (Restored from your original code) ---
357
  if x_data_is_valid:
 
358
  sorted_data = data_plot.sort_values(by=[x_col_to_use, y_col_to_use], ascending=[True, False])
 
359
  frontier_points = []
360
  max_score_so_far = float('-inf')
361
 
362
+ for _, row in sorted_data.iterrows():
363
  score = row[y_col_to_use]
364
+ if score >= max_score_so_far:
 
 
365
  frontier_points.append({'x': row[x_col_to_use], 'y': score})
366
  max_score_so_far = score
367
 
 
368
  if frontier_points:
369
  frontier_df = pd.DataFrame(frontier_points)
370
  fig.add_trace(go.Scatter(
 
373
  mode='lines',
374
  name='Efficiency Frontier',
375
  line=dict(color='firebrick', width=2, dash='dash'),
376
+ hoverinfo='skip'
377
  ))
378
 
379
+ # --- Section 5: Prepare for Marker Plotting ---
380
+ # Pre-generate hover text and shapes for each point
381
+ data_plot['hover_text'] = data_plot.apply(
382
+ lambda row: f"<b>{row[agent_col]}</b><br>{x_axis_label}: ${row[x_col_to_use]:.2f}<br>{y_col_to_use}: {row[y_col_to_use]:.2f}",
383
+ axis=1
384
+ )
385
+ data_plot['shape_symbol'] = data_plot['Agent Tooling'].map(shape_map).fillna(default_shape)
386
+
387
+ # --- Section 6: Plot Markers by "Openness" Category ---
388
+ for category in category_order:
389
+ group = data_plot[data_plot['Openness'] == category]
390
+ if group.empty:
391
+ continue
392
+
393
  fig.add_trace(go.Scatter(
394
  x=group[x_col_to_use],
395
  y=group[y_col_to_use],
396
  mode='markers',
397
+ name=category,
398
+ showlegend=False,
399
+ text=group['hover_text'],
400
+ hoverinfo='text',
401
+ marker=dict(
402
+ color=color_map.get(category, 'grey'),
403
+ symbol=group['shape_symbol'],
404
+ size=10,
405
+ opacity=0.8,
406
+ line=dict(width=1, color='DarkSlateGrey')
407
+ )
408
+ ))
409
+ # ---- Add logic for making the legend -----------
410
+ for i, category in enumerate(category_order):
411
+ fig.add_trace(go.Scatter(
412
+ x=[None], y=[None],
413
+ mode='markers',
414
+ name=category,
415
+ legendgroup="openness_group",
416
+ legendgrouptitle_text="Agent Openness" if i == 0 else None,
417
+ marker=dict(
418
+ color=color_map.get(category, 'grey'),
419
+ symbol='circle',
420
+ size=12
421
+ )
422
+ ))
423
+
424
+ # Part B: Dummy traces for the SHAPES ("Agent Tooling")
425
+ shape_items = list(shape_map.items())
426
+ for i, (shape_name, shape_symbol) in enumerate(shape_items):
427
+ fig.add_trace(go.Scatter(
428
+ x=[None], y=[None],
429
+ mode='markers',
430
+ name=shape_name,
431
+ legendgroup="tooling_group",
432
+ legendgrouptitle_text="Agent Tooling" if i == 0 else None,
433
+ marker=dict(color='grey', symbol=shape_symbol, size=12)
434
  ))
435
 
436
+ # --- Section 8: Configure Layout (Restored from your original code) ---
437
  xaxis_config = dict(title=x_axis_label)
438
  if not x_data_is_valid:
439
  xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
 
441
  else:
442
  xaxis_config['rangemode'] = "tozero"
443
 
444
+ logo_data_uri = svg_to_data_uri("assets/just-icon.svg")
445
+
446
  fig.update_layout(
447
+ template="plotly_white",
448
  title=f"{y_col_to_use} vs. {x_axis_label}",
449
  xaxis=xaxis_config,
450
  yaxis=dict(title=y_col_to_use, rangemode="tozero"),
451
+ legend=dict(
452
+ bgcolor='#FAF2E9',
453
+ )
454
+ )
455
+ fig.add_layout_image(
456
+ dict(
457
+ source=logo_data_uri,
458
+ xref="x domain", yref="y domain",
459
+ x=1.1, y=1.1,
460
+ sizex=0.2, sizey=0.2,
461
+ xanchor="left",
462
+ yanchor="bottom",
463
+ layer="above",
464
+ ),
465
  )
466
 
467
  return fig
468
 
469
+
470
  def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
471
  """
472
  Applies custom formatting to a cost column based on its corresponding score column.
 
494
  if pd.notna(cost_value) and isinstance(cost_value, (int, float)):
495
  return f"${cost_value:.2f}"
496
  elif pd.notna(score_value):
497
+ return f'<span style="color: {status_color};">Missing</span>' # Score exists, but cost is missing
498
  else:
499
  return f'<span style="color: {status_color};">Not Attempted</span>' # Neither score nor cost exists
500
 
 
530
  # Apply the formatting and return the updated DataFrame
531
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
532
 
533
+
534
+ def get_pareto_df(data):
535
+ # This is a placeholder; use your actual function that handles dynamic column names
536
+ # A robust version might look for any column with "Cost" and "Score"
537
+ cost_cols = [c for c in data.columns if 'Cost' in c]
538
+ score_cols = [c for c in data.columns if 'Score' in c]
539
+ if not cost_cols or not score_cols:
540
+ return pd.DataFrame()
541
+
542
+ x_col, y_col = cost_cols[0], score_cols[0]
543
+
544
+ frontier_data = data.dropna(subset=[x_col, y_col]).copy()
545
+ frontier_data[y_col] = pd.to_numeric(frontier_data[y_col], errors='coerce')
546
+ frontier_data[x_col] = pd.to_numeric(frontier_data[x_col], errors='coerce')
547
+ frontier_data.dropna(subset=[x_col, y_col], inplace=True)
548
+ if frontier_data.empty:
549
+ return pd.DataFrame()
550
+
551
+ frontier_data = frontier_data.sort_values(by=[x_col, y_col], ascending=[True, False])
552
+
553
+ pareto_points = []
554
+ max_score_at_cost = -np.inf
555
+
556
+ for _, row in frontier_data.iterrows():
557
+ if row[y_col] >= max_score_at_cost:
558
+ pareto_points.append(row)
559
+ max_score_at_cost = row[y_col]
560
+
561
+ return pd.DataFrame(pareto_points)
562
+
563
+
564
+ def svg_to_data_uri(path: str) -> str:
565
+ """Reads an SVG file and encodes it as a Data URI for Plotly."""
566
+ try:
567
+ with open(path, "rb") as f:
568
+ encoded_string = base64.b64encode(f.read()).decode()
569
+ return f"data:image/svg+xml;base64,{encoded_string}"
570
+ except FileNotFoundError:
571
+ logger.warning(f"SVG file not found at: {path}")
572
+ return None
leaderboard_viewer.py DELETED
@@ -1,319 +0,0 @@
1
- """
2
- View and plot leaderboard results.
3
- """
4
-
5
- import logging
6
- from typing import Optional
7
- from zoneinfo import ZoneInfo
8
-
9
- import datasets
10
- import matplotlib.pyplot as plt
11
- import numpy as np
12
- import pandas as pd
13
- import seaborn as sns
14
-
15
- from agenteval import compute_summary_statistics
16
- from agenteval.config import SuiteConfig
17
- from agenteval.models import EvalResult
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class LeaderboardViewer:
23
- """
24
- Load and visualize leaderboard for a given HF dataset split.
25
- """
26
-
27
- def __init__(
28
- self, repo_id: str, config: str, split: str, is_internal: bool = False
29
- ):
30
- self._repo_id = repo_id
31
- self._config = config
32
- self._split = split
33
- self._internal = is_internal
34
-
35
- # build suite_config and mapping from tags to tasks from the first result
36
- # TODO: Verify the sort order
37
- ds = datasets.load_dataset(repo_id, name=config).get(split)
38
- if not ds:
39
- raise ValueError(f"Split '{split}' not found in dataset results")
40
- suite = EvalResult.model_validate(ds[0]).suite_config
41
- self._cfg = suite
42
- self.tag_map: dict[str, list[str]] = {}
43
- for task in suite.get_tasks(split):
44
- for t in task.tags or []:
45
- self.tag_map.setdefault(t, []).append(task.name)
46
-
47
- def _load(self):
48
- results = datasets.load_dataset(self._repo_id, name=self._config)
49
- overview = _get_dataframe(
50
- eval_results=results,
51
- split=self._split,
52
- is_internal=self._internal,
53
- suite_config=self._cfg,
54
- )
55
- return overview, self.tag_map
56
-
57
- def view(
58
- self, tag: Optional[str] = None, with_plots: bool = False
59
- ) -> tuple[pd.DataFrame, dict[str, plt.Figure]]:
60
- """
61
- If tag is None, primary="Overall" and group=all tags.
62
- Otherwise primary=tag and group=tasks under that tag.
63
- """
64
- data, tag_map = self._load()
65
- cols = [
66
- "Agent",
67
- "Submitter",
68
- "Completeness",
69
- "LLM Base",
70
- "Openness" ,
71
- "Date",
72
- "Logs",
73
- ]
74
-
75
- # choose primary metric and its sub‐group
76
- if tag is None:
77
- primary = "Overall"
78
- group = list(tag_map.keys())
79
- else:
80
- primary = tag
81
- group = tag_map.get(tag, [])
82
- data = data.sort_values(primary, ascending=False)
83
-
84
- # build full metric list: primary + its cost + each member and its cost
85
- metrics = [primary, f"{primary} cost"] + [
86
- m for t in group for m in (t, f"{t} cost")
87
- ]
88
-
89
- # filter to relevant columns
90
- ci_cols = [f"{m} 95% CI" for m in metrics if f"{m} 95% CI" in data.columns]
91
- df = data.loc[
92
- :,
93
- cols + [c for c in metrics if c in data.columns] + ci_cols,
94
- ].reset_index(drop=True)
95
-
96
- plots: dict[str, plt.Figure] = {}
97
- if with_plots:
98
- avail = [c for c in metrics if c in df.columns]
99
- for m in [primary] + group:
100
- x, y = f"{m} cost", m
101
- if x in df.columns and y in df.columns:
102
- plots[f"scatter_{m}"] = _plot_scatter(
103
- df, x=x, y=y, agent_col="Agent"
104
- )
105
-
106
- return df, plots
107
-
108
-
109
- def _get_dataframe(
110
- eval_results: datasets.DatasetDict,
111
- split: str,
112
- is_internal: bool,
113
- suite_config: SuiteConfig,
114
- timezone: str = "US/Pacific",
115
- ) -> pd.DataFrame:
116
- """
117
- Load leaderboard results from the given dataset split and return a DataFrame.
118
- """
119
- ds = eval_results.get(split)
120
- if not ds:
121
- cols = ["agent_name", "agent_description", "username", "submit_time"]
122
- pretty = [_pretty_column_name(c) for c in cols]
123
- empty = pd.DataFrame({c: ["No data"] for c in pretty})
124
- return empty
125
-
126
- cfg = suite_config
127
-
128
- rows = []
129
- for itm in ds:
130
- ev = EvalResult.model_validate(itm)
131
- sub = ev.submission
132
- # only format if submit_time present, else leave as None
133
- ts = sub.submit_time
134
- if ts is not None:
135
- date = ts.astimezone(ZoneInfo(timezone)).strftime("%Y-%m-%d")
136
- else:
137
- date = None
138
-
139
- if not ev.results:
140
- logger.warning(
141
- f"Skipping submission {sub.agent_name} ({sub.username}) "
142
- f"({sub.submit_time}) with no results"
143
- )
144
- continue
145
- stats = compute_summary_statistics(
146
- suite_config=cfg, split=split, results=ev.results
147
- )
148
- flat = {}
149
- for key, s in stats.items():
150
- parts = key.split("/")
151
- if parts[0] == "overall":
152
- flat["overall/score"], flat["overall/cost"] = s.score, s.cost
153
- elif parts[0] == "tag":
154
- flat[f"tag/{parts[1]}/score"], flat[f"tag/{parts[1]}/cost"] = (
155
- s.score,
156
- s.cost,
157
- )
158
- else: # task
159
- t0 = parts[1]
160
- # compute 95% CI half-width from stderr
161
- flat.update(
162
- {
163
- f"task/{t0}/score": s.score,
164
- f"task/{t0}/score_ci": (
165
- (s.score_stderr * 1.96)
166
- if s.score_stderr is not None
167
- else np.nan
168
- ),
169
- f"task/{t0}/cost": s.cost,
170
- f"task/{t0}/cost_ci": (
171
- (s.cost_stderr * 1.96)
172
- if s.cost_stderr is not None
173
- else np.nan
174
- ),
175
- }
176
- )
177
-
178
- rows.append(
179
- {
180
- "agent_name": sub.agent_name,
181
- "username": sub.username or "",
182
- "submit_time": date,
183
- **flat,
184
- "logs_url": sub.logs_url if is_internal else sub.logs_url_public,
185
- }
186
- )
187
-
188
- df = pd.DataFrame(rows)
189
-
190
- # prepare pretty column mapping
191
- pretty_cols = {c: _pretty_column_name(c) for c in df.columns}
192
-
193
- # construct overview table with human-friendly names
194
- overview = df.rename(columns=pretty_cols)
195
-
196
- return overview
197
-
198
-
199
- def _pretty_column_name(col: str) -> str:
200
- """Map raw column name to display name."""
201
- # fixed mappings
202
- mapping = {
203
- "submit_time": "Date",
204
- "agent_name": "Agent",
205
- "username": "User/organization",
206
- "logs_url": "Logs",
207
- "overall/score": "Score",
208
- "overall/cost": "Cost (USD)",
209
- }
210
- if col in mapping:
211
- return mapping[col]
212
- # dynamic: task/{name}/{metric} or tag/{name}/{metric}
213
- parts = col.split("/")
214
- if len(parts) == 3:
215
- _, name, metric = parts
216
- if metric == "score":
217
- return name
218
- if metric == "cost":
219
- return f"{name} cost"
220
- if metric == "score_ci":
221
- return f"{name} 95% CI"
222
- if metric == "cost_ci":
223
- return f"{name} cost 95% CI"
224
- # fallback to last segment
225
- return parts[-1]
226
-
227
-
228
-
229
- def _plot_scatter(
230
- data: pd.DataFrame,
231
- x: str, # Cost column name (e.g., "Overall cost")
232
- y: str, # Score column name (e.g., "Overall score")
233
- agent_col: str,
234
- ) -> plt.Figure:
235
- """Scatter plot of agent results, showing score vs cost with Pareto frontier."""
236
- fig, ax = plt.subplots(figsize=(20,7))
237
-
238
- # Make a copy for manipulation to find frontier without affecting original data
239
- plot_data = data.copy()
240
-
241
- # Ensure score (y) and cost (x) are numeric and drop NaNs for frontier calculation
242
- plot_data[y] = pd.to_numeric(plot_data[y], errors='coerce')
243
- plot_data[x] = pd.to_numeric(plot_data[x], errors='coerce')
244
- frontier_data = plot_data.dropna(subset=[y, x])
245
-
246
- if not frontier_data.empty:
247
- # Sort by cost (x) ascending, then by score (y) descending for tie-breaking
248
- frontier_data = frontier_data.sort_values(by=[x, y], ascending=[True, False])
249
-
250
- pareto_points = []
251
- max_score_at_cost = -np.inf # Initialize with negative infinity
252
-
253
- for index, row in frontier_data.iterrows():
254
- current_score = row[y]
255
- current_cost = row[x]
256
- # Only add point if it offers a higher score than any previous point
257
- # on the frontier with less or equal cost (implicit by sorting).
258
- # More strictly, for a point to be on the frontier here, it must improve the score.
259
- if current_score > max_score_at_cost:
260
- # Optional: If allowing same score but lower cost (already handled by sort somewhat)
261
- # you might need to check if a point with same score but lower cost exists
262
- # For this algorithm, we simply take points that strictly increase score.
263
- pareto_points.append(row)
264
- max_score_at_cost = current_score
265
-
266
- if pareto_points:
267
- pareto_df = pd.DataFrame(pareto_points)
268
- # Sort pareto_df by cost again just to be sure for plotting line
269
- pareto_df = pareto_df.sort_values(by=x)
270
- # Plot the Pareto frontier line
271
- ax.plot(pareto_df[x], pareto_df[y], marker='o', linestyle='-', color='red', alpha=0.7, linewidth=2, markersize=5, label='Pareto Frontier')
272
-
273
- # Plot all data points
274
- sns.scatterplot(data=data, x=x, y=y, hue=agent_col, s=100, ax=ax, legend="auto")
275
-
276
- # Error bars (if they exist)
277
- x_ci_col = f"{x} 95% CI"
278
- y_ci_col = f"{y} 95% CI"
279
- if x_ci_col in data.columns or y_ci_col in data.columns:
280
- # Filter data for error bars to only include rows present in the original 'data'
281
- # This is important if 'frontier_data' subset was used for some logic but error bars are for all.
282
- error_bar_data = data.copy() # Use original data for error bars
283
- error_bar_data[x_ci_col] = pd.to_numeric(error_bar_data.get(x_ci_col), errors='coerce')
284
- error_bar_data[y_ci_col] = pd.to_numeric(error_bar_data.get(y_ci_col), errors='coerce')
285
-
286
- ax.errorbar(
287
- x=error_bar_data[x], # Use original data's x
288
- y=error_bar_data[y], # Use original data's y
289
- xerr=error_bar_data.get(x_ci_col),
290
- yerr=error_bar_data.get(y_ci_col),
291
- fmt="none",
292
- ecolor="gray",
293
- alpha=0.5,
294
- capsize=3,
295
- zorder=0 # Draw error bars behind scatter points
296
- )
297
-
298
- ax.set_xlim(left=0)
299
- ax.set_ylim(bottom=0) # Scores and costs are typically non-negative
300
- ax.set_xlabel(x) # x is cost
301
- ax.set_ylabel(y) # y is score
302
-
303
- # Adjust legend: Get handles and labels from seaborn plot, then add frontier's
304
- handles, labels = ax.get_legend_handles_labels()
305
- # Check if "Pareto Frontier" was actually plotted and add its handle/label if so
306
- if pareto_points and "Pareto Frontier" not in labels: # Avoid duplicate legend items
307
- # Find the frontier line object to get its handle
308
- frontier_line = next((line for line in ax.get_lines() if line.get_label() == 'Pareto Frontier'), None)
309
- if frontier_line:
310
- handles.append(frontier_line)
311
- labels.append('Pareto Frontier')
312
-
313
- ax.legend(handles=handles, labels=labels, title=agent_col, bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0.)
314
-
315
- plt.tight_layout(rect=[0, 0, 0.85, 1])
316
- return fig
317
-
318
-
319
- __all__ = ["LeaderboardViewer"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
literature_understanding.py CHANGED
@@ -2,13 +2,19 @@ import gradio as gr
2
  import pandas as pd
3
 
4
  # Import our UI factories and the data loader
5
- from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data
6
-
7
  # Define the category for this page
8
  CATEGORY_NAME = "Literature Understanding"
9
 
10
  with gr.Blocks() as demo:
11
- gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
 
 
 
 
 
 
12
 
13
  # --- This page now has two main sections: Validation and Test ---
14
  with gr.Tabs():
 
2
  import pandas as pd
3
 
4
  # Import our UI factories and the data loader
5
+ from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
6
+ from content import LIT_DESCRIPTION
7
  # Define the category for this page
8
  CATEGORY_NAME = "Literature Understanding"
9
 
10
  with gr.Blocks() as demo:
11
+ gr.Markdown(f"## {CATEGORY_NAME} Aggregated")
12
+
13
+ validation_df, validation_tag_map = get_full_leaderboard_data("validation")
14
+ test_df, test_tag_map = get_full_leaderboard_data("test")
15
+ gr.Markdown(LIT_DESCRIPTION, elem_id="category-intro")
16
+ if validation_tag_map:
17
+ create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
18
 
19
  # --- This page now has two main sections: Validation and Test ---
20
  with gr.Tabs():
main_page.py CHANGED
@@ -1,341 +1,26 @@
1
  import matplotlib
2
  matplotlib.use('Agg')
3
-
4
- import os
5
- import shutil
6
- import tarfile
7
- import tempfile
8
- from datetime import datetime, timedelta, timezone
9
- from email.utils import parseaddr
10
- from pathlib import Path
11
- # from zoneinfo import ZoneInfo # LeaderboardViewer uses this, ensure it's available
12
-
13
  import gradio as gr
14
- import requests
15
- from agenteval import (
16
- # compute_summary_statistics, # This will now be used by LeaderboardViewer
17
- process_eval_logs,
18
- upload_folder_to_hf,
19
- upload_summary_to_hf,
20
- )
21
- from agenteval.models import EvalResult # Used by submission and LeaderboardViewer (implicitly)
22
- from agenteval.leaderboard.upload import sanitize_path_component
23
- from datasets import Dataset, DatasetDict, VerificationMode, load_dataset # load_dataset used by LV
24
- from datasets.data_files import EmptyDatasetError
25
- from huggingface_hub import HfApi
26
 
27
  from ui_components import create_leaderboard_display, get_full_leaderboard_data
28
 
29
  from content import (
30
  CITATION_BUTTON_LABEL,
31
  CITATION_BUTTON_TEXT,
32
- INTRODUCTION_TEXT,
33
- SUBMISSION_TEXT,
34
- INTRO_PARAGRAPH,
35
- SCATTER_DISCLAIMER,
36
- format_error,
37
- format_log,
38
- format_warning,
39
  )
40
 
41
- # --- Constants and Configuration ---
42
- LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
43
- CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
44
- IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
45
-
46
- OWNER = "allenai"
47
- PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
48
- SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"
49
- SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"
50
- CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
51
- RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # This is the repo_id for LeaderboardViewer
52
- LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
53
-
54
- if LOCAL_DEBUG:
55
- DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
56
- else:
57
- DATA_DIR = "/home/user/data/" + CONFIG_NAME
58
- EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
59
-
60
- api = HfApi()
61
- MAX_UPLOAD_BYTES = 100 * 1024**2
62
- AGENTEVAL_MANIFEST_NAME = "agenteval.json"
63
- os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
64
-
65
  # --- Global State for Viewers (simple caching) ---
66
  CACHED_VIEWERS = {}
67
  CACHED_TAG_MAPS = {}
68
 
69
- # --- Submission Logic (largely unchanged from original, ensure EvalResult and other deps are fine) ---
70
- def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one
71
- try:
72
- return load_dataset(*args, **kwargs)
73
- except EmptyDatasetError:
74
- return DatasetDict()
75
- except ValueError: # Handles cases where dataset is empty or ill-formed
76
- return DatasetDict()
77
-
78
- def checked_upload_folder(
79
- api_hf: HfApi, # Renamed to avoid conflict with global api
80
- folder_path: str,
81
- repo_id: str,
82
- config_name_ul: str, # Renamed
83
- split_ul: str, # Renamed
84
- submission_name_ul: str, # Renamed
85
- ) -> str:
86
- total = 0
87
- for root, _, files in os.walk(folder_path):
88
- for f_ul in files: # Renamed
89
- total += os.path.getsize(os.path.join(root, f_ul))
90
- if total > MAX_UPLOAD_BYTES:
91
- raise ValueError(
92
- f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
93
- )
94
- return upload_folder_to_hf(
95
- api=api_hf, # Use renamed parameter
96
- folder_path=folder_path,
97
- repo_id=repo_id,
98
- config_name=config_name_ul,
99
- split=split_ul,
100
- submission_name=submission_name_ul,
101
- )
102
-
103
- def add_new_eval(
104
- val_or_test: str,
105
- agent_name: str | None,
106
- agent_description: str,
107
- agent_url: str,
108
- openness: str | None,
109
- degree_of_control: str | None,
110
- path_to_file: tempfile._TemporaryFileWrapper | None,
111
- username: str,
112
- mail: str,
113
- profile: gr.OAuthProfile,
114
- # We need global eval_results for checks; this might need rethinking if it's purely display driven now
115
- # For now, let's assume we still load it for submission checks
116
- ):
117
- # Load current eval_results for submission checks
118
- # This is a bit redundant if display part reloads it, but submission needs its own consistent view
119
- current_eval_results_for_submission = try_load_dataset_submission(
120
- RESULTS_DATASET,
121
- CONFIG_NAME,
122
- download_mode="force_redownload", # Or a less aggressive mode
123
- verification_mode=VerificationMode.NO_CHECKS,
124
- trust_remote_code=True,
125
- )
126
- if not agent_name:
127
- return format_warning("Please provide an agent name.")
128
-
129
- submission_time = datetime.now(timezone.utc)
130
- if not username or username.strip() == "":
131
- username = profile.username # Default to HF username
132
-
133
- # User account age check
134
- try:
135
- user_data_resp = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
136
- user_data_resp.raise_for_status()
137
- creation_date_str = user_data_resp.json()["createdAt"]
138
- created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
139
- if submission_time - created_at < timedelta(days=60):
140
- return format_error("This account is not authorized to submit here (account too new).")
141
- except Exception as e:
142
- print(f"Error checking user account age: {e}")
143
- return format_error("Could not verify account age. Please try again later.")
144
-
145
- # Submission frequency check
146
- contact_infos = try_load_dataset_submission(
147
- CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
148
- verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True
149
- )
150
- user_submission_dates = sorted(
151
- datetime.fromisoformat(row["submit_time"])
152
- for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
153
- )
154
- if user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)):
155
- return format_error("You already submitted once in the last 24h for this split; please try again later.")
156
-
157
- # Email validation
158
- _, parsed_mail = parseaddr(mail)
159
- if "@" not in parsed_mail:
160
- return format_warning("Please provide a valid email address.")
161
-
162
- # Duplicate submission check
163
- if val_or_test in current_eval_results_for_submission and len(current_eval_results_for_submission[val_or_test]) > 0:
164
- existing_submissions = current_eval_results_for_submission[val_or_test].to_dict().get("submission", [])
165
- for sub_item in existing_submissions:
166
- if (sub_item.get("agent_name", "").lower() == agent_name.lower() and
167
- sub_item.get("username", "").lower() == username.lower()):
168
- return format_warning("This agent name by this user has already been submitted to this split.")
169
-
170
- if path_to_file is None:
171
- return format_warning("Please attach a .tar.gz file.")
172
-
173
- safe_username = sanitize_path_component(username)
174
- safe_agent_name = sanitize_path_component(agent_name)
175
- extracted_dir = os.path.join(EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}")
176
-
177
- # File extraction
178
- if not LOCAL_DEBUG:
179
- try:
180
- if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir)
181
- os.makedirs(extracted_dir, exist_ok=True)
182
- with tarfile.open(path_to_file.name, "r:gz") as tar:
183
- members_extracted = 0
184
- for member in tar.getmembers():
185
- if not member.isreg(): continue
186
- fname = os.path.basename(member.name)
187
- if not fname or fname.startswith("."): continue
188
- fobj = tar.extractfile(member)
189
- if not fobj: continue
190
- with open(os.path.join(extracted_dir, fname), "wb") as out:
191
- out.write(fobj.read())
192
- members_extracted +=1
193
- if members_extracted == 0:
194
- return format_error("Submission tarball is empty or contains no valid files.")
195
- except Exception as e:
196
- return format_error(f"Error extracting file: {e}. Ensure it's a valid .tar.gz.")
197
- else: print("mock extracted file", flush=True)
198
-
199
-
200
- submission_name = f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d_%H-%M-%S')}"
201
-
202
- # 1. Upload raw (unscored) submission files
203
- if not LOCAL_DEBUG:
204
- try:
205
- checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, submission_name)
206
- except ValueError as e: return format_error(str(e))
207
- except Exception as e: return format_error(f"Failed to upload raw submission: {e}")
208
- else: print("mock uploaded raw submission", flush=True)
209
-
210
- # 2. Save contact information
211
- contact_info = {
212
- "agent_name": agent_name, "agent_description": agent_description, "url": agent_url,
213
- "username": username, "username_auth": profile.username, "mail": mail,
214
- "submit_time": submission_time.isoformat(),
215
- }
216
- if val_or_test in contact_infos:
217
- contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
218
- else:
219
- contact_infos[val_or_test] = Dataset.from_list([contact_info])
220
-
221
- if not LOCAL_DEBUG:
222
- try:
223
- contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
224
- except Exception as e: return format_warning(f"Submission recorded, but contact info failed to save: {e}")
225
- else: print("mock uploaded contact info", flush=True)
226
-
227
-
228
- # 3. Process and score the submission
229
- eval_result_obj = None # Define to avoid NameError
230
- try:
231
- json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
232
- if not json_path.exists():
233
- return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
234
-
235
- eval_result_obj = EvalResult.model_validate_json(json_path.read_text(encoding="utf-8"))
236
- if eval_result_obj.suite_config.version != CONFIG_NAME:
237
- return format_error(f"Suite version mismatch: expected {CONFIG_NAME}, got {eval_result_obj.suite_config.version}.")
238
- if eval_result_obj.split != val_or_test:
239
- return format_error(f"Split mismatch: expected {val_or_test}, got {eval_result_obj.split}.")
240
-
241
- # Re-compute results from logs for integrity
242
- eval_result_obj.results = process_eval_logs(extracted_dir)[0] # Assuming process_eval_logs returns a tuple/list
243
- eval_result_obj.save_json(str(json_path)) # Save the re-processed manifest
244
-
245
- except Exception as e:
246
- return format_error(f"Error scoring submission: {e}. Check manifest and log files.")
247
-
248
- # 4. Upload scored submission files
249
- logs_url_private_val, logs_url_public_val = None, None
250
- scored_submission_name = f"{submission_name}_scored"
251
- if not LOCAL_DEBUG:
252
- try:
253
- logs_url_private_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
254
- if val_or_test == "validation" and not IS_INTERNAL: # Public copy for validation
255
- logs_url_public_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET_PUBLIC, CONFIG_NAME, val_or_test, scored_submission_name)
256
- except ValueError as e: return format_error(str(e))
257
- except Exception as e: return format_error(f"Failed to upload scored submission: {e}")
258
- else: print("mock uploaded scored submission", flush=True)
259
-
260
-
261
- # Update EvalResult with submission details
262
- eval_result_obj.submission.agent_name = agent_name
263
- eval_result_obj.submission.agent_description = agent_description
264
- eval_result_obj.submission.agent_url = agent_url
265
- eval_result_obj.submission.openness = openness
266
- eval_result_obj.submission.degree_of_control = degree_of_control
267
- eval_result_obj.submission.username = username
268
- eval_result_obj.submission.submit_time = submission_time
269
- eval_result_obj.submission.logs_url = logs_url_private_val
270
- eval_result_obj.submission.logs_url_public = logs_url_public_val
271
-
272
- # 5. Upload summary statistics to RESULTS_DATASET (for the leaderboard)
273
- if not LOCAL_DEBUG:
274
- try:
275
- upload_summary_to_hf(api, eval_result_obj, RESULTS_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
276
- except Exception as e:
277
- return format_error(f"Failed to upload summary results to leaderboard: {e}")
278
- else: print("mock uploaded results to lb", flush=True)
279
-
280
- # Invalidate viewer cache for the split that was updated
281
- if val_or_test in CACHED_VIEWERS:
282
- del CACHED_VIEWERS[val_or_test]
283
- if val_or_test in CACHED_TAG_MAPS:
284
- del CACHED_TAG_MAPS[val_or_test]
285
-
286
-
287
- return format_log(
288
- f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split. "
289
- "Please refresh the leaderboard in a few moments. It may take some time for changes to propagate."
290
- )
291
-
292
- with gr.Blocks() as demo:
293
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
294
- gr.HTML(INTRO_PARAGRAPH, elem_id="intro-paragraph")
295
-
296
- # --- Submission Accordion ---
297
- with gr.Accordion("🚀 Submit a new agent for evaluation", open=False, elem_classes="submission-accordion"):
298
- gr.Markdown(SUBMISSION_TEXT, elem_id="markdown-text")
299
- with gr.Row():
300
- with gr.Column():
301
- level_of_test_radio = gr.Radio(["validation", "test"], value="validation", label="Split")
302
- agent_name_tb = gr.Textbox(label="Agent Name")
303
- agent_desc_tb = gr.Textbox(label="Agent Description")
304
- agent_url_tb = gr.Textbox(label="URL to Agent Information")
305
- openness_radio = gr.Radio(["Open Source", "API", "UI"], value=None, label="Openness of Agent")
306
- degree_of_control_radio = gr.Radio(["Standard", "Custom"], value=None, label="Degree of Control")
307
- with gr.Column():
308
- username_tb = gr.Textbox(label="Organization or User Name (Defaults to HF username)")
309
- mail_tb = gr.Textbox(label="Contact Email (Private, for submission issues)")
310
- file_upload_comp = gr.File(
311
- label="Submission File (.tar.gz ...)", # Shortened for brevity
312
- file_types=[".gz", ".tar.gz"]
313
- )
314
- with gr.Row():
315
- gr.LoginButton()
316
- submit_eval_button = gr.Button("Submit Evaluation")
317
- submission_result = gr.Markdown()
318
-
319
- submit_eval_button.click(
320
- add_new_eval,
321
- [
322
- level_of_test_radio,
323
- agent_name_tb,
324
- agent_desc_tb,
325
- agent_url_tb,
326
- openness_radio,
327
- degree_of_control_radio,
328
- file_upload_comp,
329
- username_tb,
330
- mail_tb
331
- ],
332
- submission_result,
333
- )
334
-
335
  # --- Leaderboard Display Section ---
336
  gr.Markdown("---")
337
  CATEGORY_NAME = "Overall"
338
- gr.Markdown(f"## {CATEGORY_NAME} Leaderboard Results")
339
 
340
  with gr.Tabs() as tabs:
341
  with gr.Tab("Results: Validation"):
@@ -352,7 +37,6 @@ with gr.Blocks() as demo:
352
  split_name="validation"
353
  )
354
  else:
355
- # Display a message if no data is available
356
  gr.Markdown("No data available for validation split.")
357
 
358
  with gr.Tab("Results: Test"):
 
1
  import matplotlib
2
  matplotlib.use('Agg')
 
 
 
 
 
 
 
 
 
 
3
  import gradio as gr
4
+
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  from ui_components import create_leaderboard_display, get_full_leaderboard_data
7
 
8
  from content import (
9
  CITATION_BUTTON_LABEL,
10
  CITATION_BUTTON_TEXT,
11
+ INTRO_PARAGRAPH
 
 
 
 
 
 
12
  )
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # --- Global State for Viewers (simple caching) ---
15
  CACHED_VIEWERS = {}
16
  CACHED_TAG_MAPS = {}
17
 
18
+ with gr.Blocks(fill_width=True) as demo:
19
+ gr.Markdown(INTRO_PARAGRAPH, elem_id="intro-paragraph")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # --- Leaderboard Display Section ---
21
  gr.Markdown("---")
22
  CATEGORY_NAME = "Overall"
23
+ gr.Markdown(f"## {CATEGORY_NAME} Categories Aggregated")
24
 
25
  with gr.Tabs() as tabs:
26
  with gr.Tab("Results: Validation"):
 
37
  split_name="validation"
38
  )
39
  else:
 
40
  gr.Markdown("No data available for validation split.")
41
 
42
  with gr.Tab("Results: Test"):
requirements.txt CHANGED
@@ -1,5 +1,131 @@
1
- datasets
2
- gradio[oauth]==5.30.0
3
- huggingface-hub
4
- APScheduler
5
- agent-eval==0.1.9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agent-eval==0.1.13
2
+ aiobotocore==2.22.0
3
+ aiofiles==24.1.0
4
+ aiohappyeyeballs==2.6.1
5
+ aiohttp==3.11.18
6
+ aioitertools==0.12.0
7
+ aiosignal==1.3.2
8
+ annotated-types==0.7.0
9
+ anyio==4.9.0
10
+ APScheduler==3.11.0
11
+ async-timeout==5.0.1
12
+ attrs==25.3.0
13
+ Authlib==1.5.2
14
+ beautifulsoup4==4.13.4
15
+ black==25.1.0
16
+ botocore==1.37.3
17
+ certifi==2025.4.26
18
+ cffi==1.17.1
19
+ charset-normalizer==3.4.2
20
+ click==8.1.8
21
+ contourpy==1.3.2
22
+ cryptography==44.0.3
23
+ cycler==0.12.1
24
+ datasets==3.6.0
25
+ debugpy==1.8.14
26
+ dill==0.3.8
27
+ distro==1.9.0
28
+ docstring_parser==0.16
29
+ exceptiongroup==1.2.2
30
+ fastapi==0.115.12
31
+ ffmpy==0.5.0
32
+ filelock==3.18.0
33
+ fonttools==4.58.1
34
+ frozenlist==1.6.0
35
+ fsspec==2025.3.0
36
+ gradio==5.30.0
37
+ gradio_client==1.10.1
38
+ groovy==0.1.2
39
+ h11==0.16.0
40
+ httpcore==1.0.9
41
+ httpx==0.28.1
42
+ huggingface-hub==0.30.2
43
+ idna==3.10
44
+ ijson==3.3.0
45
+ importlib_metadata==8.7.0
46
+ inspect_ai==0.3.94
47
+ isort==6.0.1
48
+ itsdangerous==2.2.0
49
+ Jinja2==3.1.6
50
+ jiter==0.9.0
51
+ jmespath==1.0.1
52
+ jsonlines==4.0.0
53
+ jsonpatch==1.33
54
+ jsonpointer==3.0.0
55
+ jsonschema==4.23.0
56
+ jsonschema-specifications==2025.4.1
57
+ kiwisolver==1.4.8
58
+ linkify-it-py==2.0.3
59
+ litellm==1.68.1
60
+ markdown-it-py==3.0.0
61
+ MarkupSafe==3.0.2
62
+ matplotlib==3.10.3
63
+ mdit-py-plugins==0.4.2
64
+ mdurl==0.1.2
65
+ mmh3==5.1.0
66
+ mplcursors==0.6
67
+ multidict==6.4.3
68
+ multiprocess==0.70.16
69
+ mypy_extensions==1.1.0
70
+ narwhals==1.38.2
71
+ nest-asyncio==1.6.0
72
+ numpy==2.2.5
73
+ openai==1.75.0
74
+ orjson==3.10.18
75
+ packaging==25.0
76
+ pandas==2.2.3
77
+ pathspec==0.12.1
78
+ pillow==11.2.1
79
+ platformdirs==4.3.7
80
+ plotly==6.0.1
81
+ propcache==0.3.1
82
+ psutil==7.0.0
83
+ pyarrow==20.0.0
84
+ pycparser==2.22
85
+ pydantic==2.11.4
86
+ pydantic_core==2.33.2
87
+ pydub==0.25.1
88
+ Pygments==2.19.1
89
+ pyparsing==3.2.3
90
+ python-dateutil==2.9.0.post0
91
+ python-dotenv==1.1.0
92
+ python-multipart==0.0.20
93
+ pytz==2025.2
94
+ PyYAML==6.0.2
95
+ referencing==0.36.2
96
+ regex==2024.11.6
97
+ requests==2.32.3
98
+ rich==13.9.4
99
+ rpds-py==0.24.0
100
+ ruff==0.11.8
101
+ s3fs==2025.3.0
102
+ safehttpx==0.1.6
103
+ seaborn==0.13.2
104
+ semantic-version==2.10.0
105
+ semver==3.0.4
106
+ shellingham==1.5.4
107
+ shortuuid==1.0.13
108
+ six==1.17.0
109
+ sniffio==1.3.1
110
+ soupsieve==2.7
111
+ starlette==0.46.2
112
+ tenacity==9.1.2
113
+ textual==3.2.0
114
+ tiktoken==0.9.0
115
+ tokenizers==0.21.1
116
+ tomli==2.2.1
117
+ tomlkit==0.13.2
118
+ tqdm==4.67.1
119
+ typer==0.15.3
120
+ typing-inspection==0.4.0
121
+ typing_extensions==4.13.2
122
+ tzdata==2025.2
123
+ tzlocal==5.3.1
124
+ uc-micro-py==1.0.3
125
+ urllib3==2.4.0
126
+ uvicorn==0.34.2
127
+ websockets==15.0.1
128
+ wrapt==1.17.2
129
+ xxhash==3.5.0
130
+ yarl==1.20.0
131
+ zipp==3.21.0
submission.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib
2
+ matplotlib.use('Agg')
3
+
4
+ import os
5
+ import shutil
6
+ import tarfile
7
+ import tempfile
8
+ from datetime import datetime, timedelta, timezone
9
+ from email.utils import parseaddr
10
+ from pathlib import Path
11
+
12
+ import gradio as gr
13
+ import requests
14
+ from agenteval import (
15
+ process_eval_logs,
16
+ upload_folder_to_hf,
17
+ upload_summary_to_hf,
18
+ )
19
+ from agenteval.models import EvalResult
20
+ from agenteval.leaderboard.upload import sanitize_path_component
21
+ from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
22
+ from datasets.data_files import EmptyDatasetError
23
+ from huggingface_hub import HfApi
24
+
25
+ from content import (
26
+ CITATION_BUTTON_LABEL,
27
+ CITATION_BUTTON_TEXT,
28
+ format_error,
29
+ format_log,
30
+ format_warning,
31
+ )
32
+
33
+ # --- Constants and Configuration ---
34
+ LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
35
+ CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
36
+ IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
37
+
38
+ OWNER = "allenai"
39
+ PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
40
+ SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"
41
+ SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"
42
+ CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
43
+ RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # This is the repo_id for LeaderboardViewer
44
+ LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
45
+
46
+ if LOCAL_DEBUG:
47
+ DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
48
+ else:
49
+ DATA_DIR = "/home/user/data/" + CONFIG_NAME
50
+ EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
51
+
52
+ api = HfApi()
53
+ MAX_UPLOAD_BYTES = 100 * 1024**2
54
+ AGENTEVAL_MANIFEST_NAME = "agenteval.json"
55
+ os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
56
+
57
+ # --- Global State for Viewers (simple caching) ---
58
+ CACHED_VIEWERS = {}
59
+ CACHED_TAG_MAPS = {}
60
+
61
+ # --- Submission Logic (largely unchanged from original, ensure EvalResult and other deps are fine) ---
62
+ def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one
63
+ try:
64
+ return load_dataset(*args, **kwargs)
65
+ except EmptyDatasetError:
66
+ return DatasetDict()
67
+ except ValueError: # Handles cases where dataset is empty or ill-formed
68
+ return DatasetDict()
69
+
70
+ def checked_upload_folder(
71
+ api_hf: HfApi, # Renamed to avoid conflict with global api
72
+ folder_path: str,
73
+ repo_id: str,
74
+ config_name_ul: str, # Renamed
75
+ split_ul: str, # Renamed
76
+ submission_name_ul: str, # Renamed
77
+ ) -> str:
78
+ total = 0
79
+ for root, _, files in os.walk(folder_path):
80
+ for f_ul in files: # Renamed
81
+ total += os.path.getsize(os.path.join(root, f_ul))
82
+ if total > MAX_UPLOAD_BYTES:
83
+ raise ValueError(
84
+ f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
85
+ )
86
+ return upload_folder_to_hf(
87
+ api=api_hf, # Use renamed parameter
88
+ folder_path=folder_path,
89
+ repo_id=repo_id,
90
+ config_name=config_name_ul,
91
+ split=split_ul,
92
+ submission_name=submission_name_ul,
93
+ )
94
+
95
+ def add_new_eval(
96
+ val_or_test: str,
97
+ agent_name: str | None,
98
+ agent_description: str,
99
+ agent_url: str,
100
+ openness: str | None,
101
+ degree_of_control: str | None,
102
+ path_to_file: tempfile._TemporaryFileWrapper | None,
103
+ username: str,
104
+ mail: str,
105
+ profile: gr.OAuthProfile,
106
+ # We need global eval_results for checks; this might need rethinking if it's purely display driven now
107
+ # For now, let's assume we still load it for submission checks
108
+ ):
109
+ # Load current eval_results for submission checks
110
+ # This is a bit redundant if display part reloads it, but submission needs its own consistent view
111
+ current_eval_results_for_submission = try_load_dataset_submission(
112
+ RESULTS_DATASET,
113
+ CONFIG_NAME,
114
+ download_mode="force_redownload", # Or a less aggressive mode
115
+ verification_mode=VerificationMode.NO_CHECKS,
116
+ trust_remote_code=True,
117
+ )
118
+ if not agent_name:
119
+ return format_warning("Please provide an agent name.")
120
+
121
+ submission_time = datetime.now(timezone.utc)
122
+ if not username or username.strip() == "":
123
+ username = profile.username # Default to HF username
124
+
125
+ # User account age check
126
+ try:
127
+ user_data_resp = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
128
+ user_data_resp.raise_for_status()
129
+ creation_date_str = user_data_resp.json()["createdAt"]
130
+ created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
131
+ if submission_time - created_at < timedelta(days=60):
132
+ return format_error("This account is not authorized to submit here (account too new).")
133
+ except Exception as e:
134
+ print(f"Error checking user account age: {e}")
135
+ return format_error("Could not verify account age. Please try again later.")
136
+
137
+ # Submission frequency check
138
+ contact_infos = try_load_dataset_submission(
139
+ CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
140
+ verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True
141
+ )
142
+ user_submission_dates = sorted(
143
+ datetime.fromisoformat(row["submit_time"])
144
+ for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
145
+ )
146
+ if user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)):
147
+ return format_error("You already submitted once in the last 24h for this split; please try again later.")
148
+
149
+ # Email validation
150
+ _, parsed_mail = parseaddr(mail)
151
+ if "@" not in parsed_mail:
152
+ return format_warning("Please provide a valid email address.")
153
+
154
+ # Duplicate submission check
155
+ if val_or_test in current_eval_results_for_submission and len(current_eval_results_for_submission[val_or_test]) > 0:
156
+ existing_submissions = current_eval_results_for_submission[val_or_test].to_dict().get("submission", [])
157
+ for sub_item in existing_submissions:
158
+ if (sub_item.get("agent_name", "").lower() == agent_name.lower() and
159
+ sub_item.get("username", "").lower() == username.lower()):
160
+ return format_warning("This agent name by this user has already been submitted to this split.")
161
+
162
+ if path_to_file is None:
163
+ return format_warning("Please attach a .tar.gz file.")
164
+
165
+ safe_username = sanitize_path_component(username)
166
+ safe_agent_name = sanitize_path_component(agent_name)
167
+ extracted_dir = os.path.join(EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}")
168
+
169
+ # File extraction
170
+ if not LOCAL_DEBUG:
171
+ try:
172
+ if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir)
173
+ os.makedirs(extracted_dir, exist_ok=True)
174
+ with tarfile.open(path_to_file.name, "r:gz") as tar:
175
+ members_extracted = 0
176
+ for member in tar.getmembers():
177
+ if not member.isreg(): continue
178
+ fname = os.path.basename(member.name)
179
+ if not fname or fname.startswith("."): continue
180
+ fobj = tar.extractfile(member)
181
+ if not fobj: continue
182
+ with open(os.path.join(extracted_dir, fname), "wb") as out:
183
+ out.write(fobj.read())
184
+ members_extracted +=1
185
+ if members_extracted == 0:
186
+ return format_error("Submission tarball is empty or contains no valid files.")
187
+ except Exception as e:
188
+ return format_error(f"Error extracting file: {e}. Ensure it's a valid .tar.gz.")
189
+ else: print("mock extracted file", flush=True)
190
+
191
+
192
+ submission_name = f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d_%H-%M-%S')}"
193
+
194
+ # 1. Upload raw (unscored) submission files
195
+ if not LOCAL_DEBUG:
196
+ try:
197
+ checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, submission_name)
198
+ except ValueError as e: return format_error(str(e))
199
+ except Exception as e: return format_error(f"Failed to upload raw submission: {e}")
200
+ else: print("mock uploaded raw submission", flush=True)
201
+
202
+ # 2. Save contact information
203
+ contact_info = {
204
+ "agent_name": agent_name, "agent_description": agent_description, "url": agent_url,
205
+ "username": username, "username_auth": profile.username, "mail": mail,
206
+ "submit_time": submission_time.isoformat(),
207
+ }
208
+ if val_or_test in contact_infos:
209
+ contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
210
+ else:
211
+ contact_infos[val_or_test] = Dataset.from_list([contact_info])
212
+
213
+ if not LOCAL_DEBUG:
214
+ try:
215
+ contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
216
+ except Exception as e: return format_warning(f"Submission recorded, but contact info failed to save: {e}")
217
+ else: print("mock uploaded contact info", flush=True)
218
+
219
+
220
+ # 3. Process and score the submission
221
+ eval_result_obj = None # Define to avoid NameError
222
+ try:
223
+ json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
224
+ if not json_path.exists():
225
+ return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME} in submission.")
226
+
227
+ eval_result_obj = EvalResult.model_validate_json(json_path.read_text(encoding="utf-8"))
228
+ if eval_result_obj.suite_config.version != CONFIG_NAME:
229
+ return format_error(f"Suite version mismatch: expected {CONFIG_NAME}, got {eval_result_obj.suite_config.version}.")
230
+ if eval_result_obj.split != val_or_test:
231
+ return format_error(f"Split mismatch: expected {val_or_test}, got {eval_result_obj.split}.")
232
+
233
+ # Re-compute results from logs for integrity
234
+ eval_result_obj.results = process_eval_logs(extracted_dir)[0] # Assuming process_eval_logs returns a tuple/list
235
+ eval_result_obj.save_json(str(json_path)) # Save the re-processed manifest
236
+
237
+ except Exception as e:
238
+ return format_error(f"Error scoring submission: {e}. Check manifest and log files.")
239
+
240
+ # 4. Upload scored submission files
241
+ logs_url_private_val, logs_url_public_val = None, None
242
+ scored_submission_name = f"{submission_name}_scored"
243
+ if not LOCAL_DEBUG:
244
+ try:
245
+ logs_url_private_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
246
+ if val_or_test == "validation" and not IS_INTERNAL: # Public copy for validation
247
+ logs_url_public_val = checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET_PUBLIC, CONFIG_NAME, val_or_test, scored_submission_name)
248
+ except ValueError as e: return format_error(str(e))
249
+ except Exception as e: return format_error(f"Failed to upload scored submission: {e}")
250
+ else: print("mock uploaded scored submission", flush=True)
251
+
252
+
253
+ # Update EvalResult with submission details
254
+ eval_result_obj.submission.agent_name = agent_name
255
+ eval_result_obj.submission.agent_description = agent_description
256
+ eval_result_obj.submission.agent_url = agent_url
257
+ eval_result_obj.submission.openness = openness
258
+ eval_result_obj.submission.degree_of_control = degree_of_control
259
+ eval_result_obj.submission.username = username
260
+ eval_result_obj.submission.submit_time = submission_time
261
+ eval_result_obj.submission.logs_url = logs_url_private_val
262
+ eval_result_obj.submission.logs_url_public = logs_url_public_val
263
+
264
+ # 5. Upload summary statistics to RESULTS_DATASET (for the leaderboard)
265
+ if not LOCAL_DEBUG:
266
+ try:
267
+ upload_summary_to_hf(api, eval_result_obj, RESULTS_DATASET, CONFIG_NAME, val_or_test, scored_submission_name)
268
+ except Exception as e:
269
+ return format_error(f"Failed to upload summary results to leaderboard: {e}")
270
+ else: print("mock uploaded results to lb", flush=True)
271
+
272
+ # Invalidate viewer cache for the split that was updated
273
+ if val_or_test in CACHED_VIEWERS:
274
+ del CACHED_VIEWERS[val_or_test]
275
+ if val_or_test in CACHED_TAG_MAPS:
276
+ del CACHED_TAG_MAPS[val_or_test]
277
+
278
+
279
+ return format_log(
280
+ f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split. "
281
+ "Please refresh the leaderboard in a few moments. It may take some time for changes to propagate."
282
+ )
283
+
284
+
285
+ # --- Submission Accordion ---
286
+ with gr.Blocks() as demo:
287
+ gr.Markdown(f"## 🚀 Submit a new agent for evaluation", elem_id="markdown-text")
288
+ with gr.Row():
289
+ with gr.Column():
290
+ level_of_test_radio = gr.Radio(["validation", "test"], value="validation", label="Split")
291
+ agent_name_tb = gr.Textbox(label="Agent Name")
292
+ agent_desc_tb = gr.Textbox(label="Agent Description")
293
+ agent_url_tb = gr.Textbox(label="URL to Agent Information")
294
+ openness_radio = gr.Radio(["Open Source","Open Source Open Weights", "API Available", "Closed"], value=None, label="Openness of Agent")
295
+ degree_of_control_radio = gr.Radio(["Standard","Custom with Standard Search", "Fully Custom"], value=None, label="Agent Tooling")
296
+ with gr.Column():
297
+ username_tb = gr.Textbox(label="Organization or User Name (Defaults to HF username)")
298
+ mail_tb = gr.Textbox(label="Contact Email (Private, for submission issues)")
299
+ file_upload_comp = gr.File(
300
+ label="Submission File (.tar.gz ...)", # Shortened for brevity
301
+ file_types=[".gz", ".tar.gz"]
302
+ )
303
+ with gr.Row():
304
+ gr.LoginButton()
305
+ submit_eval_button = gr.Button("Submit Evaluation")
306
+ submission_result = gr.Markdown()
307
+
308
+ submit_eval_button.click(
309
+ add_new_eval,
310
+ [
311
+ level_of_test_radio,
312
+ agent_name_tb,
313
+ agent_desc_tb,
314
+ agent_url_tb,
315
+ openness_radio,
316
+ degree_of_control_radio,
317
+ file_upload_comp,
318
+ username_tb,
319
+ mail_tb
320
+ ],
321
+ submission_result,
322
+ )
323
+ with gr.Accordion("📙 Citation", open=False):
324
+ gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
ui_components.py CHANGED
@@ -1,13 +1,22 @@
1
  import gradio as gr
2
- from gradio.events import SelectData
3
  import pandas as pd
4
  import plotly.graph_objects as go
5
  import os
 
6
 
7
  from agenteval.leaderboard.view import LeaderboardViewer
8
  from huggingface_hub import HfApi
9
 
10
- from leaderboard_transformer import DataTransformer, transform_raw_dataframe, create_pretty_tag_map, INFORMAL_TO_FORMAL_NAME_MAP, _plot_scatter_plotly, format_cost_column, format_score_column
 
 
 
 
 
 
 
 
 
11
  from content import (
12
  SCATTER_DISCLAIMER,
13
  format_error,
@@ -19,7 +28,7 @@ from content import (
19
 
20
  # --- Constants and Configuration ---
21
  LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
22
- CONFIG_NAME = "1.0.0-dev1" # This corresponds to 'config' in LeaderboardViewer
23
  IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
24
 
25
  OWNER = "allenai"
@@ -41,6 +50,24 @@ MAX_UPLOAD_BYTES = 100 * 1024**2
41
  AGENTEVAL_MANIFEST_NAME = "agenteval.json"
42
  os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # --- Global State for Viewers (simple caching) ---
46
  CACHED_VIEWERS = {}
@@ -117,29 +144,48 @@ def create_leaderboard_display(
117
  # The function no longer loads data itself; it filters the data it receives.
118
  transformer = DataTransformer(full_df, tag_map)
119
  df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
120
- # format cost columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  for col in df_view.columns:
122
  if "Cost" in col:
123
  df_view = format_cost_column(df_view, col)
124
 
125
- # 2. Fill NaN scores with 0
126
  for col in df_view.columns:
127
  if "Score" in col:
128
  df_view = format_score_column(df_view, col)
129
  scatter_plot = plots_dict.get('scatter_plot', go.Figure())
130
 
131
- # 2. Define the UI components with the filtered data.
 
 
 
 
 
 
 
 
132
  df_headers = df_view.columns.tolist()
133
- df_datatypes = ["markdown" if col == "Logs" or "Cost" in col or "Score" in col else "str" for col in df_headers]
134
-
135
- dataframe_component = gr.DataFrame(
136
- headers=df_headers,
137
- value=df_view,
138
- datatype=df_datatypes,
139
- interactive=False,
140
- wrap=True,
141
- column_widths=[100, 100, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 75, 75, 50, 50]
142
- )
143
 
144
  plot_component = gr.Plot(
145
  value=scatter_plot,
@@ -147,8 +193,20 @@ def create_leaderboard_display(
147
  )
148
  gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
149
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  # Return the components so they can be referenced elsewhere.
151
- return dataframe_component, plot_component
152
 
153
  def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
154
  """
@@ -178,8 +236,36 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
178
 
179
  # Fallback for unexpected types
180
  return pd.DataFrame(), {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
- # --- Detailed Benchmark Display ---
 
 
 
183
  def create_benchmark_details_display(
184
  full_df: pd.DataFrame,
185
  tag_map: dict,
@@ -206,14 +292,14 @@ def create_benchmark_details_display(
206
  # 2. Loop through each benchmark and create its UI components
207
  for benchmark_name in benchmark_names:
208
  with gr.Blocks():
209
- gr.Markdown(f"### {benchmark_name}")
210
 
211
  # 3. Prepare the data for this specific benchmark's table and plot
212
  benchmark_score_col = f"{benchmark_name} Score"
213
  benchmark_cost_col = f"{benchmark_name} Cost"
214
 
215
  # Define the columns needed for the detailed table
216
- table_cols = ['Agent', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs']
217
 
218
  # Filter to only columns that actually exist in the full dataframe
219
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -224,11 +310,29 @@ def create_benchmark_details_display(
224
 
225
  # Create a specific DataFrame for the table view
226
  benchmark_table_df = full_df[existing_table_cols].copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  # Calculated and add "Benchmark Attempted" column
228
  def check_benchmark_status(row):
229
  has_score = pd.notna(row.get(benchmark_score_col))
230
  has_cost = pd.notna(row.get(benchmark_cost_col))
231
-
232
  if has_score and has_cost:
233
  return "✅"
234
  if has_score or has_cost:
@@ -246,14 +350,14 @@ def create_benchmark_details_display(
246
  benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
247
  benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
248
  desired_cols_in_order = [
 
 
 
249
  'Agent',
250
  'Submitter',
251
  'Attempted Benchmark',
252
  benchmark_score_col,
253
  benchmark_cost_col,
254
- 'Openness',
255
- 'Degree of Control',
256
- 'Date',
257
  'Logs'
258
  ]
259
  for col in desired_cols_in_order:
@@ -261,25 +365,13 @@ def create_benchmark_details_display(
261
  benchmark_table_df[col] = pd.NA # Add as an empty column
262
  benchmark_table_df = benchmark_table_df[desired_cols_in_order]
263
  # Rename columns for a cleaner table display, as requested
264
- benchmark_table_df.rename(columns={
265
  benchmark_score_col: 'Score',
266
- benchmark_cost_col: 'Cost'
267
  }, inplace=True)
268
  # Ensure the 'Logs' column is formatted correctly
269
- table_headers = benchmark_table_df.columns.tolist()
270
- # If the column is 'Logs', render as markdown; otherwise, as a string.
271
- df_datatypes = [
272
- "markdown" if col in ["Logs", "Cost", "Score"] else "str"
273
- for col in table_headers
274
- ]
275
-
276
- # Create the Gradio component, now with the correct datatypes
277
- gr.DataFrame(
278
- value=benchmark_table_df,
279
- datatype=df_datatypes,
280
- interactive=False,
281
- wrap=True,
282
- )
283
 
284
  # Create the scatter plot using the full data for context, but plotting benchmark metrics
285
  # This shows all agents on the same axis for better comparison.
@@ -291,3 +383,14 @@ def create_benchmark_details_display(
291
  )
292
  gr.Plot(value=benchmark_plot)
293
  gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
  import plotly.graph_objects as go
4
  import os
5
+ import re
6
 
7
  from agenteval.leaderboard.view import LeaderboardViewer
8
  from huggingface_hub import HfApi
9
 
10
+ from leaderboard_transformer import (
11
+ DataTransformer,
12
+ transform_raw_dataframe,
13
+ create_pretty_tag_map,
14
+ INFORMAL_TO_FORMAL_NAME_MAP,
15
+ _plot_scatter_plotly,
16
+ format_cost_column,
17
+ format_score_column,
18
+ get_pareto_df,
19
+ )
20
  from content import (
21
  SCATTER_DISCLAIMER,
22
  format_error,
 
28
 
29
  # --- Constants and Configuration ---
30
  LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
31
+ CONFIG_NAME = "1.0.0-dev2" # This corresponds to 'config' in LeaderboardViewer
32
  IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
33
 
34
  OWNER = "allenai"
 
50
  AGENTEVAL_MANIFEST_NAME = "agenteval.json"
51
  os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
52
 
53
+ # Global variables
54
+ openness_emoji_map = {
55
+ "Closed": '🔴',
56
+ "API Available": '🟠',
57
+ "Open Source": '🟢',
58
+ "Open Source + Open Weights": '🔵'
59
+ }
60
+ control_emoji_map = {
61
+ "Standard": "⭐",
62
+ "Custom with Standard Search": "🔶",
63
+ "Fully Custom": "⚪️",
64
+ }
65
+ legend_markdown = """
66
+ <span>On pareto curve:📈</span>
67
+ <span>**Agent Openness**:</span> <span>🔴 Closed</span> <span>🟠 API Available</span> <span>🟢 Open Source</span> <span>🔵 Open Source + Open Weights</span>
68
+ <span>**Agent Tooling**:</span> <span>⭐ Standard</span> <span>🔶 Custom with Standard Search</span> <span>⚪️ Fully Custom</span>
69
+ <span>**COMING SOON:** COLUMN DESCRIPTIONS</span>
70
+ """
71
 
72
  # --- Global State for Viewers (simple caching) ---
73
  CACHED_VIEWERS = {}
 
144
  # The function no longer loads data itself; it filters the data it receives.
145
  transformer = DataTransformer(full_df, tag_map)
146
  df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
147
+ pareto_df = get_pareto_df(df_view)
148
+ # Get the list of agents on the frontier. We'll use this list later.
149
+ if not pareto_df.empty and 'id' in pareto_df.columns:
150
+ pareto_agent_names = pareto_df['id'].tolist()
151
+ else:
152
+ pareto_agent_names = []
153
+ df_view['Pareto'] = df_view.apply(
154
+ lambda row: '📈' if row['id'] in pareto_agent_names else '',
155
+ axis=1
156
+ )
157
+ # Create mapping for Openness
158
+ original_openness = df_view['Openness']
159
+ df_view['Openness'] = df_view['Openness'].map(openness_emoji_map).fillna(original_openness)
160
+
161
+ # For this column, we'll use .apply() to handle the "Other" case cleanly.
162
+ df_view['Agent Tooling'] = df_view['Agent Tooling'].apply(
163
+ lambda ctrl: control_emoji_map.get(ctrl, f"{ctrl}" if pd.notna(ctrl) else "")
164
+ )
165
+
166
+
167
+ # Format cost columns
168
  for col in df_view.columns:
169
  if "Cost" in col:
170
  df_view = format_cost_column(df_view, col)
171
 
172
+ # Fill NaN scores with 0
173
  for col in df_view.columns:
174
  if "Score" in col:
175
  df_view = format_score_column(df_view, col)
176
  scatter_plot = plots_dict.get('scatter_plot', go.Figure())
177
 
178
+
179
+ all_cols = df_view.columns.tolist()
180
+ # Remove 'Pareto' from the list and insert it at the beginning
181
+ all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
182
+ df_view = df_view[all_cols]
183
+ # Drop internally used columns that are not needed in the display
184
+ columns_to_drop = ['id', 'agent_for_hover']
185
+ df_view = df_view.drop(columns=columns_to_drop, errors='ignore')
186
+
187
  df_headers = df_view.columns.tolist()
188
+ df_datatypes = ["markdown" if col == "Logs" or col == "Agent" or "Cost" in col or "Score" in col else "str" for col in df_headers]
 
 
 
 
 
 
 
 
 
189
 
190
  plot_component = gr.Plot(
191
  value=scatter_plot,
 
193
  )
194
  gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
195
 
196
+ # Put table and key into an accordion
197
+ with gr.Accordion("See Table", open=False, elem_id="leaderboard-accordion"):
198
+ dataframe_component = gr.DataFrame(
199
+ headers=df_headers,
200
+ value=df_view,
201
+ datatype=df_datatypes,
202
+ interactive=False,
203
+ wrap=True,
204
+ column_widths=[30, 30, 30, 100, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 50, 30]
205
+ )
206
+ gr.Markdown(value=legend_markdown, elem_id="legend-markdown")
207
+
208
  # Return the components so they can be referenced elsewhere.
209
+ return plot_component, dataframe_component,
210
 
211
  def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
212
  """
 
236
 
237
  # Fallback for unexpected types
238
  return pd.DataFrame(), {}
239
+ # Create sub-nav bar for benchmarks
240
+ def create_gradio_anchor_id(text: str) -> str:
241
+ """
242
+ Replicates the ID format created by gr.Markdown(header_links=True).
243
+ Example: "Paper Finder Validation" -> "h-paper-finder-validation"
244
+ """
245
+ text = text.lower()
246
+ text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens
247
+ text = re.sub(r'[^\w-]', '', text) # Remove non-word characters
248
+ return f"h-{text}"
249
+ def create_sub_navigation_bar(tag_map: dict, category_name: str):
250
+ """
251
+ Generates and renders the HTML for the anchor-link sub-navigation bar.
252
+ """
253
+ benchmark_names = tag_map.get(category_name, [])
254
+ if not benchmark_names:
255
+ return # Do nothing if there are no benchmarks
256
+
257
+ anchor_links = []
258
+ for name in benchmark_names:
259
+ # Use the helper function to create the correct ID format
260
+ target_id = create_gradio_anchor_id(name)
261
+ anchor_links.append(f"<a href='#{target_id}'>{name}</a>")
262
+
263
+ nav_bar_html = f"<div class='sub-nav-bar'>{' '.join(anchor_links)}</div>"
264
 
265
+ # Use gr.HTML to render the links correctly
266
+ gr.HTML(nav_bar_html)
267
+
268
+ # # --- Detailed Benchmark Display ---
269
  def create_benchmark_details_display(
270
  full_df: pd.DataFrame,
271
  tag_map: dict,
 
292
  # 2. Loop through each benchmark and create its UI components
293
  for benchmark_name in benchmark_names:
294
  with gr.Blocks():
295
+ gr.Markdown(f"### {benchmark_name}", header_links=True)
296
 
297
  # 3. Prepare the data for this specific benchmark's table and plot
298
  benchmark_score_col = f"{benchmark_name} Score"
299
  benchmark_cost_col = f"{benchmark_name} Cost"
300
 
301
  # Define the columns needed for the detailed table
302
+ table_cols = ['Agent','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id']
303
 
304
  # Filter to only columns that actually exist in the full dataframe
305
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
 
310
 
311
  # Create a specific DataFrame for the table view
312
  benchmark_table_df = full_df[existing_table_cols].copy()
313
+ pareto_df = get_pareto_df(benchmark_table_df)
314
+ # Get the list of agents on the frontier. We'll use this list later.
315
+ if not pareto_df.empty and 'id' in pareto_df.columns:
316
+ pareto_agent_names = pareto_df['id'].tolist()
317
+ else:
318
+ pareto_agent_names = []
319
+ benchmark_table_df['Pareto'] = benchmark_table_df.apply(
320
+ lambda row: '📈' if row['id'] in pareto_agent_names else '',
321
+ axis=1
322
+ )
323
+
324
+ original_openness = benchmark_table_df['Openness']
325
+ benchmark_table_df['Openness'] = benchmark_table_df['Openness'].map(openness_emoji_map).fillna(original_openness)
326
+
327
+ # For this column, we'll use .apply() to handle the "Other" case cleanly.
328
+ benchmark_table_df['Agent Tooling'] = benchmark_table_df['Agent Tooling'].apply(
329
+ lambda ctrl: control_emoji_map.get(ctrl, f"{ctrl}" if pd.notna(ctrl) else "")
330
+ )
331
+
332
  # Calculated and add "Benchmark Attempted" column
333
  def check_benchmark_status(row):
334
  has_score = pd.notna(row.get(benchmark_score_col))
335
  has_cost = pd.notna(row.get(benchmark_cost_col))
 
336
  if has_score and has_cost:
337
  return "✅"
338
  if has_score or has_cost:
 
350
  benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
351
  benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
352
  desired_cols_in_order = [
353
+ 'Pareto',
354
+ 'Openness',
355
+ 'Agent Tooling',
356
  'Agent',
357
  'Submitter',
358
  'Attempted Benchmark',
359
  benchmark_score_col,
360
  benchmark_cost_col,
 
 
 
361
  'Logs'
362
  ]
363
  for col in desired_cols_in_order:
 
365
  benchmark_table_df[col] = pd.NA # Add as an empty column
366
  benchmark_table_df = benchmark_table_df[desired_cols_in_order]
367
  # Rename columns for a cleaner table display, as requested
368
+ benchmark_table_df.rename({
369
  benchmark_score_col: 'Score',
370
+ benchmark_cost_col: 'Cost',
371
  }, inplace=True)
372
  # Ensure the 'Logs' column is formatted correctly
373
+ df_headers = benchmark_table_df.columns.tolist()
374
+ df_datatypes = ["markdown" if col == "Logs" or col == "Agent" or "Cost" in col or "Score" in col else "str" for col in df_headers]
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
  # Create the scatter plot using the full data for context, but plotting benchmark metrics
377
  # This shows all agents on the same axis for better comparison.
 
383
  )
384
  gr.Plot(value=benchmark_plot)
385
  gr.HTML(SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
386
+ # Put table and key into an accordion
387
+ with gr.Accordion("See Table", open=False, elem_id="leaderboard-accordion"):
388
+ gr.DataFrame(
389
+ headers=df_headers,
390
+ value=benchmark_table_df,
391
+ datatype=df_datatypes,
392
+ interactive=False,
393
+ wrap=True,
394
+ )
395
+ gr.Markdown(value=legend_markdown, elem_id="legend-markdown")
396
+