Add a separate dataset for aggregating requests metadata
Browse files- app.py +6 -2
- src/submission_uploader.py +83 -12
app.py
CHANGED
|
@@ -30,7 +30,9 @@ logging.basicConfig(
|
|
| 30 |
handlers=[logging.StreamHandler()],
|
| 31 |
)
|
| 32 |
|
| 33 |
-
submission_uploader = SubmissionUploader(
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
with gr.Blocks() as demo:
|
|
@@ -61,7 +63,7 @@ with gr.Blocks() as demo:
|
|
| 61 |
with gr.Column():
|
| 62 |
model_folder_textbox = gr.Textbox(
|
| 63 |
label="Model Folder",
|
| 64 |
-
placeholder="How to call a folder related to this submission in our results dataset.",
|
| 65 |
)
|
| 66 |
model_name_textbox = gr.Textbox(
|
| 67 |
label="Model Name",
|
|
@@ -111,6 +113,8 @@ with gr.Blocks() as demo:
|
|
| 111 |
url_textbox,
|
| 112 |
context_size_textbox,
|
| 113 |
submitted_by_textbox,
|
|
|
|
|
|
|
| 114 |
file_output,
|
| 115 |
],
|
| 116 |
submission_result,
|
|
|
|
| 30 |
handlers=[logging.StreamHandler()],
|
| 31 |
)
|
| 32 |
|
| 33 |
+
submission_uploader = SubmissionUploader(
|
| 34 |
+
dataset_id=os.environ["DATASET_ID"], private_dataset_id=os.environ["PRIVATE_DATASET_ID"]
|
| 35 |
+
)
|
| 36 |
|
| 37 |
|
| 38 |
with gr.Blocks() as demo:
|
|
|
|
| 63 |
with gr.Column():
|
| 64 |
model_folder_textbox = gr.Textbox(
|
| 65 |
label="Model Folder",
|
| 66 |
+
placeholder="How to call a folder related to this submission in our results dataset (should be unique).",
|
| 67 |
)
|
| 68 |
model_name_textbox = gr.Textbox(
|
| 69 |
label="Model Name",
|
|
|
|
| 113 |
url_textbox,
|
| 114 |
context_size_textbox,
|
| 115 |
submitted_by_textbox,
|
| 116 |
+
contact_textbox,
|
| 117 |
+
comment_textbox,
|
| 118 |
file_output,
|
| 119 |
],
|
| 120 |
submission_result,
|
src/submission_uploader.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import json
|
| 2 |
import logging
|
| 3 |
import os
|
|
|
|
| 4 |
from tempfile import TemporaryDirectory
|
| 5 |
from typing import Dict, List, Optional
|
| 6 |
|
|
@@ -26,10 +27,11 @@ class SubmissionUploader:
|
|
| 26 |
* https://huggingface.co/spaces/gaia-benchmark/leaderboard
|
| 27 |
"""
|
| 28 |
|
| 29 |
-
def __init__(self, dataset_id: str):
|
| 30 |
self._api = HfApi(token=os.environ["HF_TOKEN"])
|
| 31 |
self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
|
| 32 |
self._dataset_id = dataset_id
|
|
|
|
| 33 |
|
| 34 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
| 35 |
"""Searches among discussions of dataset repo for a PR with the given title."""
|
|
@@ -46,10 +48,10 @@ class SubmissionUploader:
|
|
| 46 |
self,
|
| 47 |
model_name_pretty: str,
|
| 48 |
model_availability: str,
|
| 49 |
-
urls: str,
|
| 50 |
context_size: str,
|
| 51 |
submitted_by: str,
|
| 52 |
-
) -> Dict[str, str]:
|
| 53 |
return {
|
| 54 |
"model_name": model_name_pretty,
|
| 55 |
"model_availability": model_availability,
|
|
@@ -58,6 +60,45 @@ class SubmissionUploader:
|
|
| 58 |
"submitted_by": submitted_by,
|
| 59 |
}
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def _upload_predictions(
|
| 62 |
self,
|
| 63 |
task_id: str,
|
|
@@ -107,7 +148,7 @@ class SubmissionUploader:
|
|
| 107 |
model_folder: str,
|
| 108 |
model_name_pretty: str,
|
| 109 |
model_availability: str,
|
| 110 |
-
urls: str,
|
| 111 |
context_size: str,
|
| 112 |
submitted_by: str,
|
| 113 |
temp_directory: str,
|
|
@@ -141,9 +182,11 @@ class SubmissionUploader:
|
|
| 141 |
model_folder: str,
|
| 142 |
model_name_pretty: str,
|
| 143 |
model_availability: str,
|
| 144 |
-
urls: str,
|
| 145 |
context_size: str,
|
| 146 |
submitted_by: str,
|
|
|
|
|
|
|
| 147 |
filenames: Optional[List[str]],
|
| 148 |
):
|
| 149 |
assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
|
|
@@ -158,6 +201,7 @@ class SubmissionUploader:
|
|
| 158 |
|
| 159 |
assert submitted_by, "Please, specify non-empty information about a submission's author(s)."
|
| 160 |
assert filenames, "Please, attach at least one file with predictions."
|
|
|
|
| 161 |
|
| 162 |
def upload_files(
|
| 163 |
self,
|
|
@@ -165,9 +209,11 @@ class SubmissionUploader:
|
|
| 165 |
model_folder: str,
|
| 166 |
model_name_pretty: str,
|
| 167 |
model_availability: str,
|
| 168 |
-
urls: str,
|
| 169 |
context_size: str,
|
| 170 |
submitted_by: str,
|
|
|
|
|
|
|
| 171 |
filenames: Optional[List[str]],
|
| 172 |
force: bool = False,
|
| 173 |
) -> str:
|
|
@@ -180,6 +226,8 @@ class SubmissionUploader:
|
|
| 180 |
urls=urls,
|
| 181 |
context_size=context_size,
|
| 182 |
submitted_by=submitted_by,
|
|
|
|
|
|
|
| 183 |
filenames=filenames,
|
| 184 |
)
|
| 185 |
pr_title = f"π New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
|
|
@@ -190,11 +238,10 @@ class SubmissionUploader:
|
|
| 190 |
|
| 191 |
logging.info("Checking if this request has already been submitted...")
|
| 192 |
if not force:
|
| 193 |
-
if
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
return styled_warning(f"{model_name_pretty} is already present in {self._dataset_id}.")
|
| 198 |
|
| 199 |
prev_pr = self._get_previous_pr(pr_title)
|
| 200 |
if prev_pr is not None:
|
|
@@ -224,7 +271,7 @@ class SubmissionUploader:
|
|
| 224 |
temp_directory=str(d),
|
| 225 |
)
|
| 226 |
|
| 227 |
-
logging.info("Creating commit...")
|
| 228 |
new_pr = self._api.create_commit(
|
| 229 |
repo_id=self._dataset_id,
|
| 230 |
operations=predictions_commit_operations + results_commit_operations,
|
|
@@ -233,6 +280,30 @@ class SubmissionUploader:
|
|
| 233 |
create_pr=True,
|
| 234 |
repo_type="dataset",
|
| 235 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
return styled_message(f"π PR created at {new_pr.pr_url}.")
|
| 237 |
|
| 238 |
except Exception as e:
|
|
|
|
| 1 |
import json
|
| 2 |
import logging
|
| 3 |
import os
|
| 4 |
+
import time
|
| 5 |
from tempfile import TemporaryDirectory
|
| 6 |
from typing import Dict, List, Optional
|
| 7 |
|
|
|
|
| 27 |
* https://huggingface.co/spaces/gaia-benchmark/leaderboard
|
| 28 |
"""
|
| 29 |
|
| 30 |
+
def __init__(self, dataset_id: str, private_dataset_id: str):
|
| 31 |
self._api = HfApi(token=os.environ["HF_TOKEN"])
|
| 32 |
self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
|
| 33 |
self._dataset_id = dataset_id
|
| 34 |
+
self._private_dataset_id = private_dataset_id
|
| 35 |
|
| 36 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
| 37 |
"""Searches among discussions of dataset repo for a PR with the given title."""
|
|
|
|
| 48 |
self,
|
| 49 |
model_name_pretty: str,
|
| 50 |
model_availability: str,
|
| 51 |
+
urls: Optional[str],
|
| 52 |
context_size: str,
|
| 53 |
submitted_by: str,
|
| 54 |
+
) -> Dict[str, Optional[str]]:
|
| 55 |
return {
|
| 56 |
"model_name": model_name_pretty,
|
| 57 |
"model_availability": model_availability,
|
|
|
|
| 60 |
"submitted_by": submitted_by,
|
| 61 |
}
|
| 62 |
|
| 63 |
+
def _upload_request(
|
| 64 |
+
self,
|
| 65 |
+
task_id: str,
|
| 66 |
+
model_folder: str,
|
| 67 |
+
model_name_pretty: str,
|
| 68 |
+
model_availability: str,
|
| 69 |
+
urls: Optional[str],
|
| 70 |
+
context_size: str,
|
| 71 |
+
submitted_by: str,
|
| 72 |
+
contact_information: str,
|
| 73 |
+
comment: Optional[str],
|
| 74 |
+
pr_url: str,
|
| 75 |
+
temp_directory: str,
|
| 76 |
+
) -> List[CommitOperationAdd]:
|
| 77 |
+
request_metadata = {
|
| 78 |
+
"model_folder": model_folder,
|
| 79 |
+
"model_name_pretty": model_name_pretty,
|
| 80 |
+
"model_availability": model_availability,
|
| 81 |
+
"urls": urls,
|
| 82 |
+
"context_size": context_size,
|
| 83 |
+
"submitted_by": submitted_by,
|
| 84 |
+
"contact_information": contact_information,
|
| 85 |
+
"comment": comment,
|
| 86 |
+
"timestamp": time.time(),
|
| 87 |
+
"pr_url": pr_url,
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
with open(os.path.join(temp_directory, "request_metadata.json"), "w") as f:
|
| 91 |
+
json.dump(request_metadata, f)
|
| 92 |
+
|
| 93 |
+
num_requests_already_present = len(self._fs.ls(f"datasets/{self._private_dataset_id}/{task_id}/"))
|
| 94 |
+
commit_operations = [
|
| 95 |
+
CommitOperationAdd(
|
| 96 |
+
path_in_repo=f"{task_id}/{num_requests_already_present}_{model_folder}.json",
|
| 97 |
+
path_or_fileobj=os.path.join(temp_directory, "request_metadata.json"),
|
| 98 |
+
)
|
| 99 |
+
]
|
| 100 |
+
return commit_operations
|
| 101 |
+
|
| 102 |
def _upload_predictions(
|
| 103 |
self,
|
| 104 |
task_id: str,
|
|
|
|
| 148 |
model_folder: str,
|
| 149 |
model_name_pretty: str,
|
| 150 |
model_availability: str,
|
| 151 |
+
urls: Optional[str],
|
| 152 |
context_size: str,
|
| 153 |
submitted_by: str,
|
| 154 |
temp_directory: str,
|
|
|
|
| 182 |
model_folder: str,
|
| 183 |
model_name_pretty: str,
|
| 184 |
model_availability: str,
|
| 185 |
+
urls: Optional[str],
|
| 186 |
context_size: str,
|
| 187 |
submitted_by: str,
|
| 188 |
+
contact_information: str,
|
| 189 |
+
comment: Optional[str],
|
| 190 |
filenames: Optional[List[str]],
|
| 191 |
):
|
| 192 |
assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
|
|
|
|
| 201 |
|
| 202 |
assert submitted_by, "Please, specify non-empty information about a submission's author(s)."
|
| 203 |
assert filenames, "Please, attach at least one file with predictions."
|
| 204 |
+
assert contact_information, "Please, fill in the field with contact information."
|
| 205 |
|
| 206 |
def upload_files(
|
| 207 |
self,
|
|
|
|
| 209 |
model_folder: str,
|
| 210 |
model_name_pretty: str,
|
| 211 |
model_availability: str,
|
| 212 |
+
urls: Optional[str],
|
| 213 |
context_size: str,
|
| 214 |
submitted_by: str,
|
| 215 |
+
contact_information: str,
|
| 216 |
+
comment: Optional[str],
|
| 217 |
filenames: Optional[List[str]],
|
| 218 |
force: bool = False,
|
| 219 |
) -> str:
|
|
|
|
| 226 |
urls=urls,
|
| 227 |
context_size=context_size,
|
| 228 |
submitted_by=submitted_by,
|
| 229 |
+
contact_information=contact_information,
|
| 230 |
+
comment=comment,
|
| 231 |
filenames=filenames,
|
| 232 |
)
|
| 233 |
pr_title = f"π New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
|
|
|
|
| 238 |
|
| 239 |
logging.info("Checking if this request has already been submitted...")
|
| 240 |
if not force:
|
| 241 |
+
if model_folder in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions"):
|
| 242 |
+
return styled_warning(
|
| 243 |
+
f"{model_folder} is already present in {self._dataset_id}, please, select another folder name."
|
| 244 |
+
)
|
|
|
|
| 245 |
|
| 246 |
prev_pr = self._get_previous_pr(pr_title)
|
| 247 |
if prev_pr is not None:
|
|
|
|
| 271 |
temp_directory=str(d),
|
| 272 |
)
|
| 273 |
|
| 274 |
+
logging.info(f"Creating commit to results dataset...")
|
| 275 |
new_pr = self._api.create_commit(
|
| 276 |
repo_id=self._dataset_id,
|
| 277 |
operations=predictions_commit_operations + results_commit_operations,
|
|
|
|
| 280 |
create_pr=True,
|
| 281 |
repo_type="dataset",
|
| 282 |
)
|
| 283 |
+
|
| 284 |
+
logging.info(f"Creating commit to requests dataset...")
|
| 285 |
+
request_commit_operations = self._upload_request(
|
| 286 |
+
task_id=task_id,
|
| 287 |
+
model_folder=model_folder,
|
| 288 |
+
temp_directory=str(d),
|
| 289 |
+
model_name_pretty=model_name_pretty,
|
| 290 |
+
model_availability=model_availability,
|
| 291 |
+
urls=urls,
|
| 292 |
+
context_size=context_size,
|
| 293 |
+
submitted_by=submitted_by,
|
| 294 |
+
contact_information=contact_information,
|
| 295 |
+
comment=comment,
|
| 296 |
+
pr_url=new_pr.pr_url,
|
| 297 |
+
)
|
| 298 |
+
self._api.create_commit(
|
| 299 |
+
repo_id=self._private_dataset_id,
|
| 300 |
+
operations=request_commit_operations,
|
| 301 |
+
commit_message=pr_title,
|
| 302 |
+
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}\n* PR: {new_pr.pr_url}\n* Contact information: {contact_information}\n* Comment: {comment}""",
|
| 303 |
+
create_pr=True,
|
| 304 |
+
repo_type="dataset",
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
return styled_message(f"π PR created at {new_pr.pr_url}.")
|
| 308 |
|
| 309 |
except Exception as e:
|