Spaces:
Running
Running
Quentin Lhoest
commited on
Commit
Β·
9242f47
1
Parent(s):
40a1ebe
update task taxonomy
Browse files- `datasets` version is from the PR on github for now
- pre-loading tags are from the PR that updates all the datasets
- specify the size category yourself
- build_metadata_file.py +3 -1
- metadata_8418b1d4ebac7c59372c5a55556522584891ba9c.json +0 -3
- requirements.txt +1 -1
- tagging_app.py +24 -13
build_metadata_file.py
CHANGED
|
@@ -14,6 +14,8 @@ import yaml
|
|
| 14 |
|
| 15 |
from apputils import new_state
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def metadata_from_readme(f: Path) -> Dict:
|
| 19 |
with f.open() as fi:
|
|
@@ -29,7 +31,7 @@ def load_ds_datas():
|
|
| 29 |
if drepo.exists() and drepo.is_dir():
|
| 30 |
check_call(["git", "pull"], cwd=drepo)
|
| 31 |
else:
|
| 32 |
-
check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
|
| 33 |
head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
|
| 34 |
|
| 35 |
datasets_md = dict()
|
|
|
|
| 14 |
|
| 15 |
from apputils import new_state
|
| 16 |
|
| 17 |
+
DATASETS_BRANCH = "tasks-alignment-with-models"
|
| 18 |
+
|
| 19 |
|
| 20 |
def metadata_from_readme(f: Path) -> Dict:
|
| 21 |
with f.open() as fi:
|
|
|
|
| 31 |
if drepo.exists() and drepo.is_dir():
|
| 32 |
check_call(["git", "pull"], cwd=drepo)
|
| 33 |
else:
|
| 34 |
+
check_call(["git", "clone", "-b", DATASETS_BRANCH, "https://github.com/huggingface/datasets.git"])
|
| 35 |
head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
|
| 36 |
|
| 37 |
datasets_md = dict()
|
metadata_8418b1d4ebac7c59372c5a55556522584891ba9c.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a72566a87cb959e17e04840367969ef3a5966db12f82039ca6faea9b87da54d9
|
| 3 |
-
size 29912341
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
pyyaml
|
| 2 |
-
datasets==1.9.0
|
| 3 |
streamlit>=0.88.0
|
| 4 |
langcodes[data]
|
|
|
|
|
|
| 1 |
pyyaml
|
|
|
|
| 2 |
streamlit>=0.88.0
|
| 3 |
langcodes[data]
|
| 4 |
+
git+https://github.com/huggingface/datasets.git@update-task-list
|
tagging_app.py
CHANGED
|
@@ -73,20 +73,22 @@ def multiselect(
|
|
| 73 |
if len(invalid_values) > 0:
|
| 74 |
w.markdown("Found the following invalid values:")
|
| 75 |
w.error(invalid_values)
|
| 76 |
-
return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
|
| 77 |
|
| 78 |
|
| 79 |
def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
|
| 80 |
try:
|
| 81 |
DatasetMetadata(**state_dict)
|
|
|
|
|
|
|
| 82 |
w.markdown("β
This is a valid tagset! π€")
|
| 83 |
except Exception as e:
|
| 84 |
w.markdown("β This is an invalid tagset, here are the errors in it:")
|
| 85 |
w.error(e)
|
| 86 |
|
| 87 |
|
| 88 |
-
def
|
| 89 |
-
if n
|
| 90 |
size_cat = "unknown"
|
| 91 |
elif n < 1000:
|
| 92 |
size_cat = "n<1K"
|
|
@@ -212,8 +214,7 @@ state["task_categories"] = multiselect(
|
|
| 212 |
"Task category",
|
| 213 |
"What categories of task does the dataset support?",
|
| 214 |
values=state["task_categories"],
|
| 215 |
-
valid_set=list(known_task_ids.keys()),
|
| 216 |
-
format_func=lambda tg: f"{tg}: {known_task_ids[tg]['description']}",
|
| 217 |
)
|
| 218 |
task_specifics = []
|
| 219 |
for task_category in state["task_categories"]:
|
|
@@ -221,8 +222,8 @@ for task_category in state["task_categories"]:
|
|
| 221 |
leftcol,
|
| 222 |
f"Specific _{task_category}_ tasks",
|
| 223 |
f"What specific tasks does the dataset support?",
|
| 224 |
-
values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[task_category]
|
| 225 |
-
valid_set=known_task_ids[task_category]
|
| 226 |
)
|
| 227 |
if "other" in specs:
|
| 228 |
other_task = leftcol.text_input(
|
|
@@ -355,14 +356,24 @@ initial_num_examples = (
|
|
| 355 |
if initial_infos is not None
|
| 356 |
else -1
|
| 357 |
)
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
if len(nonok) > 0:
|
| 363 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
| 364 |
-
|
| 365 |
-
state["size_categories"] = [initial_size_cats]
|
| 366 |
|
| 367 |
|
| 368 |
########################
|
|
|
|
| 73 |
if len(invalid_values) > 0:
|
| 74 |
w.markdown("Found the following invalid values:")
|
| 75 |
w.error(invalid_values)
|
| 76 |
+
return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func, key=title)
|
| 77 |
|
| 78 |
|
| 79 |
def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
|
| 80 |
try:
|
| 81 |
DatasetMetadata(**state_dict)
|
| 82 |
+
if not state_dict.get("pretty_name"):
|
| 83 |
+
raise ValueError("Please specify a non-empty Dataset name.")
|
| 84 |
w.markdown("β
This is a valid tagset! π€")
|
| 85 |
except Exception as e:
|
| 86 |
w.markdown("β This is an invalid tagset, here are the errors in it:")
|
| 87 |
w.error(e)
|
| 88 |
|
| 89 |
|
| 90 |
+
def map_num_examples_to_size_category(n: int) -> str:
|
| 91 |
+
if n < 0:
|
| 92 |
size_cat = "unknown"
|
| 93 |
elif n < 1000:
|
| 94 |
size_cat = "n<1K"
|
|
|
|
| 214 |
"Task category",
|
| 215 |
"What categories of task does the dataset support?",
|
| 216 |
values=state["task_categories"],
|
| 217 |
+
valid_set=sorted(list(known_task_ids.keys())),
|
|
|
|
| 218 |
)
|
| 219 |
task_specifics = []
|
| 220 |
for task_category in state["task_categories"]:
|
|
|
|
| 222 |
leftcol,
|
| 223 |
f"Specific _{task_category}_ tasks",
|
| 224 |
f"What specific tasks does the dataset support?",
|
| 225 |
+
values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[task_category].get("subtasks", [])],
|
| 226 |
+
valid_set=known_task_ids[task_category].get("subtasks", []),
|
| 227 |
)
|
| 228 |
if "other" in specs:
|
| 229 |
other_task = leftcol.text_input(
|
|
|
|
| 356 |
if initial_infos is not None
|
| 357 |
else -1
|
| 358 |
)
|
| 359 |
+
if initial_num_examples >= 0:
|
| 360 |
+
initial_size_categories = [map_num_examples_to_size_category(initial_num_examples)]
|
| 361 |
+
else:
|
| 362 |
+
initial_size_categories = []
|
| 363 |
+
current_size_cats = multiselect(
|
| 364 |
+
leftcol,
|
| 365 |
+
f"Size category",
|
| 366 |
+
f"How many samples are there in the dataset?",
|
| 367 |
+
values=initial_size_categories,
|
| 368 |
+
valid_set=known_size_categories,
|
| 369 |
+
)
|
| 370 |
+
if initial_size_categories:
|
| 371 |
+
leftcol.markdown(f"Computed size category from automatically generated dataset info to: `{initial_size_categories}`")
|
| 372 |
+
prev_size_cats = state.get("size_categories") or []
|
| 373 |
+
ok, nonok = split_known(prev_size_cats, known_size_categories)
|
| 374 |
if len(nonok) > 0:
|
| 375 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
| 376 |
+
state["size_categories"] = current_size_cats
|
|
|
|
| 377 |
|
| 378 |
|
| 379 |
########################
|