Spaces:
Running
Running
theo
commited on
Commit
Β·
326ad7e
1
Parent(s):
26742b2
better state mgmt + use validator script
Browse files- tagging_app.py +74 -57
tagging_app.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import Callable, List, Tuple
|
|
| 4 |
|
| 5 |
import streamlit as st
|
| 6 |
import yaml
|
|
|
|
| 7 |
|
| 8 |
st.set_page_config(
|
| 9 |
page_title="HF Dataset Tagging App",
|
|
@@ -128,31 +129,52 @@ Beware that clicking pre-load will overwrite the current state!
|
|
| 128 |
)
|
| 129 |
|
| 130 |
|
| 131 |
-
|
| 132 |
-
preload =
|
| 133 |
preloaded_id = None
|
|
|
|
| 134 |
did_index = 0
|
| 135 |
if len(preload) == 1 and preload[0] in all_dataset_ids:
|
| 136 |
preloaded_id, *_ = preload
|
| 137 |
-
|
|
|
|
| 138 |
did_index = all_dataset_ids.index(preloaded_id)
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
|
|
|
| 142 |
leftbtn, rightbtn = st.sidebar.beta_columns(2)
|
| 143 |
-
if leftbtn.button("pre-load
|
| 144 |
-
|
| 145 |
-
|
|
|
|
| 146 |
if rightbtn.button("flush state"):
|
| 147 |
state = new_state()
|
|
|
|
|
|
|
| 148 |
st.experimental_set_query_params()
|
| 149 |
|
| 150 |
-
if preloaded_id is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
st.sidebar.markdown(
|
| 152 |
f"""
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
```yaml
|
| 155 |
-
{yaml.dump(
|
| 156 |
```
|
| 157 |
"""
|
| 158 |
)
|
|
@@ -162,7 +184,7 @@ leftcol, _, rightcol = st.beta_columns([12, 1, 12])
|
|
| 162 |
|
| 163 |
|
| 164 |
leftcol.markdown("### Supported tasks")
|
| 165 |
-
task_categories = multiselect(
|
| 166 |
leftcol,
|
| 167 |
"Task category",
|
| 168 |
"What categories of task does the dataset support?",
|
|
@@ -171,27 +193,27 @@ task_categories = multiselect(
|
|
| 171 |
format_func=lambda tg: f"{tg}: {task_set[tg]['description']}",
|
| 172 |
)
|
| 173 |
task_specifics = []
|
| 174 |
-
for tg in task_categories:
|
| 175 |
-
|
| 176 |
leftcol,
|
| 177 |
-
"Specific tasks",
|
| 178 |
-
f"What specific
|
| 179 |
-
values=[ts for ts in state["task_ids"] if ts in task_set[tg]["options"]],
|
| 180 |
valid_set=task_set[tg]["options"],
|
| 181 |
)
|
| 182 |
-
if "other" in
|
| 183 |
other_task = st.text_input(
|
| 184 |
"You selected 'other' task. Please enter a short hyphen-separated description for the task:",
|
| 185 |
value="my-task-description",
|
| 186 |
)
|
| 187 |
st.write(f"Registering {tg}-other-{other_task} task")
|
| 188 |
-
|
| 189 |
-
task_specifics +=
|
|
|
|
| 190 |
|
| 191 |
|
| 192 |
leftcol.markdown("### Languages")
|
| 193 |
-
|
| 194 |
-
multilinguality = multiselect(
|
| 195 |
leftcol,
|
| 196 |
"Monolingual?",
|
| 197 |
"Does the dataset contain more than one language?",
|
|
@@ -200,16 +222,15 @@ multilinguality = multiselect(
|
|
| 200 |
format_func=lambda m: f"{m} : {multilinguality_set[m]}",
|
| 201 |
)
|
| 202 |
|
| 203 |
-
if "other" in multilinguality:
|
| 204 |
other_multilinguality = st.text_input(
|
| 205 |
"You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
|
| 206 |
value="my-multilinguality",
|
| 207 |
)
|
| 208 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
| 209 |
-
multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
|
| 210 |
|
| 211 |
-
|
| 212 |
-
languages = multiselect(
|
| 213 |
leftcol,
|
| 214 |
"Languages",
|
| 215 |
"What languages are represented in the dataset?",
|
|
@@ -220,14 +241,14 @@ languages = multiselect(
|
|
| 220 |
|
| 221 |
|
| 222 |
leftcol.markdown("### Dataset creators")
|
| 223 |
-
language_creators = multiselect(
|
| 224 |
leftcol,
|
| 225 |
"Data origin",
|
| 226 |
"Where does the text in the dataset come from?",
|
| 227 |
values=state["language_creators"],
|
| 228 |
valid_set=creator_set["language"],
|
| 229 |
)
|
| 230 |
-
annotations_creators = multiselect(
|
| 231 |
leftcol,
|
| 232 |
"Annotations origin",
|
| 233 |
"Where do the annotations in the dataset come from?",
|
|
@@ -236,7 +257,7 @@ annotations_creators = multiselect(
|
|
| 236 |
)
|
| 237 |
|
| 238 |
|
| 239 |
-
licenses = multiselect(
|
| 240 |
leftcol,
|
| 241 |
"Licenses",
|
| 242 |
"What licenses is the dataset under?",
|
|
@@ -244,13 +265,13 @@ licenses = multiselect(
|
|
| 244 |
values=state["licenses"],
|
| 245 |
format_func=lambda l: f"{l} : {license_set[l]}",
|
| 246 |
)
|
| 247 |
-
if "other" in licenses:
|
| 248 |
other_license = st.text_input(
|
| 249 |
"You selected 'other' type of license. Please enter a short hyphen-separated description:",
|
| 250 |
value="my-license",
|
| 251 |
)
|
| 252 |
st.write(f"Registering other-{other_license} license")
|
| 253 |
-
licenses[licenses.index("other")] = f"other-{other_license}"
|
| 254 |
|
| 255 |
# link to supported datasets
|
| 256 |
pre_select_ext_a = []
|
|
@@ -258,17 +279,16 @@ if "original" in state["source_datasets"]:
|
|
| 258 |
pre_select_ext_a += ["original"]
|
| 259 |
if any([p.startswith("extended") for p in state["source_datasets"]]):
|
| 260 |
pre_select_ext_a += ["extended"]
|
| 261 |
-
extended = multiselect(
|
| 262 |
leftcol,
|
| 263 |
"Relations to existing work",
|
| 264 |
"Does the dataset contain original data and/or was it extended from other datasets?",
|
| 265 |
values=pre_select_ext_a,
|
| 266 |
valid_set=["original", "extended"],
|
| 267 |
)
|
| 268 |
-
source_datasets = ["original"] if "original" in extended else []
|
| 269 |
|
| 270 |
-
|
| 271 |
-
if "extended" in extended:
|
| 272 |
pre_select_ext_b = [p.split("|")[1] for p in state["source_datasets"] if p.startswith("extended")]
|
| 273 |
extended_sources = multiselect(
|
| 274 |
leftcol,
|
|
@@ -284,43 +304,40 @@ if "extended" in extended:
|
|
| 284 |
)
|
| 285 |
st.write(f"Registering other-{other_extended_sources} dataset")
|
| 286 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
| 287 |
-
source_datasets += [f"extended|{src}" for src in extended_sources]
|
| 288 |
|
| 289 |
size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"]
|
| 290 |
current_size_cats = state.get("size_categories") or ["unknown"]
|
| 291 |
ok, nonok = split_known(current_size_cats, size_cats)
|
| 292 |
if len(nonok) > 0:
|
| 293 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
)
|
|
|
|
|
|
|
| 299 |
|
| 300 |
|
| 301 |
########################
|
| 302 |
## Show results
|
| 303 |
########################
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
"annotations_creators": annotations_creators,
|
| 312 |
-
"source_datasets": source_datasets,
|
| 313 |
-
"size_categories": size_category,
|
| 314 |
-
"licenses": licenses,
|
| 315 |
-
}
|
| 316 |
-
)
|
| 317 |
rightcol.markdown(
|
| 318 |
f"""
|
| 319 |
### Finalized tag set
|
| 320 |
|
| 321 |
-
|
| 322 |
|
| 323 |
```yaml
|
| 324 |
-
{
|
| 325 |
-
```
|
|
|
|
| 326 |
)
|
|
|
|
| 4 |
|
| 5 |
import streamlit as st
|
| 6 |
import yaml
|
| 7 |
+
from datasets.utils.metadata_validator import DatasetMetadata
|
| 8 |
|
| 9 |
st.set_page_config(
|
| 10 |
page_title="HF Dataset Tagging App",
|
|
|
|
| 129 |
)
|
| 130 |
|
| 131 |
|
| 132 |
+
queryparams = st.experimental_get_query_params()
|
| 133 |
+
preload = queryparams.get("preload_dataset", list())
|
| 134 |
preloaded_id = None
|
| 135 |
+
initial_state = None
|
| 136 |
did_index = 0
|
| 137 |
if len(preload) == 1 and preload[0] in all_dataset_ids:
|
| 138 |
preloaded_id, *_ = preload
|
| 139 |
+
initial_state = existing_tag_sets.get(preloaded_id)
|
| 140 |
+
state = initial_state or new_state()
|
| 141 |
did_index = all_dataset_ids.index(preloaded_id)
|
| 142 |
|
| 143 |
+
preloaded_id = st.sidebar.selectbox(
|
| 144 |
+
label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index
|
| 145 |
+
)
|
| 146 |
leftbtn, rightbtn = st.sidebar.beta_columns(2)
|
| 147 |
+
if leftbtn.button("pre-load"):
|
| 148 |
+
initial_state = existing_tag_sets[preloaded_id]
|
| 149 |
+
state = initial_state or new_state()
|
| 150 |
+
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
| 151 |
if rightbtn.button("flush state"):
|
| 152 |
state = new_state()
|
| 153 |
+
initial_state = None
|
| 154 |
+
preloaded_id = None
|
| 155 |
st.experimental_set_query_params()
|
| 156 |
|
| 157 |
+
if preloaded_id is not None and initial_state is not None:
|
| 158 |
+
try:
|
| 159 |
+
DatasetMetadata(**initial_state)
|
| 160 |
+
valid = "βοΈ This is a valid tagset!"
|
| 161 |
+
except Exception as e:
|
| 162 |
+
valid = f"""
|
| 163 |
+
π This is an invalid tagset, here are the errors in it:
|
| 164 |
+
```
|
| 165 |
+
{e}
|
| 166 |
+
```
|
| 167 |
+
You're _very_ welcome to fix these issues and submit a new PR on [`datasets`](https://github.com/huggingface/datasets/)
|
| 168 |
+
"""
|
| 169 |
st.sidebar.markdown(
|
| 170 |
f"""
|
| 171 |
+
---
|
| 172 |
+
The current base tagset is [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id})
|
| 173 |
+
{valid}
|
| 174 |
+
Here is the matching yaml block:
|
| 175 |
+
|
| 176 |
```yaml
|
| 177 |
+
{yaml.dump(initial_state)}
|
| 178 |
```
|
| 179 |
"""
|
| 180 |
)
|
|
|
|
| 184 |
|
| 185 |
|
| 186 |
leftcol.markdown("### Supported tasks")
|
| 187 |
+
state["task_categories"] = multiselect(
|
| 188 |
leftcol,
|
| 189 |
"Task category",
|
| 190 |
"What categories of task does the dataset support?",
|
|
|
|
| 193 |
format_func=lambda tg: f"{tg}: {task_set[tg]['description']}",
|
| 194 |
)
|
| 195 |
task_specifics = []
|
| 196 |
+
for tg in state["task_categories"]:
|
| 197 |
+
specs = multiselect(
|
| 198 |
leftcol,
|
| 199 |
+
f"Specific _{tg}_ tasks",
|
| 200 |
+
f"What specific tasks does the dataset support?",
|
| 201 |
+
values=[ts for ts in (state["task_ids"] or []) if ts in task_set[tg]["options"]],
|
| 202 |
valid_set=task_set[tg]["options"],
|
| 203 |
)
|
| 204 |
+
if "other" in specs:
|
| 205 |
other_task = st.text_input(
|
| 206 |
"You selected 'other' task. Please enter a short hyphen-separated description for the task:",
|
| 207 |
value="my-task-description",
|
| 208 |
)
|
| 209 |
st.write(f"Registering {tg}-other-{other_task} task")
|
| 210 |
+
specs[specs.index("other")] = f"{tg}-other-{other_task}"
|
| 211 |
+
task_specifics += specs
|
| 212 |
+
state["task_ids"] = task_specifics
|
| 213 |
|
| 214 |
|
| 215 |
leftcol.markdown("### Languages")
|
| 216 |
+
state["multilinguality"] = multiselect(
|
|
|
|
| 217 |
leftcol,
|
| 218 |
"Monolingual?",
|
| 219 |
"Does the dataset contain more than one language?",
|
|
|
|
| 222 |
format_func=lambda m: f"{m} : {multilinguality_set[m]}",
|
| 223 |
)
|
| 224 |
|
| 225 |
+
if "other" in state["multilinguality"]:
|
| 226 |
other_multilinguality = st.text_input(
|
| 227 |
"You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
|
| 228 |
value="my-multilinguality",
|
| 229 |
)
|
| 230 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
| 231 |
+
state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"
|
| 232 |
|
| 233 |
+
state["languages"] = multiselect(
|
|
|
|
| 234 |
leftcol,
|
| 235 |
"Languages",
|
| 236 |
"What languages are represented in the dataset?",
|
|
|
|
| 241 |
|
| 242 |
|
| 243 |
leftcol.markdown("### Dataset creators")
|
| 244 |
+
state["language_creators"] = multiselect(
|
| 245 |
leftcol,
|
| 246 |
"Data origin",
|
| 247 |
"Where does the text in the dataset come from?",
|
| 248 |
values=state["language_creators"],
|
| 249 |
valid_set=creator_set["language"],
|
| 250 |
)
|
| 251 |
+
state["annotations_creators"] = multiselect(
|
| 252 |
leftcol,
|
| 253 |
"Annotations origin",
|
| 254 |
"Where do the annotations in the dataset come from?",
|
|
|
|
| 257 |
)
|
| 258 |
|
| 259 |
|
| 260 |
+
state["licenses"] = multiselect(
|
| 261 |
leftcol,
|
| 262 |
"Licenses",
|
| 263 |
"What licenses is the dataset under?",
|
|
|
|
| 265 |
values=state["licenses"],
|
| 266 |
format_func=lambda l: f"{l} : {license_set[l]}",
|
| 267 |
)
|
| 268 |
+
if "other" in state["licenses"]:
|
| 269 |
other_license = st.text_input(
|
| 270 |
"You selected 'other' type of license. Please enter a short hyphen-separated description:",
|
| 271 |
value="my-license",
|
| 272 |
)
|
| 273 |
st.write(f"Registering other-{other_license} license")
|
| 274 |
+
state["licenses"][state["licenses"].index("other")] = f"other-{other_license}"
|
| 275 |
|
| 276 |
# link to supported datasets
|
| 277 |
pre_select_ext_a = []
|
|
|
|
| 279 |
pre_select_ext_a += ["original"]
|
| 280 |
if any([p.startswith("extended") for p in state["source_datasets"]]):
|
| 281 |
pre_select_ext_a += ["extended"]
|
| 282 |
+
state["extended"] = multiselect(
|
| 283 |
leftcol,
|
| 284 |
"Relations to existing work",
|
| 285 |
"Does the dataset contain original data and/or was it extended from other datasets?",
|
| 286 |
values=pre_select_ext_a,
|
| 287 |
valid_set=["original", "extended"],
|
| 288 |
)
|
| 289 |
+
state["source_datasets"] = ["original"] if "original" in state["extended"] else []
|
| 290 |
|
| 291 |
+
if "extended" in state["extended"]:
|
|
|
|
| 292 |
pre_select_ext_b = [p.split("|")[1] for p in state["source_datasets"] if p.startswith("extended")]
|
| 293 |
extended_sources = multiselect(
|
| 294 |
leftcol,
|
|
|
|
| 304 |
)
|
| 305 |
st.write(f"Registering other-{other_extended_sources} dataset")
|
| 306 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
| 307 |
+
state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
|
| 308 |
|
| 309 |
size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"]
|
| 310 |
current_size_cats = state.get("size_categories") or ["unknown"]
|
| 311 |
ok, nonok = split_known(current_size_cats, size_cats)
|
| 312 |
if len(nonok) > 0:
|
| 313 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
| 314 |
+
state["size_categories"] = [
|
| 315 |
+
leftcol.selectbox(
|
| 316 |
+
"What is the size category of the dataset?",
|
| 317 |
+
options=size_cats,
|
| 318 |
+
index=size_cats.index(ok[0]) if len(ok) > 0 else 0,
|
| 319 |
+
)
|
| 320 |
+
]
|
| 321 |
|
| 322 |
|
| 323 |
########################
|
| 324 |
## Show results
|
| 325 |
########################
|
| 326 |
+
try:
|
| 327 |
+
DatasetMetadata(**state)
|
| 328 |
+
valid = "β Validated! Copy it into your dataset's `README.md` header! π€ "
|
| 329 |
+
except Exception as e:
|
| 330 |
+
valid = f"""π Could not validate:
|
| 331 |
+
```{e}```
|
| 332 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
rightcol.markdown(
|
| 334 |
f"""
|
| 335 |
### Finalized tag set
|
| 336 |
|
| 337 |
+
{valid}
|
| 338 |
|
| 339 |
```yaml
|
| 340 |
+
{yaml.dump(state)}
|
| 341 |
+
```
|
| 342 |
+
""",
|
| 343 |
)
|