Spaces:
Running
Running
theo
commited on
Commit
·
26742b2
1
Parent(s):
2656c08
refactor multiselect to handle error uniformly
Browse files- tagging_app.py +92 -53
tagging_app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import json
|
| 2 |
from pathlib import Path
|
| 3 |
-
from typing import List, Tuple
|
| 4 |
|
| 5 |
import streamlit as st
|
| 6 |
import yaml
|
|
@@ -59,10 +59,32 @@ def load_ds_datas():
|
|
| 59 |
|
| 60 |
|
| 61 |
def split_known(vals: List[str], okset: List[str]) -> Tuple[List[str], List[str]]:
|
|
|
|
|
|
|
| 62 |
return [v for v in vals if v in okset], [v for v in vals if v not in okset]
|
| 63 |
|
| 64 |
|
| 65 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
return {
|
| 67 |
"task_categories": [],
|
| 68 |
"task_ids": [],
|
|
@@ -76,7 +98,7 @@ def new_pre_loaded():
|
|
| 76 |
}
|
| 77 |
|
| 78 |
|
| 79 |
-
|
| 80 |
datasets_md = load_ds_datas()
|
| 81 |
existing_tag_sets = {name: mds["metadata"] for name, mds in datasets_md.items()}
|
| 82 |
all_dataset_ids = list(existing_tag_sets.keys())
|
|
@@ -112,39 +134,50 @@ preloaded_id = None
|
|
| 112 |
did_index = 0
|
| 113 |
if len(preload) == 1 and preload[0] in all_dataset_ids:
|
| 114 |
preloaded_id, *_ = preload
|
| 115 |
-
|
| 116 |
did_index = all_dataset_ids.index(preloaded_id)
|
| 117 |
|
| 118 |
did = st.sidebar.selectbox(label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index)
|
| 119 |
|
| 120 |
leftbtn, rightbtn = st.sidebar.beta_columns(2)
|
| 121 |
if leftbtn.button("pre-load tagset"):
|
| 122 |
-
|
| 123 |
st.experimental_set_query_params(preload_dataset=did)
|
| 124 |
if rightbtn.button("flush state"):
|
| 125 |
-
|
| 126 |
st.experimental_set_query_params()
|
| 127 |
|
| 128 |
if preloaded_id is not None:
|
| 129 |
-
st.sidebar.markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
leftcol, _, rightcol = st.beta_columns([12, 1, 12])
|
| 133 |
|
| 134 |
|
| 135 |
leftcol.markdown("### Supported tasks")
|
| 136 |
-
task_categories =
|
|
|
|
|
|
|
| 137 |
"What categories of task does the dataset support?",
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
format_func=lambda tg: f"{tg}
|
| 141 |
)
|
| 142 |
task_specifics = []
|
| 143 |
for tg in task_categories:
|
| 144 |
-
task_specs =
|
|
|
|
|
|
|
| 145 |
f"What specific *{tg}* tasks does the dataset support?",
|
| 146 |
-
|
| 147 |
-
|
| 148 |
)
|
| 149 |
if "other" in task_specs:
|
| 150 |
other_task = st.text_input(
|
|
@@ -157,13 +190,13 @@ for tg in task_categories:
|
|
| 157 |
|
| 158 |
|
| 159 |
leftcol.markdown("### Languages")
|
| 160 |
-
filtered_existing_languages = [lgc for lgc in set(pre_loaded["languages"]) if lgc not in language_set_restricted]
|
| 161 |
-
pre_loaded["languages"] = [lgc for lgc in set(pre_loaded["languages"]) if lgc in language_set_restricted]
|
| 162 |
|
| 163 |
-
multilinguality =
|
|
|
|
|
|
|
| 164 |
"Does the dataset contain more than one language?",
|
| 165 |
-
|
| 166 |
-
|
| 167 |
format_func=lambda m: f"{m} : {multilinguality_set[m]}",
|
| 168 |
)
|
| 169 |
|
|
@@ -175,41 +208,40 @@ if "other" in multilinguality:
|
|
| 175 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
| 176 |
multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
|
|
|
| 181 |
"What languages are represented in the dataset?",
|
| 182 |
-
|
| 183 |
-
|
| 184 |
format_func=lambda m: f"{m} : {language_set_restricted[m]}",
|
| 185 |
)
|
| 186 |
|
| 187 |
|
| 188 |
leftcol.markdown("### Dataset creators")
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
language_creators = leftcol.multiselect(
|
| 193 |
"Where does the text in the dataset come from?",
|
| 194 |
-
|
| 195 |
-
|
| 196 |
)
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
annotations_creators = leftcol.multiselect(
|
| 201 |
"Where do the annotations in the dataset come from?",
|
| 202 |
-
|
| 203 |
-
|
| 204 |
)
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
leftcol
|
| 209 |
-
|
| 210 |
"What licenses is the dataset under?",
|
| 211 |
-
|
| 212 |
-
|
| 213 |
format_func=lambda l: f"{l} : {license_set[l]}",
|
| 214 |
)
|
| 215 |
if "other" in licenses:
|
|
@@ -219,24 +251,31 @@ if "other" in licenses:
|
|
| 219 |
)
|
| 220 |
st.write(f"Registering other-{other_license} license")
|
| 221 |
licenses[licenses.index("other")] = f"other-{other_license}"
|
| 222 |
-
|
|
|
|
| 223 |
pre_select_ext_a = []
|
| 224 |
-
if "original" in
|
| 225 |
pre_select_ext_a += ["original"]
|
| 226 |
-
if any([p.startswith("extended") for p in
|
| 227 |
pre_select_ext_a += ["extended"]
|
| 228 |
-
extended =
|
|
|
|
|
|
|
| 229 |
"Does the dataset contain original data and/or was it extended from other datasets?",
|
| 230 |
-
|
| 231 |
-
|
| 232 |
)
|
| 233 |
source_datasets = ["original"] if "original" in extended else []
|
|
|
|
|
|
|
| 234 |
if "extended" in extended:
|
| 235 |
-
pre_select_ext_b = [p.split("|")[1] for p in
|
| 236 |
-
extended_sources =
|
|
|
|
|
|
|
| 237 |
"Which other datasets does this one use data from?",
|
| 238 |
-
|
| 239 |
-
|
| 240 |
)
|
| 241 |
if "other" in extended_sources:
|
| 242 |
other_extended_sources = st.text_input(
|
|
@@ -248,7 +287,7 @@ if "extended" in extended:
|
|
| 248 |
source_datasets += [f"extended|{src}" for src in extended_sources]
|
| 249 |
|
| 250 |
size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"]
|
| 251 |
-
current_size_cats =
|
| 252 |
ok, nonok = split_known(current_size_cats, size_cats)
|
| 253 |
if len(nonok) > 0:
|
| 254 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
|
|
|
| 1 |
import json
|
| 2 |
from pathlib import Path
|
| 3 |
+
from typing import Callable, List, Tuple
|
| 4 |
|
| 5 |
import streamlit as st
|
| 6 |
import yaml
|
|
|
|
| 59 |
|
| 60 |
|
| 61 |
def split_known(vals: List[str], okset: List[str]) -> Tuple[List[str], List[str]]:
|
| 62 |
+
if vals is None:
|
| 63 |
+
return [], []
|
| 64 |
return [v for v in vals if v in okset], [v for v in vals if v not in okset]
|
| 65 |
|
| 66 |
|
| 67 |
+
def multiselect(
|
| 68 |
+
w: st.delta_generator.DeltaGenerator,
|
| 69 |
+
title: str,
|
| 70 |
+
markdown: str,
|
| 71 |
+
values: List[str],
|
| 72 |
+
valid_set: List[str],
|
| 73 |
+
format_func: Callable = str,
|
| 74 |
+
):
|
| 75 |
+
valid_values, invalid_values = split_known(values, valid_set)
|
| 76 |
+
w.markdown(
|
| 77 |
+
"""
|
| 78 |
+
#### {title}
|
| 79 |
+
{errors}
|
| 80 |
+
""".format(
|
| 81 |
+
title=title, errors="" if len(invalid_values) == 0 else f"_Found invalid values:_ `{invalid_values}`"
|
| 82 |
+
)
|
| 83 |
+
)
|
| 84 |
+
return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def new_state():
|
| 88 |
return {
|
| 89 |
"task_categories": [],
|
| 90 |
"task_ids": [],
|
|
|
|
| 98 |
}
|
| 99 |
|
| 100 |
|
| 101 |
+
state = new_state()
|
| 102 |
datasets_md = load_ds_datas()
|
| 103 |
existing_tag_sets = {name: mds["metadata"] for name, mds in datasets_md.items()}
|
| 104 |
all_dataset_ids = list(existing_tag_sets.keys())
|
|
|
|
| 134 |
did_index = 0
|
| 135 |
if len(preload) == 1 and preload[0] in all_dataset_ids:
|
| 136 |
preloaded_id, *_ = preload
|
| 137 |
+
state = existing_tag_sets[preloaded_id] or new_state()
|
| 138 |
did_index = all_dataset_ids.index(preloaded_id)
|
| 139 |
|
| 140 |
did = st.sidebar.selectbox(label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index)
|
| 141 |
|
| 142 |
leftbtn, rightbtn = st.sidebar.beta_columns(2)
|
| 143 |
if leftbtn.button("pre-load tagset"):
|
| 144 |
+
state = existing_tag_sets[did] or new_state()
|
| 145 |
st.experimental_set_query_params(preload_dataset=did)
|
| 146 |
if rightbtn.button("flush state"):
|
| 147 |
+
state = new_state()
|
| 148 |
st.experimental_set_query_params()
|
| 149 |
|
| 150 |
if preloaded_id is not None:
|
| 151 |
+
st.sidebar.markdown(
|
| 152 |
+
f"""
|
| 153 |
+
Took [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id}) as base tagset:
|
| 154 |
+
```yaml
|
| 155 |
+
{yaml.dump(state)}
|
| 156 |
+
```
|
| 157 |
+
"""
|
| 158 |
+
)
|
| 159 |
|
| 160 |
|
| 161 |
leftcol, _, rightcol = st.beta_columns([12, 1, 12])
|
| 162 |
|
| 163 |
|
| 164 |
leftcol.markdown("### Supported tasks")
|
| 165 |
+
task_categories = multiselect(
|
| 166 |
+
leftcol,
|
| 167 |
+
"Task category",
|
| 168 |
"What categories of task does the dataset support?",
|
| 169 |
+
values=state["task_categories"],
|
| 170 |
+
valid_set=list(task_set.keys()),
|
| 171 |
+
format_func=lambda tg: f"{tg}: {task_set[tg]['description']}",
|
| 172 |
)
|
| 173 |
task_specifics = []
|
| 174 |
for tg in task_categories:
|
| 175 |
+
task_specs = multiselect(
|
| 176 |
+
leftcol,
|
| 177 |
+
"Specific tasks",
|
| 178 |
f"What specific *{tg}* tasks does the dataset support?",
|
| 179 |
+
values=[ts for ts in state["task_ids"] if ts in task_set[tg]["options"]],
|
| 180 |
+
valid_set=task_set[tg]["options"],
|
| 181 |
)
|
| 182 |
if "other" in task_specs:
|
| 183 |
other_task = st.text_input(
|
|
|
|
| 190 |
|
| 191 |
|
| 192 |
leftcol.markdown("### Languages")
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
multilinguality = multiselect(
|
| 195 |
+
leftcol,
|
| 196 |
+
"Monolingual?",
|
| 197 |
"Does the dataset contain more than one language?",
|
| 198 |
+
values=state["multilinguality"],
|
| 199 |
+
valid_set=list(multilinguality_set.keys()),
|
| 200 |
format_func=lambda m: f"{m} : {multilinguality_set[m]}",
|
| 201 |
)
|
| 202 |
|
|
|
|
| 208 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
| 209 |
multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
|
| 210 |
|
| 211 |
+
|
| 212 |
+
languages = multiselect(
|
| 213 |
+
leftcol,
|
| 214 |
+
"Languages",
|
| 215 |
"What languages are represented in the dataset?",
|
| 216 |
+
values=state["languages"],
|
| 217 |
+
valid_set=list(language_set_restricted.keys()),
|
| 218 |
format_func=lambda m: f"{m} : {language_set_restricted[m]}",
|
| 219 |
)
|
| 220 |
|
| 221 |
|
| 222 |
leftcol.markdown("### Dataset creators")
|
| 223 |
+
language_creators = multiselect(
|
| 224 |
+
leftcol,
|
| 225 |
+
"Data origin",
|
|
|
|
| 226 |
"Where does the text in the dataset come from?",
|
| 227 |
+
values=state["language_creators"],
|
| 228 |
+
valid_set=creator_set["language"],
|
| 229 |
)
|
| 230 |
+
annotations_creators = multiselect(
|
| 231 |
+
leftcol,
|
| 232 |
+
"Annotations origin",
|
|
|
|
| 233 |
"Where do the annotations in the dataset come from?",
|
| 234 |
+
values=state["annotations_creators"],
|
| 235 |
+
valid_set=creator_set["annotations"],
|
| 236 |
)
|
| 237 |
|
| 238 |
+
|
| 239 |
+
licenses = multiselect(
|
| 240 |
+
leftcol,
|
| 241 |
+
"Licenses",
|
| 242 |
"What licenses is the dataset under?",
|
| 243 |
+
valid_set=list(license_set.keys()),
|
| 244 |
+
values=state["licenses"],
|
| 245 |
format_func=lambda l: f"{l} : {license_set[l]}",
|
| 246 |
)
|
| 247 |
if "other" in licenses:
|
|
|
|
| 251 |
)
|
| 252 |
st.write(f"Registering other-{other_license} license")
|
| 253 |
licenses[licenses.index("other")] = f"other-{other_license}"
|
| 254 |
+
|
| 255 |
+
# link to supported datasets
|
| 256 |
pre_select_ext_a = []
|
| 257 |
+
if "original" in state["source_datasets"]:
|
| 258 |
pre_select_ext_a += ["original"]
|
| 259 |
+
if any([p.startswith("extended") for p in state["source_datasets"]]):
|
| 260 |
pre_select_ext_a += ["extended"]
|
| 261 |
+
extended = multiselect(
|
| 262 |
+
leftcol,
|
| 263 |
+
"Relations to existing work",
|
| 264 |
"Does the dataset contain original data and/or was it extended from other datasets?",
|
| 265 |
+
values=pre_select_ext_a,
|
| 266 |
+
valid_set=["original", "extended"],
|
| 267 |
)
|
| 268 |
source_datasets = ["original"] if "original" in extended else []
|
| 269 |
+
|
| 270 |
+
# todo: show bad tags
|
| 271 |
if "extended" in extended:
|
| 272 |
+
pre_select_ext_b = [p.split("|")[1] for p in state["source_datasets"] if p.startswith("extended")]
|
| 273 |
+
extended_sources = multiselect(
|
| 274 |
+
leftcol,
|
| 275 |
+
"Linked datasets",
|
| 276 |
"Which other datasets does this one use data from?",
|
| 277 |
+
values=pre_select_ext_b,
|
| 278 |
+
valid_set=all_dataset_ids + ["other"],
|
| 279 |
)
|
| 280 |
if "other" in extended_sources:
|
| 281 |
other_extended_sources = st.text_input(
|
|
|
|
| 287 |
source_datasets += [f"extended|{src}" for src in extended_sources]
|
| 288 |
|
| 289 |
size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"]
|
| 290 |
+
current_size_cats = state.get("size_categories") or ["unknown"]
|
| 291 |
ok, nonok = split_known(current_size_cats, size_cats)
|
| 292 |
if len(nonok) > 0:
|
| 293 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|