Spaces:
Running
Running
Merge pull request #5 from lhoestq/load-local-only-by-default
Browse files- tagging_app.py +12 -7
tagging_app.py
CHANGED
|
@@ -3,12 +3,18 @@ import datasets
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import streamlit as st
|
|
|
|
| 6 |
import yaml
|
| 7 |
-
|
| 8 |
from dataclasses import asdict
|
|
|
|
|
|
|
|
|
|
| 9 |
from glob import glob
|
| 10 |
from os.path import join as pjoin
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
st.set_page_config(
|
| 13 |
page_title="HF Dataset Tagging App",
|
| 14 |
page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
|
|
@@ -132,7 +138,7 @@ def load_all_dataset_infos(dataset_list):
|
|
| 132 |
def load_existing_tags():
|
| 133 |
has_tags = {}
|
| 134 |
for fname in glob("saved_tags/*/*/tags.json"):
|
| 135 |
-
_, did, cid, _ = fname.split(
|
| 136 |
has_tags[did] = has_tags.get(did, {})
|
| 137 |
has_tags[did][cid] = fname
|
| 138 |
return has_tags
|
|
@@ -160,9 +166,9 @@ to pre-load the tag sets from another dataset or configuration to avoid too much
|
|
| 160 |
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
| 161 |
"""
|
| 162 |
|
| 163 |
-
all_dataset_ids = copy.deepcopy(get_dataset_list())
|
| 164 |
existing_tag_sets = load_existing_tags()
|
| 165 |
-
|
|
|
|
| 166 |
|
| 167 |
st.sidebar.markdown(app_desc)
|
| 168 |
|
|
@@ -181,6 +187,7 @@ dataset_id = st.sidebar.selectbox(
|
|
| 181 |
index=0,
|
| 182 |
)
|
| 183 |
|
|
|
|
| 184 |
if dataset_id == "local dataset":
|
| 185 |
path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
|
| 186 |
if path_to_info not in ["/path/to/dataset/", ""]:
|
|
@@ -249,8 +256,6 @@ c2.markdown(f"### Writing tags for: {dataset_id} / {config_id}")
|
|
| 249 |
##########
|
| 250 |
c2.markdown("#### Pre-loading an existing tag set")
|
| 251 |
|
| 252 |
-
existing_tag_sets = load_existing_tags()
|
| 253 |
-
|
| 254 |
pre_loaded = {
|
| 255 |
"task_categories": [],
|
| 256 |
"task_ids": [],
|
|
@@ -442,7 +447,7 @@ with c3.beta_expander("Show JSON output for the current config"):
|
|
| 442 |
|
| 443 |
with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
|
| 444 |
task_saved_configs = dict([
|
| 445 |
-
(fname.
|
| 446 |
for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
|
| 447 |
])
|
| 448 |
aggregate_config = {}
|
|
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import streamlit as st
|
| 6 |
+
import sys
|
| 7 |
import yaml
|
|
|
|
| 8 |
from dataclasses import asdict
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Dict
|
| 11 |
+
|
| 12 |
from glob import glob
|
| 13 |
from os.path import join as pjoin
|
| 14 |
|
| 15 |
+
|
| 16 |
+
load_remote_datasets = "--load_remote_datasets" in sys.argv[1:]
|
| 17 |
+
|
| 18 |
st.set_page_config(
|
| 19 |
page_title="HF Dataset Tagging App",
|
| 20 |
page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
|
|
|
|
| 138 |
def load_existing_tags():
|
| 139 |
has_tags = {}
|
| 140 |
for fname in glob("saved_tags/*/*/tags.json"):
|
| 141 |
+
_, did, cid, _ = fname.split(os.sep)
|
| 142 |
has_tags[did] = has_tags.get(did, {})
|
| 143 |
has_tags[did][cid] = fname
|
| 144 |
return has_tags
|
|
|
|
| 166 |
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
| 167 |
"""
|
| 168 |
|
|
|
|
| 169 |
existing_tag_sets = load_existing_tags()
|
| 170 |
+
all_dataset_ids = list(existing_tag_sets.keys()) if not load_remote_datasets else copy.deepcopy(get_dataset_list())
|
| 171 |
+
all_dataset_infos = {} if not load_remote_datasets else load_all_dataset_infos(all_dataset_ids)
|
| 172 |
|
| 173 |
st.sidebar.markdown(app_desc)
|
| 174 |
|
|
|
|
| 187 |
index=0,
|
| 188 |
)
|
| 189 |
|
| 190 |
+
all_info_dicts = {}
|
| 191 |
if dataset_id == "local dataset":
|
| 192 |
path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
|
| 193 |
if path_to_info not in ["/path/to/dataset/", ""]:
|
|
|
|
| 256 |
##########
|
| 257 |
c2.markdown("#### Pre-loading an existing tag set")
|
| 258 |
|
|
|
|
|
|
|
| 259 |
pre_loaded = {
|
| 260 |
"task_categories": [],
|
| 261 |
"task_ids": [],
|
|
|
|
| 447 |
|
| 448 |
with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
|
| 449 |
task_saved_configs = dict([
|
| 450 |
+
(Path(fname).parent.name, json.load(open(fname)))
|
| 451 |
for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
|
| 452 |
])
|
| 453 |
aggregate_config = {}
|