Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		theo
		
	commited on
		
		
					Commit 
							
							·
						
						8b77729
	
1
								Parent(s):
							
							08a65ff
								
dockerfile builder + metadata builder
Browse files- .gitignore +1 -0
 - build_docker_image.sh +21 -0
 - build_metadata_file.py +56 -0
 - language_set.json +0 -1
 - tagging_app.py +82 -54
 
    	
        .gitignore
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            .idea
         
     | 
    	
        build_docker_image.sh
    ADDED
    
    | 
         @@ -0,0 +1,21 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #!/usr/bin/env bash
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            cleanup() {
         
     | 
| 5 | 
         
            +
              rm -f Dockerfile
         
     | 
| 6 | 
         
            +
            }
         
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
            trap cleanup ERR EXIT
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            cat > Dockerfile << EOF
         
     | 
| 11 | 
         
            +
            FROM python
         
     | 
| 12 | 
         
            +
            COPY requirements.txt .
         
     | 
| 13 | 
         
            +
            COPY tagging_app.py .
         
     | 
| 14 | 
         
            +
            RUN pip install -r requirements.txt
         
     | 
| 15 | 
         
            +
            CMD ["streamlit", "run", "tagging_app.py"]
         
     | 
| 16 | 
         
            +
            EOF
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
            set -eEx
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
            ./build_metadata_file.py
         
     | 
| 21 | 
         
            +
            docker build -t dataset-tagger .
         
     | 
    	
        build_metadata_file.py
    ADDED
    
    | 
         @@ -0,0 +1,56 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #!/usr/bin/env python
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            """ This script will clone the `datasets` repository in your current directory and parse all currently available
         
     | 
| 4 | 
         
            +
                metadata, from the `README.md` yaml headers and the automatically generated json files.
         
     | 
| 5 | 
         
            +
                It dumps the results in a `metadata_{current-commit-of-datasets}.json` file.
         
     | 
| 6 | 
         
            +
            """
         
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
            import json
         
     | 
| 9 | 
         
            +
            from pathlib import Path
         
     | 
| 10 | 
         
            +
            from subprocess import check_call, check_output
         
     | 
| 11 | 
         
            +
            from typing import Dict
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            import yaml
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            def metadata_from_readme(f: Path) -> Dict:
         
     | 
| 17 | 
         
            +
                with f.open() as fi:
         
     | 
| 18 | 
         
            +
                    content = [line.strip() for line in fi]
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
                if content[0] == "---" and "---" in content[1:]:
         
     | 
| 21 | 
         
            +
                    yamlblock = "\n".join(content[1 : content[1:].index("---") + 1])
         
     | 
| 22 | 
         
            +
                    return yaml.safe_load(yamlblock) or dict()
         
     | 
| 23 | 
         
            +
             
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
            def load_ds_datas():
         
     | 
| 26 | 
         
            +
                drepo = Path("datasets")
         
     | 
| 27 | 
         
            +
                if drepo.exists() and drepo.is_dir():
         
     | 
| 28 | 
         
            +
                    check_call(["git", "pull"], cwd=str((Path.cwd() / "datasets").absolute()))
         
     | 
| 29 | 
         
            +
                else:
         
     | 
| 30 | 
         
            +
                    check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
         
     | 
| 31 | 
         
            +
                head_sha = check_output(["git", "rev-parse", "HEAD"])
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
                datasets_md = dict()
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
                for ddir in sorted((drepo / "datasets").iterdir(), key=lambda d: d.name):
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
                    try:
         
     | 
| 38 | 
         
            +
                        metadata = metadata_from_readme(ddir / "README.md")
         
     | 
| 39 | 
         
            +
                    except:
         
     | 
| 40 | 
         
            +
                        metadata = None
         
     | 
| 41 | 
         
            +
             
     | 
| 42 | 
         
            +
                    try:
         
     | 
| 43 | 
         
            +
                        with (ddir / "dataset_infos.json").open() as fi:
         
     | 
| 44 | 
         
            +
                            infos = json.load(fi)
         
     | 
| 45 | 
         
            +
                    except:
         
     | 
| 46 | 
         
            +
                        infos = None
         
     | 
| 47 | 
         
            +
             
     | 
| 48 | 
         
            +
                    if metadata is not None and len(metadata) > 0:
         
     | 
| 49 | 
         
            +
                        datasets_md[ddir.name] = dict(metadata=metadata, infos=infos)
         
     | 
| 50 | 
         
            +
                return head_sha.decode().strip(), datasets_md
         
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 54 | 
         
            +
                head_sha, datas = load_ds_datas()
         
     | 
| 55 | 
         
            +
                with open(f"metadata_{head_sha}.json", "w") as fi:
         
     | 
| 56 | 
         
            +
                    fi.write(json.dumps(datas))
         
     | 
    	
        language_set.json
    CHANGED
    
    | 
         @@ -345,7 +345,6 @@ 
     | 
|
| 345 | 
         
             
                "pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
         
     | 
| 346 | 
         
             
                "ps": "Pushto, Pashto",
         
     | 
| 347 | 
         
             
                "pt": "Portuguese",
         
     | 
| 348 | 
         
            -
                "qaa..qtz": "Private use",
         
     | 
| 349 | 
         
             
                "qu": "Quechua",
         
     | 
| 350 | 
         
             
                "raj": "Rajasthani",
         
     | 
| 351 | 
         
             
                "rap": "Rapanui",
         
     | 
| 
         | 
|
| 345 | 
         
             
                "pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
         
     | 
| 346 | 
         
             
                "ps": "Pushto, Pashto",
         
     | 
| 347 | 
         
             
                "pt": "Portuguese",
         
     | 
| 
         | 
|
| 348 | 
         
             
                "qu": "Quechua",
         
     | 
| 349 | 
         
             
                "raj": "Rajasthani",
         
     | 
| 350 | 
         
             
                "rap": "Rapanui",
         
     | 
    	
        tagging_app.py
    CHANGED
    
    | 
         @@ -1,9 +1,7 @@ 
     | 
|
| 1 | 
         
             
            import json
         
     | 
| 2 | 
         
            -
            import  
     | 
| 3 | 
         
            -
            from  
     | 
| 4 | 
         
            -
            from glob import glob
         
     | 
| 5 | 
         | 
| 6 | 
         
            -
            import datasets
         
     | 
| 7 | 
         
             
            import streamlit as st
         
     | 
| 8 | 
         
             
            import yaml
         
     | 
| 9 | 
         | 
| 
         @@ -17,7 +15,6 @@ st.set_page_config( 
     | 
|
| 17 | 
         
             
            task_set = json.load(open("task_set.json"))
         
     | 
| 18 | 
         
             
            license_set = json.load(open("license_set.json"))
         
     | 
| 19 | 
         
             
            language_set_restricted = json.load(open("language_set.json"))
         
     | 
| 20 | 
         
            -
            language_set = json.load(open("language_set_full.json"))
         
     | 
| 21 | 
         | 
| 22 | 
         
             
            multilinguality_set = {
         
     | 
| 23 | 
         
             
                "monolingual": "contains a single language",
         
     | 
| 
         @@ -49,13 +46,21 @@ creator_set = { 
     | 
|
| 49 | 
         
             
            ########################
         
     | 
| 50 | 
         | 
| 51 | 
         | 
| 52 | 
         
            -
             
     | 
| 53 | 
         
            -
             
     | 
| 54 | 
         
            -
                 
     | 
| 55 | 
         
            -
                     
     | 
| 56 | 
         
            -
                     
     | 
| 57 | 
         
            -
                     
     | 
| 58 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 59 | 
         | 
| 60 | 
         | 
| 61 | 
         
             
            def new_pre_loaded():
         
     | 
| 
         @@ -73,8 +78,8 @@ def new_pre_loaded(): 
     | 
|
| 73 | 
         | 
| 74 | 
         | 
| 75 | 
         
             
            pre_loaded = new_pre_loaded()
         
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
            existing_tag_sets =  
     | 
| 78 | 
         
             
            all_dataset_ids = list(existing_tag_sets.keys())
         
     | 
| 79 | 
         | 
| 80 | 
         | 
| 
         @@ -104,34 +109,29 @@ Beware that clicking pre-load will overwrite the current state! 
     | 
|
| 104 | 
         | 
| 105 | 
         
             
            qp = st.experimental_get_query_params()
         
     | 
| 106 | 
         
             
            preload = qp.get("preload_dataset", list())
         
     | 
| 107 | 
         
            -
             
     | 
| 
         | 
|
| 108 | 
         
             
            if len(preload) == 1 and preload[0] in all_dataset_ids:
         
     | 
| 109 | 
         
            -
                 
     | 
| 110 | 
         
            -
                 
     | 
| 111 | 
         
            -
                 
     | 
| 112 | 
         
            -
                did_index = all_dataset_ids.index(did_qp)
         
     | 
| 113 | 
         | 
| 114 | 
         
             
            did = st.sidebar.selectbox(label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index)
         
     | 
| 115 | 
         
            -
            if len(existing_tag_sets[did]) > 1:
         
     | 
| 116 | 
         
            -
                cid = st.sidebar.selectbox(
         
     | 
| 117 | 
         
            -
                    label="Choose config to load tag set from",
         
     | 
| 118 | 
         
            -
                    options=list(existing_tag_sets[did].keys()),
         
     | 
| 119 | 
         
            -
                    index=0,
         
     | 
| 120 | 
         
            -
                )
         
     | 
| 121 | 
         
            -
            else:
         
     | 
| 122 | 
         
            -
                cid = next(iter(existing_tag_sets[did].keys()))
         
     | 
| 123 | 
         | 
| 124 | 
         
            -
             
     | 
| 125 | 
         
            -
             
     | 
| 
         | 
|
| 126 | 
         
             
                st.experimental_set_query_params(preload_dataset=did)
         
     | 
| 127 | 
         
            -
            if  
     | 
| 128 | 
         
             
                pre_loaded = new_pre_loaded()
         
     | 
| 129 | 
         
             
                st.experimental_set_query_params()
         
     | 
| 130 | 
         | 
| 131 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 132 | 
         | 
| 
         | 
|
| 133 | 
         | 
| 134 | 
         
            -
            pre_loaded["languages"] = list(set(pre_loaded["languages"]))
         
     | 
| 135 | 
         | 
| 136 | 
         
             
            leftcol.markdown("### Supported tasks")
         
     | 
| 137 | 
         
             
            task_categories = leftcol.multiselect(
         
     | 
| 
         @@ -156,13 +156,18 @@ for tg in task_categories: 
     | 
|
| 156 | 
         
             
                    task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
         
     | 
| 157 | 
         
             
                task_specifics += task_specs
         
     | 
| 158 | 
         | 
| 
         | 
|
| 159 | 
         
             
            leftcol.markdown("### Languages")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 160 | 
         
             
            multilinguality = leftcol.multiselect(
         
     | 
| 161 | 
         
             
                "Does the dataset contain more than one language?",
         
     | 
| 162 | 
         
             
                options=list(multilinguality_set.keys()),
         
     | 
| 163 | 
         
             
                default=pre_loaded["multilinguality"],
         
     | 
| 164 | 
         
             
                format_func=lambda m: f"{m} : {multilinguality_set[m]}",
         
     | 
| 165 | 
         
             
            )
         
     | 
| 
         | 
|
| 166 | 
         
             
            if "other" in multilinguality:
         
     | 
| 167 | 
         
             
                other_multilinguality = st.text_input(
         
     | 
| 168 | 
         
             
                    "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
         
     | 
| 
         @@ -170,28 +175,42 @@ if "other" in multilinguality: 
     | 
|
| 170 | 
         
             
                )
         
     | 
| 171 | 
         
             
                st.write(f"Registering other-{other_multilinguality} multilinguality")
         
     | 
| 172 | 
         
             
                multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 173 | 
         
             
            languages = leftcol.multiselect(
         
     | 
| 174 | 
         
             
                "What languages are represented in the dataset?",
         
     | 
| 175 | 
         
            -
                options=list( 
     | 
| 176 | 
         
             
                default=pre_loaded["languages"],
         
     | 
| 177 | 
         
            -
                format_func=lambda m: f"{m} : { 
     | 
| 178 | 
         
             
            )
         
     | 
| 179 | 
         | 
| 
         | 
|
| 180 | 
         
             
            leftcol.markdown("### Dataset creators")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 181 | 
         
             
            language_creators = leftcol.multiselect(
         
     | 
| 182 | 
         
             
                "Where does the text in the dataset come from?",
         
     | 
| 183 | 
         
             
                options=creator_set["language"],
         
     | 
| 184 | 
         
            -
                default= 
     | 
| 185 | 
         
             
            )
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 186 | 
         
             
            annotations_creators = leftcol.multiselect(
         
     | 
| 187 | 
         
             
                "Where do the annotations in the dataset come from?",
         
     | 
| 188 | 
         
             
                options=creator_set["annotations"],
         
     | 
| 189 | 
         
            -
                default= 
     | 
| 190 | 
         
             
            )
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 191 | 
         
             
            licenses = leftcol.multiselect(
         
     | 
| 192 | 
         
             
                "What licenses is the dataset under?",
         
     | 
| 193 | 
         
             
                options=list(license_set.keys()),
         
     | 
| 194 | 
         
            -
                default= 
     | 
| 195 | 
         
             
                format_func=lambda l: f"{l} : {license_set[l]}",
         
     | 
| 196 | 
         
             
            )
         
     | 
| 197 | 
         
             
            if "other" in licenses:
         
     | 
| 
         @@ -228,33 +247,42 @@ if "extended" in extended: 
     | 
|
| 228 | 
         
             
                    st.write(f"Registering other-{other_extended_sources} dataset")
         
     | 
| 229 | 
         
             
                    extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
         
     | 
| 230 | 
         
             
                source_datasets += [f"extended|{src}" for src in extended_sources]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 231 | 
         
             
            size_category = leftcol.selectbox(
         
     | 
| 232 | 
         
             
                "What is the size category of the dataset?",
         
     | 
| 233 | 
         
            -
                options= 
     | 
| 234 | 
         
            -
                index=[ 
     | 
| 235 | 
         
            -
                    (pre_loaded.get("size_categories") or ["unknown"])[0]
         
     | 
| 236 | 
         
            -
                ),
         
     | 
| 237 | 
         
             
            )
         
     | 
| 238 | 
         | 
| 239 | 
         | 
| 240 | 
         
             
            ########################
         
     | 
| 241 | 
         
             
            ## Show results
         
     | 
| 242 | 
         
             
            ########################
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 243 | 
         
             
            rightcol.markdown(
         
     | 
| 244 | 
         
             
                f"""
         
     | 
| 245 | 
         
             
            ### Finalized tag set
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 246 | 
         
             
            ```yaml
         
     | 
| 247 | 
         
            -
            { 
     | 
| 248 | 
         
            -
             
     | 
| 249 | 
         
            -
                "task_ids": task_specifics,
         
     | 
| 250 | 
         
            -
                "multilinguality": multilinguality,
         
     | 
| 251 | 
         
            -
                "languages": languages,
         
     | 
| 252 | 
         
            -
                "language_creators": language_creators,
         
     | 
| 253 | 
         
            -
                "annotations_creators": annotations_creators,
         
     | 
| 254 | 
         
            -
                "source_datasets": source_datasets,
         
     | 
| 255 | 
         
            -
                "size_categories": size_category,
         
     | 
| 256 | 
         
            -
                "licenses": licenses,
         
     | 
| 257 | 
         
            -
            })}
         
     | 
| 258 | 
         
            -
            ```
         
     | 
| 259 | 
         
            -
            """
         
     | 
| 260 | 
         
             
            )
         
     | 
| 
         | 
|
| 1 | 
         
             
            import json
         
     | 
| 2 | 
         
            +
            from pathlib import Path
         
     | 
| 3 | 
         
            +
            from typing import List, Tuple
         
     | 
| 
         | 
|
| 4 | 
         | 
| 
         | 
|
| 5 | 
         
             
            import streamlit as st
         
     | 
| 6 | 
         
             
            import yaml
         
     | 
| 7 | 
         | 
| 
         | 
|
| 15 | 
         
             
            task_set = json.load(open("task_set.json"))
         
     | 
| 16 | 
         
             
            license_set = json.load(open("license_set.json"))
         
     | 
| 17 | 
         
             
            language_set_restricted = json.load(open("language_set.json"))
         
     | 
| 
         | 
|
| 18 | 
         | 
| 19 | 
         
             
            multilinguality_set = {
         
     | 
| 20 | 
         
             
                "monolingual": "contains a single language",
         
     | 
| 
         | 
|
| 46 | 
         
             
            ########################
         
     | 
| 47 | 
         | 
| 48 | 
         | 
| 49 | 
         
            +
            @st.cache(allow_output_mutation=True)
         
     | 
| 50 | 
         
            +
            def load_ds_datas():
         
     | 
| 51 | 
         
            +
                metada_exports = sorted(
         
     | 
| 52 | 
         
            +
                    [f for f in Path.cwd().iterdir() if f.name.startswith("metadata_")],
         
     | 
| 53 | 
         
            +
                    key=lambda f: f.lstat().st_mtime,
         
     | 
| 54 | 
         
            +
                    reverse=True,
         
     | 
| 55 | 
         
            +
                )
         
     | 
| 56 | 
         
            +
                if len(metada_exports) == 0:
         
     | 
| 57 | 
         
            +
                    raise ValueError("need to run ./build_metada_file.py at least once")
         
     | 
| 58 | 
         
            +
                with metada_exports[0].open() as fi:
         
     | 
| 59 | 
         
            +
                    return json.load(fi)
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
            def split_known(vals: List[str], okset: List[str]) -> Tuple[List[str], List[str]]:
         
     | 
| 63 | 
         
            +
                return [v for v in vals if v in okset], [v for v in vals if v not in okset]
         
     | 
| 64 | 
         | 
| 65 | 
         | 
| 66 | 
         
             
            def new_pre_loaded():
         
     | 
| 
         | 
|
| 78 | 
         | 
| 79 | 
         | 
| 80 | 
         
             
            pre_loaded = new_pre_loaded()
         
     | 
| 81 | 
         
            +
            datasets_md = load_ds_datas()
         
     | 
| 82 | 
         
            +
            existing_tag_sets = {name: mds["metadata"] for name, mds in datasets_md.items()}
         
     | 
| 83 | 
         
             
            all_dataset_ids = list(existing_tag_sets.keys())
         
     | 
| 84 | 
         | 
| 85 | 
         | 
| 
         | 
|
| 109 | 
         | 
| 110 | 
         
             
            qp = st.experimental_get_query_params()
         
     | 
| 111 | 
         
             
            preload = qp.get("preload_dataset", list())
         
     | 
| 112 | 
         
            +
            preloaded_id = None
         
     | 
| 113 | 
         
            +
            did_index = 0
         
     | 
| 114 | 
         
             
            if len(preload) == 1 and preload[0] in all_dataset_ids:
         
     | 
| 115 | 
         
            +
                preloaded_id, *_ = preload
         
     | 
| 116 | 
         
            +
                pre_loaded = existing_tag_sets[preloaded_id] or new_pre_loaded()
         
     | 
| 117 | 
         
            +
                did_index = all_dataset_ids.index(preloaded_id)
         
     | 
| 
         | 
|
| 118 | 
         | 
| 119 | 
         
             
            did = st.sidebar.selectbox(label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 120 | 
         | 
| 121 | 
         
            +
            leftbtn, rightbtn = st.sidebar.beta_columns(2)
         
     | 
| 122 | 
         
            +
            if leftbtn.button("pre-load tagset"):
         
     | 
| 123 | 
         
            +
                pre_loaded = existing_tag_sets[did] or new_pre_loaded()
         
     | 
| 124 | 
         
             
                st.experimental_set_query_params(preload_dataset=did)
         
     | 
| 125 | 
         
            +
            if rightbtn.button("flush state"):
         
     | 
| 126 | 
         
             
                pre_loaded = new_pre_loaded()
         
     | 
| 127 | 
         
             
                st.experimental_set_query_params()
         
     | 
| 128 | 
         | 
| 129 | 
         
            +
            if preloaded_id is not None:
         
     | 
| 130 | 
         
            +
                st.sidebar.markdown(f"Took [{preloaded_id}](https://huggingface.co/datasets/{preloaded_id}) as base tagset.")
         
     | 
| 131 | 
         
            +
             
     | 
| 132 | 
         | 
| 133 | 
         
            +
            leftcol, _, rightcol = st.beta_columns([12, 1, 12])
         
     | 
| 134 | 
         | 
| 
         | 
|
| 135 | 
         | 
| 136 | 
         
             
            leftcol.markdown("### Supported tasks")
         
     | 
| 137 | 
         
             
            task_categories = leftcol.multiselect(
         
     | 
| 
         | 
|
| 156 | 
         
             
                    task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
         
     | 
| 157 | 
         
             
                task_specifics += task_specs
         
     | 
| 158 | 
         | 
| 159 | 
         
            +
             
     | 
| 160 | 
         
             
            leftcol.markdown("### Languages")
         
     | 
| 161 | 
         
            +
            filtered_existing_languages = [lgc for lgc in set(pre_loaded["languages"]) if lgc not in language_set_restricted]
         
     | 
| 162 | 
         
            +
            pre_loaded["languages"] = [lgc for lgc in set(pre_loaded["languages"]) if lgc in language_set_restricted]
         
     | 
| 163 | 
         
            +
             
     | 
| 164 | 
         
             
            multilinguality = leftcol.multiselect(
         
     | 
| 165 | 
         
             
                "Does the dataset contain more than one language?",
         
     | 
| 166 | 
         
             
                options=list(multilinguality_set.keys()),
         
     | 
| 167 | 
         
             
                default=pre_loaded["multilinguality"],
         
     | 
| 168 | 
         
             
                format_func=lambda m: f"{m} : {multilinguality_set[m]}",
         
     | 
| 169 | 
         
             
            )
         
     | 
| 170 | 
         
            +
             
     | 
| 171 | 
         
             
            if "other" in multilinguality:
         
     | 
| 172 | 
         
             
                other_multilinguality = st.text_input(
         
     | 
| 173 | 
         
             
                    "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
         
     | 
| 
         | 
|
| 175 | 
         
             
                )
         
     | 
| 176 | 
         
             
                st.write(f"Registering other-{other_multilinguality} multilinguality")
         
     | 
| 177 | 
         
             
                multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
         
     | 
| 178 | 
         
            +
             
     | 
| 179 | 
         
            +
            if len(filtered_existing_languages) > 0:
         
     | 
| 180 | 
         
            +
                leftcol.markdown(f"**Found bad language codes in existing tagset**:\n{filtered_existing_languages}")
         
     | 
| 181 | 
         
             
            languages = leftcol.multiselect(
         
     | 
| 182 | 
         
             
                "What languages are represented in the dataset?",
         
     | 
| 183 | 
         
            +
                options=list(language_set_restricted.keys()),
         
     | 
| 184 | 
         
             
                default=pre_loaded["languages"],
         
     | 
| 185 | 
         
            +
                format_func=lambda m: f"{m} : {language_set_restricted[m]}",
         
     | 
| 186 | 
         
             
            )
         
     | 
| 187 | 
         | 
| 188 | 
         
            +
             
     | 
| 189 | 
         
             
            leftcol.markdown("### Dataset creators")
         
     | 
| 190 | 
         
            +
            ok, nonok = split_known(pre_loaded["language_creators"], creator_set["language"])
         
     | 
| 191 | 
         
            +
            if len(nonok) > 0:
         
     | 
| 192 | 
         
            +
                leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
         
     | 
| 193 | 
         
             
            language_creators = leftcol.multiselect(
         
     | 
| 194 | 
         
             
                "Where does the text in the dataset come from?",
         
     | 
| 195 | 
         
             
                options=creator_set["language"],
         
     | 
| 196 | 
         
            +
                default=ok,
         
     | 
| 197 | 
         
             
            )
         
     | 
| 198 | 
         
            +
            ok, nonok = split_known(pre_loaded["annotations_creators"], creator_set["annotations"])
         
     | 
| 199 | 
         
            +
            if len(nonok) > 0:
         
     | 
| 200 | 
         
            +
                leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
         
     | 
| 201 | 
         
             
            annotations_creators = leftcol.multiselect(
         
     | 
| 202 | 
         
             
                "Where do the annotations in the dataset come from?",
         
     | 
| 203 | 
         
             
                options=creator_set["annotations"],
         
     | 
| 204 | 
         
            +
                default=ok,
         
     | 
| 205 | 
         
             
            )
         
     | 
| 206 | 
         
            +
             
     | 
| 207 | 
         
            +
            ok, nonok = split_known(pre_loaded["licenses"], list(license_set.keys()))
         
     | 
| 208 | 
         
            +
            if len(nonok) > 0:
         
     | 
| 209 | 
         
            +
                leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
         
     | 
| 210 | 
         
             
            licenses = leftcol.multiselect(
         
     | 
| 211 | 
         
             
                "What licenses is the dataset under?",
         
     | 
| 212 | 
         
             
                options=list(license_set.keys()),
         
     | 
| 213 | 
         
            +
                default=ok,
         
     | 
| 214 | 
         
             
                format_func=lambda l: f"{l} : {license_set[l]}",
         
     | 
| 215 | 
         
             
            )
         
     | 
| 216 | 
         
             
            if "other" in licenses:
         
     | 
| 
         | 
|
| 247 | 
         
             
                    st.write(f"Registering other-{other_extended_sources} dataset")
         
     | 
| 248 | 
         
             
                    extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
         
     | 
| 249 | 
         
             
                source_datasets += [f"extended|{src}" for src in extended_sources]
         
     | 
| 250 | 
         
            +
             
     | 
| 251 | 
         
            +
            size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"]
         
     | 
| 252 | 
         
            +
            current_size_cats = pre_loaded.get("size_categories") or ["unknown"]
         
     | 
| 253 | 
         
            +
            ok, nonok = split_known(current_size_cats, size_cats)
         
     | 
| 254 | 
         
            +
            if len(nonok) > 0:
         
     | 
| 255 | 
         
            +
                leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
         
     | 
| 256 | 
         
             
            size_category = leftcol.selectbox(
         
     | 
| 257 | 
         
             
                "What is the size category of the dataset?",
         
     | 
| 258 | 
         
            +
                options=size_cats,
         
     | 
| 259 | 
         
            +
                index=size_cats.index(ok[0]) if len(ok) > 0 else 0,
         
     | 
| 
         | 
|
| 
         | 
|
| 260 | 
         
             
            )
         
     | 
| 261 | 
         | 
| 262 | 
         | 
| 263 | 
         
             
            ########################
         
     | 
| 264 | 
         
             
            ## Show results
         
     | 
| 265 | 
         
             
            ########################
         
     | 
| 266 | 
         
            +
            yamlblock = yaml.dump(
         
     | 
| 267 | 
         
            +
                {
         
     | 
| 268 | 
         
            +
                    "task_categories": task_categories,
         
     | 
| 269 | 
         
            +
                    "task_ids": task_specifics,
         
     | 
| 270 | 
         
            +
                    "multilinguality": multilinguality,
         
     | 
| 271 | 
         
            +
                    "languages": languages,
         
     | 
| 272 | 
         
            +
                    "language_creators": language_creators,
         
     | 
| 273 | 
         
            +
                    "annotations_creators": annotations_creators,
         
     | 
| 274 | 
         
            +
                    "source_datasets": source_datasets,
         
     | 
| 275 | 
         
            +
                    "size_categories": size_category,
         
     | 
| 276 | 
         
            +
                    "licenses": licenses,
         
     | 
| 277 | 
         
            +
                }
         
     | 
| 278 | 
         
            +
            )
         
     | 
| 279 | 
         
             
            rightcol.markdown(
         
     | 
| 280 | 
         
             
                f"""
         
     | 
| 281 | 
         
             
            ### Finalized tag set
         
     | 
| 282 | 
         
            +
             
     | 
| 283 | 
         
            +
            Copy it into your dataset's `README.md` header! 🤗 
         
     | 
| 284 | 
         
            +
             
     | 
| 285 | 
         
             
            ```yaml
         
     | 
| 286 | 
         
            +
            {yamlblock}
         
     | 
| 287 | 
         
            +
            ```""",
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 288 | 
         
             
            )
         
     |