Spaces:
Running
Running
| #!/usr/bin/env python | |
| """ This script will clone the `datasets` repository in your current directory and parse all currently available | |
| metadata, from the `README.md` yaml headers and the automatically generated json files. | |
| It dumps the results in a `metadata_{current-commit-of-datasets}.json` file. | |
| """ | |
| import json | |
| from pathlib import Path | |
| from subprocess import check_call, check_output | |
| from typing import Dict | |
| import yaml | |
| from apputils import new_state | |
| DATASETS_BRANCH = "tasks-alignment-with-models" | |
| def metadata_from_readme(f: Path) -> Dict: | |
| with f.open() as fi: | |
| content = [line.rstrip() for line in fi] | |
| if content[0] == "---" and "---" in content[1:]: | |
| yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) | |
| return yaml.safe_load(yamlblock) or dict() | |
| def load_ds_datas(): | |
| drepo = Path("datasets") | |
| if drepo.exists() and drepo.is_dir(): | |
| check_call(["git", "pull"], cwd=drepo) | |
| else: | |
| check_call(["git", "clone", "-b", DATASETS_BRANCH, "https://github.com/huggingface/datasets.git"]) | |
| head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo) | |
| datasets_md = dict() | |
| for ddir in sorted((drepo / "datasets").iterdir(), key=lambda d: d.name): | |
| try: | |
| metadata = metadata_from_readme(ddir / "README.md") | |
| except: | |
| metadata = None | |
| if metadata is None or len(metadata) == 0: | |
| metadata = new_state() | |
| try: | |
| with (ddir / "dataset_infos.json").open() as fi: | |
| infos = json.load(fi) | |
| except: | |
| infos = None | |
| datasets_md[ddir.name] = dict(metadata=metadata, infos=infos) | |
| return head_sha.decode().strip(), datasets_md | |
| if __name__ == "__main__": | |
| head_sha, datas = load_ds_datas() | |
| fn = f"metadata_{head_sha}.json" | |
| print(f"writing to '{fn}'") | |
| with open(fn, "w") as fi: | |
| fi.write(json.dumps(datas)) | |