Spaces:

huggingface
/

datasets-tagging

Running

datasets-tagging / build_metadata_file.py

Quentin Lhoest

update task taxonomy

9242f47 over 3 years ago

1.93 kB

	#!/usr/bin/env python

	""" This script will clone the `datasets` repository in your current directory and parse all currently available
	metadata, from the `README.md` yaml headers and the automatically generated json files.
	It dumps the results in a `metadata_{current-commit-of-datasets}.json` file.
	"""

	import json
	from pathlib import Path
	from subprocess import check_call, check_output
	from typing import Dict

	import yaml

	from apputils import new_state

	DATASETS_BRANCH = "tasks-alignment-with-models"


	def metadata_from_readme(f: Path) -> Dict:
	with f.open() as fi:
	content = [line.rstrip() for line in fi]

	if content[0] == "---" and "---" in content[1:]:
	yamlblock = "\n".join(content[1 : content[1:].index("---") + 1])
	return yaml.safe_load(yamlblock) or dict()


	def load_ds_datas():
	drepo = Path("datasets")
	if drepo.exists() and drepo.is_dir():
	check_call(["git", "pull"], cwd=drepo)
	else:
	check_call(["git", "clone", "-b", DATASETS_BRANCH, "https://github.com/huggingface/datasets.git"])
	head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)

	datasets_md = dict()

	for ddir in sorted((drepo / "datasets").iterdir(), key=lambda d: d.name):

	try:
	metadata = metadata_from_readme(ddir / "README.md")
	except:
	metadata = None
	if metadata is None or len(metadata) == 0:
	metadata = new_state()

	try:
	with (ddir / "dataset_infos.json").open() as fi:
	infos = json.load(fi)
	except:
	infos = None

	datasets_md[ddir.name] = dict(metadata=metadata, infos=infos)
	return head_sha.decode().strip(), datasets_md


	if __name__ == "__main__":
	head_sha, datas = load_ds_datas()
	fn = f"metadata_{head_sha}.json"
	print(f"writing to '{fn}'")
	with open(fn, "w") as fi:
	fi.write(json.dumps(datas))