| import os, sys | |
| import argparse | |
| from datasets import load_dataset, concatenate_datasets, Dataset | |
| from huggingface_hub import login | |
| path = os.path.abspath(os.path.dirname(__file__)) | |
| sys.path.insert(0, path) | |
| def merge_dataset(datapaths) -> Dataset: | |
| datapaths = datapaths.split(",") | |
| dataset = load_dataset(datapaths[0], split="train") | |
| for i in range(1, len(datapaths)): | |
| data = load_dataset(datapaths[i], split="train") | |
| data = concatenate_datasets([dataset, data]) | |
| return dataset | |
| if __name__=="__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--datapaths", type=str, default="") | |
| parser.add_argument("--huggingface_hub_token", type=str, default="") | |
| parser.add_argument("--split", type=str, default="train") | |
| args = parser.parse_args() | |
| print("=========================================") | |
| print('\n'.join(f' + {k}={v}' for k, v in vars(args).items())) | |
| print("=========================================") | |
| login(token=args.huggingface_hub_token) | |
| print("Successfully logged in to Huggingface Hub") | |
| dataset = merge_dataset(datapaths=args.datapaths) | |
| DATASET_ID = "qds-triplet-dialogsum" | |
| dataset.push_to_hub(DATASET_ID) | |
| print(f"Successful push to Huggingface Hub: {DATASET_ID}") |