# Copyright 2024-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import random import numpy as np import torch from datasets import load_dataset """ doc https://huggingface.co/docs/datasets/loading doc https://huggingface.co/docs/datasets/process doc https://huggingface.co/blog/llama2#how-to-prompt-llama-2 """ def set_seed(seed): np.random.seed(seed) torch.random.manual_seed(seed) def sample_train_loaders(name, tokenizer, nsamples=128, seed=0, seqlen=2048): set_seed(seed) if "wikitext2" in name: traindata = load_dataset( "wikitext", "wikitext-2-raw-v1", split="train", ) traindata = "\n\n".join(traindata["text"]) elif "c4" in name: traindata = load_dataset( "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train", ) traindata = "\n\n".join(traindata["text"]) else: raise NotImplementedError trainloader = [] for _ in range(nsamples): i = random.randint(0, len(traindata) - seqlen * 2 - 1) j = i + seqlen * 2 # breakpoint() trainenc = tokenizer(traindata[i:j], return_tensors="pt") inp = trainenc.input_ids[:, :seqlen] trainloader.append(inp) return trainloader def get_redpajama_train(tokenizer, percent=10, seed=3, batch_size=128, max_length=2048): def tokenization(example): return tokenizer(example["text"], truncation=True, max_length=max_length) if percent != 100: split = f"train[:{int(850000 * percent / 100)}]" else: split = "train" dataset = load_dataset("togethercomputer/RedPajama-Data-1T-Sample", split=split) processed_dataset = dataset.map(tokenization, batched=True, batch_size=batch_size, num_proc=os.cpu_count()) return processed_dataset def get_english_quote(dataset_name, tokenizer): data = load_dataset(dataset_name) data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) return data["train"] def get_qat_dataset(name, tokenizer, data_percent): if name == "red_pajama": data = get_redpajama_train(tokenizer, data_percent) elif name == "Abirate/english_quotes": data = get_english_quote(name, tokenizer) else: raise NotImplementedError data = data.shuffle() return data llama_chat_format = """[INST] <> "Below is an instruction that describes a task. Write a response that appropriately completes the request." <> {instruction} [/INST] {response} """ def get_calib_data(name, tokenizer, model_id, nsamples, seqlen=2048, seed=3): print(f" get_data_from: {name}, nsamples={nsamples}, seqlen={seqlen}, {seed}") cache_file = f"cache/{name}_{model_id.replace('/', '_')}_{nsamples}_{seqlen}_{seed}.pt" traindataset = [] if not os.path.exists("cache"): os.makedirs("cache") if os.path.exists(cache_file): print(f"found data file: {cache_file}") traindataset = torch.load(cache_file) print("loaded ...") return traindataset if name == "c4": traindata = load_dataset( "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train", ) tot_text = "\n\n".join(traindata["text"]) elif name == "wikitext2": traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train") tot_text = "\n\n".join(traindata["text"]) elif name == "ptb": traindata = load_dataset( "ptb_text_only", "penn_treebank", split="train", ) tot_text = "\n\n".join(traindata["sentence"]) elif name == "traivia_qa": traindata = load_dataset("trivia_qa", "rc", split="train") tot_text = "\n\n".join(traindata["question"]) elif name == "nqopen": traindata = load_dataset("nq_open", split="train") tot_text = "\n\n".join(traindata["question"]) elif name == "alpaca": selected_data_dict = load_dataset("iboing/alpaca_data", split="train").shuffle(seed=seed).take(nsamples) for example in selected_data_dict: if example.get("input", "") == "": s = llama_chat_format.format(instruction=example["instruction"], response=example["output"]) trainenc = tokenizer(s, return_tensors="pt") inp = trainenc.input_ids[:, :seqlen] attention_mask = torch.ones_like(inp) traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) print("example instruction:", s) torch.save(traindataset, cache_file) return traindataset elif name == "MetaMATH": selected_data_dict = load_dataset("iboing/MetaMathQA-395K", split="train").shuffle(seed=seed).take(nsamples) for example in selected_data_dict: if example.get("input", "") == "": s = llama_chat_format.format(instruction=example["query"], response=example["response"]) trainenc = tokenizer(s, return_tensors="pt") inp = trainenc.input_ids[:, :seqlen] attention_mask = torch.ones_like(inp) traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) print("example instruction:", s) torch.save(traindataset, cache_file) return traindataset elif name == "codefeedback": selected_data_dict = ( load_dataset("iboing/CodeFeedback-Filtered-Instruction", split="train").shuffle(seed=seed).take(nsamples) ) for example in selected_data_dict: if example.get("input", "") == "": s = llama_chat_format.format(instruction=example["query"], response=example["answer"]) trainenc = tokenizer(s, return_tensors="pt") inp = trainenc.input_ids[:, :seqlen] attention_mask = torch.ones_like(inp) traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) print("example instruction:", s) torch.save(traindataset, cache_file) return traindataset elif name == "WizLMinstruct": selected_data_dict = ( load_dataset("iboing/WizardLM_evol_instruct_V2_143k", split="train").shuffle(seed=seed).take(nsamples) ) for example in selected_data_dict: if example.get("input", "") == "": s = llama_chat_format.format( instruction=example["conversation"][0]["human"], response=example["conversation"][0]["assistant"] ) trainenc = tokenizer(s, return_tensors="pt") inp = trainenc.input_ids[:, :seqlen] attention_mask = torch.ones_like(inp) traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) print("example instruction:", s) torch.save(traindataset, cache_file) return traindataset else: raise NotImplementedError print(f"tot_text={len(tot_text)}") for _ in range(nsamples): i = random.randint(0, len(tot_text) - seqlen - 1) j = i + seqlen * 10 trainenc = tokenizer(tot_text[i:j], return_tensors="pt") inp = trainenc.input_ids[:, :seqlen] attention_mask = torch.ones_like(inp) traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) torch.save(traindataset, cache_file) return traindataset def get_eval_loaders(name, tokenizer): if "wikitext2" in name: testdata = load_dataset( "wikitext", "wikitext-2-raw-v1", split="test", ) testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt") return testenc if "ptb" in name: valdata = load_dataset( "ptb_text_only", "penn_treebank", split="validation", ) testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt") return testenc if "c4" in name: testdata = load_dataset( "allenai/c4", "allenai--c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation", ) testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt") return testenc raise NotImplementedError