Spaces:

ferid197
/

LLaMA-Factory

Runtime error

App Files Files Community

LLaMA-Factory / scripts /stat_utils /length_cdf.py

ferid197

Upload folder using huggingface_hub

e81015c verified 7 months ago

raw

history blame contribute delete

2.49 kB

	# Copyright 2025 the LlamaFactory team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from collections import defaultdict

	import fire
	from tqdm import tqdm

	from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
	from llamafactory.hparams import get_train_args
	from llamafactory.model import load_tokenizer


	def length_cdf(
	model_name_or_path: str,
	dataset: str = "alpaca_en_demo",
	dataset_dir: str = "data",
	template: str = "default",
	interval: int = 1000,
	):
	r"""Calculate the distribution of the input lengths in the dataset.

	Usage: export CUDA_VISIBLE_DEVICES=0
	python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default
	"""
	model_args, data_args, training_args, _, _ = get_train_args(
	dict(
	stage="sft",
	model_name_or_path=model_name_or_path,
	dataset=dataset,
	dataset_dir=dataset_dir,
	template=template,
	cutoff_len=1_000_000,
	preprocessing_num_workers=16,
	output_dir="dummy_dir",
	overwrite_cache=True,
	do_train=True,
	)
	)
	tokenizer_module = load_tokenizer(model_args)
	template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
	trainset = get_dataset(template, model_args, data_args, training_args, "sft", **tokenizer_module)["train_dataset"]
	total_num = len(trainset)
	length_dict = defaultdict(int)
	for sample in tqdm(trainset["input_ids"], desc="Collecting lengths"):
	length_dict[len(sample) // interval * interval] += 1

	length_tuples = list(length_dict.items())
	length_tuples.sort()
	count_accu, prob_accu = 0, 0
	for length, count in length_tuples:
	count_accu += count
	prob_accu += count / total_num * 100
	print(f"{count_accu:d} ({prob_accu:.2f}%) samples have length < {length + interval}.")


	if __name__ == "__main__":
	fire.Fire(length_cdf)