Spaces:

OpenMOSS-Team
/

MOSS-Speech

Running on Zero

App Files Files Community

MOSS-Speech / cosyvoice /dataset /dataset.py

singularitys0

Initial Commit

f5abf67 about 1 month ago

raw

history blame contribute delete

5.09 kB

	# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
	# 2024 Alibaba Inc (authors: Xiang Lyu)
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import random
	import math
	from functools import partial

	import torch
	import torch.distributed as dist
	from torch.utils.data import IterableDataset
	from cosyvoice.utils.file_utils import read_lists


	class Processor(IterableDataset):

	def __init__(self, source, f, args, *kw):
	assert callable(f)
	self.source = source
	self.f = f
	self.args = args
	self.kw = kw

	def set_epoch(self, epoch):
	self.source.set_epoch(epoch)

	def __iter__(self):
	""" Return an iterator over the source dataset processed by the
	given processor.
	"""
	assert self.source is not None
	assert callable(self.f)
	return self.f(iter(self.source), self.args, *self.kw)

	def apply(self, f):
	assert callable(f)
	return Processor(self, f, self.args, *self.kw)


	class DistributedSampler:

	def __init__(self, shuffle=True, partition=True):
	self.epoch = -1
	self.update()
	self.shuffle = shuffle
	self.partition = partition

	def update(self):
	assert dist.is_available()
	if dist.is_initialized():
	self.rank = dist.get_rank()
	self.world_size = dist.get_world_size()
	else:
	self.rank = 0
	self.world_size = 1
	worker_info = torch.utils.data.get_worker_info()
	if worker_info is None:
	self.worker_id = 0
	self.num_workers = 1
	else:
	self.worker_id = worker_info.id
	self.num_workers = worker_info.num_workers
	return dict(rank=self.rank,
	world_size=self.world_size,
	worker_id=self.worker_id,
	num_workers=self.num_workers)

	def set_epoch(self, epoch):
	self.epoch = epoch

	def sample(self, data):
	""" Sample data according to rank/world_size/num_workers

	Args:
	data(List): input data list

	Returns:
	List: data list after sample
	"""
	data = list(range(len(data)))
	# force datalist even
	if self.partition:
	if self.shuffle:
	random.Random(self.epoch).shuffle(data)
	if len(data) < self.world_size:
	data = data * math.ceil(self.world_size / len(data))
	data = data[:self.world_size]
	data = data[self.rank::self.world_size]
	if len(data) < self.num_workers:
	data = data * math.ceil(self.num_workers / len(data))
	data = data[:self.num_workers]
	data = data[self.worker_id::self.num_workers]
	return data


	class DataList(IterableDataset):

	def __init__(self, lists, shuffle=True, partition=True):
	self.lists = lists
	self.sampler = DistributedSampler(shuffle, partition)

	def set_epoch(self, epoch):
	self.sampler.set_epoch(epoch)

	def __iter__(self):
	sampler_info = self.sampler.update()
	indexes = self.sampler.sample(self.lists)
	for index in indexes:
	data = dict(src=self.lists[index])
	data.update(sampler_info)
	yield data


	def Dataset(data_list_file,
	data_pipeline,
	mode='train',
	gan=False,
	dpo=False,
	shuffle=True,
	partition=True):
	""" Construct dataset from arguments

	We have two shuffle stage in the Dataset. The first is global
	shuffle at shards tar/raw file level. The second is global shuffle
	at training samples level.

	Args:
	data_type(str): raw/shard
	tokenizer (BaseTokenizer): tokenizer to tokenize
	partition(bool): whether to do data partition in terms of rank
	"""
	lists = read_lists(data_list_file)
	dataset = DataList(lists,
	shuffle=shuffle,
	partition=partition)
	# import pyarrow.parquet as pq
	# for sample in dataset:
	# url = sample['src']
	# for df in pq.ParquetFile(url).iter_batches(batch_size=64):
	# df = df.to_pandas()
	# import pdb
	# pdb.set_trace()
	# break
	# break
	# map partial arg to padding func
	data_pipeline[-1] = partial(data_pipeline[-1], gan=gan, dpo=dpo)
	for func in data_pipeline:
	dataset = Processor(dataset, func, mode=mode)
	return dataset