Spaces:

amphion
/

Text-to-Speech

Running

App Files Files Community

Text-to-Speech / models /svc /diffusion /diffusion_trainer.py

zyingt

Upload 685 files

0d80816 almost 2 years ago

raw

history blame contribute delete

3.39 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import torch
	from diffusers import DDPMScheduler

	from models.svc.base import SVCTrainer
	from modules.encoder.condition_encoder import ConditionEncoder
	from .diffusion_wrapper import DiffusionWrapper


	class DiffusionTrainer(SVCTrainer):
	r"""The base trainer for all diffusion models. It inherits from SVCTrainer and
	implements ``_build_model`` and ``_forward_step`` methods.
	"""

	def __init__(self, args=None, cfg=None):
	SVCTrainer.__init__(self, args, cfg)

	# Only for SVC tasks using diffusion
	self.noise_scheduler = DDPMScheduler(
	**self.cfg.model.diffusion.scheduler_settings,
	)
	self.diffusion_timesteps = (
	self.cfg.model.diffusion.scheduler_settings.num_train_timesteps
	)

	### Following are methods only for diffusion models ###
	def _build_model(self):
	r"""Build the model for training. This function is called in ``__init__`` function."""

	# TODO: sort out the config
	self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
	self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
	self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
	self.acoustic_mapper = DiffusionWrapper(self.cfg)
	model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])

	num_of_params_encoder = self.count_parameters(self.condition_encoder)
	num_of_params_am = self.count_parameters(self.acoustic_mapper)
	num_of_params = num_of_params_encoder + num_of_params_am
	log = "Diffusion Model's Parameters: #Encoder is {:.2f}M, #Diffusion is {:.2f}M. The total is {:.2f}M".format(
	num_of_params_encoder / 1e6, num_of_params_am / 1e6, num_of_params / 1e6
	)
	self.logger.info(log)

	return model

	def count_parameters(self, model):
	model_param = 0.0
	if isinstance(model, dict):
	for key, value in model.items():
	model_param += sum(p.numel() for p in model[key].parameters())
	else:
	model_param = sum(p.numel() for p in model.parameters())
	return model_param

	def _forward_step(self, batch):
	r"""Forward step for training and inference. This function is called
	in ``_train_step`` & ``_test_step`` function.
	"""

	device = self.accelerator.device

	mel_input = batch["mel"]
	noise = torch.randn_like(mel_input, device=device, dtype=torch.float32)
	batch_size = mel_input.size(0)
	timesteps = torch.randint(
	0,
	self.diffusion_timesteps,
	(batch_size,),
	device=device,
	dtype=torch.long,
	)

	noisy_mel = self.noise_scheduler.add_noise(mel_input, noise, timesteps)
	conditioner = self.condition_encoder(batch)

	y_pred = self.acoustic_mapper(noisy_mel, timesteps, conditioner)

	# TODO: Predict noise or gt should be configurable
	loss = self._compute_loss(self.criterion, y_pred, noise, batch["mask"])
	self._check_nan(loss, y_pred, noise)

	# FIXME: Clarify that we should not divide it with batch size here
	return loss