Spaces:

ivangabriele
/

trl-sandbox

Paused

App Files Files Community

trl-sandbox / trl /trainer /rloo_config.py

ivangabriele

feat: initialize project

2f5127c verified 5 months ago

raw

history blame contribute delete

4.83 kB

	# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os
	from dataclasses import dataclass, field

	from ..trainer.utils import OnPolicyConfig


	@dataclass
	class RLOOConfig(OnPolicyConfig):
	r"""
	Configuration class for the [`RLOOTrainer`].

	This class includes only the parameters that are specific to RLOO training. For a full list of training arguments,
	please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
	values in this class may differ from those in [`~transformers.TrainingArguments`].

	Using [`~transformers.HfArgumentParser`] we can turn this class into
	[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
	command line.

	Parameters:
	exp_name (`str`, optional, defaults to `os.path.basename(__file__)[: -len(".py")]`):
	Name of this experiment.
	reward_model_path (`str`, optional, defaults to `"EleutherAI/pythia-160m"`):
	Path to the reward model.
	num_ppo_epochs (`int`, optional, defaults to `4`):
	Number of epochs to train.
	whiten_rewards (`bool`, optional, defaults to `False`):
	Whether to whiten the rewards.
	kl_coef (`float`, optional, defaults to `0.05`):
	KL coefficient.
	cliprange (`float`, optional, defaults to `0.2`):
	Clip range.
	rloo_k (`int`, optional, defaults to `2`):
	REINFORCE Leave-One-Out (RLOO) number of online samples per prompt.
	normalize_reward (`bool`, optional, defaults to `False`):
	Whether to normalize rewards.
	reward_clip_range (`float`, optional, defaults to `10.0`):
	Clip range for rewards.
	normalize_advantage (`bool`, optional, defaults to `False`):
	Whether to normalize advantages.
	token_level_kl (`bool`, optional, defaults to `True`):
	Whether to use token-level KL penalty or sequence-level KL penalty.
	ds3_gather_for_generation (`bool`, optional, defaults to `True`):
	This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
	improving generation speed. However, disabling this option allows training models that exceed the VRAM
	capacity of a single GPU, albeit at the cost of slower generation.
	"""

	exp_name: str = field(
	default=os.path.basename(__file__)[:-3],
	metadata={"help": "Name of this experiment."},
	)
	reward_model_path: str = field(
	default="EleutherAI/pythia-160m",
	metadata={"help": "Path to the reward model."},
	)
	num_ppo_epochs: int = field(
	default=4,
	metadata={"help": "Number of epochs to train."},
	)
	whiten_rewards: bool = field(
	default=False,
	metadata={"help": "Whether to whiten the rewards."},
	)
	kl_coef: float = field(
	default=0.05,
	metadata={"help": "KL coefficient."},
	)
	cliprange: float = field(
	default=0.2,
	metadata={"help": "Clip range."},
	)
	rloo_k: int = field(
	default=2,
	metadata={"help": "REINFORCE Leave-One-Out (RLOO) number of online samples per prompt."},
	)
	normalize_reward: bool = field(
	default=False,
	metadata={"help": "Whether to normalize rewards"},
	)
	reward_clip_range: float = field(
	default=10.0,
	metadata={"help": "Clip range for rewards"},
	)
	normalize_advantage: bool = field(
	default=False,
	metadata={"help": "Whether to normalize advantages"},
	)
	token_level_kl: bool = field(
	default=False,
	metadata={"help": "Whether to use token-level KL penalty or sequence-level KL penalty"},
	)
	ds3_gather_for_generation: bool = field(
	default=True,
	metadata={
	"help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for "
	"generation, improving generation speed. However, disabling this option allows training models that "
	"exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation."
	},
	)