Spaces:
Paused
Paused
| # Copyright 2020-2025 The HuggingFace Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import os | |
| from dataclasses import dataclass, field | |
| from ..trainer.utils import OnPolicyConfig | |
| class RLOOConfig(OnPolicyConfig): | |
| r""" | |
| Configuration class for the [`RLOOTrainer`]. | |
| This class includes only the parameters that are specific to RLOO training. For a full list of training arguments, | |
| please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default | |
| values in this class may differ from those in [`~transformers.TrainingArguments`]. | |
| Using [`~transformers.HfArgumentParser`] we can turn this class into | |
| [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the | |
| command line. | |
| Parameters: | |
| exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`): | |
| Name of this experiment. | |
| reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): | |
| Path to the reward model. | |
| num_ppo_epochs (`int`, *optional*, defaults to `4`): | |
| Number of epochs to train. | |
| whiten_rewards (`bool`, *optional*, defaults to `False`): | |
| Whether to whiten the rewards. | |
| kl_coef (`float`, *optional*, defaults to `0.05`): | |
| KL coefficient. | |
| cliprange (`float`, *optional*, defaults to `0.2`): | |
| Clip range. | |
| rloo_k (`int`, *optional*, defaults to `2`): | |
| REINFORCE Leave-One-Out (RLOO) number of online samples per prompt. | |
| normalize_reward (`bool`, *optional*, defaults to `False`): | |
| Whether to normalize rewards. | |
| reward_clip_range (`float`, *optional*, defaults to `10.0`): | |
| Clip range for rewards. | |
| normalize_advantage (`bool`, *optional*, defaults to `False`): | |
| Whether to normalize advantages. | |
| token_level_kl (`bool`, *optional*, defaults to `True`): | |
| Whether to use token-level KL penalty or sequence-level KL penalty. | |
| ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): | |
| This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, | |
| improving generation speed. However, disabling this option allows training models that exceed the VRAM | |
| capacity of a single GPU, albeit at the cost of slower generation. | |
| """ | |
| exp_name: str = field( | |
| default=os.path.basename(__file__)[:-3], | |
| metadata={"help": "Name of this experiment."}, | |
| ) | |
| reward_model_path: str = field( | |
| default="EleutherAI/pythia-160m", | |
| metadata={"help": "Path to the reward model."}, | |
| ) | |
| num_ppo_epochs: int = field( | |
| default=4, | |
| metadata={"help": "Number of epochs to train."}, | |
| ) | |
| whiten_rewards: bool = field( | |
| default=False, | |
| metadata={"help": "Whether to whiten the rewards."}, | |
| ) | |
| kl_coef: float = field( | |
| default=0.05, | |
| metadata={"help": "KL coefficient."}, | |
| ) | |
| cliprange: float = field( | |
| default=0.2, | |
| metadata={"help": "Clip range."}, | |
| ) | |
| rloo_k: int = field( | |
| default=2, | |
| metadata={"help": "REINFORCE Leave-One-Out (RLOO) number of online samples per prompt."}, | |
| ) | |
| normalize_reward: bool = field( | |
| default=False, | |
| metadata={"help": "Whether to normalize rewards"}, | |
| ) | |
| reward_clip_range: float = field( | |
| default=10.0, | |
| metadata={"help": "Clip range for rewards"}, | |
| ) | |
| normalize_advantage: bool = field( | |
| default=False, | |
| metadata={"help": "Whether to normalize advantages"}, | |
| ) | |
| token_level_kl: bool = field( | |
| default=False, | |
| metadata={"help": "Whether to use token-level KL penalty or sequence-level KL penalty"}, | |
| ) | |
| ds3_gather_for_generation: bool = field( | |
| default=True, | |
| metadata={ | |
| "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for " | |
| "generation, improving generation speed. However, disabling this option allows training models that " | |
| "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation." | |
| }, | |
| ) | |