Spaces:
Paused
Paused
| # Copyright 2020-2025 The HuggingFace Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import tempfile | |
| import unittest | |
| from datasets import load_dataset | |
| from parameterized import parameterized | |
| from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer | |
| from trl import ( | |
| BCOConfig, | |
| BCOTrainer, | |
| CPOConfig, | |
| CPOTrainer, | |
| DPOConfig, | |
| DPOTrainer, | |
| KTOConfig, | |
| KTOTrainer, | |
| NashMDConfig, | |
| NashMDTrainer, | |
| OnlineDPOConfig, | |
| OnlineDPOTrainer, | |
| ORPOConfig, | |
| ORPOTrainer, | |
| RewardConfig, | |
| RewardTrainer, | |
| SFTConfig, | |
| SFTTrainer, | |
| XPOConfig, | |
| XPOTrainer, | |
| ) | |
| from .testing_utils import require_sklearn | |
| class TrainerArgTester(unittest.TestCase): | |
| def test_bco(self): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = BCOConfig( | |
| tmp_dir, | |
| max_length=256, | |
| max_prompt_length=64, | |
| max_completion_length=64, | |
| beta=0.5, | |
| label_pad_token_id=-99, | |
| padding_value=-99, | |
| truncation_mode="keep_start", | |
| # generate_during_eval=True, # ignore this one, it requires wandb | |
| is_encoder_decoder=True, | |
| precompute_ref_log_probs=True, | |
| model_init_kwargs={"trust_remote_code": True}, | |
| ref_model_init_kwargs={"trust_remote_code": True}, | |
| dataset_num_proc=4, | |
| prompt_sample_size=512, | |
| min_density_ratio=0.2, | |
| max_density_ratio=20.0, | |
| ) | |
| trainer = BCOTrainer( | |
| model=model_id, | |
| ref_model=model_id, | |
| args=training_args, | |
| train_dataset=dataset, | |
| processing_class=tokenizer, | |
| ) | |
| self.assertEqual(trainer.args.max_length, 256) | |
| self.assertEqual(trainer.args.max_prompt_length, 64) | |
| self.assertEqual(trainer.args.max_completion_length, 64) | |
| self.assertEqual(trainer.args.beta, 0.5) | |
| self.assertEqual(trainer.args.label_pad_token_id, -99) | |
| self.assertEqual(trainer.args.padding_value, -99) | |
| self.assertEqual(trainer.args.truncation_mode, "keep_start") | |
| # self.assertEqual(trainer.args.generate_during_eval, True) | |
| self.assertEqual(trainer.args.is_encoder_decoder, True) | |
| self.assertEqual(trainer.args.precompute_ref_log_probs, True) | |
| self.assertEqual(trainer.args.model_init_kwargs, {"trust_remote_code": True}) | |
| self.assertEqual(trainer.args.ref_model_init_kwargs, {"trust_remote_code": True}) | |
| self.assertEqual(trainer.args.dataset_num_proc, 4) | |
| self.assertEqual(trainer.args.prompt_sample_size, 512) | |
| self.assertEqual(trainer.args.min_density_ratio, 0.2) | |
| self.assertEqual(trainer.args.max_density_ratio, 20.0) | |
| def test_cpo(self): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = CPOConfig( | |
| tmp_dir, | |
| max_length=256, | |
| max_prompt_length=64, | |
| max_completion_length=64, | |
| beta=0.5, | |
| label_smoothing=0.5, | |
| loss_type="hinge", | |
| disable_dropout=False, | |
| cpo_alpha=0.5, | |
| simpo_gamma=0.2, | |
| label_pad_token_id=-99, | |
| padding_value=-99, | |
| truncation_mode="keep_start", | |
| # generate_during_eval=True, # ignore this one, it requires wandb | |
| is_encoder_decoder=True, | |
| model_init_kwargs={"trust_remote_code": True}, | |
| dataset_num_proc=4, | |
| ) | |
| trainer = CPOTrainer(model=model_id, args=training_args, train_dataset=dataset, processing_class=tokenizer) | |
| self.assertEqual(trainer.args.max_length, 256) | |
| self.assertEqual(trainer.args.max_prompt_length, 64) | |
| self.assertEqual(trainer.args.max_completion_length, 64) | |
| self.assertEqual(trainer.args.beta, 0.5) | |
| self.assertEqual(trainer.args.label_smoothing, 0.5) | |
| self.assertEqual(trainer.args.loss_type, "hinge") | |
| self.assertEqual(trainer.args.disable_dropout, False) | |
| self.assertEqual(trainer.args.cpo_alpha, 0.5) | |
| self.assertEqual(trainer.args.simpo_gamma, 0.2) | |
| self.assertEqual(trainer.args.label_pad_token_id, -99) | |
| self.assertEqual(trainer.args.padding_value, -99) | |
| self.assertEqual(trainer.args.truncation_mode, "keep_start") | |
| # self.assertEqual(trainer.args.generate_during_eval, True) | |
| self.assertEqual(trainer.args.is_encoder_decoder, True) | |
| self.assertEqual(trainer.args.model_init_kwargs, {"trust_remote_code": True}) | |
| self.assertEqual(trainer.args.dataset_num_proc, 4) | |
| def test_dpo(self): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = DPOConfig( | |
| tmp_dir, | |
| beta=0.5, | |
| label_smoothing=0.5, | |
| loss_type="hinge", | |
| label_pad_token_id=-99, | |
| padding_value=-99, | |
| truncation_mode="keep_start", | |
| max_length=256, | |
| max_prompt_length=64, | |
| max_completion_length=64, | |
| disable_dropout=False, | |
| # generate_during_eval=True, # ignore this one, it requires wandb | |
| precompute_ref_log_probs=True, | |
| dataset_num_proc=4, | |
| model_init_kwargs={"trust_remote_code": True}, | |
| ref_model_init_kwargs={"trust_remote_code": True}, | |
| model_adapter_name="dummy_adapter", | |
| ref_adapter_name="dummy_adapter", | |
| reference_free=True, | |
| force_use_ref_model=True, | |
| f_divergence_type="js_divergence", | |
| f_alpha_divergence_coef=0.5, | |
| # sync_ref_model=True, # cannot be True when precompute_ref_log_probs=True. Don't test this. | |
| ref_model_mixup_alpha=0.5, | |
| ref_model_sync_steps=32, | |
| rpo_alpha=0.5, | |
| discopop_tau=0.1, | |
| ) | |
| trainer = DPOTrainer( | |
| model=model_id, | |
| ref_model=model_id, | |
| args=training_args, | |
| train_dataset=dataset, | |
| processing_class=tokenizer, | |
| ) | |
| self.assertEqual(trainer.args.beta, 0.5) | |
| self.assertEqual(trainer.args.label_smoothing, 0.5) | |
| self.assertEqual(trainer.args.loss_type, "hinge") | |
| self.assertEqual(trainer.args.label_pad_token_id, -99) | |
| self.assertEqual(trainer.args.padding_value, -99) | |
| self.assertEqual(trainer.args.truncation_mode, "keep_start") | |
| self.assertEqual(trainer.args.max_length, 256) | |
| self.assertEqual(trainer.args.max_prompt_length, 64) | |
| self.assertEqual(trainer.args.max_completion_length, 64) | |
| self.assertEqual(trainer.args.disable_dropout, False) | |
| # self.assertEqual(trainer.args.generate_during_eval, True) | |
| self.assertEqual(trainer.args.precompute_ref_log_probs, True) | |
| self.assertEqual(trainer.args.dataset_num_proc, 4) | |
| self.assertEqual(trainer.args.model_init_kwargs, {"trust_remote_code": True}) | |
| self.assertEqual(trainer.args.ref_model_init_kwargs, {"trust_remote_code": True}) | |
| self.assertEqual(trainer.args.model_adapter_name, "dummy_adapter") | |
| self.assertEqual(trainer.args.ref_adapter_name, "dummy_adapter") | |
| self.assertEqual(trainer.args.reference_free, True) | |
| self.assertEqual(trainer.args.force_use_ref_model, True) | |
| self.assertEqual(trainer.args.f_divergence_type, "js_divergence") | |
| self.assertEqual(trainer.args.f_alpha_divergence_coef, 0.5) | |
| # self.assertEqual(trainer.args.sync_ref_model, True) | |
| self.assertEqual(trainer.args.ref_model_mixup_alpha, 0.5) | |
| self.assertEqual(trainer.args.ref_model_sync_steps, 32) | |
| self.assertEqual(trainer.args.rpo_alpha, 0.5) | |
| self.assertEqual(trainer.args.discopop_tau, 0.1) | |
| def test_kto(self): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = KTOConfig( | |
| tmp_dir, | |
| max_length=256, | |
| max_prompt_length=64, | |
| max_completion_length=64, | |
| beta=0.5, | |
| desirable_weight=0.5, | |
| undesirable_weight=0.5, | |
| label_pad_token_id=-99, | |
| padding_value=-99, | |
| truncation_mode="keep_start", | |
| # generate_during_eval=True, # ignore this one, it requires wandb | |
| is_encoder_decoder=True, | |
| precompute_ref_log_probs=True, | |
| model_init_kwargs={"trust_remote_code": True}, | |
| ref_model_init_kwargs={"trust_remote_code": True}, | |
| dataset_num_proc=4, | |
| ) | |
| trainer = KTOTrainer( | |
| model=model_id, | |
| ref_model=model_id, | |
| args=training_args, | |
| train_dataset=dataset, | |
| processing_class=tokenizer, | |
| ) | |
| self.assertEqual(trainer.args.max_length, 256) | |
| self.assertEqual(trainer.args.max_prompt_length, 64) | |
| self.assertEqual(trainer.args.max_completion_length, 64) | |
| self.assertEqual(trainer.args.beta, 0.5) | |
| self.assertEqual(trainer.args.desirable_weight, 0.5) | |
| self.assertEqual(trainer.args.undesirable_weight, 0.5) | |
| self.assertEqual(trainer.args.label_pad_token_id, -99) | |
| self.assertEqual(trainer.args.padding_value, -99) | |
| self.assertEqual(trainer.args.truncation_mode, "keep_start") | |
| # self.assertEqual(trainer.args.generate_during_eval, True) | |
| self.assertEqual(trainer.args.is_encoder_decoder, True) | |
| self.assertEqual(trainer.args.precompute_ref_log_probs, True) | |
| self.assertEqual(trainer.args.model_init_kwargs, {"trust_remote_code": True}) | |
| self.assertEqual(trainer.args.ref_model_init_kwargs, {"trust_remote_code": True}) | |
| self.assertEqual(trainer.args.dataset_num_proc, 4) | |
| def test_nash_md(self, mixtures_coef_list): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| ref_model = AutoModelForCausalLM.from_pretrained(model_id) | |
| reward_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1) | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = NashMDConfig( | |
| tmp_dir, | |
| mixture_coef=0.5 if not mixtures_coef_list else [0.5, 0.6], | |
| ) | |
| trainer = NashMDTrainer( | |
| args=training_args, | |
| processing_class=tokenizer, | |
| model=model, | |
| ref_model=ref_model, | |
| reward_model=reward_model, | |
| train_dataset=dataset, | |
| ) | |
| self.assertEqual(trainer.args.mixture_coef, 0.5 if not mixtures_coef_list else [0.5, 0.6]) | |
| def test_online_dpo(self, beta_list): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| ref_model = AutoModelForCausalLM.from_pretrained(model_id) | |
| reward_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1) | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = OnlineDPOConfig( | |
| tmp_dir, | |
| max_new_tokens=42, | |
| temperature=0.5, | |
| missing_eos_penalty=0.33, | |
| beta=0.6 if not beta_list else [0.6, 0.7], | |
| loss_type="hinge", | |
| dataset_num_proc=4, | |
| ) | |
| trainer = OnlineDPOTrainer( | |
| model=model, | |
| ref_model=ref_model, | |
| reward_model=reward_model, | |
| args=training_args, | |
| train_dataset=dataset, | |
| processing_class=tokenizer, | |
| reward_processing_class=tokenizer, | |
| ) | |
| self.assertEqual(trainer.args.max_new_tokens, 42) | |
| self.assertEqual(trainer.args.temperature, 0.5) | |
| self.assertEqual(trainer.args.missing_eos_penalty, 0.33) | |
| self.assertEqual(trainer.args.beta, 0.6 if not beta_list else [0.6, 0.7]) | |
| self.assertEqual(trainer.args.loss_type, "hinge") | |
| self.assertEqual(trainer.args.dataset_num_proc, 4) | |
| def test_orpo(self): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = ORPOConfig( | |
| tmp_dir, | |
| max_length=256, | |
| max_prompt_length=64, | |
| max_completion_length=64, | |
| beta=0.5, | |
| disable_dropout=False, | |
| label_pad_token_id=-99, | |
| padding_value=-99, | |
| truncation_mode="keep_start", | |
| # generate_during_eval=True, # ignore this one, it requires wandb | |
| is_encoder_decoder=True, | |
| model_init_kwargs={"trust_remote_code": True}, | |
| dataset_num_proc=4, | |
| ) | |
| trainer = ORPOTrainer( | |
| model=model_id, args=training_args, train_dataset=dataset, processing_class=tokenizer | |
| ) | |
| self.assertEqual(trainer.args.max_length, 256) | |
| self.assertEqual(trainer.args.max_prompt_length, 64) | |
| self.assertEqual(trainer.args.max_completion_length, 64) | |
| self.assertEqual(trainer.args.beta, 0.5) | |
| self.assertEqual(trainer.args.disable_dropout, False) | |
| self.assertEqual(trainer.args.label_pad_token_id, -99) | |
| def test_reward(self): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = RewardConfig( | |
| tmp_dir, | |
| max_length=256, | |
| dataset_num_proc=4, | |
| center_rewards_coefficient=0.1, | |
| ) | |
| trainer = RewardTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=dataset, | |
| processing_class=tokenizer, | |
| ) | |
| self.assertEqual(trainer.args.max_length, 256) | |
| self.assertEqual(trainer.args.dataset_num_proc, 4) | |
| self.assertEqual(trainer.args.center_rewards_coefficient, 0.1) | |
| def test_sft(self): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_language_modeling", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = SFTConfig( | |
| tmp_dir, | |
| dataset_text_field="dummy_text_field", | |
| packing=True, | |
| max_length=256, | |
| dataset_num_proc=4, | |
| neftune_noise_alpha=0.1, | |
| model_init_kwargs={"trust_remote_code": True}, | |
| dataset_kwargs={"append_concat_token": True, "skip_prepare_dataset": True}, | |
| eval_packing=True, | |
| ) | |
| trainer = SFTTrainer(model_id, args=training_args, train_dataset=dataset) | |
| self.assertEqual(trainer.args.dataset_text_field, "dummy_text_field") | |
| self.assertEqual(trainer.args.packing, True) | |
| self.assertEqual(trainer.args.max_length, 256) | |
| self.assertEqual(trainer.args.dataset_num_proc, 4) | |
| self.assertEqual(trainer.args.neftune_noise_alpha, 0.1) | |
| self.assertEqual(trainer.args.model_init_kwargs, {"trust_remote_code": True}) | |
| self.assertIn("append_concat_token", trainer.args.dataset_kwargs) | |
| self.assertEqual(trainer.args.dataset_kwargs["append_concat_token"], True) | |
| self.assertEqual(trainer.args.eval_packing, True) | |
| def test_xpo(self, alpha_list): | |
| model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| ref_model = AutoModelForCausalLM.from_pretrained(model_id) | |
| reward_model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1) | |
| dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train") | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| training_args = XPOConfig( | |
| tmp_dir, | |
| alpha=0.5 if not alpha_list else [0.5, 0.6], | |
| ) | |
| trainer = XPOTrainer( | |
| args=training_args, | |
| processing_class=tokenizer, | |
| model=model, | |
| ref_model=ref_model, | |
| reward_model=reward_model, | |
| train_dataset=dataset, | |
| ) | |
| self.assertEqual(trainer.args.alpha, 0.5 if not alpha_list else [0.5, 0.6]) | |