{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.996, "eval_steps": 500, "global_step": 249, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 599.9553680419922, "epoch": 0.004, "grad_norm": 0.24663478136062622, "kl": 0.0, "learning_rate": 8.000000000000001e-07, "loss": -0.0302, "reward": 0.1160714365541935, "reward_std": 0.20195358991622925, "rewards/accuracy_reward": 0.1160714365541935, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 644.7827648586697, "epoch": 0.04, "grad_norm": 0.2229214757680893, "kl": 0.00017437669965955947, "learning_rate": 8.000000000000001e-06, "loss": -0.0164, "reward": 0.13591270407454836, "reward_std": 0.1599799825085534, "rewards/accuracy_reward": 0.13591270407454836, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 693.7643196105957, "epoch": 0.08, "grad_norm": 0.21021594107151031, "kl": 0.005759429931640625, "learning_rate": 1.6000000000000003e-05, "loss": -0.0059, "reward": 0.22232143711298705, "reward_std": 0.23315932899713515, "rewards/accuracy_reward": 0.22232143711298705, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 821.8294998168946, "epoch": 0.12, "grad_norm": 0.2534916400909424, "kl": 0.02135162353515625, "learning_rate": 1.997807645233842e-05, "loss": -0.001, "reward": 0.33482144214212894, "reward_std": 0.34336078241467477, "rewards/accuracy_reward": 0.33482144214212894, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 877.2330703735352, "epoch": 0.16, "grad_norm": 0.2218112200498581, "kl": 0.0388702392578125, "learning_rate": 1.9803328406604252e-05, "loss": 0.004, "reward": 0.36428573317825796, "reward_std": 0.34514649510383605, "rewards/accuracy_reward": 0.36428573317825796, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 610.845565032959, "epoch": 0.2, "grad_norm": 0.3174823522567749, "kl": 0.064056396484375, "learning_rate": 1.9457233587073177e-05, "loss": 0.0214, "reward": 0.4857143074274063, "reward_std": 0.36878507286310197, "rewards/accuracy_reward": 0.4857143074274063, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 555.7643127441406, "epoch": 0.24, "grad_norm": 0.34336915612220764, "kl": 0.214013671875, "learning_rate": 1.8946528335730344e-05, "loss": 0.0412, "reward": 0.4758928783237934, "reward_std": 0.382133661583066, "rewards/accuracy_reward": 0.4758928783237934, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 738.0321739196777, "epoch": 0.28, "grad_norm": 0.3574662208557129, "kl": 0.13843994140625, "learning_rate": 1.8281152949374527e-05, "loss": 0.0249, "reward": 0.4142857350409031, "reward_std": 0.37416205164045097, "rewards/accuracy_reward": 0.4142857350409031, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 609.2402069091797, "epoch": 0.32, "grad_norm": 0.23834314942359924, "kl": 0.12186279296875, "learning_rate": 1.7474058203047863e-05, "loss": 0.0057, "reward": 0.4830357365310192, "reward_std": 0.3151938086375594, "rewards/accuracy_reward": 0.4830357365310192, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 633.9777038574218, "epoch": 0.36, "grad_norm": 0.3009245991706848, "kl": 0.10828857421875, "learning_rate": 1.6540953277930925e-05, "loss": 0.0113, "reward": 0.5000000216066838, "reward_std": 0.3544993594288826, "rewards/accuracy_reward": 0.5000000216066838, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 609.4643142700195, "epoch": 0.4, "grad_norm": 0.2935497462749481, "kl": 0.12471923828125, "learning_rate": 1.55e-05, "loss": 0.0182, "reward": 0.5151785992085933, "reward_std": 0.3775656022131443, "rewards/accuracy_reward": 0.5151785992085933, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 521.8821655273438, "epoch": 0.44, "grad_norm": 0.31697750091552734, "kl": 0.14866943359375, "learning_rate": 1.437145934074321e-05, "loss": 0.0108, "reward": 0.5276785962283611, "reward_std": 0.3804761566221714, "rewards/accuracy_reward": 0.5276785962283611, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 525.0000259399415, "epoch": 0.48, "grad_norm": 0.38678815960884094, "kl": 0.13438720703125, "learning_rate": 1.317729706039701e-05, "loss": 0.0101, "reward": 0.5142857365310192, "reward_std": 0.3935286600142717, "rewards/accuracy_reward": 0.5142857365310192, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 613.3500282287598, "epoch": 0.52, "grad_norm": 0.30911684036254883, "kl": 0.12950439453125, "learning_rate": 1.1940756169408882e-05, "loss": 0.006, "reward": 0.46875002309679986, "reward_std": 0.3613659642636776, "rewards/accuracy_reward": 0.46875002309679986, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 636.9446754455566, "epoch": 0.56, "grad_norm": 0.28472012281417847, "kl": 0.12186279296875, "learning_rate": 1.0685904529677496e-05, "loss": 0.0075, "reward": 0.449107164144516, "reward_std": 0.36206651106476784, "rewards/accuracy_reward": 0.449107164144516, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 864.2687881469726, "epoch": 0.6, "grad_norm": 0.2347540706396103, "kl": 0.09737548828125, "learning_rate": 9.437166400997629e-06, "loss": 0.0228, "reward": 0.4017857346683741, "reward_std": 0.37069899663329126, "rewards/accuracy_reward": 0.4017857346683741, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 839.4205764770508, "epoch": 0.64, "grad_norm": 0.2780732214450836, "kl": 0.11231689453125, "learning_rate": 8.218847050625476e-06, "loss": 0.0131, "reward": 0.43303573224693537, "reward_std": 0.3525656070560217, "rewards/accuracy_reward": 0.43303573224693537, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 762.250927734375, "epoch": 0.68, "grad_norm": 0.24334457516670227, "kl": 0.11146240234375, "learning_rate": 7.054659678898304e-06, "loss": -0.0166, "reward": 0.4669643059372902, "reward_std": 0.4058806154876947, "rewards/accuracy_reward": 0.4669643059372902, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 653.9178886413574, "epoch": 0.72, "grad_norm": 0.293698787689209, "kl": 0.10758056640625, "learning_rate": 5.96726386876328e-06, "loss": 0.0285, "reward": 0.5089285977184772, "reward_std": 0.3675716958940029, "rewards/accuracy_reward": 0.5089285977184772, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 607.0893157958984, "epoch": 0.76, "grad_norm": 0.3261861801147461, "kl": 0.10914306640625, "learning_rate": 4.977824542770279e-06, "loss": -0.0012, "reward": 0.5026785939931869, "reward_std": 0.387619011849165, "rewards/accuracy_reward": 0.5026785939931869, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 641.3634246826172, "epoch": 0.8, "grad_norm": 0.34083282947540283, "kl": 0.11761474609375, "learning_rate": 4.1056000119292e-06, "loss": 0.0296, "reward": 0.5035714529454708, "reward_std": 0.3847282975912094, "rewards/accuracy_reward": 0.5035714529454708, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 684.0607452392578, "epoch": 0.84, "grad_norm": 0.35424429178237915, "kl": 0.13974609375, "learning_rate": 3.367567134592167e-06, "loss": 0.0385, "reward": 0.5000000253319741, "reward_std": 0.3738659627735615, "rewards/accuracy_reward": 0.5000000253319741, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 743.4545013427735, "epoch": 0.88, "grad_norm": 0.5738213658332825, "kl": 0.18140869140625, "learning_rate": 2.778090881216592e-06, "loss": -0.0031, "reward": 0.48482145369052887, "reward_std": 0.36493739411234855, "rewards/accuracy_reward": 0.48482145369052887, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 736.5125328063965, "epoch": 0.92, "grad_norm": 0.34698280692100525, "kl": 0.1845947265625, "learning_rate": 2.34864473655513e-06, "loss": 0.0267, "reward": 0.5008928813040257, "reward_std": 0.3658943545073271, "rewards/accuracy_reward": 0.5008928813040257, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 654.1107444763184, "epoch": 0.96, "grad_norm": 0.27743178606033325, "kl": 0.1630615234375, "learning_rate": 2.087587381325867e-06, "loss": 0.0044, "reward": 0.5339285939931869, "reward_std": 0.35324631109833715, "rewards/accuracy_reward": 0.5339285939931869, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 649.8173192342123, "epoch": 0.996, "kl": 0.14461263020833334, "reward": 0.5496032014489174, "reward_std": 0.36488261736101574, "rewards/accuracy_reward": 0.5496032014489174, "step": 249, "total_flos": 0.0, "train_loss": 0.012884760174496347, "train_runtime": 16946.151, "train_samples_per_second": 0.413, "train_steps_per_second": 0.015 } ], "logging_steps": 10, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }