{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3741, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008019246190858059, "grad_norm": 151.16231984056856, "learning_rate": 1.3333333333333333e-09, "logits/chosen": 1.015625, "logits/rejected": -0.64453125, "logps/chosen": -486.0, "logps/rejected": -97.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00801924619085806, "grad_norm": 157.68454909718238, "learning_rate": 1.3333333333333334e-08, "logits/chosen": 0.45703125, "logits/rejected": 0.2158203125, "logps/chosen": -448.0, "logps/rejected": -139.0, "loss": 0.7014, "rewards/accuracies": 0.2222222238779068, "rewards/chosen": -0.005523681640625, "rewards/margins": -0.0026397705078125, "rewards/rejected": -0.002777099609375, "step": 10 }, { "epoch": 0.01603849238171612, "grad_norm": 140.01201261980347, "learning_rate": 2.6666666666666667e-08, "logits/chosen": 0.5625, "logits/rejected": 0.1494140625, "logps/chosen": -470.0, "logps/rejected": -142.0, "loss": 0.7047, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.0150146484375, "rewards/margins": -0.00750732421875, "rewards/rejected": -0.00750732421875, "step": 20 }, { "epoch": 0.024057738572574178, "grad_norm": 189.67162885270034, "learning_rate": 4e-08, "logits/chosen": 0.7265625, "logits/rejected": 0.36328125, "logps/chosen": -448.0, "logps/rejected": -140.0, "loss": 0.6836, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.02001953125, "rewards/margins": 0.01373291015625, "rewards/rejected": 0.006256103515625, "step": 30 }, { "epoch": 0.03207698476343224, "grad_norm": 126.36839679937302, "learning_rate": 5.3333333333333334e-08, "logits/chosen": 0.47265625, "logits/rejected": 0.06103515625, "logps/chosen": -456.0, "logps/rejected": -128.0, "loss": 0.666, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.03759765625, "rewards/margins": 0.047607421875, "rewards/rejected": -0.010009765625, "step": 40 }, { "epoch": 0.040096230954290296, "grad_norm": 170.43264876127887, "learning_rate": 6.666666666666667e-08, "logits/chosen": 0.484375, "logits/rejected": 0.384765625, "logps/chosen": -462.0, "logps/rejected": -134.0, "loss": 0.6311, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.1103515625, "rewards/margins": 0.154296875, "rewards/rejected": -0.043701171875, "step": 50 }, { "epoch": 0.048115477145148355, "grad_norm": 115.9123010728823, "learning_rate": 8e-08, "logits/chosen": 0.333984375, "logits/rejected": 0.1748046875, "logps/chosen": -466.0, "logps/rejected": -134.0, "loss": 0.6098, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.150390625, "rewards/margins": 0.185546875, "rewards/rejected": -0.03564453125, "step": 60 }, { "epoch": 0.056134723336006415, "grad_norm": 121.61291096101645, "learning_rate": 9.333333333333334e-08, "logits/chosen": 0.375, "logits/rejected": 0.216796875, "logps/chosen": -448.0, "logps/rejected": -152.0, "loss": 0.5172, "rewards/accuracies": 1.0, "rewards/chosen": 0.279296875, "rewards/margins": 0.41015625, "rewards/rejected": -0.130859375, "step": 70 }, { "epoch": 0.06415396952686447, "grad_norm": 92.3990972398572, "learning_rate": 1.0666666666666667e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.470703125, "logps/chosen": -484.0, "logps/rejected": -134.0, "loss": 0.4258, "rewards/accuracies": 1.0, "rewards/chosen": 0.46484375, "rewards/margins": 0.63671875, "rewards/rejected": -0.169921875, "step": 80 }, { "epoch": 0.07217321571772253, "grad_norm": 74.84694443967108, "learning_rate": 1.2e-07, "logits/chosen": 0.357421875, "logits/rejected": 0.38671875, "logps/chosen": -498.0, "logps/rejected": -144.0, "loss": 0.3945, "rewards/accuracies": 1.0, "rewards/chosen": 0.57421875, "rewards/margins": 0.8046875, "rewards/rejected": -0.2275390625, "step": 90 }, { "epoch": 0.08019246190858059, "grad_norm": 52.15625730194694, "learning_rate": 1.3333333333333334e-07, "logits/chosen": 0.5859375, "logits/rejected": 0.1826171875, "logps/chosen": -476.0, "logps/rejected": -146.0, "loss": 0.258, "rewards/accuracies": 1.0, "rewards/chosen": 0.95703125, "rewards/margins": 1.2890625, "rewards/rejected": -0.3359375, "step": 100 }, { "epoch": 0.08821170809943865, "grad_norm": 36.83176654746254, "learning_rate": 1.4666666666666666e-07, "logits/chosen": 0.515625, "logits/rejected": 0.28125, "logps/chosen": -494.0, "logps/rejected": -137.0, "loss": 0.1608, "rewards/accuracies": 1.0, "rewards/chosen": 1.375, "rewards/margins": 1.9140625, "rewards/rejected": -0.5390625, "step": 110 }, { "epoch": 0.09623095429029671, "grad_norm": 23.888739991484748, "learning_rate": 1.6e-07, "logits/chosen": 0.5859375, "logits/rejected": 0.36328125, "logps/chosen": -462.0, "logps/rejected": -135.0, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": 1.703125, "rewards/margins": 2.375, "rewards/rejected": -0.6640625, "step": 120 }, { "epoch": 0.10425020048115477, "grad_norm": 15.853450776043777, "learning_rate": 1.7333333333333332e-07, "logits/chosen": 0.34375, "logits/rejected": 0.2314453125, "logps/chosen": -460.0, "logps/rejected": -152.0, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": 1.890625, "rewards/margins": 2.71875, "rewards/rejected": -0.828125, "step": 130 }, { "epoch": 0.11226944667201283, "grad_norm": 17.76048627628574, "learning_rate": 1.8666666666666667e-07, "logits/chosen": 0.376953125, "logits/rejected": 0.1865234375, "logps/chosen": -476.0, "logps/rejected": -142.0, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": 2.1875, "rewards/margins": 3.28125, "rewards/rejected": -1.1015625, "step": 140 }, { "epoch": 0.12028869286287089, "grad_norm": 4.629661113898006, "learning_rate": 2e-07, "logits/chosen": 0.57421875, "logits/rejected": 0.298828125, "logps/chosen": -464.0, "logps/rejected": -155.0, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 2.6875, "rewards/margins": 4.03125, "rewards/rejected": -1.359375, "step": 150 }, { "epoch": 0.12830793905372895, "grad_norm": 3.0673728757698404, "learning_rate": 2.1333333333333334e-07, "logits/chosen": 0.49609375, "logits/rejected": 0.37890625, "logps/chosen": -444.0, "logps/rejected": -153.0, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 3.015625, "rewards/margins": 4.65625, "rewards/rejected": -1.6328125, "step": 160 }, { "epoch": 0.136327185244587, "grad_norm": 9.941318270300394, "learning_rate": 2.2666666666666663e-07, "logits/chosen": 0.1455078125, "logits/rejected": 0.515625, "logps/chosen": -446.0, "logps/rejected": -188.0, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": 3.46875, "rewards/margins": 5.625, "rewards/rejected": -2.15625, "step": 170 }, { "epoch": 0.14434643143544507, "grad_norm": 1.3010921876710215, "learning_rate": 2.4e-07, "logits/chosen": 0.5, "logits/rejected": 0.474609375, "logps/chosen": -404.0, "logps/rejected": -178.0, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 3.703125, "rewards/margins": 6.0, "rewards/rejected": -2.28125, "step": 180 }, { "epoch": 0.15236567762630313, "grad_norm": 6.434919601365227, "learning_rate": 2.533333333333333e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.51171875, "logps/chosen": -428.0, "logps/rejected": -156.0, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 4.34375, "rewards/margins": 6.875, "rewards/rejected": -2.5, "step": 190 }, { "epoch": 0.16038492381716118, "grad_norm": 1.4699035544697376, "learning_rate": 2.6666666666666667e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.49609375, "logps/chosen": -428.0, "logps/rejected": -165.0, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 4.65625, "rewards/margins": 7.21875, "rewards/rejected": -2.5625, "step": 200 }, { "epoch": 0.16840417000801924, "grad_norm": 0.21322645163978157, "learning_rate": 2.8e-07, "logits/chosen": 0.4453125, "logits/rejected": 0.5078125, "logps/chosen": -400.0, "logps/rejected": -163.0, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 4.5625, "rewards/margins": 7.6875, "rewards/rejected": -3.140625, "step": 210 }, { "epoch": 0.1764234161988773, "grad_norm": 2.789746384228196, "learning_rate": 2.933333333333333e-07, "logits/chosen": 0.62890625, "logits/rejected": 0.578125, "logps/chosen": -398.0, "logps/rejected": -159.0, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 4.59375, "rewards/margins": 7.5625, "rewards/rejected": -2.96875, "step": 220 }, { "epoch": 0.18444266238973536, "grad_norm": 0.4354358263881392, "learning_rate": 3.066666666666666e-07, "logits/chosen": 0.478515625, "logits/rejected": 0.5, "logps/chosen": -444.0, "logps/rejected": -161.0, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 4.96875, "rewards/margins": 8.25, "rewards/rejected": -3.28125, "step": 230 }, { "epoch": 0.19246190858059342, "grad_norm": 0.22054875816521427, "learning_rate": 3.2e-07, "logits/chosen": 0.416015625, "logits/rejected": 0.478515625, "logps/chosen": -390.0, "logps/rejected": -191.0, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 4.84375, "rewards/margins": 8.625, "rewards/rejected": -3.75, "step": 240 }, { "epoch": 0.20048115477145148, "grad_norm": 0.15707631274986159, "learning_rate": 3.333333333333333e-07, "logits/chosen": 0.337890625, "logits/rejected": 0.3203125, "logps/chosen": -420.0, "logps/rejected": -159.0, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 5.0, "rewards/margins": 8.375, "rewards/rejected": -3.375, "step": 250 }, { "epoch": 0.20850040096230954, "grad_norm": 0.7731514946431466, "learning_rate": 3.4666666666666665e-07, "logits/chosen": 0.431640625, "logits/rejected": 0.49609375, "logps/chosen": -408.0, "logps/rejected": -177.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 4.875, "rewards/margins": 8.875, "rewards/rejected": -3.96875, "step": 260 }, { "epoch": 0.2165196471531676, "grad_norm": 1.6784457208768384, "learning_rate": 3.6e-07, "logits/chosen": 0.4375, "logits/rejected": 0.40625, "logps/chosen": -416.0, "logps/rejected": -175.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 5.875, "rewards/margins": 9.6875, "rewards/rejected": -3.8125, "step": 270 }, { "epoch": 0.22453889334402566, "grad_norm": 0.6217122500025944, "learning_rate": 3.7333333333333334e-07, "logits/chosen": 0.7421875, "logits/rejected": 0.5546875, "logps/chosen": -404.0, "logps/rejected": -166.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 5.34375, "rewards/margins": 9.4375, "rewards/rejected": -4.0625, "step": 280 }, { "epoch": 0.23255813953488372, "grad_norm": 0.06528165803173126, "learning_rate": 3.8666666666666664e-07, "logits/chosen": 0.48828125, "logits/rejected": 0.625, "logps/chosen": -406.0, "logps/rejected": -193.0, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 5.46875, "rewards/margins": 9.75, "rewards/rejected": -4.3125, "step": 290 }, { "epoch": 0.24057738572574178, "grad_norm": 0.09562662477437295, "learning_rate": 4e-07, "logits/chosen": 0.55078125, "logits/rejected": 0.51171875, "logps/chosen": -376.0, "logps/rejected": -173.0, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 5.46875, "rewards/margins": 9.75, "rewards/rejected": -4.25, "step": 300 }, { "epoch": 0.24859663191659984, "grad_norm": 0.06706200198313735, "learning_rate": 4.1333333333333333e-07, "logits/chosen": 0.75, "logits/rejected": 0.7265625, "logps/chosen": -414.0, "logps/rejected": -174.0, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 5.9375, "rewards/margins": 10.375, "rewards/rejected": -4.40625, "step": 310 }, { "epoch": 0.2566158781074579, "grad_norm": 0.009122064059245512, "learning_rate": 4.266666666666667e-07, "logits/chosen": 0.5546875, "logits/rejected": 0.66796875, "logps/chosen": -404.0, "logps/rejected": -193.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 5.9375, "rewards/margins": 11.125, "rewards/rejected": -5.21875, "step": 320 }, { "epoch": 0.264635124298316, "grad_norm": 0.025542363556422566, "learning_rate": 4.3999999999999997e-07, "logits/chosen": 0.458984375, "logits/rejected": 0.5859375, "logps/chosen": -390.0, "logps/rejected": -182.0, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 5.8125, "rewards/margins": 10.5625, "rewards/rejected": -4.75, "step": 330 }, { "epoch": 0.272654370489174, "grad_norm": 0.0051795006658474155, "learning_rate": 4.5333333333333326e-07, "logits/chosen": 0.490234375, "logits/rejected": 0.54296875, "logps/chosen": -422.0, "logps/rejected": -190.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 6.65625, "rewards/margins": 11.625, "rewards/rejected": -4.96875, "step": 340 }, { "epoch": 0.2806736166800321, "grad_norm": 0.042353248890211505, "learning_rate": 4.6666666666666666e-07, "logits/chosen": 0.5625, "logits/rejected": 0.490234375, "logps/chosen": -390.0, "logps/rejected": -171.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 5.96875, "rewards/margins": 10.75, "rewards/rejected": -4.78125, "step": 350 }, { "epoch": 0.28869286287089013, "grad_norm": 0.00902672610786128, "learning_rate": 4.8e-07, "logits/chosen": 0.482421875, "logits/rejected": 0.435546875, "logps/chosen": -428.0, "logps/rejected": -179.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 6.875, "rewards/margins": 12.0, "rewards/rejected": -5.0625, "step": 360 }, { "epoch": 0.2967121090617482, "grad_norm": 0.009917622772121312, "learning_rate": 4.933333333333333e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.58203125, "logps/chosen": -404.0, "logps/rejected": -178.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 6.3125, "rewards/margins": 11.25, "rewards/rejected": -4.96875, "step": 370 }, { "epoch": 0.30473135525260625, "grad_norm": 0.041117543355245464, "learning_rate": 4.992572786690433e-07, "logits/chosen": 0.7265625, "logits/rejected": 0.578125, "logps/chosen": -398.0, "logps/rejected": -195.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 6.4375, "rewards/margins": 11.6875, "rewards/rejected": -5.25, "step": 380 }, { "epoch": 0.31275060144346434, "grad_norm": 0.1466010873112578, "learning_rate": 4.9777183600713e-07, "logits/chosen": 0.4609375, "logits/rejected": 0.396484375, "logps/chosen": -422.0, "logps/rejected": -173.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 6.59375, "rewards/margins": 11.3125, "rewards/rejected": -4.75, "step": 390 }, { "epoch": 0.32076984763432237, "grad_norm": 0.005506135969581839, "learning_rate": 4.962863933452169e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.7578125, "logps/chosen": -388.0, "logps/rejected": -192.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 6.375, "rewards/margins": 12.375, "rewards/rejected": -6.03125, "step": 400 }, { "epoch": 0.32878909382518046, "grad_norm": 0.03493478886748467, "learning_rate": 4.948009506833036e-07, "logits/chosen": 0.578125, "logits/rejected": 0.828125, "logps/chosen": -396.0, "logps/rejected": -206.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 6.78125, "rewards/margins": 13.375, "rewards/rejected": -6.5625, "step": 410 }, { "epoch": 0.3368083400160385, "grad_norm": 0.0052981473953303245, "learning_rate": 4.933155080213904e-07, "logits/chosen": 0.51953125, "logits/rejected": 0.86328125, "logps/chosen": -412.0, "logps/rejected": -204.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 6.59375, "rewards/margins": 12.6875, "rewards/rejected": -6.09375, "step": 420 }, { "epoch": 0.3448275862068966, "grad_norm": 0.0028446773540384642, "learning_rate": 4.918300653594771e-07, "logits/chosen": 0.68359375, "logits/rejected": 0.6015625, "logps/chosen": -424.0, "logps/rejected": -194.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.84375, "rewards/margins": 12.9375, "rewards/rejected": -6.09375, "step": 430 }, { "epoch": 0.3528468323977546, "grad_norm": 0.027327469420180828, "learning_rate": 4.903446226975638e-07, "logits/chosen": 0.33203125, "logits/rejected": 0.53515625, "logps/chosen": -418.0, "logps/rejected": -185.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.59375, "rewards/margins": 12.5, "rewards/rejected": -5.90625, "step": 440 }, { "epoch": 0.3608660785886127, "grad_norm": 0.0018353345740164588, "learning_rate": 4.888591800356506e-07, "logits/chosen": 0.58203125, "logits/rejected": 0.7421875, "logps/chosen": -394.0, "logps/rejected": -196.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 6.78125, "rewards/margins": 13.0, "rewards/rejected": -6.25, "step": 450 }, { "epoch": 0.3688853247794707, "grad_norm": 0.04093093471850283, "learning_rate": 4.873737373737373e-07, "logits/chosen": 0.40625, "logits/rejected": 0.62890625, "logps/chosen": -402.0, "logps/rejected": -188.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 6.875, "rewards/margins": 12.8125, "rewards/rejected": -5.9375, "step": 460 }, { "epoch": 0.3769045709703288, "grad_norm": 0.010823756561171452, "learning_rate": 4.858882947118241e-07, "logits/chosen": 0.400390625, "logits/rejected": 0.56640625, "logps/chosen": -418.0, "logps/rejected": -200.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.25, "rewards/margins": 13.4375, "rewards/rejected": -6.15625, "step": 470 }, { "epoch": 0.38492381716118684, "grad_norm": 0.01371851306777804, "learning_rate": 4.844028520499108e-07, "logits/chosen": 0.640625, "logits/rejected": 0.74609375, "logps/chosen": -416.0, "logps/rejected": -188.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.46875, "rewards/margins": 13.5, "rewards/rejected": -6.03125, "step": 480 }, { "epoch": 0.39294306335204493, "grad_norm": 0.03659552986074992, "learning_rate": 4.829174093879975e-07, "logits/chosen": 0.466796875, "logits/rejected": 0.59375, "logps/chosen": -384.0, "logps/rejected": -183.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 6.625, "rewards/margins": 12.875, "rewards/rejected": -6.1875, "step": 490 }, { "epoch": 0.40096230954290296, "grad_norm": 0.004051370833936493, "learning_rate": 4.814319667260843e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.87109375, "logps/chosen": -394.0, "logps/rejected": -203.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 6.5, "rewards/margins": 13.0, "rewards/rejected": -6.53125, "step": 500 }, { "epoch": 0.40898155573376105, "grad_norm": 0.0008665658642914715, "learning_rate": 4.799465240641711e-07, "logits/chosen": 0.6015625, "logits/rejected": 0.47265625, "logps/chosen": -398.0, "logps/rejected": -195.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.96875, "rewards/margins": 13.1875, "rewards/rejected": -6.25, "step": 510 }, { "epoch": 0.4170008019246191, "grad_norm": 0.006834277686765966, "learning_rate": 4.784610814022579e-07, "logits/chosen": 0.52734375, "logits/rejected": 0.796875, "logps/chosen": -416.0, "logps/rejected": -194.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.28125, "rewards/margins": 13.75, "rewards/rejected": -6.46875, "step": 520 }, { "epoch": 0.42502004811547717, "grad_norm": 0.20693223020514473, "learning_rate": 4.769756387403446e-07, "logits/chosen": 0.396484375, "logits/rejected": 0.453125, "logps/chosen": -424.0, "logps/rejected": -212.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 6.78125, "rewards/margins": 13.3125, "rewards/rejected": -6.5625, "step": 530 }, { "epoch": 0.4330392943063352, "grad_norm": 0.2847841118184419, "learning_rate": 4.7549019607843133e-07, "logits/chosen": 0.294921875, "logits/rejected": 0.703125, "logps/chosen": -404.0, "logps/rejected": -213.0, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 6.75, "rewards/margins": 13.1875, "rewards/rejected": -6.46875, "step": 540 }, { "epoch": 0.4410585404971933, "grad_norm": 0.003502732261053006, "learning_rate": 4.740047534165181e-07, "logits/chosen": 0.408203125, "logits/rejected": 0.578125, "logps/chosen": -414.0, "logps/rejected": -188.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.5, "rewards/margins": 13.875, "rewards/rejected": -6.375, "step": 550 }, { "epoch": 0.4490777866880513, "grad_norm": 0.2288317625574338, "learning_rate": 4.725193107546048e-07, "logits/chosen": 0.50390625, "logits/rejected": 0.51171875, "logps/chosen": -418.0, "logps/rejected": -194.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 7.1875, "rewards/margins": 13.5625, "rewards/rejected": -6.375, "step": 560 }, { "epoch": 0.4570970328789094, "grad_norm": 0.004529214200899048, "learning_rate": 4.7103386809269156e-07, "logits/chosen": 0.48828125, "logits/rejected": 0.462890625, "logps/chosen": -414.0, "logps/rejected": -199.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.25, "rewards/margins": 13.875, "rewards/rejected": -6.625, "step": 570 }, { "epoch": 0.46511627906976744, "grad_norm": 0.0012884084946254593, "learning_rate": 4.695484254307783e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.64453125, "logps/chosen": -388.0, "logps/rejected": -201.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 6.875, "rewards/margins": 13.875, "rewards/rejected": -7.0, "step": 580 }, { "epoch": 0.4731355252606255, "grad_norm": 0.020377739990682507, "learning_rate": 4.680629827688651e-07, "logits/chosen": 0.5078125, "logits/rejected": 0.55859375, "logps/chosen": -416.0, "logps/rejected": -191.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 7.84375, "rewards/margins": 14.5625, "rewards/rejected": -6.71875, "step": 590 }, { "epoch": 0.48115477145148355, "grad_norm": 0.013163367961936954, "learning_rate": 4.6657754010695184e-07, "logits/chosen": 0.373046875, "logits/rejected": 0.56640625, "logps/chosen": -394.0, "logps/rejected": -213.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.25, "rewards/margins": 14.75, "rewards/rejected": -7.5, "step": 600 }, { "epoch": 0.48917401764234164, "grad_norm": 0.0006534451115061735, "learning_rate": 4.650920974450386e-07, "logits/chosen": 0.5546875, "logits/rejected": 0.82421875, "logps/chosen": -386.0, "logps/rejected": -212.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.53125, "rewards/margins": 15.125, "rewards/rejected": -7.625, "step": 610 }, { "epoch": 0.4971932638331997, "grad_norm": 0.00541313718336999, "learning_rate": 4.636066547831253e-07, "logits/chosen": 0.478515625, "logits/rejected": 0.47265625, "logps/chosen": -410.0, "logps/rejected": -185.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.9375, "rewards/margins": 13.5, "rewards/rejected": -6.5625, "step": 620 }, { "epoch": 0.5052125100240578, "grad_norm": 0.010487472203453516, "learning_rate": 4.6212121212121207e-07, "logits/chosen": 0.62890625, "logits/rejected": 0.7578125, "logps/chosen": -406.0, "logps/rejected": -214.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.75, "rewards/margins": 15.4375, "rewards/rejected": -7.71875, "step": 630 }, { "epoch": 0.5132317562149158, "grad_norm": 0.00034659542513365186, "learning_rate": 4.6063576945929883e-07, "logits/chosen": 0.41015625, "logits/rejected": 0.78125, "logps/chosen": -420.0, "logps/rejected": -217.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.375, "rewards/margins": 15.125, "rewards/rejected": -7.75, "step": 640 }, { "epoch": 0.5212510024057738, "grad_norm": 0.0009782700234550058, "learning_rate": 4.591503267973856e-07, "logits/chosen": 0.578125, "logits/rejected": 0.73046875, "logps/chosen": -396.0, "logps/rejected": -195.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.0625, "rewards/margins": 14.0, "rewards/rejected": -6.9375, "step": 650 }, { "epoch": 0.529270248596632, "grad_norm": 0.005795939556909752, "learning_rate": 4.5766488413547235e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.76171875, "logps/chosen": -384.0, "logps/rejected": -203.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.8125, "rewards/margins": 14.8125, "rewards/rejected": -6.96875, "step": 660 }, { "epoch": 0.53728949478749, "grad_norm": 0.004469903767264327, "learning_rate": 4.5617944147355906e-07, "logits/chosen": 0.75390625, "logits/rejected": 0.81640625, "logps/chosen": -404.0, "logps/rejected": -206.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.5, "rewards/margins": 14.9375, "rewards/rejected": -7.4375, "step": 670 }, { "epoch": 0.545308740978348, "grad_norm": 0.0002444418964476635, "learning_rate": 4.546939988116458e-07, "logits/chosen": 0.33984375, "logits/rejected": 0.8984375, "logps/chosen": -406.0, "logps/rejected": -228.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.0625, "rewards/margins": 15.875, "rewards/rejected": -7.8125, "step": 680 }, { "epoch": 0.5533279871692061, "grad_norm": 0.00021397422587196623, "learning_rate": 4.532085561497326e-07, "logits/chosen": 0.435546875, "logits/rejected": 0.94140625, "logps/chosen": -402.0, "logps/rejected": -207.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.46875, "rewards/margins": 14.8125, "rewards/rejected": -7.34375, "step": 690 }, { "epoch": 0.5613472333600642, "grad_norm": 0.03890331539149767, "learning_rate": 4.5172311348781934e-07, "logits/chosen": 0.71484375, "logits/rejected": 0.84765625, "logps/chosen": -408.0, "logps/rejected": -208.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.21875, "rewards/margins": 14.8125, "rewards/rejected": -7.59375, "step": 700 }, { "epoch": 0.5693664795509222, "grad_norm": 6.504967952173011e-05, "learning_rate": 4.502376708259061e-07, "logits/chosen": 0.48046875, "logits/rejected": 0.74609375, "logps/chosen": -434.0, "logps/rejected": -214.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.3125, "rewards/margins": 16.5, "rewards/rejected": -8.1875, "step": 710 }, { "epoch": 0.5773857257417803, "grad_norm": 0.15367117541733433, "learning_rate": 4.487522281639928e-07, "logits/chosen": 0.4921875, "logits/rejected": 0.76953125, "logps/chosen": -392.0, "logps/rejected": -218.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.40625, "rewards/margins": 15.4375, "rewards/rejected": -8.0, "step": 720 }, { "epoch": 0.5854049719326383, "grad_norm": 0.00013398465564378202, "learning_rate": 4.4726678550207957e-07, "logits/chosen": 0.59375, "logits/rejected": 0.87890625, "logps/chosen": -384.0, "logps/rejected": -221.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.8125, "rewards/margins": 16.25, "rewards/rejected": -8.5, "step": 730 }, { "epoch": 0.5934242181234964, "grad_norm": 0.7640897104556196, "learning_rate": 4.457813428401663e-07, "logits/chosen": 0.39453125, "logits/rejected": 0.859375, "logps/chosen": -412.0, "logps/rejected": -222.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.4375, "rewards/margins": 15.5625, "rewards/rejected": -8.125, "step": 740 }, { "epoch": 0.6014434643143545, "grad_norm": 0.025607082461738897, "learning_rate": 4.442959001782531e-07, "logits/chosen": 0.453125, "logits/rejected": 0.4765625, "logps/chosen": -384.0, "logps/rejected": -219.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.0625, "rewards/margins": 14.0, "rewards/rejected": -6.9375, "step": 750 }, { "epoch": 0.6094627105052125, "grad_norm": 0.10951715152655012, "learning_rate": 4.4281045751633985e-07, "logits/chosen": 0.458984375, "logits/rejected": 0.88671875, "logps/chosen": -404.0, "logps/rejected": -218.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.84375, "rewards/margins": 15.25, "rewards/rejected": -7.40625, "step": 760 }, { "epoch": 0.6174819566960705, "grad_norm": 3.605400216321251e-05, "learning_rate": 4.413250148544266e-07, "logits/chosen": 0.5, "logits/rejected": 0.75, "logps/chosen": -400.0, "logps/rejected": -205.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.25, "rewards/margins": 14.6875, "rewards/rejected": -7.4375, "step": 770 }, { "epoch": 0.6255012028869287, "grad_norm": 0.02284775953703058, "learning_rate": 4.398395721925133e-07, "logits/chosen": 0.47265625, "logits/rejected": 0.85546875, "logps/chosen": -390.0, "logps/rejected": -235.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.0625, "rewards/margins": 16.0, "rewards/rejected": -7.90625, "step": 780 }, { "epoch": 0.6335204490777867, "grad_norm": 0.003285558623209297, "learning_rate": 4.3835412953060007e-07, "logits/chosen": 0.53515625, "logits/rejected": 0.99609375, "logps/chosen": -418.0, "logps/rejected": -213.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.125, "rewards/margins": 15.9375, "rewards/rejected": -7.8125, "step": 790 }, { "epoch": 0.6415396952686447, "grad_norm": 0.00037806485385265764, "learning_rate": 4.3686868686868683e-07, "logits/chosen": 0.484375, "logits/rejected": 0.82421875, "logps/chosen": -400.0, "logps/rejected": -226.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.6875, "rewards/margins": 15.75, "rewards/rejected": -8.0625, "step": 800 }, { "epoch": 0.6495589414595028, "grad_norm": 0.0044268725320260015, "learning_rate": 4.353832442067736e-07, "logits/chosen": 0.498046875, "logits/rejected": 0.7734375, "logps/chosen": -390.0, "logps/rejected": -205.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.8125, "rewards/margins": 15.4375, "rewards/rejected": -7.65625, "step": 810 }, { "epoch": 0.6575781876503609, "grad_norm": 0.019362152024566967, "learning_rate": 4.3389780154486035e-07, "logits/chosen": 0.40234375, "logits/rejected": 0.82421875, "logps/chosen": -428.0, "logps/rejected": -231.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 16.625, "rewards/rejected": -8.1875, "step": 820 }, { "epoch": 0.6655974338412189, "grad_norm": 0.0007741766064915356, "learning_rate": 4.3241235888294706e-07, "logits/chosen": 0.462890625, "logits/rejected": 0.74609375, "logps/chosen": -390.0, "logps/rejected": -210.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.0, "rewards/margins": 16.125, "rewards/rejected": -8.0625, "step": 830 }, { "epoch": 0.673616680032077, "grad_norm": 0.0011565706889433063, "learning_rate": 4.309269162210338e-07, "logits/chosen": 0.486328125, "logits/rejected": 1.03125, "logps/chosen": -420.0, "logps/rejected": -230.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 17.375, "rewards/rejected": -8.875, "step": 840 }, { "epoch": 0.681635926222935, "grad_norm": 0.0007694710671328706, "learning_rate": 4.294414735591206e-07, "logits/chosen": 0.470703125, "logits/rejected": 0.69921875, "logps/chosen": -394.0, "logps/rejected": -222.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.6875, "rewards/margins": 16.125, "rewards/rejected": -8.4375, "step": 850 }, { "epoch": 0.6896551724137931, "grad_norm": 0.0008513724514433051, "learning_rate": 4.2795603089720734e-07, "logits/chosen": 0.345703125, "logits/rejected": 0.9921875, "logps/chosen": -390.0, "logps/rejected": -230.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.15625, "rewards/margins": 15.6875, "rewards/rejected": -8.5625, "step": 860 }, { "epoch": 0.6976744186046512, "grad_norm": 0.00016335436820339935, "learning_rate": 4.264705882352941e-07, "logits/chosen": 0.44921875, "logits/rejected": 0.7265625, "logps/chosen": -408.0, "logps/rejected": -213.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.65625, "rewards/margins": 15.8125, "rewards/rejected": -8.125, "step": 870 }, { "epoch": 0.7056936647955092, "grad_norm": 0.0031554420832968864, "learning_rate": 4.249851455733808e-07, "logits/chosen": 0.486328125, "logits/rejected": 0.9765625, "logps/chosen": -400.0, "logps/rejected": -212.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.25, "rewards/margins": 16.75, "rewards/rejected": -8.5, "step": 880 }, { "epoch": 0.7137129109863672, "grad_norm": 0.0011806212384073626, "learning_rate": 4.2349970291146757e-07, "logits/chosen": 0.40234375, "logits/rejected": 0.90625, "logps/chosen": -396.0, "logps/rejected": -229.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.0, "rewards/margins": 17.125, "rewards/rejected": -9.125, "step": 890 }, { "epoch": 0.7217321571772254, "grad_norm": 0.0021136250169822507, "learning_rate": 4.2201426024955433e-07, "logits/chosen": 0.6640625, "logits/rejected": 0.953125, "logps/chosen": -384.0, "logps/rejected": -222.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.125, "rewards/margins": 16.75, "rewards/rejected": -8.5625, "step": 900 }, { "epoch": 0.7297514033680834, "grad_norm": 0.00012742628464020592, "learning_rate": 4.205288175876411e-07, "logits/chosen": 0.3359375, "logits/rejected": 0.703125, "logps/chosen": -408.0, "logps/rejected": -226.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.625, "rewards/margins": 16.375, "rewards/rejected": -8.75, "step": 910 }, { "epoch": 0.7377706495589414, "grad_norm": 0.0011593350470986772, "learning_rate": 4.1904337492572785e-07, "logits/chosen": 0.59375, "logits/rejected": 1.0078125, "logps/chosen": -408.0, "logps/rejected": -217.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.0625, "rewards/margins": 16.75, "rewards/rejected": -8.6875, "step": 920 }, { "epoch": 0.7457898957497995, "grad_norm": 0.7054163177066007, "learning_rate": 4.175579322638146e-07, "logits/chosen": 0.384765625, "logits/rejected": 0.78515625, "logps/chosen": -376.0, "logps/rejected": -232.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 8.0625, "rewards/margins": 17.125, "rewards/rejected": -9.125, "step": 930 }, { "epoch": 0.7538091419406576, "grad_norm": 0.0021867642029836532, "learning_rate": 4.160724896019013e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.81640625, "logps/chosen": -386.0, "logps/rejected": -220.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.84375, "rewards/margins": 16.75, "rewards/rejected": -8.9375, "step": 940 }, { "epoch": 0.7618283881315157, "grad_norm": 0.0007974623102976589, "learning_rate": 4.145870469399881e-07, "logits/chosen": 0.53515625, "logits/rejected": 0.84375, "logps/chosen": -386.0, "logps/rejected": -224.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.0625, "rewards/margins": 17.25, "rewards/rejected": -9.25, "step": 950 }, { "epoch": 0.7698476343223737, "grad_norm": 0.00020167233308354896, "learning_rate": 4.1310160427807484e-07, "logits/chosen": 0.50390625, "logits/rejected": 0.91015625, "logps/chosen": -398.0, "logps/rejected": -220.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.1875, "rewards/margins": 17.25, "rewards/rejected": -9.0, "step": 960 }, { "epoch": 0.7778668805132317, "grad_norm": 0.00033416513303796653, "learning_rate": 4.116161616161616e-07, "logits/chosen": 0.3828125, "logits/rejected": 0.87109375, "logps/chosen": -384.0, "logps/rejected": -230.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.25, "rewards/margins": 17.125, "rewards/rejected": -8.9375, "step": 970 }, { "epoch": 0.7858861267040899, "grad_norm": 0.0003195054657717697, "learning_rate": 4.1013071895424836e-07, "logits/chosen": 0.439453125, "logits/rejected": 1.0, "logps/chosen": -402.0, "logps/rejected": -221.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 7.8125, "rewards/margins": 17.0, "rewards/rejected": -9.125, "step": 980 }, { "epoch": 0.7939053728949479, "grad_norm": 8.827637734836567e-05, "learning_rate": 4.0864527629233506e-07, "logits/chosen": 0.6640625, "logits/rejected": 0.75390625, "logps/chosen": -394.0, "logps/rejected": -207.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 17.125, "rewards/rejected": -8.625, "step": 990 }, { "epoch": 0.8019246190858059, "grad_norm": 0.0002351272511052529, "learning_rate": 4.071598336304218e-07, "logits/chosen": 0.6171875, "logits/rejected": 0.6328125, "logps/chosen": -388.0, "logps/rejected": -222.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.125, "rewards/margins": 17.375, "rewards/rejected": -9.1875, "step": 1000 }, { "epoch": 0.809943865276664, "grad_norm": 3.055010069287546e-05, "learning_rate": 4.056743909685086e-07, "logits/chosen": 0.283203125, "logits/rejected": 0.734375, "logps/chosen": -396.0, "logps/rejected": -222.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.40625, "rewards/margins": 16.5, "rewards/rejected": -9.125, "step": 1010 }, { "epoch": 0.8179631114675221, "grad_norm": 0.004053992461842295, "learning_rate": 4.0418894830659534e-07, "logits/chosen": 0.390625, "logits/rejected": 0.8125, "logps/chosen": -404.0, "logps/rejected": -253.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.8125, "rewards/margins": 17.5, "rewards/rejected": -9.6875, "step": 1020 }, { "epoch": 0.8259823576583801, "grad_norm": 0.00012457177912010068, "learning_rate": 4.027035056446821e-07, "logits/chosen": 0.27734375, "logits/rejected": 0.6796875, "logps/chosen": -418.0, "logps/rejected": -233.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.875, "rewards/margins": 16.5, "rewards/rejected": -8.625, "step": 1030 }, { "epoch": 0.8340016038492382, "grad_norm": 0.0005837368196038175, "learning_rate": 4.012180629827688e-07, "logits/chosen": 0.6328125, "logits/rejected": 1.0546875, "logps/chosen": -374.0, "logps/rejected": -236.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.25, "rewards/margins": 17.875, "rewards/rejected": -9.625, "step": 1040 }, { "epoch": 0.8420208500400962, "grad_norm": 0.0038079612988684247, "learning_rate": 3.9973262032085557e-07, "logits/chosen": 0.5703125, "logits/rejected": 0.83203125, "logps/chosen": -402.0, "logps/rejected": -223.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5625, "rewards/margins": 17.875, "rewards/rejected": -9.25, "step": 1050 }, { "epoch": 0.8500400962309543, "grad_norm": 0.0006197180775587829, "learning_rate": 3.9824717765894233e-07, "logits/chosen": 0.609375, "logits/rejected": 0.8671875, "logps/chosen": -382.0, "logps/rejected": -221.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.6875, "rewards/margins": 17.0, "rewards/rejected": -9.25, "step": 1060 }, { "epoch": 0.8580593424218124, "grad_norm": 0.0010398146438408034, "learning_rate": 3.967617349970291e-07, "logits/chosen": 0.27734375, "logits/rejected": 0.80859375, "logps/chosen": -404.0, "logps/rejected": -230.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.5, "rewards/margins": 16.625, "rewards/rejected": -9.125, "step": 1070 }, { "epoch": 0.8660785886126704, "grad_norm": 0.0014882861906958015, "learning_rate": 3.9527629233511585e-07, "logits/chosen": 0.390625, "logits/rejected": 0.5390625, "logps/chosen": -382.0, "logps/rejected": -228.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.0625, "rewards/margins": 17.375, "rewards/rejected": -9.3125, "step": 1080 }, { "epoch": 0.8740978348035284, "grad_norm": 0.0003714249634479452, "learning_rate": 3.9379084967320256e-07, "logits/chosen": 0.337890625, "logits/rejected": 0.48046875, "logps/chosen": -394.0, "logps/rejected": -248.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.125, "rewards/margins": 18.0, "rewards/rejected": -9.8125, "step": 1090 }, { "epoch": 0.8821170809943866, "grad_norm": 0.24656906073483573, "learning_rate": 3.923054070112893e-07, "logits/chosen": 0.341796875, "logits/rejected": 0.79296875, "logps/chosen": -416.0, "logps/rejected": -227.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 17.875, "rewards/rejected": -9.375, "step": 1100 }, { "epoch": 0.8901363271852446, "grad_norm": 0.0003431643964367433, "learning_rate": 3.908199643493761e-07, "logits/chosen": 0.50390625, "logits/rejected": 0.94921875, "logps/chosen": -390.0, "logps/rejected": -229.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 18.125, "rewards/rejected": -9.625, "step": 1110 }, { "epoch": 0.8981555733761026, "grad_norm": 4.200754050552462e-05, "learning_rate": 3.8933452168746284e-07, "logits/chosen": 0.259765625, "logits/rejected": 0.8828125, "logps/chosen": -408.0, "logps/rejected": -227.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 18.25, "rewards/rejected": -9.0625, "step": 1120 }, { "epoch": 0.9061748195669607, "grad_norm": 2.6484090760862323e-05, "learning_rate": 3.878490790255496e-07, "logits/chosen": 0.52734375, "logits/rejected": 0.96484375, "logps/chosen": -390.0, "logps/rejected": -233.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.1875, "rewards/margins": 17.875, "rewards/rejected": -9.6875, "step": 1130 }, { "epoch": 0.9141940657578188, "grad_norm": 0.00030545280148334747, "learning_rate": 3.8636363636363636e-07, "logits/chosen": 0.3515625, "logits/rejected": 0.7265625, "logps/chosen": -402.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.6875, "rewards/margins": 18.75, "rewards/rejected": -10.0625, "step": 1140 }, { "epoch": 0.9222133119486768, "grad_norm": 0.3631150495907317, "learning_rate": 3.8487819370172307e-07, "logits/chosen": 0.5625, "logits/rejected": 0.7265625, "logps/chosen": -398.0, "logps/rejected": -218.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.3125, "rewards/margins": 17.625, "rewards/rejected": -9.3125, "step": 1150 }, { "epoch": 0.9302325581395349, "grad_norm": 0.018167290811332105, "learning_rate": 3.8339275103980983e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.75, "logps/chosen": -402.0, "logps/rejected": -231.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 17.875, "rewards/rejected": -9.5, "step": 1160 }, { "epoch": 0.9382518043303929, "grad_norm": 1.5931081911049097e-05, "learning_rate": 3.819073083778966e-07, "logits/chosen": 0.439453125, "logits/rejected": 0.7265625, "logps/chosen": -366.0, "logps/rejected": -251.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 18.875, "rewards/rejected": -10.375, "step": 1170 }, { "epoch": 0.946271050521251, "grad_norm": 0.0005972393678641138, "learning_rate": 3.8042186571598335e-07, "logits/chosen": 0.38671875, "logits/rejected": 0.89453125, "logps/chosen": -378.0, "logps/rejected": -218.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.59375, "rewards/margins": 16.5, "rewards/rejected": -8.9375, "step": 1180 }, { "epoch": 0.9542902967121091, "grad_norm": 0.0036918944045519976, "learning_rate": 3.789364230540701e-07, "logits/chosen": 0.419921875, "logits/rejected": 1.0234375, "logps/chosen": -402.0, "logps/rejected": -228.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.6875, "rewards/margins": 18.125, "rewards/rejected": -9.4375, "step": 1190 }, { "epoch": 0.9623095429029671, "grad_norm": 0.00011481458292727216, "learning_rate": 3.774509803921568e-07, "logits/chosen": 0.5625, "logits/rejected": 1.078125, "logps/chosen": -386.0, "logps/rejected": -234.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.625, "rewards/margins": 18.0, "rewards/rejected": -9.375, "step": 1200 }, { "epoch": 0.9703287890938251, "grad_norm": 0.273417562935222, "learning_rate": 3.759655377302436e-07, "logits/chosen": 0.5078125, "logits/rejected": 0.9140625, "logps/chosen": -384.0, "logps/rejected": -233.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 8.5625, "rewards/margins": 18.25, "rewards/rejected": -9.6875, "step": 1210 }, { "epoch": 0.9783480352846833, "grad_norm": 0.0015624788569434155, "learning_rate": 3.7448009506833034e-07, "logits/chosen": 0.31640625, "logits/rejected": 0.78125, "logps/chosen": -400.0, "logps/rejected": -237.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 7.875, "rewards/margins": 17.75, "rewards/rejected": -9.875, "step": 1220 }, { "epoch": 0.9863672814755413, "grad_norm": 1.1353066929259035, "learning_rate": 3.729946524064171e-07, "logits/chosen": 0.5390625, "logits/rejected": 0.96484375, "logps/chosen": -390.0, "logps/rejected": -226.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 18.25, "rewards/rejected": -9.75, "step": 1230 }, { "epoch": 0.9943865276663993, "grad_norm": 1.2277868461143306e-05, "learning_rate": 3.7150920974450386e-07, "logits/chosen": 0.64453125, "logits/rejected": 0.87109375, "logps/chosen": -366.0, "logps/rejected": -242.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 19.125, "rewards/rejected": -10.625, "step": 1240 }, { "epoch": 1.0, "eval_logits/chosen": 0.5078125, "eval_logits/rejected": 0.984375, "eval_logps/chosen": -388.0, "eval_logps/rejected": -243.0, "eval_loss": 9.374080946145114e-06, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 8.25, "eval_rewards/margins": 18.25, "eval_rewards/rejected": -9.9375, "eval_runtime": 25.6003, "eval_samples_per_second": 7.773, "eval_steps_per_second": 0.977, "step": 1247 }, { "epoch": 1.0024057738572574, "grad_norm": 0.0005061416601184525, "learning_rate": 3.7002376708259056e-07, "logits/chosen": 0.47265625, "logits/rejected": 0.96875, "logps/chosen": -392.0, "logps/rejected": -250.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 19.5, "rewards/rejected": -10.4375, "step": 1250 }, { "epoch": 1.0104250200481155, "grad_norm": 0.00016074576570914342, "learning_rate": 3.685383244206773e-07, "logits/chosen": 0.37890625, "logits/rejected": 0.8203125, "logps/chosen": -396.0, "logps/rejected": -232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.875, "rewards/margins": 20.0, "rewards/rejected": -10.1875, "step": 1260 }, { "epoch": 1.0184442662389734, "grad_norm": 0.0002972094679375377, "learning_rate": 3.670528817587641e-07, "logits/chosen": 0.306640625, "logits/rejected": 0.83984375, "logps/chosen": -396.0, "logps/rejected": -242.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 19.125, "rewards/rejected": -10.1875, "step": 1270 }, { "epoch": 1.0264635124298316, "grad_norm": 0.014475735455727648, "learning_rate": 3.6556743909685084e-07, "logits/chosen": 0.455078125, "logits/rejected": 0.6171875, "logps/chosen": -388.0, "logps/rejected": -221.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 18.0, "rewards/rejected": -9.5625, "step": 1280 }, { "epoch": 1.0344827586206897, "grad_norm": 4.0588485153068495e-06, "learning_rate": 3.640819964349376e-07, "logits/chosen": 0.423828125, "logits/rejected": 0.73046875, "logps/chosen": -400.0, "logps/rejected": -231.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 18.875, "rewards/rejected": -10.4375, "step": 1290 }, { "epoch": 1.0425020048115476, "grad_norm": 0.0001086010066631998, "learning_rate": 3.6259655377302436e-07, "logits/chosen": 0.349609375, "logits/rejected": 0.5234375, "logps/chosen": -386.0, "logps/rejected": -228.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 18.375, "rewards/rejected": -9.8125, "step": 1300 }, { "epoch": 1.0505212510024058, "grad_norm": 0.00014674974754107502, "learning_rate": 3.6111111111111107e-07, "logits/chosen": 0.28515625, "logits/rejected": 0.71875, "logps/chosen": -412.0, "logps/rejected": -230.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 19.25, "rewards/rejected": -10.1875, "step": 1310 }, { "epoch": 1.058540497193264, "grad_norm": 4.898432349816205e-05, "learning_rate": 3.5962566844919783e-07, "logits/chosen": 0.50390625, "logits/rejected": 0.84765625, "logps/chosen": -374.0, "logps/rejected": -233.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.3125, "rewards/margins": 18.125, "rewards/rejected": -9.875, "step": 1320 }, { "epoch": 1.0665597433841218, "grad_norm": 5.3980399655467655e-06, "learning_rate": 3.581402257872846e-07, "logits/chosen": 0.470703125, "logits/rejected": 1.0, "logps/chosen": -422.0, "logps/rejected": -240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 19.625, "rewards/rejected": -10.5, "step": 1330 }, { "epoch": 1.07457898957498, "grad_norm": 2.0067736112942238e-05, "learning_rate": 3.5665478312537135e-07, "logits/chosen": 0.345703125, "logits/rejected": 1.0078125, "logps/chosen": -406.0, "logps/rejected": -251.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5625, "rewards/margins": 19.375, "rewards/rejected": -10.875, "step": 1340 }, { "epoch": 1.082598235765838, "grad_norm": 5.427404417650856e-06, "learning_rate": 3.551693404634581e-07, "logits/chosen": 0.439453125, "logits/rejected": 0.55859375, "logps/chosen": -396.0, "logps/rejected": -238.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 18.25, "rewards/rejected": -9.75, "step": 1350 }, { "epoch": 1.090617481956696, "grad_norm": 0.0002553159939305877, "learning_rate": 3.536838978015448e-07, "logits/chosen": 0.2578125, "logits/rejected": 0.474609375, "logps/chosen": -374.0, "logps/rejected": -236.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.1875, "rewards/margins": 18.125, "rewards/rejected": -10.0, "step": 1360 }, { "epoch": 1.0986367281475542, "grad_norm": 3.0482699774886166e-05, "learning_rate": 3.521984551396316e-07, "logits/chosen": 0.3828125, "logits/rejected": 0.87890625, "logps/chosen": -374.0, "logps/rejected": -220.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.625, "rewards/margins": 18.625, "rewards/rejected": -10.0, "step": 1370 }, { "epoch": 1.1066559743384121, "grad_norm": 0.0002243930919712017, "learning_rate": 3.5071301247771834e-07, "logits/chosen": 0.302734375, "logits/rejected": 0.8828125, "logps/chosen": -424.0, "logps/rejected": -233.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 19.5, "rewards/rejected": -10.25, "step": 1380 }, { "epoch": 1.1146752205292703, "grad_norm": 0.01694638510819178, "learning_rate": 3.492275698158051e-07, "logits/chosen": 0.62890625, "logits/rejected": 1.0390625, "logps/chosen": -372.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5625, "rewards/margins": 19.375, "rewards/rejected": -10.8125, "step": 1390 }, { "epoch": 1.1226944667201284, "grad_norm": 8.079206777894219e-05, "learning_rate": 3.4774212715389186e-07, "logits/chosen": 0.298828125, "logits/rejected": 0.66015625, "logps/chosen": -352.0, "logps/rejected": -240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.25, "rewards/margins": 18.5, "rewards/rejected": -10.25, "step": 1400 }, { "epoch": 1.1307137129109863, "grad_norm": 0.0012896331488338052, "learning_rate": 3.4625668449197857e-07, "logits/chosen": 0.375, "logits/rejected": 0.703125, "logps/chosen": -420.0, "logps/rejected": -241.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 18.875, "rewards/rejected": -10.0, "step": 1410 }, { "epoch": 1.1387329591018445, "grad_norm": 2.4342092717802892e-05, "learning_rate": 3.4477124183006533e-07, "logits/chosen": 0.46484375, "logits/rejected": 1.0390625, "logps/chosen": -402.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.25, "rewards/margins": 18.625, "rewards/rejected": -10.375, "step": 1420 }, { "epoch": 1.1467522052927026, "grad_norm": 1.3946534411696136e-05, "learning_rate": 3.432857991681521e-07, "logits/chosen": 0.30859375, "logits/rejected": 0.8203125, "logps/chosen": -404.0, "logps/rejected": -242.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 19.5, "rewards/rejected": -10.4375, "step": 1430 }, { "epoch": 1.1547714514835605, "grad_norm": 9.254504564581852e-05, "learning_rate": 3.4180035650623885e-07, "logits/chosen": 0.51171875, "logits/rejected": 1.0625, "logps/chosen": -402.0, "logps/rejected": -248.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 19.5, "rewards/rejected": -10.5625, "step": 1440 }, { "epoch": 1.1627906976744187, "grad_norm": 6.574270710956317e-06, "learning_rate": 3.403149138443256e-07, "logits/chosen": 0.59765625, "logits/rejected": 0.9453125, "logps/chosen": -372.0, "logps/rejected": -229.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 18.375, "rewards/rejected": -9.875, "step": 1450 }, { "epoch": 1.1708099438652766, "grad_norm": 0.0001855015890081949, "learning_rate": 3.388294711824123e-07, "logits/chosen": 0.330078125, "logits/rejected": 0.78515625, "logps/chosen": -384.0, "logps/rejected": -232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.0625, "rewards/margins": 17.5, "rewards/rejected": -9.5, "step": 1460 }, { "epoch": 1.1788291900561347, "grad_norm": 7.59498115519318e-05, "learning_rate": 3.373440285204991e-07, "logits/chosen": 0.359375, "logits/rejected": 0.83984375, "logps/chosen": -404.0, "logps/rejected": -239.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5625, "rewards/margins": 18.75, "rewards/rejected": -10.25, "step": 1470 }, { "epoch": 1.1868484362469929, "grad_norm": 0.003961941559574146, "learning_rate": 3.3585858585858583e-07, "logits/chosen": 0.5703125, "logits/rejected": 1.0859375, "logps/chosen": -392.0, "logps/rejected": -230.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5625, "rewards/margins": 18.75, "rewards/rejected": -10.25, "step": 1480 }, { "epoch": 1.1948676824378508, "grad_norm": 0.00047457157622335985, "learning_rate": 3.343731431966726e-07, "logits/chosen": 0.166015625, "logits/rejected": 0.7578125, "logps/chosen": -422.0, "logps/rejected": -245.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 19.625, "rewards/rejected": -10.5625, "step": 1490 }, { "epoch": 1.202886928628709, "grad_norm": 0.00044385507290575576, "learning_rate": 3.3288770053475936e-07, "logits/chosen": 0.427734375, "logits/rejected": 0.96484375, "logps/chosen": -408.0, "logps/rejected": -244.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 19.0, "rewards/rejected": -10.25, "step": 1500 }, { "epoch": 1.2109061748195669, "grad_norm": 6.329712868660368e-05, "learning_rate": 3.314022578728461e-07, "logits/chosen": 0.3828125, "logits/rejected": 0.9140625, "logps/chosen": -370.0, "logps/rejected": -236.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.71875, "rewards/margins": 18.0, "rewards/rejected": -10.25, "step": 1510 }, { "epoch": 1.218925421010425, "grad_norm": 0.00012653295555518, "learning_rate": 3.299168152109328e-07, "logits/chosen": 0.416015625, "logits/rejected": 0.609375, "logps/chosen": -388.0, "logps/rejected": -231.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.875, "rewards/margins": 18.0, "rewards/rejected": -10.125, "step": 1520 }, { "epoch": 1.2269446672012831, "grad_norm": 0.0018177067729794561, "learning_rate": 3.284313725490196e-07, "logits/chosen": 0.412109375, "logits/rejected": 0.82421875, "logps/chosen": -384.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 18.75, "rewards/rejected": -10.25, "step": 1530 }, { "epoch": 1.234963913392141, "grad_norm": 0.00013718475555098635, "learning_rate": 3.2694592988710634e-07, "logits/chosen": 0.318359375, "logits/rejected": 0.84375, "logps/chosen": -392.0, "logps/rejected": -227.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 19.125, "rewards/rejected": -9.6875, "step": 1540 }, { "epoch": 1.2429831595829992, "grad_norm": 2.7566037146109023e-06, "learning_rate": 3.254604872251931e-07, "logits/chosen": 0.462890625, "logits/rejected": 0.7578125, "logps/chosen": -396.0, "logps/rejected": -235.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5625, "rewards/margins": 19.0, "rewards/rejected": -10.4375, "step": 1550 }, { "epoch": 1.2510024057738574, "grad_norm": 0.0004394883265400964, "learning_rate": 3.2397504456327986e-07, "logits/chosen": 0.423828125, "logits/rejected": 0.9765625, "logps/chosen": -362.0, "logps/rejected": -232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.25, "rewards/margins": 18.5, "rewards/rejected": -10.3125, "step": 1560 }, { "epoch": 1.2590216519647153, "grad_norm": 7.507168237443349e-05, "learning_rate": 3.2248960190136657e-07, "logits/chosen": 0.361328125, "logits/rejected": 0.73046875, "logps/chosen": -376.0, "logps/rejected": -241.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.3125, "rewards/margins": 18.625, "rewards/rejected": -10.25, "step": 1570 }, { "epoch": 1.2670408981555734, "grad_norm": 4.1924229859954286e-05, "learning_rate": 3.2100415923945333e-07, "logits/chosen": 0.48828125, "logits/rejected": 0.7578125, "logps/chosen": -384.0, "logps/rejected": -230.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.8125, "rewards/margins": 19.375, "rewards/rejected": -10.5625, "step": 1580 }, { "epoch": 1.2750601443464316, "grad_norm": 0.0003813917253797437, "learning_rate": 3.195187165775401e-07, "logits/chosen": 0.57421875, "logits/rejected": 0.796875, "logps/chosen": -366.0, "logps/rejected": -241.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 19.25, "rewards/rejected": -10.5625, "step": 1590 }, { "epoch": 1.2830793905372895, "grad_norm": 0.0004909355049526477, "learning_rate": 3.1803327391562685e-07, "logits/chosen": 0.388671875, "logits/rejected": 1.03125, "logps/chosen": -390.0, "logps/rejected": -233.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.4375, "rewards/margins": 19.75, "rewards/rejected": -10.3125, "step": 1600 }, { "epoch": 1.2910986367281476, "grad_norm": 2.0193325849591732e-05, "learning_rate": 3.165478312537136e-07, "logits/chosen": 0.6796875, "logits/rejected": 0.984375, "logps/chosen": -364.0, "logps/rejected": -242.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.6875, "rewards/margins": 18.75, "rewards/rejected": -10.0625, "step": 1610 }, { "epoch": 1.2991178829190055, "grad_norm": 0.006471324277883983, "learning_rate": 3.150623885918003e-07, "logits/chosen": 0.5078125, "logits/rejected": 0.63671875, "logps/chosen": -404.0, "logps/rejected": -226.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 19.25, "rewards/rejected": -10.125, "step": 1620 }, { "epoch": 1.3071371291098637, "grad_norm": 5.8435573919274625e-05, "learning_rate": 3.135769459298871e-07, "logits/chosen": 0.416015625, "logits/rejected": 0.8984375, "logps/chosen": -388.0, "logps/rejected": -229.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.6875, "rewards/margins": 19.0, "rewards/rejected": -10.25, "step": 1630 }, { "epoch": 1.3151563753007216, "grad_norm": 2.141384886609053e-05, "learning_rate": 3.1209150326797384e-07, "logits/chosen": 0.421875, "logits/rejected": 1.0546875, "logps/chosen": -384.0, "logps/rejected": -252.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 19.625, "rewards/rejected": -10.75, "step": 1640 }, { "epoch": 1.3231756214915797, "grad_norm": 5.913545427504495e-05, "learning_rate": 3.106060606060606e-07, "logits/chosen": 0.68359375, "logits/rejected": 1.21875, "logps/chosen": -414.0, "logps/rejected": -232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.875, "rewards/margins": 21.0, "rewards/rejected": -11.125, "step": 1650 }, { "epoch": 1.3311948676824379, "grad_norm": 8.399920187635036e-05, "learning_rate": 3.0912061794414736e-07, "logits/chosen": 0.26953125, "logits/rejected": 0.83984375, "logps/chosen": -374.0, "logps/rejected": -247.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.6875, "rewards/margins": 18.875, "rewards/rejected": -10.25, "step": 1660 }, { "epoch": 1.3392141138732958, "grad_norm": 3.3877183487283596e-05, "learning_rate": 3.076351752822341e-07, "logits/chosen": 0.458984375, "logits/rejected": 0.90234375, "logps/chosen": -388.0, "logps/rejected": -249.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 20.375, "rewards/rejected": -10.875, "step": 1670 }, { "epoch": 1.347233360064154, "grad_norm": 1.2628205173518488e-05, "learning_rate": 3.061497326203208e-07, "logits/chosen": 0.296875, "logits/rejected": 0.9375, "logps/chosen": -392.0, "logps/rejected": -253.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 19.75, "rewards/rejected": -10.5, "step": 1680 }, { "epoch": 1.355252606255012, "grad_norm": 1.1397682072314274e-05, "learning_rate": 3.046642899584076e-07, "logits/chosen": 0.43359375, "logits/rejected": 0.87890625, "logps/chosen": -410.0, "logps/rejected": -249.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 20.0, "rewards/rejected": -11.0, "step": 1690 }, { "epoch": 1.36327185244587, "grad_norm": 2.227653145645693e-06, "learning_rate": 3.0317884729649435e-07, "logits/chosen": 0.361328125, "logits/rejected": 0.7890625, "logps/chosen": -384.0, "logps/rejected": -231.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 18.625, "rewards/rejected": -9.75, "step": 1700 }, { "epoch": 1.3712910986367282, "grad_norm": 2.7638642411381305e-05, "learning_rate": 3.016934046345811e-07, "logits/chosen": 0.734375, "logits/rejected": 1.125, "logps/chosen": -350.0, "logps/rejected": -243.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 19.875, "rewards/rejected": -10.6875, "step": 1710 }, { "epoch": 1.3793103448275863, "grad_norm": 9.133968652306276e-06, "learning_rate": 3.0020796197266787e-07, "logits/chosen": 0.1826171875, "logits/rejected": 0.78515625, "logps/chosen": -412.0, "logps/rejected": -240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 20.0, "rewards/rejected": -11.0625, "step": 1720 }, { "epoch": 1.3873295910184442, "grad_norm": 4.170093626403422e-06, "learning_rate": 2.987225193107546e-07, "logits/chosen": 0.41796875, "logits/rejected": 0.70703125, "logps/chosen": -392.0, "logps/rejected": -232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 19.0, "rewards/rejected": -10.3125, "step": 1730 }, { "epoch": 1.3953488372093024, "grad_norm": 1.695990731710641e-05, "learning_rate": 2.9723707664884133e-07, "logits/chosen": 0.57421875, "logits/rejected": 0.8828125, "logps/chosen": -372.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 20.125, "rewards/rejected": -10.875, "step": 1740 }, { "epoch": 1.4033680834001605, "grad_norm": 0.001672585532407839, "learning_rate": 2.957516339869281e-07, "logits/chosen": 0.259765625, "logits/rejected": 0.6171875, "logps/chosen": -386.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.6875, "rewards/margins": 19.5, "rewards/rejected": -10.875, "step": 1750 }, { "epoch": 1.4113873295910184, "grad_norm": 2.1129681054156512e-05, "learning_rate": 2.9426619132501485e-07, "logits/chosen": 0.330078125, "logits/rejected": 0.78515625, "logps/chosen": -392.0, "logps/rejected": -242.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 20.5, "rewards/rejected": -11.0625, "step": 1760 }, { "epoch": 1.4194065757818766, "grad_norm": 0.024296230451070665, "learning_rate": 2.927807486631016e-07, "logits/chosen": 0.333984375, "logits/rejected": 0.7421875, "logps/chosen": -404.0, "logps/rejected": -236.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 19.75, "rewards/rejected": -10.3125, "step": 1770 }, { "epoch": 1.4274258219727345, "grad_norm": 4.772022717362016e-06, "learning_rate": 2.912953060011883e-07, "logits/chosen": 0.4140625, "logits/rejected": 0.9140625, "logps/chosen": -378.0, "logps/rejected": -254.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 19.5, "rewards/rejected": -11.0625, "step": 1780 }, { "epoch": 1.4354450681635926, "grad_norm": 0.00037550214797630075, "learning_rate": 2.898098633392751e-07, "logits/chosen": 0.42578125, "logits/rejected": 1.078125, "logps/chosen": -378.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 19.625, "rewards/rejected": -10.75, "step": 1790 }, { "epoch": 1.4434643143544506, "grad_norm": 4.072332860339327e-05, "learning_rate": 2.8832442067736184e-07, "logits/chosen": 0.376953125, "logits/rejected": 0.76171875, "logps/chosen": -374.0, "logps/rejected": -237.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 19.375, "rewards/rejected": -10.625, "step": 1800 }, { "epoch": 1.4514835605453087, "grad_norm": 3.3540716532805943e-06, "learning_rate": 2.868389780154486e-07, "logits/chosen": 0.6171875, "logits/rejected": 1.0078125, "logps/chosen": -390.0, "logps/rejected": -232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 19.5, "rewards/rejected": -10.375, "step": 1810 }, { "epoch": 1.4595028067361668, "grad_norm": 8.477589527252758e-05, "learning_rate": 2.8535353535353536e-07, "logits/chosen": 0.296875, "logits/rejected": 0.69921875, "logps/chosen": -358.0, "logps/rejected": -232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.125, "rewards/margins": 18.5, "rewards/rejected": -10.375, "step": 1820 }, { "epoch": 1.4675220529270248, "grad_norm": 0.02575278269084806, "learning_rate": 2.838680926916221e-07, "logits/chosen": 0.56640625, "logits/rejected": 0.703125, "logps/chosen": -380.0, "logps/rejected": -229.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 19.375, "rewards/rejected": -10.1875, "step": 1830 }, { "epoch": 1.475541299117883, "grad_norm": 6.688710715052394e-05, "learning_rate": 2.8238265002970883e-07, "logits/chosen": 0.46875, "logits/rejected": 1.0703125, "logps/chosen": -380.0, "logps/rejected": -242.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 20.25, "rewards/rejected": -11.125, "step": 1840 }, { "epoch": 1.483560545308741, "grad_norm": 1.4788277479310367e-05, "learning_rate": 2.808972073677956e-07, "logits/chosen": 0.51171875, "logits/rejected": 1.078125, "logps/chosen": -380.0, "logps/rejected": -240.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 19.875, "rewards/rejected": -10.9375, "step": 1850 }, { "epoch": 1.491579791499599, "grad_norm": 2.2018485912245277e-05, "learning_rate": 2.7941176470588235e-07, "logits/chosen": 0.1494140625, "logits/rejected": 0.828125, "logps/chosen": -404.0, "logps/rejected": -255.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.25, "rewards/margins": 19.5, "rewards/rejected": -11.1875, "step": 1860 }, { "epoch": 1.499599037690457, "grad_norm": 2.269628180164056e-06, "learning_rate": 2.779263220439691e-07, "logits/chosen": 0.376953125, "logits/rejected": 0.8671875, "logps/chosen": -368.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.375, "rewards/margins": 19.625, "rewards/rejected": -11.25, "step": 1870 }, { "epoch": 1.5076182838813152, "grad_norm": 0.00030107674595158094, "learning_rate": 2.7644087938205587e-07, "logits/chosen": 0.3671875, "logits/rejected": 0.9296875, "logps/chosen": -404.0, "logps/rejected": -242.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.4375, "rewards/margins": 20.375, "rewards/rejected": -10.875, "step": 1880 }, { "epoch": 1.5156375300721732, "grad_norm": 0.00010821940947119307, "learning_rate": 2.749554367201426e-07, "logits/chosen": 0.3984375, "logits/rejected": 0.91796875, "logps/chosen": -398.0, "logps/rejected": -237.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.8125, "rewards/margins": 20.75, "rewards/rejected": -10.875, "step": 1890 }, { "epoch": 1.5236567762630313, "grad_norm": 0.0008639968704052882, "learning_rate": 2.7346999405822934e-07, "logits/chosen": 0.5078125, "logits/rejected": 0.74609375, "logps/chosen": -388.0, "logps/rejected": -236.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.625, "rewards/margins": 19.25, "rewards/rejected": -10.6875, "step": 1900 }, { "epoch": 1.5316760224538895, "grad_norm": 1.4709571592931431e-05, "learning_rate": 2.719845513963161e-07, "logits/chosen": 0.375, "logits/rejected": 0.80078125, "logps/chosen": -390.0, "logps/rejected": -230.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 9.5625, "rewards/margins": 19.875, "rewards/rejected": -10.3125, "step": 1910 }, { "epoch": 1.5396952686447474, "grad_norm": 1.5039372347577695e-06, "learning_rate": 2.7049910873440286e-07, "logits/chosen": 0.4140625, "logits/rejected": 0.74609375, "logps/chosen": -370.0, "logps/rejected": -248.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 20.0, "rewards/rejected": -11.0, "step": 1920 }, { "epoch": 1.5477145148356053, "grad_norm": 3.96951436655122e-05, "learning_rate": 2.690136660724896e-07, "logits/chosen": 0.65625, "logits/rejected": 1.1015625, "logps/chosen": -366.0, "logps/rejected": -234.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 19.75, "rewards/rejected": -10.6875, "step": 1930 }, { "epoch": 1.5557337610264637, "grad_norm": 1.860137855263675e-05, "learning_rate": 2.675282234105763e-07, "logits/chosen": 0.39453125, "logits/rejected": 1.0078125, "logps/chosen": -370.0, "logps/rejected": -241.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.375, "rewards/margins": 18.75, "rewards/rejected": -10.3125, "step": 1940 }, { "epoch": 1.5637530072173216, "grad_norm": 0.00027363277179593646, "learning_rate": 2.660427807486631e-07, "logits/chosen": 0.36328125, "logits/rejected": 0.78125, "logps/chosen": -372.0, "logps/rejected": -228.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 19.25, "rewards/rejected": -10.375, "step": 1950 }, { "epoch": 1.5717722534081795, "grad_norm": 0.0005257066720049141, "learning_rate": 2.6455733808674985e-07, "logits/chosen": 0.37890625, "logits/rejected": 0.67578125, "logps/chosen": -398.0, "logps/rejected": -274.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 19.75, "rewards/rejected": -10.875, "step": 1960 }, { "epoch": 1.5797914995990376, "grad_norm": 4.880586680662074e-06, "learning_rate": 2.630718954248366e-07, "logits/chosen": 0.47265625, "logits/rejected": 0.7578125, "logps/chosen": -372.0, "logps/rejected": -235.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.375, "rewards/margins": 19.875, "rewards/rejected": -10.5, "step": 1970 }, { "epoch": 1.5878107457898958, "grad_norm": 5.382804738940797e-06, "learning_rate": 2.6158645276292337e-07, "logits/chosen": 0.42578125, "logits/rejected": 1.109375, "logps/chosen": -402.0, "logps/rejected": -226.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.875, "rewards/margins": 20.25, "rewards/rejected": -10.375, "step": 1980 }, { "epoch": 1.5958299919807537, "grad_norm": 3.883029815959265e-06, "learning_rate": 2.6010101010101007e-07, "logits/chosen": 0.515625, "logits/rejected": 0.69921875, "logps/chosen": -386.0, "logps/rejected": -233.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 19.0, "rewards/rejected": -10.0625, "step": 1990 }, { "epoch": 1.6038492381716118, "grad_norm": 8.745385138334475e-06, "learning_rate": 2.5861556743909683e-07, "logits/chosen": 0.5625, "logits/rejected": 0.8515625, "logps/chosen": -404.0, "logps/rejected": -250.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 19.875, "rewards/rejected": -10.8125, "step": 2000 }, { "epoch": 1.61186848436247, "grad_norm": 2.000482023267285e-06, "learning_rate": 2.571301247771836e-07, "logits/chosen": 0.265625, "logits/rejected": 1.1015625, "logps/chosen": -390.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.8125, "rewards/margins": 19.625, "rewards/rejected": -10.875, "step": 2010 }, { "epoch": 1.619887730553328, "grad_norm": 2.3073094687767326e-05, "learning_rate": 2.5564468211527035e-07, "logits/chosen": 0.48046875, "logits/rejected": 0.87890625, "logps/chosen": -376.0, "logps/rejected": -249.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 20.0, "rewards/rejected": -10.8125, "step": 2020 }, { "epoch": 1.627906976744186, "grad_norm": 2.1892602162734023e-05, "learning_rate": 2.541592394533571e-07, "logits/chosen": 0.3984375, "logits/rejected": 0.72265625, "logps/chosen": -392.0, "logps/rejected": -248.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 20.125, "rewards/rejected": -11.0, "step": 2030 }, { "epoch": 1.6359262229350442, "grad_norm": 0.00030393477684378023, "learning_rate": 2.5267379679144387e-07, "logits/chosen": 0.51953125, "logits/rejected": 0.90234375, "logps/chosen": -382.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 20.5, "rewards/rejected": -11.25, "step": 2040 }, { "epoch": 1.6439454691259021, "grad_norm": 0.00028315849570995473, "learning_rate": 2.511883541295306e-07, "logits/chosen": 0.390625, "logits/rejected": 0.65625, "logps/chosen": -376.0, "logps/rejected": -255.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 19.625, "rewards/rejected": -11.125, "step": 2050 }, { "epoch": 1.6519647153167603, "grad_norm": 2.0000514775877407e-05, "learning_rate": 2.4970291146761734e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.61328125, "logps/chosen": -384.0, "logps/rejected": -222.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.25, "rewards/margins": 18.5, "rewards/rejected": -10.3125, "step": 2060 }, { "epoch": 1.6599839615076184, "grad_norm": 2.990137885470394e-05, "learning_rate": 2.482174688057041e-07, "logits/chosen": 0.61328125, "logits/rejected": 0.9609375, "logps/chosen": -362.0, "logps/rejected": -255.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.375, "rewards/margins": 19.375, "rewards/rejected": -11.0, "step": 2070 }, { "epoch": 1.6680032076984763, "grad_norm": 4.0442043383218975e-06, "learning_rate": 2.4673202614379086e-07, "logits/chosen": 0.341796875, "logits/rejected": 0.7890625, "logps/chosen": -386.0, "logps/rejected": -249.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 20.0, "rewards/rejected": -11.125, "step": 2080 }, { "epoch": 1.6760224538893342, "grad_norm": 6.173823267071295e-05, "learning_rate": 2.452465834818776e-07, "logits/chosen": 0.53125, "logits/rejected": 1.125, "logps/chosen": -370.0, "logps/rejected": -245.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 20.125, "rewards/rejected": -11.1875, "step": 2090 }, { "epoch": 1.6840417000801926, "grad_norm": 1.169291319171789e-05, "learning_rate": 2.4376114081996433e-07, "logits/chosen": 0.3046875, "logits/rejected": 0.8671875, "logps/chosen": -396.0, "logps/rejected": -244.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.625, "rewards/margins": 21.0, "rewards/rejected": -11.375, "step": 2100 }, { "epoch": 1.6920609462710505, "grad_norm": 0.0004824313229502819, "learning_rate": 2.422756981580511e-07, "logits/chosen": 0.498046875, "logits/rejected": 0.8984375, "logps/chosen": -388.0, "logps/rejected": -248.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.375, "rewards/margins": 21.0, "rewards/rejected": -11.5625, "step": 2110 }, { "epoch": 1.7000801924619084, "grad_norm": 1.8179124875058412e-06, "learning_rate": 2.4079025549613785e-07, "logits/chosen": 0.2021484375, "logits/rejected": 0.51171875, "logps/chosen": -376.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 19.5, "rewards/rejected": -10.75, "step": 2120 }, { "epoch": 1.7080994386527666, "grad_norm": 3.876588485318874e-07, "learning_rate": 2.393048128342246e-07, "logits/chosen": 0.400390625, "logits/rejected": 0.88671875, "logps/chosen": -390.0, "logps/rejected": -252.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.9375, "rewards/margins": 21.0, "rewards/rejected": -11.0625, "step": 2130 }, { "epoch": 1.7161186848436247, "grad_norm": 0.000957144565159327, "learning_rate": 2.3781937017231134e-07, "logits/chosen": 0.5546875, "logits/rejected": 0.83203125, "logps/chosen": -408.0, "logps/rejected": -239.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.8125, "rewards/margins": 20.625, "rewards/rejected": -10.8125, "step": 2140 }, { "epoch": 1.7241379310344827, "grad_norm": 1.3221019649908978e-06, "learning_rate": 2.363339275103981e-07, "logits/chosen": 0.60546875, "logits/rejected": 0.94140625, "logps/chosen": -372.0, "logps/rejected": -232.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.8125, "rewards/margins": 19.75, "rewards/rejected": -11.0, "step": 2150 }, { "epoch": 1.7321571772253408, "grad_norm": 0.008117553842250269, "learning_rate": 2.3484848484848486e-07, "logits/chosen": 0.421875, "logits/rejected": 1.0625, "logps/chosen": -380.0, "logps/rejected": -250.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 21.0, "rewards/rejected": -11.8125, "step": 2160 }, { "epoch": 1.740176423416199, "grad_norm": 0.006750730850535208, "learning_rate": 2.333630421865716e-07, "logits/chosen": 0.48046875, "logits/rejected": 0.875, "logps/chosen": -360.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.625, "rewards/margins": 20.25, "rewards/rejected": -11.5625, "step": 2170 }, { "epoch": 1.7481956696070569, "grad_norm": 2.0690956749746336e-05, "learning_rate": 2.3187759952465833e-07, "logits/chosen": 0.66015625, "logits/rejected": 0.9296875, "logps/chosen": -366.0, "logps/rejected": -239.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 20.25, "rewards/rejected": -10.8125, "step": 2180 }, { "epoch": 1.756214915797915, "grad_norm": 3.41067760967297e-05, "learning_rate": 2.3039215686274506e-07, "logits/chosen": 0.439453125, "logits/rejected": 1.015625, "logps/chosen": -364.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.375, "rewards/margins": 19.75, "rewards/rejected": -11.375, "step": 2190 }, { "epoch": 1.7642341619887731, "grad_norm": 1.6827307687831508e-06, "learning_rate": 2.2890671420083182e-07, "logits/chosen": 0.4609375, "logits/rejected": 0.96875, "logps/chosen": -364.0, "logps/rejected": -244.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.6875, "rewards/margins": 19.75, "rewards/rejected": -11.0625, "step": 2200 }, { "epoch": 1.772253408179631, "grad_norm": 1.5910820694258549e-06, "learning_rate": 2.2742127153891858e-07, "logits/chosen": 0.55078125, "logits/rejected": 1.0078125, "logps/chosen": -394.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.8125, "rewards/margins": 21.5, "rewards/rejected": -11.6875, "step": 2210 }, { "epoch": 1.7802726543704892, "grad_norm": 0.0005192527879960267, "learning_rate": 2.2593582887700532e-07, "logits/chosen": 0.46484375, "logits/rejected": 1.1328125, "logps/chosen": -368.0, "logps/rejected": -239.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 20.0, "rewards/rejected": -10.9375, "step": 2220 }, { "epoch": 1.7882919005613473, "grad_norm": 9.899401699126356e-07, "learning_rate": 2.2445038621509208e-07, "logits/chosen": 0.447265625, "logits/rejected": 0.9296875, "logps/chosen": -412.0, "logps/rejected": -243.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.875, "rewards/margins": 21.125, "rewards/rejected": -11.3125, "step": 2230 }, { "epoch": 1.7963111467522053, "grad_norm": 0.00016904700633269654, "learning_rate": 2.2296494355317884e-07, "logits/chosen": 0.5546875, "logits/rejected": 0.7734375, "logps/chosen": -390.0, "logps/rejected": -244.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5625, "rewards/margins": 20.5, "rewards/rejected": -10.875, "step": 2240 }, { "epoch": 1.8043303929430632, "grad_norm": 1.0640028907646635e-05, "learning_rate": 2.2147950089126557e-07, "logits/chosen": 0.224609375, "logits/rejected": 1.0234375, "logps/chosen": -378.0, "logps/rejected": -270.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 20.75, "rewards/rejected": -11.875, "step": 2250 }, { "epoch": 1.8123496391339216, "grad_norm": 0.0018345519481934851, "learning_rate": 2.1999405822935233e-07, "logits/chosen": 0.431640625, "logits/rejected": 1.1328125, "logps/chosen": -376.0, "logps/rejected": -255.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.25, "rewards/margins": 19.875, "rewards/rejected": -11.5625, "step": 2260 }, { "epoch": 1.8203688853247795, "grad_norm": 0.00016019329942054905, "learning_rate": 2.1850861556743907e-07, "logits/chosen": 0.62109375, "logits/rejected": 0.90234375, "logps/chosen": -370.0, "logps/rejected": -244.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.8125, "rewards/margins": 19.875, "rewards/rejected": -11.125, "step": 2270 }, { "epoch": 1.8283881315156374, "grad_norm": 8.358716588615397e-05, "learning_rate": 2.1702317290552583e-07, "logits/chosen": 0.2734375, "logits/rejected": 1.0859375, "logps/chosen": -428.0, "logps/rejected": -286.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 22.125, "rewards/rejected": -13.0625, "step": 2280 }, { "epoch": 1.8364073777064955, "grad_norm": 2.027336971624459e-05, "learning_rate": 2.1553773024361259e-07, "logits/chosen": 0.337890625, "logits/rejected": 0.71875, "logps/chosen": -396.0, "logps/rejected": -253.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 21.75, "rewards/rejected": -12.1875, "step": 2290 }, { "epoch": 1.8444266238973537, "grad_norm": 8.055713765919762e-05, "learning_rate": 2.1405228758169932e-07, "logits/chosen": 0.5625, "logits/rejected": 1.3203125, "logps/chosen": -368.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.375, "rewards/margins": 19.75, "rewards/rejected": -11.4375, "step": 2300 }, { "epoch": 1.8524458700882116, "grad_norm": 0.0007049283377848298, "learning_rate": 2.1256684491978608e-07, "logits/chosen": 0.1435546875, "logits/rejected": 0.81640625, "logps/chosen": -402.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 21.125, "rewards/rejected": -12.0, "step": 2310 }, { "epoch": 1.8604651162790697, "grad_norm": 3.792337960411242e-05, "learning_rate": 2.110814022578728e-07, "logits/chosen": 0.349609375, "logits/rejected": 0.921875, "logps/chosen": -396.0, "logps/rejected": -254.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 21.0, "rewards/rejected": -11.6875, "step": 2320 }, { "epoch": 1.8684843624699279, "grad_norm": 1.7716540922831983e-05, "learning_rate": 2.0959595959595957e-07, "logits/chosen": 0.341796875, "logits/rejected": 0.76171875, "logps/chosen": -390.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 21.0, "rewards/rejected": -11.875, "step": 2330 }, { "epoch": 1.8765036086607858, "grad_norm": 2.216884967833355e-06, "learning_rate": 2.0811051693404633e-07, "logits/chosen": 0.51171875, "logits/rejected": 1.0234375, "logps/chosen": -398.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.9375, "rewards/margins": 21.75, "rewards/rejected": -11.875, "step": 2340 }, { "epoch": 1.884522854851644, "grad_norm": 4.545770122381447e-06, "learning_rate": 2.0662507427213307e-07, "logits/chosen": 0.462890625, "logits/rejected": 0.890625, "logps/chosen": -386.0, "logps/rejected": -250.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 20.625, "rewards/rejected": -11.5625, "step": 2350 }, { "epoch": 1.892542101042502, "grad_norm": 4.904088370802712e-05, "learning_rate": 2.0513963161021983e-07, "logits/chosen": 0.61328125, "logits/rejected": 1.203125, "logps/chosen": -358.0, "logps/rejected": -260.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.4375, "rewards/margins": 21.75, "rewards/rejected": -12.3125, "step": 2360 }, { "epoch": 1.90056134723336, "grad_norm": 0.00042270249184112594, "learning_rate": 2.036541889483066e-07, "logits/chosen": 0.72265625, "logits/rejected": 1.203125, "logps/chosen": -388.0, "logps/rejected": -255.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 20.25, "rewards/rejected": -11.375, "step": 2370 }, { "epoch": 1.9085805934242182, "grad_norm": 3.586115046168339e-05, "learning_rate": 2.0216874628639332e-07, "logits/chosen": 0.57421875, "logits/rejected": 0.83203125, "logps/chosen": -362.0, "logps/rejected": -241.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 19.625, "rewards/rejected": -10.875, "step": 2380 }, { "epoch": 1.9165998396150763, "grad_norm": 0.00010212705651715115, "learning_rate": 2.0068330362448008e-07, "logits/chosen": 0.546875, "logits/rejected": 0.9296875, "logps/chosen": -376.0, "logps/rejected": -262.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 20.625, "rewards/rejected": -11.75, "step": 2390 }, { "epoch": 1.9246190858059342, "grad_norm": 5.825547101059951e-06, "learning_rate": 1.9919786096256681e-07, "logits/chosen": 0.3828125, "logits/rejected": 0.77734375, "logps/chosen": -380.0, "logps/rejected": -250.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 20.75, "rewards/rejected": -11.1875, "step": 2400 }, { "epoch": 1.9326383319967921, "grad_norm": 7.67522513044559e-06, "learning_rate": 1.9771241830065358e-07, "logits/chosen": 0.578125, "logits/rejected": 0.9921875, "logps/chosen": -368.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 21.0, "rewards/rejected": -12.0625, "step": 2410 }, { "epoch": 1.9406575781876505, "grad_norm": 2.154465642818756e-05, "learning_rate": 1.9622697563874034e-07, "logits/chosen": 0.25, "logits/rejected": 1.0234375, "logps/chosen": -388.0, "logps/rejected": -255.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 20.875, "rewards/rejected": -11.9375, "step": 2420 }, { "epoch": 1.9486768243785084, "grad_norm": 8.356031770709064e-06, "learning_rate": 1.9474153297682707e-07, "logits/chosen": 0.6484375, "logits/rejected": 0.828125, "logps/chosen": -388.0, "logps/rejected": -241.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 21.0, "rewards/rejected": -11.5, "step": 2430 }, { "epoch": 1.9566960705693663, "grad_norm": 0.00015387145297822115, "learning_rate": 1.9325609031491383e-07, "logits/chosen": 0.283203125, "logits/rejected": 0.8046875, "logps/chosen": -412.0, "logps/rejected": -262.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 21.875, "rewards/rejected": -12.375, "step": 2440 }, { "epoch": 1.9647153167602245, "grad_norm": 0.0007004537181760268, "learning_rate": 1.917706476530006e-07, "logits/chosen": 0.2109375, "logits/rejected": 0.703125, "logps/chosen": -396.0, "logps/rejected": -250.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5625, "rewards/margins": 20.25, "rewards/rejected": -11.625, "step": 2450 }, { "epoch": 1.9727345629510826, "grad_norm": 7.567606643626648e-07, "learning_rate": 1.9028520499108732e-07, "logits/chosen": 0.6953125, "logits/rejected": 1.1484375, "logps/chosen": -374.0, "logps/rejected": -260.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.8125, "rewards/margins": 22.375, "rewards/rejected": -12.5625, "step": 2460 }, { "epoch": 1.9807538091419405, "grad_norm": 0.0016717929387466024, "learning_rate": 1.8879976232917408e-07, "logits/chosen": 0.462890625, "logits/rejected": 0.78515625, "logps/chosen": -374.0, "logps/rejected": -244.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 20.5, "rewards/rejected": -11.25, "step": 2470 }, { "epoch": 1.9887730553327987, "grad_norm": 4.48690443797418e-05, "learning_rate": 1.8731431966726082e-07, "logits/chosen": 0.38671875, "logits/rejected": 0.97265625, "logps/chosen": -392.0, "logps/rejected": -252.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.0, "rewards/margins": 21.75, "rewards/rejected": -11.6875, "step": 2480 }, { "epoch": 1.9967923015236568, "grad_norm": 0.0006633147667953635, "learning_rate": 1.8582887700534758e-07, "logits/chosen": 0.5234375, "logits/rejected": 1.0078125, "logps/chosen": -360.0, "logps/rejected": -241.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.3125, "rewards/margins": 19.375, "rewards/rejected": -11.0625, "step": 2490 }, { "epoch": 2.0, "eval_logits/chosen": 0.546875, "eval_logits/rejected": 1.078125, "eval_logps/chosen": -384.0, "eval_logps/rejected": -260.0, "eval_loss": 2.2745707610738464e-06, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 8.75, "eval_rewards/margins": 20.375, "eval_rewards/rejected": -11.625, "eval_runtime": 25.7597, "eval_samples_per_second": 7.725, "eval_steps_per_second": 0.971, "step": 2494 }, { "epoch": 2.0048115477145148, "grad_norm": 2.930090264143551e-06, "learning_rate": 1.8434343434343434e-07, "logits/chosen": 0.28515625, "logits/rejected": 0.9140625, "logps/chosen": -372.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 21.125, "rewards/rejected": -12.4375, "step": 2500 }, { "epoch": 2.0128307939053727, "grad_norm": 0.0012222821898789798, "learning_rate": 1.8285799168152107e-07, "logits/chosen": 0.37109375, "logits/rejected": 1.0859375, "logps/chosen": -382.0, "logps/rejected": -270.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 21.625, "rewards/rejected": -12.5, "step": 2510 }, { "epoch": 2.020850040096231, "grad_norm": 3.467185219486767e-05, "learning_rate": 1.8137254901960783e-07, "logits/chosen": 0.5625, "logits/rejected": 0.953125, "logps/chosen": -368.0, "logps/rejected": -252.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 20.5, "rewards/rejected": -11.9375, "step": 2520 }, { "epoch": 2.028869286287089, "grad_norm": 0.000711027954800088, "learning_rate": 1.798871063576946e-07, "logits/chosen": 0.36328125, "logits/rejected": 0.87890625, "logps/chosen": -390.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 20.0, "rewards/rejected": -11.125, "step": 2530 }, { "epoch": 2.036888532477947, "grad_norm": 2.962969641819805e-05, "learning_rate": 1.7840166369578132e-07, "logits/chosen": 0.298828125, "logits/rejected": 1.0234375, "logps/chosen": -392.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 21.5, "rewards/rejected": -12.375, "step": 2540 }, { "epoch": 2.0449077786688052, "grad_norm": 4.571963961456707e-05, "learning_rate": 1.7691622103386808e-07, "logits/chosen": 0.1376953125, "logits/rejected": 0.9375, "logps/chosen": -394.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5625, "rewards/margins": 20.75, "rewards/rejected": -12.25, "step": 2550 }, { "epoch": 2.052927024859663, "grad_norm": 1.5456516913511285e-05, "learning_rate": 1.7543077837195482e-07, "logits/chosen": 0.515625, "logits/rejected": 0.96484375, "logps/chosen": -358.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.5, "rewards/margins": 20.5, "rewards/rejected": -11.9375, "step": 2560 }, { "epoch": 2.060946271050521, "grad_norm": 2.026658098801818e-05, "learning_rate": 1.7394533571004158e-07, "logits/chosen": 0.451171875, "logits/rejected": 0.8359375, "logps/chosen": -410.0, "logps/rejected": -253.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.625, "rewards/margins": 21.5, "rewards/rejected": -11.875, "step": 2570 }, { "epoch": 2.0689655172413794, "grad_norm": 0.00013971817574957832, "learning_rate": 1.7245989304812834e-07, "logits/chosen": 0.369140625, "logits/rejected": 1.046875, "logps/chosen": -392.0, "logps/rejected": -251.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 21.0, "rewards/rejected": -11.75, "step": 2580 }, { "epoch": 2.0769847634322374, "grad_norm": 3.8096711927239275e-06, "learning_rate": 1.7097445038621507e-07, "logits/chosen": 0.2451171875, "logits/rejected": 0.79296875, "logps/chosen": -392.0, "logps/rejected": -250.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 20.875, "rewards/rejected": -11.75, "step": 2590 }, { "epoch": 2.0850040096230953, "grad_norm": 2.3978271037030293e-07, "learning_rate": 1.6948900772430183e-07, "logits/chosen": 0.294921875, "logits/rejected": 0.64453125, "logps/chosen": -394.0, "logps/rejected": -247.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 20.375, "rewards/rejected": -11.375, "step": 2600 }, { "epoch": 2.0930232558139537, "grad_norm": 0.005810234748535011, "learning_rate": 1.680035650623886e-07, "logits/chosen": 0.357421875, "logits/rejected": 1.21875, "logps/chosen": -386.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 21.5, "rewards/rejected": -12.0625, "step": 2610 }, { "epoch": 2.1010425020048116, "grad_norm": 9.09682895575849e-07, "learning_rate": 1.6651812240047533e-07, "logits/chosen": 0.375, "logits/rejected": 0.92578125, "logps/chosen": -384.0, "logps/rejected": -255.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.625, "rewards/margins": 21.125, "rewards/rejected": -11.5, "step": 2620 }, { "epoch": 2.1090617481956695, "grad_norm": 5.971953148626958e-07, "learning_rate": 1.6503267973856209e-07, "logits/chosen": 0.27734375, "logits/rejected": 1.15625, "logps/chosen": -408.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 22.125, "rewards/rejected": -12.75, "step": 2630 }, { "epoch": 2.117080994386528, "grad_norm": 2.0793033534612072e-06, "learning_rate": 1.6354723707664882e-07, "logits/chosen": 0.478515625, "logits/rejected": 1.0546875, "logps/chosen": -394.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 21.5, "rewards/rejected": -12.375, "step": 2640 }, { "epoch": 2.125100240577386, "grad_norm": 0.0030768018768204852, "learning_rate": 1.6206179441473558e-07, "logits/chosen": 0.64453125, "logits/rejected": 1.0859375, "logps/chosen": -370.0, "logps/rejected": -244.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5625, "rewards/margins": 21.0, "rewards/rejected": -11.375, "step": 2650 }, { "epoch": 2.1331194867682437, "grad_norm": 9.956962271570327e-06, "learning_rate": 1.6057635175282234e-07, "logits/chosen": 0.41015625, "logits/rejected": 1.125, "logps/chosen": -388.0, "logps/rejected": -242.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 21.125, "rewards/rejected": -11.8125, "step": 2660 }, { "epoch": 2.141138732959102, "grad_norm": 1.874167890211698e-06, "learning_rate": 1.5909090909090907e-07, "logits/chosen": 0.48046875, "logits/rejected": 0.84765625, "logps/chosen": -374.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 20.875, "rewards/rejected": -12.125, "step": 2670 }, { "epoch": 2.14915797914996, "grad_norm": 2.7147945967304163e-05, "learning_rate": 1.5760546642899583e-07, "logits/chosen": 0.255859375, "logits/rejected": 0.9609375, "logps/chosen": -408.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.375, "rewards/margins": 21.75, "rewards/rejected": -12.3125, "step": 2680 }, { "epoch": 2.157177225340818, "grad_norm": 0.004731880278356305, "learning_rate": 1.561200237670826e-07, "logits/chosen": 0.408203125, "logits/rejected": 0.9140625, "logps/chosen": -406.0, "logps/rejected": -254.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 20.875, "rewards/rejected": -11.875, "step": 2690 }, { "epoch": 2.165196471531676, "grad_norm": 1.2844771507454833e-06, "learning_rate": 1.5463458110516933e-07, "logits/chosen": 0.40234375, "logits/rejected": 0.984375, "logps/chosen": -370.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 21.625, "rewards/rejected": -12.375, "step": 2700 }, { "epoch": 2.173215717722534, "grad_norm": 3.8636635265402716e-07, "learning_rate": 1.531491384432561e-07, "logits/chosen": 0.40234375, "logits/rejected": 0.8046875, "logps/chosen": -392.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 20.75, "rewards/rejected": -11.8125, "step": 2710 }, { "epoch": 2.181234963913392, "grad_norm": 0.0004271074824786973, "learning_rate": 1.5166369578134282e-07, "logits/chosen": 0.6640625, "logits/rejected": 1.1015625, "logps/chosen": -392.0, "logps/rejected": -284.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.9375, "rewards/margins": 22.875, "rewards/rejected": -12.875, "step": 2720 }, { "epoch": 2.18925421010425, "grad_norm": 1.1332362527734326e-06, "learning_rate": 1.5017825311942958e-07, "logits/chosen": 0.435546875, "logits/rejected": 0.89453125, "logps/chosen": -382.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 21.25, "rewards/rejected": -11.9375, "step": 2730 }, { "epoch": 2.1972734562951084, "grad_norm": 5.2099471369239204e-05, "learning_rate": 1.4869281045751634e-07, "logits/chosen": 0.7109375, "logits/rejected": 0.9609375, "logps/chosen": -380.0, "logps/rejected": -248.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.6875, "rewards/margins": 20.375, "rewards/rejected": -11.75, "step": 2740 }, { "epoch": 2.2052927024859663, "grad_norm": 1.0659060231880619e-05, "learning_rate": 1.4720736779560308e-07, "logits/chosen": 0.47265625, "logits/rejected": 0.8359375, "logps/chosen": -378.0, "logps/rejected": -254.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 21.5, "rewards/rejected": -12.1875, "step": 2750 }, { "epoch": 2.2133119486768242, "grad_norm": 4.767031677190736e-05, "learning_rate": 1.4572192513368984e-07, "logits/chosen": 0.66796875, "logits/rejected": 0.890625, "logps/chosen": -408.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 21.5, "rewards/rejected": -12.5, "step": 2760 }, { "epoch": 2.2213311948676826, "grad_norm": 2.4249197890785813e-06, "learning_rate": 1.4423648247177657e-07, "logits/chosen": 0.40234375, "logits/rejected": 0.85546875, "logps/chosen": -374.0, "logps/rejected": -262.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.375, "rewards/margins": 20.375, "rewards/rejected": -12.0625, "step": 2770 }, { "epoch": 2.2293504410585405, "grad_norm": 0.00012434476274443069, "learning_rate": 1.4275103980986333e-07, "logits/chosen": 0.369140625, "logits/rejected": 0.99609375, "logps/chosen": -410.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.0625, "rewards/margins": 22.75, "rewards/rejected": -12.6875, "step": 2780 }, { "epoch": 2.2373696872493984, "grad_norm": 5.50417015882791e-06, "learning_rate": 1.412655971479501e-07, "logits/chosen": 0.58984375, "logits/rejected": 0.94140625, "logps/chosen": -354.0, "logps/rejected": -270.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 22.0, "rewards/rejected": -12.8125, "step": 2790 }, { "epoch": 2.245388933440257, "grad_norm": 0.00035135368638597, "learning_rate": 1.3978015448603682e-07, "logits/chosen": 0.291015625, "logits/rejected": 0.796875, "logps/chosen": -370.0, "logps/rejected": -236.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 20.25, "rewards/rejected": -11.25, "step": 2800 }, { "epoch": 2.2534081796311147, "grad_norm": 2.2112703403823823e-06, "learning_rate": 1.3829471182412358e-07, "logits/chosen": 0.640625, "logits/rejected": 0.875, "logps/chosen": -372.0, "logps/rejected": -251.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5625, "rewards/margins": 21.375, "rewards/rejected": -11.875, "step": 2810 }, { "epoch": 2.2614274258219726, "grad_norm": 4.31175569129312e-06, "learning_rate": 1.3680926916221034e-07, "logits/chosen": 0.38671875, "logits/rejected": 0.77734375, "logps/chosen": -402.0, "logps/rejected": -247.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 21.125, "rewards/rejected": -11.5625, "step": 2820 }, { "epoch": 2.2694466720128306, "grad_norm": 1.8597590930975931e-06, "learning_rate": 1.3532382650029708e-07, "logits/chosen": 0.48046875, "logits/rejected": 0.80859375, "logps/chosen": -384.0, "logps/rejected": -252.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 20.25, "rewards/rejected": -11.1875, "step": 2830 }, { "epoch": 2.277465918203689, "grad_norm": 8.122915968587665e-06, "learning_rate": 1.3383838383838384e-07, "logits/chosen": 0.4140625, "logits/rejected": 0.91796875, "logps/chosen": -402.0, "logps/rejected": -249.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.875, "rewards/margins": 21.5, "rewards/rejected": -11.625, "step": 2840 }, { "epoch": 2.285485164394547, "grad_norm": 1.0821622455340657e-05, "learning_rate": 1.3235294117647057e-07, "logits/chosen": 0.55859375, "logits/rejected": 0.9921875, "logps/chosen": -394.0, "logps/rejected": -239.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.625, "rewards/margins": 21.25, "rewards/rejected": -11.625, "step": 2850 }, { "epoch": 2.293504410585405, "grad_norm": 9.888049706350434e-06, "learning_rate": 1.3086749851455733e-07, "logits/chosen": 0.376953125, "logits/rejected": 0.5859375, "logps/chosen": -380.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 21.25, "rewards/rejected": -11.875, "step": 2860 }, { "epoch": 2.301523656776263, "grad_norm": 4.1407728850157235e-05, "learning_rate": 1.293820558526441e-07, "logits/chosen": 0.431640625, "logits/rejected": 1.0546875, "logps/chosen": -354.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 22.0, "rewards/rejected": -12.75, "step": 2870 }, { "epoch": 2.309542902967121, "grad_norm": 3.7003650267654836e-07, "learning_rate": 1.2789661319073083e-07, "logits/chosen": 0.3125, "logits/rejected": 0.75, "logps/chosen": -392.0, "logps/rejected": -270.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 21.125, "rewards/rejected": -12.125, "step": 2880 }, { "epoch": 2.317562149157979, "grad_norm": 5.1045287381407886e-05, "learning_rate": 1.2641117052881759e-07, "logits/chosen": 0.41015625, "logits/rejected": 0.828125, "logps/chosen": -370.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 21.5, "rewards/rejected": -12.6875, "step": 2890 }, { "epoch": 2.3255813953488373, "grad_norm": 5.5539305897761475e-06, "learning_rate": 1.2492572786690435e-07, "logits/chosen": 0.36328125, "logits/rejected": 0.875, "logps/chosen": -366.0, "logps/rejected": -276.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 21.75, "rewards/rejected": -12.8125, "step": 2900 }, { "epoch": 2.3336006415396953, "grad_norm": 7.544982020529351e-05, "learning_rate": 1.2344028520499108e-07, "logits/chosen": 0.6328125, "logits/rejected": 0.703125, "logps/chosen": -380.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.375, "rewards/margins": 20.875, "rewards/rejected": -12.4375, "step": 2910 }, { "epoch": 2.341619887730553, "grad_norm": 2.1523795485161937e-07, "learning_rate": 1.2195484254307784e-07, "logits/chosen": 0.302734375, "logits/rejected": 0.91015625, "logps/chosen": -372.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.8125, "rewards/margins": 21.5, "rewards/rejected": -12.625, "step": 2920 }, { "epoch": 2.3496391339214115, "grad_norm": 0.0016463592546248687, "learning_rate": 1.2046939988116457e-07, "logits/chosen": 0.40234375, "logits/rejected": 0.87109375, "logps/chosen": -416.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 21.5, "rewards/rejected": -12.0, "step": 2930 }, { "epoch": 2.3576583801122695, "grad_norm": 1.163619554978786e-05, "learning_rate": 1.1898395721925133e-07, "logits/chosen": 0.609375, "logits/rejected": 1.1953125, "logps/chosen": -392.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 21.5, "rewards/rejected": -12.25, "step": 2940 }, { "epoch": 2.3656776263031274, "grad_norm": 5.504163793859823e-06, "learning_rate": 1.1749851455733808e-07, "logits/chosen": 0.51953125, "logits/rejected": 0.8828125, "logps/chosen": -378.0, "logps/rejected": -278.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 21.625, "rewards/rejected": -12.75, "step": 2950 }, { "epoch": 2.3736968724939858, "grad_norm": 2.584190088627908e-07, "learning_rate": 1.1601307189542484e-07, "logits/chosen": 0.51171875, "logits/rejected": 1.1484375, "logps/chosen": -390.0, "logps/rejected": -245.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 20.75, "rewards/rejected": -11.625, "step": 2960 }, { "epoch": 2.3817161186848437, "grad_norm": 0.0005444095930180087, "learning_rate": 1.1452762923351159e-07, "logits/chosen": 0.482421875, "logits/rejected": 1.1953125, "logps/chosen": -384.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 21.875, "rewards/rejected": -12.625, "step": 2970 }, { "epoch": 2.3897353648757016, "grad_norm": 1.0323931081299204e-05, "learning_rate": 1.1304218657159833e-07, "logits/chosen": 0.431640625, "logits/rejected": 1.1015625, "logps/chosen": -374.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 21.375, "rewards/rejected": -12.5625, "step": 2980 }, { "epoch": 2.39775461106656, "grad_norm": 0.0011515689097892994, "learning_rate": 1.1155674390968508e-07, "logits/chosen": 0.439453125, "logits/rejected": 1.078125, "logps/chosen": -380.0, "logps/rejected": -253.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 20.75, "rewards/rejected": -11.9375, "step": 2990 }, { "epoch": 2.405773857257418, "grad_norm": 6.647792332301594e-07, "learning_rate": 1.1007130124777184e-07, "logits/chosen": 0.11572265625, "logits/rejected": 0.921875, "logps/chosen": -406.0, "logps/rejected": -270.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 21.875, "rewards/rejected": -12.8125, "step": 3000 }, { "epoch": 2.413793103448276, "grad_norm": 3.338246923828892e-07, "learning_rate": 1.0858585858585859e-07, "logits/chosen": 0.431640625, "logits/rejected": 0.9765625, "logps/chosen": -374.0, "logps/rejected": -260.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 20.875, "rewards/rejected": -11.875, "step": 3010 }, { "epoch": 2.4218123496391337, "grad_norm": 1.7008814387731784e-07, "learning_rate": 1.0710041592394533e-07, "logits/chosen": 0.30078125, "logits/rejected": 0.9609375, "logps/chosen": -396.0, "logps/rejected": -280.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.625, "rewards/margins": 22.875, "rewards/rejected": -13.1875, "step": 3020 }, { "epoch": 2.429831595829992, "grad_norm": 0.00026660135538851586, "learning_rate": 1.0561497326203208e-07, "logits/chosen": 0.486328125, "logits/rejected": 0.94140625, "logps/chosen": -402.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.125, "rewards/margins": 22.625, "rewards/rejected": -12.5, "step": 3030 }, { "epoch": 2.43785084202085, "grad_norm": 0.00037991243371585304, "learning_rate": 1.0412953060011884e-07, "logits/chosen": 0.5, "logits/rejected": 0.625, "logps/chosen": -376.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 21.0, "rewards/rejected": -12.1875, "step": 3040 }, { "epoch": 2.445870088211708, "grad_norm": 0.002144888490302767, "learning_rate": 1.0264408793820559e-07, "logits/chosen": 0.35546875, "logits/rejected": 0.5546875, "logps/chosen": -372.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 20.75, "rewards/rejected": -11.5, "step": 3050 }, { "epoch": 2.4538893344025663, "grad_norm": 0.00015094228915987385, "learning_rate": 1.0115864527629234e-07, "logits/chosen": 0.50390625, "logits/rejected": 0.8671875, "logps/chosen": -390.0, "logps/rejected": -270.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 22.25, "rewards/rejected": -13.25, "step": 3060 }, { "epoch": 2.461908580593424, "grad_norm": 1.0510215595911879e-05, "learning_rate": 9.967320261437908e-08, "logits/chosen": 0.37890625, "logits/rejected": 0.9765625, "logps/chosen": -374.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.4375, "rewards/margins": 22.25, "rewards/rejected": -12.8125, "step": 3070 }, { "epoch": 2.469927826784282, "grad_norm": 3.0881932584160404e-06, "learning_rate": 9.818775995246583e-08, "logits/chosen": 0.4765625, "logits/rejected": 1.046875, "logps/chosen": -374.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 21.875, "rewards/rejected": -12.75, "step": 3080 }, { "epoch": 2.4779470729751405, "grad_norm": 1.5127855568024035e-06, "learning_rate": 9.670231729055258e-08, "logits/chosen": 0.267578125, "logits/rejected": 0.94140625, "logps/chosen": -376.0, "logps/rejected": -278.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 22.5, "rewards/rejected": -13.25, "step": 3090 }, { "epoch": 2.4859663191659984, "grad_norm": 2.6937979394640244e-07, "learning_rate": 9.521687462863932e-08, "logits/chosen": 0.4765625, "logits/rejected": 1.1953125, "logps/chosen": -384.0, "logps/rejected": -262.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 22.375, "rewards/rejected": -13.375, "step": 3100 }, { "epoch": 2.4939855653568563, "grad_norm": 1.4639795453144539e-06, "learning_rate": 9.373143196672607e-08, "logits/chosen": 0.361328125, "logits/rejected": 0.85546875, "logps/chosen": -388.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.4375, "rewards/margins": 21.5, "rewards/rejected": -12.125, "step": 3110 }, { "epoch": 2.5020048115477147, "grad_norm": 0.02598580162597951, "learning_rate": 9.224598930481283e-08, "logits/chosen": 0.57421875, "logits/rejected": 0.9765625, "logps/chosen": -400.0, "logps/rejected": -270.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 21.75, "rewards/rejected": -12.25, "step": 3120 }, { "epoch": 2.5100240577385726, "grad_norm": 2.338182859317021e-06, "learning_rate": 9.076054664289958e-08, "logits/chosen": 0.494140625, "logits/rejected": 0.875, "logps/chosen": -368.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 21.375, "rewards/rejected": -12.25, "step": 3130 }, { "epoch": 2.5180433039294305, "grad_norm": 5.217029166192134e-06, "learning_rate": 8.927510398098632e-08, "logits/chosen": 0.33984375, "logits/rejected": 0.69140625, "logps/chosen": -374.0, "logps/rejected": -260.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 21.5, "rewards/rejected": -12.25, "step": 3140 }, { "epoch": 2.5260625501202885, "grad_norm": 6.660516101190717e-07, "learning_rate": 8.778966131907307e-08, "logits/chosen": 0.4765625, "logits/rejected": 0.87109375, "logps/chosen": -410.0, "logps/rejected": -251.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.8125, "rewards/margins": 22.0, "rewards/rejected": -12.1875, "step": 3150 }, { "epoch": 2.534081796311147, "grad_norm": 1.0217524922551768e-05, "learning_rate": 8.630421865715982e-08, "logits/chosen": 0.400390625, "logits/rejected": 0.9453125, "logps/chosen": -384.0, "logps/rejected": -260.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 21.5, "rewards/rejected": -12.25, "step": 3160 }, { "epoch": 2.5421010425020047, "grad_norm": 2.9175213558233226e-08, "learning_rate": 8.481877599524658e-08, "logits/chosen": 0.37890625, "logits/rejected": 0.80859375, "logps/chosen": -388.0, "logps/rejected": -250.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 20.375, "rewards/rejected": -12.0, "step": 3170 }, { "epoch": 2.550120288692863, "grad_norm": 2.1164913634068363e-05, "learning_rate": 8.333333333333333e-08, "logits/chosen": 0.455078125, "logits/rejected": 0.92578125, "logps/chosen": -332.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 21.25, "rewards/rejected": -12.5, "step": 3180 }, { "epoch": 2.558139534883721, "grad_norm": 0.00036466161643565877, "learning_rate": 8.184789067142007e-08, "logits/chosen": 0.4375, "logits/rejected": 1.140625, "logps/chosen": -408.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.25, "rewards/margins": 23.25, "rewards/rejected": -13.0, "step": 3190 }, { "epoch": 2.566158781074579, "grad_norm": 1.7479962306808482e-06, "learning_rate": 8.036244800950682e-08, "logits/chosen": 0.5390625, "logits/rejected": 0.8203125, "logps/chosen": -390.0, "logps/rejected": -236.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.8125, "rewards/margins": 21.0, "rewards/rejected": -11.25, "step": 3200 }, { "epoch": 2.574178027265437, "grad_norm": 1.2776292331020723e-05, "learning_rate": 7.887700534759358e-08, "logits/chosen": 0.388671875, "logits/rejected": 0.765625, "logps/chosen": -398.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0, "rewards/margins": 21.875, "rewards/rejected": -12.8125, "step": 3210 }, { "epoch": 2.5821972734562952, "grad_norm": 7.560714458642037e-06, "learning_rate": 7.739156268568033e-08, "logits/chosen": 0.427734375, "logits/rejected": 0.8046875, "logps/chosen": -372.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 22.0, "rewards/rejected": -12.75, "step": 3220 }, { "epoch": 2.590216519647153, "grad_norm": 7.652848872587089e-07, "learning_rate": 7.590612002376707e-08, "logits/chosen": 0.359375, "logits/rejected": 1.0546875, "logps/chosen": -372.0, "logps/rejected": -278.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 23.125, "rewards/rejected": -13.6875, "step": 3230 }, { "epoch": 2.598235765838011, "grad_norm": 5.810458717893015e-07, "learning_rate": 7.442067736185382e-08, "logits/chosen": 0.52734375, "logits/rejected": 0.9375, "logps/chosen": -350.0, "logps/rejected": -270.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 22.125, "rewards/rejected": -12.8125, "step": 3240 }, { "epoch": 2.6062550120288694, "grad_norm": 8.893199185209006e-06, "learning_rate": 7.293523469994058e-08, "logits/chosen": 0.392578125, "logits/rejected": 0.90625, "logps/chosen": -400.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 21.625, "rewards/rejected": -12.8125, "step": 3250 }, { "epoch": 2.6142742582197274, "grad_norm": 1.9346633350882866e-07, "learning_rate": 7.144979203802733e-08, "logits/chosen": 0.478515625, "logits/rejected": 0.890625, "logps/chosen": -356.0, "logps/rejected": -276.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.4375, "rewards/margins": 22.25, "rewards/rejected": -12.8125, "step": 3260 }, { "epoch": 2.6222935044105853, "grad_norm": 2.6875685735083364e-06, "learning_rate": 6.996434937611407e-08, "logits/chosen": 0.39453125, "logits/rejected": 0.92578125, "logps/chosen": -394.0, "logps/rejected": -254.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 21.5, "rewards/rejected": -12.3125, "step": 3270 }, { "epoch": 2.630312750601443, "grad_norm": 5.903328755667033e-06, "learning_rate": 6.847890671420082e-08, "logits/chosen": 0.51171875, "logits/rejected": 0.98046875, "logps/chosen": -400.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.125, "rewards/margins": 23.125, "rewards/rejected": -13.125, "step": 3280 }, { "epoch": 2.6383319967923016, "grad_norm": 1.6700008186742868e-06, "learning_rate": 6.699346405228758e-08, "logits/chosen": 0.6796875, "logits/rejected": 1.0546875, "logps/chosen": -408.0, "logps/rejected": -245.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.0, "rewards/margins": 22.0, "rewards/rejected": -12.125, "step": 3290 }, { "epoch": 2.6463512429831595, "grad_norm": 2.3651887723897224e-05, "learning_rate": 6.550802139037433e-08, "logits/chosen": 0.5390625, "logits/rejected": 0.66796875, "logps/chosen": -372.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.4375, "rewards/margins": 22.0, "rewards/rejected": -12.5625, "step": 3300 }, { "epoch": 2.654370489174018, "grad_norm": 3.7298118325774334e-06, "learning_rate": 6.402257872846107e-08, "logits/chosen": 0.59765625, "logits/rejected": 1.078125, "logps/chosen": -408.0, "logps/rejected": -254.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.75, "rewards/margins": 22.375, "rewards/rejected": -12.625, "step": 3310 }, { "epoch": 2.6623897353648758, "grad_norm": 6.369386383206898e-06, "learning_rate": 6.253713606654782e-08, "logits/chosen": 0.365234375, "logits/rejected": 0.9140625, "logps/chosen": -378.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.375, "rewards/margins": 22.125, "rewards/rejected": -12.75, "step": 3320 }, { "epoch": 2.6704089815557337, "grad_norm": 3.7738355336782367e-07, "learning_rate": 6.105169340463458e-08, "logits/chosen": 0.3046875, "logits/rejected": 1.0390625, "logps/chosen": -402.0, "logps/rejected": -262.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 21.875, "rewards/rejected": -12.5625, "step": 3330 }, { "epoch": 2.6784282277465916, "grad_norm": 0.0032856305622473957, "learning_rate": 5.956625074272133e-08, "logits/chosen": 0.3984375, "logits/rejected": 0.99609375, "logps/chosen": -370.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.6875, "rewards/margins": 22.875, "rewards/rejected": -13.25, "step": 3340 }, { "epoch": 2.68644747393745, "grad_norm": 4.007349858622386e-06, "learning_rate": 5.8080808080808076e-08, "logits/chosen": 0.53125, "logits/rejected": 0.92578125, "logps/chosen": -394.0, "logps/rejected": -246.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5625, "rewards/margins": 22.0, "rewards/rejected": -12.375, "step": 3350 }, { "epoch": 2.694466720128308, "grad_norm": 2.6895164734752446e-08, "learning_rate": 5.659536541889483e-08, "logits/chosen": 0.47265625, "logits/rejected": 0.90625, "logps/chosen": -408.0, "logps/rejected": -278.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.125, "rewards/margins": 23.5, "rewards/rejected": -13.4375, "step": 3360 }, { "epoch": 2.7024859663191663, "grad_norm": 3.8506192206445097e-07, "learning_rate": 5.5109922756981576e-08, "logits/chosen": 0.427734375, "logits/rejected": 0.71484375, "logps/chosen": -370.0, "logps/rejected": -253.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 21.0, "rewards/rejected": -12.125, "step": 3370 }, { "epoch": 2.710505212510024, "grad_norm": 2.3682076740273767e-06, "learning_rate": 5.362448009506833e-08, "logits/chosen": 0.353515625, "logits/rejected": 0.7421875, "logps/chosen": -420.0, "logps/rejected": -234.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 20.25, "rewards/rejected": -11.0625, "step": 3380 }, { "epoch": 2.718524458700882, "grad_norm": 6.426500767480769e-06, "learning_rate": 5.2139037433155076e-08, "logits/chosen": 0.3828125, "logits/rejected": 0.828125, "logps/chosen": -376.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 22.0, "rewards/rejected": -12.8125, "step": 3390 }, { "epoch": 2.72654370489174, "grad_norm": 3.9616135234325537e-05, "learning_rate": 5.065359477124183e-08, "logits/chosen": 0.353515625, "logits/rejected": 1.1796875, "logps/chosen": -368.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 21.875, "rewards/rejected": -12.8125, "step": 3400 }, { "epoch": 2.7345629510825984, "grad_norm": 7.298702505810354e-07, "learning_rate": 4.916815210932858e-08, "logits/chosen": 0.251953125, "logits/rejected": 0.73046875, "logps/chosen": -406.0, "logps/rejected": -274.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5625, "rewards/margins": 22.5, "rewards/rejected": -13.0, "step": 3410 }, { "epoch": 2.7425821972734563, "grad_norm": 3.2680032393659185e-05, "learning_rate": 4.768270944741533e-08, "logits/chosen": 0.43359375, "logits/rejected": 0.95703125, "logps/chosen": -388.0, "logps/rejected": -249.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.625, "rewards/margins": 21.75, "rewards/rejected": -12.125, "step": 3420 }, { "epoch": 2.7506014434643142, "grad_norm": 0.00010066931023876521, "learning_rate": 4.619726678550208e-08, "logits/chosen": 0.55078125, "logits/rejected": 0.69921875, "logps/chosen": -350.0, "logps/rejected": -245.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 21.125, "rewards/rejected": -12.0, "step": 3430 }, { "epoch": 2.7586206896551726, "grad_norm": 0.01676696800833448, "learning_rate": 4.471182412358883e-08, "logits/chosen": 0.2001953125, "logits/rejected": 1.0, "logps/chosen": -418.0, "logps/rejected": -276.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 22.5, "rewards/rejected": -13.0, "step": 3440 }, { "epoch": 2.7666399358460305, "grad_norm": 0.0001179658083699005, "learning_rate": 4.322638146167558e-08, "logits/chosen": 0.4140625, "logits/rejected": 0.7578125, "logps/chosen": -386.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.625, "rewards/margins": 22.5, "rewards/rejected": -12.875, "step": 3450 }, { "epoch": 2.7746591820368884, "grad_norm": 4.5210379061747925e-07, "learning_rate": 4.174093879976233e-08, "logits/chosen": 0.609375, "logits/rejected": 0.97265625, "logps/chosen": -382.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 22.25, "rewards/rejected": -13.0, "step": 3460 }, { "epoch": 2.7826784282277464, "grad_norm": 1.877100041848091e-05, "learning_rate": 4.025549613784908e-08, "logits/chosen": 0.5703125, "logits/rejected": 0.8046875, "logps/chosen": -360.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.375, "rewards/margins": 22.25, "rewards/rejected": -12.75, "step": 3470 }, { "epoch": 2.7906976744186047, "grad_norm": 2.73858514260584e-06, "learning_rate": 3.877005347593583e-08, "logits/chosen": 0.212890625, "logits/rejected": 0.7265625, "logps/chosen": -428.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5625, "rewards/margins": 22.5, "rewards/rejected": -13.0, "step": 3480 }, { "epoch": 2.7987169206094626, "grad_norm": 7.327862084123583e-05, "learning_rate": 3.728461081402258e-08, "logits/chosen": 0.5390625, "logits/rejected": 1.09375, "logps/chosen": -352.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.875, "rewards/margins": 22.25, "rewards/rejected": -13.375, "step": 3490 }, { "epoch": 2.806736166800321, "grad_norm": 1.3168797706922126e-05, "learning_rate": 3.579916815210933e-08, "logits/chosen": 0.498046875, "logits/rejected": 1.1015625, "logps/chosen": -384.0, "logps/rejected": -298.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.75, "rewards/margins": 23.625, "rewards/rejected": -13.875, "step": 3500 }, { "epoch": 2.814755412991179, "grad_norm": 1.0655047288357234e-06, "learning_rate": 3.431372549019608e-08, "logits/chosen": 0.490234375, "logits/rejected": 0.72265625, "logps/chosen": -374.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.625, "rewards/margins": 22.25, "rewards/rejected": -12.5625, "step": 3510 }, { "epoch": 2.822774659182037, "grad_norm": 4.308633518287558e-06, "learning_rate": 3.282828282828283e-08, "logits/chosen": 0.482421875, "logits/rejected": 1.03125, "logps/chosen": -368.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 22.375, "rewards/rejected": -12.8125, "step": 3520 }, { "epoch": 2.8307939053728948, "grad_norm": 3.2658861682776686e-06, "learning_rate": 3.134284016636958e-08, "logits/chosen": 0.74609375, "logits/rejected": 1.4921875, "logps/chosen": -374.0, "logps/rejected": -278.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.3125, "rewards/margins": 23.0, "rewards/rejected": -13.6875, "step": 3530 }, { "epoch": 2.838813151563753, "grad_norm": 0.00013016724592345414, "learning_rate": 2.9857397504456326e-08, "logits/chosen": 0.671875, "logits/rejected": 1.171875, "logps/chosen": -366.0, "logps/rejected": -248.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.4375, "rewards/margins": 21.25, "rewards/rejected": -12.75, "step": 3540 }, { "epoch": 2.846832397754611, "grad_norm": 1.4436758765119522e-06, "learning_rate": 2.8371954842543077e-08, "logits/chosen": 0.5625, "logits/rejected": 0.91015625, "logps/chosen": -388.0, "logps/rejected": -262.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.9375, "rewards/margins": 22.5, "rewards/rejected": -12.5625, "step": 3550 }, { "epoch": 2.854851643945469, "grad_norm": 5.294494355489266e-05, "learning_rate": 2.6886512180629827e-08, "logits/chosen": 0.255859375, "logits/rejected": 0.466796875, "logps/chosen": -390.0, "logps/rejected": -255.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5, "rewards/margins": 21.625, "rewards/rejected": -12.0625, "step": 3560 }, { "epoch": 2.8628708901363273, "grad_norm": 5.493532537358418e-06, "learning_rate": 2.5401069518716577e-08, "logits/chosen": 0.41015625, "logits/rejected": 0.9453125, "logps/chosen": -392.0, "logps/rejected": -268.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.5625, "rewards/margins": 22.5, "rewards/rejected": -12.875, "step": 3570 }, { "epoch": 2.8708901363271853, "grad_norm": 9.69168414444799e-06, "learning_rate": 2.3915626856803327e-08, "logits/chosen": 0.3203125, "logits/rejected": 0.77734375, "logps/chosen": -374.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 21.625, "rewards/rejected": -12.375, "step": 3580 }, { "epoch": 2.878909382518043, "grad_norm": 6.944886149610538e-06, "learning_rate": 2.2430184194890077e-08, "logits/chosen": 0.42578125, "logits/rejected": 0.8671875, "logps/chosen": -396.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 22.375, "rewards/rejected": -13.125, "step": 3590 }, { "epoch": 2.886928628708901, "grad_norm": 1.605309525555966e-07, "learning_rate": 2.0944741532976828e-08, "logits/chosen": 0.53515625, "logits/rejected": 1.234375, "logps/chosen": -398.0, "logps/rejected": -254.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.875, "rewards/margins": 22.625, "rewards/rejected": -12.75, "step": 3600 }, { "epoch": 2.8949478748997595, "grad_norm": 3.1239359113218414e-06, "learning_rate": 1.9459298871063574e-08, "logits/chosen": 0.41796875, "logits/rejected": 0.8203125, "logps/chosen": -368.0, "logps/rejected": -251.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.6875, "rewards/margins": 22.375, "rewards/rejected": -12.6875, "step": 3610 }, { "epoch": 2.9029671210906174, "grad_norm": 0.00020408073165231446, "learning_rate": 1.7973856209150325e-08, "logits/chosen": 0.57421875, "logits/rejected": 1.0625, "logps/chosen": -364.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.125, "rewards/margins": 22.25, "rewards/rejected": -13.125, "step": 3620 }, { "epoch": 2.9109863672814758, "grad_norm": 1.7032066972883364e-05, "learning_rate": 1.6488413547237075e-08, "logits/chosen": 0.50390625, "logits/rejected": 1.109375, "logps/chosen": -372.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.8125, "rewards/margins": 21.75, "rewards/rejected": -13.0, "step": 3630 }, { "epoch": 2.9190056134723337, "grad_norm": 1.2106610546976946e-05, "learning_rate": 1.5002970885323825e-08, "logits/chosen": 0.41796875, "logits/rejected": 0.8828125, "logps/chosen": -398.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.9375, "rewards/margins": 21.5, "rewards/rejected": -12.5625, "step": 3640 }, { "epoch": 2.9270248596631916, "grad_norm": 6.994562679603468e-07, "learning_rate": 1.3517528223410575e-08, "logits/chosen": 0.435546875, "logits/rejected": 0.63671875, "logps/chosen": -394.0, "logps/rejected": -256.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.0, "rewards/margins": 22.375, "rewards/rejected": -12.4375, "step": 3650 }, { "epoch": 2.9350441058540495, "grad_norm": 0.0398782081129029, "learning_rate": 1.2032085561497326e-08, "logits/chosen": 0.67578125, "logits/rejected": 0.9140625, "logps/chosen": -390.0, "logps/rejected": -266.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.4375, "rewards/margins": 22.0, "rewards/rejected": -12.5, "step": 3660 }, { "epoch": 2.943063352044908, "grad_norm": 8.670910404568822e-06, "learning_rate": 1.0546642899584076e-08, "logits/chosen": 0.39453125, "logits/rejected": 0.71875, "logps/chosen": -386.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.0625, "rewards/margins": 21.75, "rewards/rejected": -12.6875, "step": 3670 }, { "epoch": 2.951082598235766, "grad_norm": 8.518374407427174e-07, "learning_rate": 9.061200237670826e-09, "logits/chosen": 0.40234375, "logits/rejected": 0.7734375, "logps/chosen": -378.0, "logps/rejected": -264.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.8125, "rewards/margins": 22.625, "rewards/rejected": -12.75, "step": 3680 }, { "epoch": 2.959101844426624, "grad_norm": 1.8146121899351994e-05, "learning_rate": 7.575757575757576e-09, "logits/chosen": 0.32421875, "logits/rejected": 1.0, "logps/chosen": -400.0, "logps/rejected": -280.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.1875, "rewards/margins": 21.875, "rewards/rejected": -12.6875, "step": 3690 }, { "epoch": 2.967121090617482, "grad_norm": 4.854810071256327e-06, "learning_rate": 6.090314913844325e-09, "logits/chosen": 0.55078125, "logits/rejected": 1.2265625, "logps/chosen": -378.0, "logps/rejected": -272.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.8125, "rewards/margins": 22.75, "rewards/rejected": -13.0, "step": 3700 }, { "epoch": 2.97514033680834, "grad_norm": 2.0632029539297254e-06, "learning_rate": 4.604872251931075e-09, "logits/chosen": 0.51953125, "logits/rejected": 1.1640625, "logps/chosen": -362.0, "logps/rejected": -260.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.75, "rewards/margins": 21.25, "rewards/rejected": -12.5, "step": 3710 }, { "epoch": 2.983159582999198, "grad_norm": 1.1327358687335727e-05, "learning_rate": 3.1194295900178252e-09, "logits/chosen": 0.490234375, "logits/rejected": 1.0546875, "logps/chosen": -384.0, "logps/rejected": -258.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.8125, "rewards/margins": 20.625, "rewards/rejected": -11.875, "step": 3720 }, { "epoch": 2.9911788291900563, "grad_norm": 7.64344859261732e-06, "learning_rate": 1.6339869281045752e-09, "logits/chosen": 0.4453125, "logits/rejected": 1.0, "logps/chosen": -370.0, "logps/rejected": -262.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.25, "rewards/margins": 21.875, "rewards/rejected": -12.6875, "step": 3730 }, { "epoch": 2.999198075380914, "grad_norm": 8.526960458549043e-07, "learning_rate": 1.4854426619132502e-10, "logits/chosen": 0.49609375, "logits/rejected": 0.70703125, "logps/chosen": -390.0, "logps/rejected": -241.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.4375, "rewards/margins": 21.375, "rewards/rejected": -12.0, "step": 3740 }, { "epoch": 3.0, "eval_logits/chosen": 0.53125, "eval_logits/rejected": 1.078125, "eval_logps/chosen": -380.0, "eval_logps/rejected": -268.0, "eval_loss": 1.4558563634636812e-06, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 8.9375, "eval_rewards/margins": 21.375, "eval_rewards/rejected": -12.4375, "eval_runtime": 32.7864, "eval_samples_per_second": 6.07, "eval_steps_per_second": 0.763, "step": 3741 } ], "logging_steps": 10, "max_steps": 3741, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }