| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 916, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.043668122270742356, | |
| "grad_norm": 2.421576976776123, | |
| "learning_rate": 1.0869565217391306e-06, | |
| "loss": 0.5167, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08733624454148471, | |
| "grad_norm": 2.160121440887451, | |
| "learning_rate": 2.173913043478261e-06, | |
| "loss": 0.5039, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.13100436681222707, | |
| "grad_norm": 0.915518045425415, | |
| "learning_rate": 3.2608695652173914e-06, | |
| "loss": 0.4632, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.17467248908296942, | |
| "grad_norm": 0.7904559969902039, | |
| "learning_rate": 4.347826086956522e-06, | |
| "loss": 0.3982, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2183406113537118, | |
| "grad_norm": 0.6008340716362, | |
| "learning_rate": 5.4347826086956525e-06, | |
| "loss": 0.4161, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.26200873362445415, | |
| "grad_norm": 0.6038509011268616, | |
| "learning_rate": 6.521739130434783e-06, | |
| "loss": 0.438, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3056768558951965, | |
| "grad_norm": 0.609183132648468, | |
| "learning_rate": 7.608695652173914e-06, | |
| "loss": 0.3848, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.34934497816593885, | |
| "grad_norm": 0.65712571144104, | |
| "learning_rate": 8.695652173913044e-06, | |
| "loss": 0.3993, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3930131004366812, | |
| "grad_norm": 0.6194190979003906, | |
| "learning_rate": 9.782608695652175e-06, | |
| "loss": 0.3769, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4366812227074236, | |
| "grad_norm": 0.4761613607406616, | |
| "learning_rate": 9.99947842870608e-06, | |
| "loss": 0.3808, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.48034934497816595, | |
| "grad_norm": 0.4921339750289917, | |
| "learning_rate": 9.997359731816998e-06, | |
| "loss": 0.4205, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5240174672489083, | |
| "grad_norm": 0.48024195432662964, | |
| "learning_rate": 9.99361200124597e-06, | |
| "loss": 0.38, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5676855895196506, | |
| "grad_norm": 0.6233803629875183, | |
| "learning_rate": 9.988236458673974e-06, | |
| "loss": 0.3953, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.611353711790393, | |
| "grad_norm": 0.5606607794761658, | |
| "learning_rate": 9.981234856414306e-06, | |
| "loss": 0.3865, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6550218340611353, | |
| "grad_norm": 0.49620741605758667, | |
| "learning_rate": 9.972609476841368e-06, | |
| "loss": 0.3899, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6986899563318777, | |
| "grad_norm": 0.5842658281326294, | |
| "learning_rate": 9.962363131646649e-06, | |
| "loss": 0.3792, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.74235807860262, | |
| "grad_norm": 0.5468127727508545, | |
| "learning_rate": 9.950499160922184e-06, | |
| "loss": 0.4015, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7860262008733624, | |
| "grad_norm": 0.5464998483657837, | |
| "learning_rate": 9.937021432071754e-06, | |
| "loss": 0.3533, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8296943231441049, | |
| "grad_norm": 0.5048817992210388, | |
| "learning_rate": 9.921934338550187e-06, | |
| "loss": 0.3961, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8733624454148472, | |
| "grad_norm": 0.47697556018829346, | |
| "learning_rate": 9.905242798431196e-06, | |
| "loss": 0.3438, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9170305676855895, | |
| "grad_norm": 0.5746617913246155, | |
| "learning_rate": 9.886952252804177e-06, | |
| "loss": 0.4006, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9606986899563319, | |
| "grad_norm": 0.5081667304039001, | |
| "learning_rate": 9.867068664000538e-06, | |
| "loss": 0.3679, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.0043668122270741, | |
| "grad_norm": 0.4806345999240875, | |
| "learning_rate": 9.845598513650104e-06, | |
| "loss": 0.4113, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0480349344978166, | |
| "grad_norm": 0.4791143238544464, | |
| "learning_rate": 9.822548800568238e-06, | |
| "loss": 0.341, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.091703056768559, | |
| "grad_norm": 0.5520183444023132, | |
| "learning_rate": 9.797927038474383e-06, | |
| "loss": 0.298, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1353711790393013, | |
| "grad_norm": 0.486562579870224, | |
| "learning_rate": 9.771741253542742e-06, | |
| "loss": 0.2989, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1790393013100438, | |
| "grad_norm": 0.5037546753883362, | |
| "learning_rate": 9.743999981785914e-06, | |
| "loss": 0.3058, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.222707423580786, | |
| "grad_norm": 0.5140413045883179, | |
| "learning_rate": 9.714712266272339e-06, | |
| "loss": 0.3164, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2663755458515285, | |
| "grad_norm": 0.4978218972682953, | |
| "learning_rate": 9.683887654178446e-06, | |
| "loss": 0.296, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.3100436681222707, | |
| "grad_norm": 0.5410030484199524, | |
| "learning_rate": 9.651536193676476e-06, | |
| "loss": 0.2938, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3537117903930131, | |
| "grad_norm": 0.5140953063964844, | |
| "learning_rate": 9.617668430658991e-06, | |
| "loss": 0.3249, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3973799126637554, | |
| "grad_norm": 0.4528365433216095, | |
| "learning_rate": 9.582295405301131e-06, | |
| "loss": 0.3356, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4410480349344978, | |
| "grad_norm": 0.49946603178977966, | |
| "learning_rate": 9.545428648461756e-06, | |
| "loss": 0.3037, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.48471615720524, | |
| "grad_norm": 0.48589998483657837, | |
| "learning_rate": 9.50708017792463e-06, | |
| "loss": 0.3116, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.5283842794759825, | |
| "grad_norm": 0.46080437302589417, | |
| "learning_rate": 9.46726249448087e-06, | |
| "loss": 0.296, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.572052401746725, | |
| "grad_norm": 0.4566941559314728, | |
| "learning_rate": 9.425988577853959e-06, | |
| "loss": 0.3079, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.6157205240174672, | |
| "grad_norm": 0.5628035068511963, | |
| "learning_rate": 9.383271882468631e-06, | |
| "loss": 0.2906, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6593886462882095, | |
| "grad_norm": 0.4313275218009949, | |
| "learning_rate": 9.339126333065008e-06, | |
| "loss": 0.2879, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.703056768558952, | |
| "grad_norm": 0.4829094409942627, | |
| "learning_rate": 9.293566320159432e-06, | |
| "loss": 0.3609, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7467248908296944, | |
| "grad_norm": 0.7929471135139465, | |
| "learning_rate": 9.24660669535346e-06, | |
| "loss": 0.3263, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7903930131004366, | |
| "grad_norm": 0.4220748841762543, | |
| "learning_rate": 9.198262766492554e-06, | |
| "loss": 0.3092, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.8340611353711789, | |
| "grad_norm": 0.4901680648326874, | |
| "learning_rate": 9.14855029267605e-06, | |
| "loss": 0.3152, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8777292576419216, | |
| "grad_norm": 0.47252029180526733, | |
| "learning_rate": 9.097485479120027e-06, | |
| "loss": 0.3223, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.9213973799126638, | |
| "grad_norm": 0.46700412034988403, | |
| "learning_rate": 9.045084971874738e-06, | |
| "loss": 0.3144, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.965065502183406, | |
| "grad_norm": 0.46521633863449097, | |
| "learning_rate": 8.99136585239836e-06, | |
| "loss": 0.3179, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.0087336244541483, | |
| "grad_norm": 0.48223376274108887, | |
| "learning_rate": 8.9363456319888e-06, | |
| "loss": 0.3021, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.052401746724891, | |
| "grad_norm": 0.446074903011322, | |
| "learning_rate": 8.880042246075366e-06, | |
| "loss": 0.2441, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.096069868995633, | |
| "grad_norm": 0.45803341269493103, | |
| "learning_rate": 8.82247404837222e-06, | |
| "loss": 0.2733, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.1397379912663754, | |
| "grad_norm": 0.429457426071167, | |
| "learning_rate": 8.763659804895442e-06, | |
| "loss": 0.2563, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.183406113537118, | |
| "grad_norm": 0.6878861784934998, | |
| "learning_rate": 8.703618687845697e-06, | |
| "loss": 0.2458, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.2270742358078603, | |
| "grad_norm": 0.4481293261051178, | |
| "learning_rate": 8.64237026935852e-06, | |
| "loss": 0.2261, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.2707423580786026, | |
| "grad_norm": 0.45138078927993774, | |
| "learning_rate": 8.579934515124202e-06, | |
| "loss": 0.2408, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.314410480349345, | |
| "grad_norm": 0.5058510303497314, | |
| "learning_rate": 8.5163317778794e-06, | |
| "loss": 0.2386, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.3580786026200875, | |
| "grad_norm": 0.5651599168777466, | |
| "learning_rate": 8.45158279077258e-06, | |
| "loss": 0.2035, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.4017467248908297, | |
| "grad_norm": 0.4735155999660492, | |
| "learning_rate": 8.385708660605431e-06, | |
| "loss": 0.2106, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.445414847161572, | |
| "grad_norm": 0.44301047921180725, | |
| "learning_rate": 8.318730860952523e-06, | |
| "loss": 0.2164, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.489082969432314, | |
| "grad_norm": 0.38600876927375793, | |
| "learning_rate": 8.250671225161345e-06, | |
| "loss": 0.2275, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.532751091703057, | |
| "grad_norm": 0.49234113097190857, | |
| "learning_rate": 8.181551939235115e-06, | |
| "loss": 0.2254, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.576419213973799, | |
| "grad_norm": 0.4783915877342224, | |
| "learning_rate": 8.111395534600604e-06, | |
| "loss": 0.2253, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.6200873362445414, | |
| "grad_norm": 0.4308622479438782, | |
| "learning_rate": 8.040224880763368e-06, | |
| "loss": 0.2202, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.6637554585152836, | |
| "grad_norm": 0.4942546784877777, | |
| "learning_rate": 7.968063177852775e-06, | |
| "loss": 0.2512, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.7074235807860263, | |
| "grad_norm": 0.4427158832550049, | |
| "learning_rate": 7.894933949059245e-06, | |
| "loss": 0.237, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.7510917030567685, | |
| "grad_norm": 0.46294692158699036, | |
| "learning_rate": 7.820861032966199e-06, | |
| "loss": 0.226, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.7947598253275108, | |
| "grad_norm": 0.42187586426734924, | |
| "learning_rate": 7.745868575779176e-06, | |
| "loss": 0.2362, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.8384279475982535, | |
| "grad_norm": 0.4270602762699127, | |
| "learning_rate": 7.669981023454682e-06, | |
| "loss": 0.2159, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.8820960698689957, | |
| "grad_norm": 0.49507179856300354, | |
| "learning_rate": 7.593223113731323e-06, | |
| "loss": 0.2566, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.925764192139738, | |
| "grad_norm": 0.4554119408130646, | |
| "learning_rate": 7.515619868065833e-06, | |
| "loss": 0.2648, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.96943231441048, | |
| "grad_norm": 0.42243942618370056, | |
| "learning_rate": 7.437196583476597e-06, | |
| "loss": 0.2426, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.013100436681223, | |
| "grad_norm": 0.4137606620788574, | |
| "learning_rate": 7.357978824297362e-06, | |
| "loss": 0.225, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.056768558951965, | |
| "grad_norm": 0.433912992477417, | |
| "learning_rate": 7.2779924138438065e-06, | |
| "loss": 0.1688, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.1004366812227073, | |
| "grad_norm": 0.4669990539550781, | |
| "learning_rate": 7.197263425995682e-06, | |
| "loss": 0.1763, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.14410480349345, | |
| "grad_norm": 0.4027640223503113, | |
| "learning_rate": 7.115818176697285e-06, | |
| "loss": 0.1805, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.1877729257641922, | |
| "grad_norm": 0.577460765838623, | |
| "learning_rate": 7.033683215379002e-06, | |
| "loss": 0.1709, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.2314410480349345, | |
| "grad_norm": 0.43528082966804504, | |
| "learning_rate": 6.950885316302773e-06, | |
| "loss": 0.1558, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.2751091703056767, | |
| "grad_norm": 0.5665518641471863, | |
| "learning_rate": 6.867451469834237e-06, | |
| "loss": 0.1935, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.3187772925764194, | |
| "grad_norm": 0.3636087477207184, | |
| "learning_rate": 6.7834088736444435e-06, | |
| "loss": 0.1779, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.3624454148471616, | |
| "grad_norm": 0.4824029505252838, | |
| "learning_rate": 6.698784923843993e-06, | |
| "loss": 0.1748, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.406113537117904, | |
| "grad_norm": 0.44956591725349426, | |
| "learning_rate": 6.613607206052476e-06, | |
| "loss": 0.1637, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.449781659388646, | |
| "grad_norm": 0.4280209243297577, | |
| "learning_rate": 6.527903486406147e-06, | |
| "loss": 0.1618, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 3.493449781659389, | |
| "grad_norm": 0.5125846862792969, | |
| "learning_rate": 6.441701702506755e-06, | |
| "loss": 0.2097, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.537117903930131, | |
| "grad_norm": 0.4643654227256775, | |
| "learning_rate": 6.355029954314468e-06, | |
| "loss": 0.1765, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.5807860262008733, | |
| "grad_norm": 0.3958646357059479, | |
| "learning_rate": 6.267916494987883e-06, | |
| "loss": 0.1716, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 3.6244541484716155, | |
| "grad_norm": 0.3993144929409027, | |
| "learning_rate": 6.180389721674101e-06, | |
| "loss": 0.1763, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.668122270742358, | |
| "grad_norm": 0.4378385841846466, | |
| "learning_rate": 6.092478166251839e-06, | |
| "loss": 0.1677, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 3.7117903930131004, | |
| "grad_norm": 0.42186248302459717, | |
| "learning_rate": 6.00421048603066e-06, | |
| "loss": 0.1874, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 3.7554585152838427, | |
| "grad_norm": 0.4455322027206421, | |
| "learning_rate": 5.915615454409281e-06, | |
| "loss": 0.1641, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 3.7991266375545854, | |
| "grad_norm": 0.5567952990531921, | |
| "learning_rate": 5.8267219514960625e-06, | |
| "loss": 0.1714, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 3.8427947598253276, | |
| "grad_norm": 0.463058739900589, | |
| "learning_rate": 5.737558954694698e-06, | |
| "loss": 0.1799, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.88646288209607, | |
| "grad_norm": 0.4127854108810425, | |
| "learning_rate": 5.648155529258195e-06, | |
| "loss": 0.1721, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.930131004366812, | |
| "grad_norm": 0.5975726246833801, | |
| "learning_rate": 5.558540818814213e-06, | |
| "loss": 0.1792, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.9737991266375547, | |
| "grad_norm": 0.44827261567115784, | |
| "learning_rate": 5.468744035864867e-06, | |
| "loss": 0.1675, | |
| "step": 910 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1832, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 143371959500800.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |