{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 17125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029197080291970802, "grad_norm": 0.8470640182495117, "learning_rate": 3.891050583657588e-05, "loss": 4.8508, "step": 100 }, { "epoch": 0.058394160583941604, "grad_norm": 1.3564280271530151, "learning_rate": 7.782101167315176e-05, "loss": 4.4557, "step": 200 }, { "epoch": 0.08759124087591241, "grad_norm": 1.3530114889144897, "learning_rate": 0.00011673151750972763, "loss": 4.1905, "step": 300 }, { "epoch": 0.11678832116788321, "grad_norm": 1.328213095664978, "learning_rate": 0.0001556420233463035, "loss": 4.1219, "step": 400 }, { "epoch": 0.145985401459854, "grad_norm": 1.3318136930465698, "learning_rate": 0.0001945525291828794, "loss": 3.9169, "step": 500 }, { "epoch": 0.17518248175182483, "grad_norm": 1.7535721063613892, "learning_rate": 0.00019998677287306197, "loss": 3.8635, "step": 600 }, { "epoch": 0.20437956204379562, "grad_norm": 1.6469675302505493, "learning_rate": 0.00019993813296600207, "loss": 3.775, "step": 700 }, { "epoch": 0.23357664233576642, "grad_norm": 1.637732744216919, "learning_rate": 0.00019985374713924855, "loss": 3.6853, "step": 800 }, { "epoch": 0.26277372262773724, "grad_norm": 1.2182079553604126, "learning_rate": 0.00019973364557596465, "loss": 3.6172, "step": 900 }, { "epoch": 0.291970802919708, "grad_norm": 1.8510034084320068, "learning_rate": 0.00019957787123413558, "loss": 3.5044, "step": 1000 }, { "epoch": 0.32116788321167883, "grad_norm": 1.4988285303115845, "learning_rate": 0.00019938647983120316, "loss": 3.445, "step": 1100 }, { "epoch": 0.35036496350364965, "grad_norm": 2.2497406005859375, "learning_rate": 0.0001991595398241369, "loss": 3.4068, "step": 1200 }, { "epoch": 0.3795620437956204, "grad_norm": 2.1076061725616455, "learning_rate": 0.00019889713238494824, "loss": 3.275, "step": 1300 }, { "epoch": 0.40875912408759124, "grad_norm": 2.4178290367126465, "learning_rate": 0.0001985993513716568, "loss": 3.1974, "step": 1400 }, { "epoch": 0.43795620437956206, "grad_norm": 1.8352422714233398, "learning_rate": 0.00019826630329471928, "loss": 3.1553, "step": 1500 }, { "epoch": 0.46715328467153283, "grad_norm": 1.7513710260391235, "learning_rate": 0.00019789810727893284, "loss": 3.0717, "step": 1600 }, { "epoch": 0.49635036496350365, "grad_norm": 1.8882313966751099, "learning_rate": 0.00019749489502082632, "loss": 3.0037, "step": 1700 }, { "epoch": 0.5255474452554745, "grad_norm": 2.062272310256958, "learning_rate": 0.000197056810741555, "loss": 2.9222, "step": 1800 }, { "epoch": 0.5547445255474452, "grad_norm": 1.6502212285995483, "learning_rate": 0.00019658401113531565, "loss": 2.8871, "step": 1900 }, { "epoch": 0.583941605839416, "grad_norm": 1.6748732328414917, "learning_rate": 0.0001960766653132999, "loss": 2.8554, "step": 2000 }, { "epoch": 0.6131386861313869, "grad_norm": 1.9615834951400757, "learning_rate": 0.0001955349547432065, "loss": 2.7841, "step": 2100 }, { "epoch": 0.6423357664233577, "grad_norm": 3.604374885559082, "learning_rate": 0.000194959073184334, "loss": 2.6938, "step": 2200 }, { "epoch": 0.6715328467153284, "grad_norm": 2.6139862537384033, "learning_rate": 0.00019434922661827663, "loss": 2.634, "step": 2300 }, { "epoch": 0.7007299270072993, "grad_norm": 2.5986483097076416, "learning_rate": 0.00019370563317524882, "loss": 2.5901, "step": 2400 }, { "epoch": 0.7299270072992701, "grad_norm": 2.7669074535369873, "learning_rate": 0.00019302852305606432, "loss": 2.5001, "step": 2500 }, { "epoch": 0.7591240875912408, "grad_norm": 2.4265894889831543, "learning_rate": 0.00019231813844979777, "loss": 2.4595, "step": 2600 }, { "epoch": 0.7883211678832117, "grad_norm": 2.589481830596924, "learning_rate": 0.0001915747334471584, "loss": 2.3496, "step": 2700 }, { "epoch": 0.8175182481751825, "grad_norm": 2.969148874282837, "learning_rate": 0.0001907985739496068, "loss": 2.3099, "step": 2800 }, { "epoch": 0.8467153284671532, "grad_norm": 2.7729787826538086, "learning_rate": 0.00018998993757424713, "loss": 2.2896, "step": 2900 }, { "epoch": 0.8759124087591241, "grad_norm": 3.3500335216522217, "learning_rate": 0.00018914911355452895, "loss": 2.2433, "step": 3000 }, { "epoch": 0.9051094890510949, "grad_norm": 2.8922252655029297, "learning_rate": 0.00018827640263679394, "loss": 2.1564, "step": 3100 }, { "epoch": 0.9343065693430657, "grad_norm": 2.678309679031372, "learning_rate": 0.0001873721169727048, "loss": 2.1654, "step": 3200 }, { "epoch": 0.9635036496350365, "grad_norm": 3.6411967277526855, "learning_rate": 0.00018643658000759493, "loss": 2.1212, "step": 3300 }, { "epoch": 0.9927007299270073, "grad_norm": 2.9299466609954834, "learning_rate": 0.0001854701263647781, "loss": 1.9858, "step": 3400 }, { "epoch": 1.0218978102189782, "grad_norm": 1.6852861642837524, "learning_rate": 0.0001844731017258603, "loss": 1.953, "step": 3500 }, { "epoch": 1.051094890510949, "grad_norm": 2.1536929607391357, "learning_rate": 0.00018344586270709613, "loss": 1.8913, "step": 3600 }, { "epoch": 1.0802919708029197, "grad_norm": 1.5574983358383179, "learning_rate": 0.00018238877673183428, "loss": 1.8142, "step": 3700 }, { "epoch": 1.1094890510948905, "grad_norm": 1.319333791732788, "learning_rate": 0.0001813022218990972, "loss": 1.7464, "step": 3800 }, { "epoch": 1.1386861313868613, "grad_norm": 1.4836809635162354, "learning_rate": 0.00018018658684834256, "loss": 1.7574, "step": 3900 }, { "epoch": 1.167883211678832, "grad_norm": 1.7886255979537964, "learning_rate": 0.00017904227062045437, "loss": 1.7255, "step": 4000 }, { "epoch": 1.197080291970803, "grad_norm": 1.7846744060516357, "learning_rate": 0.00017786968251501406, "loss": 1.7593, "step": 4100 }, { "epoch": 1.2262773722627738, "grad_norm": 1.5439305305480957, "learning_rate": 0.00017666924194390183, "loss": 1.7175, "step": 4200 }, { "epoch": 1.2554744525547445, "grad_norm": 1.6620721817016602, "learning_rate": 0.0001754413782812812, "loss": 1.6345, "step": 4300 }, { "epoch": 1.2846715328467153, "grad_norm": 1.7942287921905518, "learning_rate": 0.00017418653071002047, "loss": 1.6425, "step": 4400 }, { "epoch": 1.313868613138686, "grad_norm": 1.7287062406539917, "learning_rate": 0.0001729051480646052, "loss": 1.5037, "step": 4500 }, { "epoch": 1.343065693430657, "grad_norm": 1.340302586555481, "learning_rate": 0.00017159768867059936, "loss": 1.6512, "step": 4600 }, { "epoch": 1.3722627737226278, "grad_norm": 1.3054938316345215, "learning_rate": 0.0001702646201807107, "loss": 1.5379, "step": 4700 }, { "epoch": 1.4014598540145986, "grad_norm": 1.4920105934143066, "learning_rate": 0.00016890641940752095, "loss": 1.4965, "step": 4800 }, { "epoch": 1.4306569343065694, "grad_norm": 1.6876699924468994, "learning_rate": 0.00016752357215293897, "loss": 1.5866, "step": 4900 }, { "epoch": 1.4598540145985401, "grad_norm": 1.3976876735687256, "learning_rate": 0.00016611657303443903, "loss": 1.4894, "step": 5000 }, { "epoch": 1.489051094890511, "grad_norm": 1.5993022918701172, "learning_rate": 0.0001646859253081458, "loss": 1.5393, "step": 5100 }, { "epoch": 1.5182481751824817, "grad_norm": 1.5950144529342651, "learning_rate": 0.00016323214068882935, "loss": 1.4881, "step": 5200 }, { "epoch": 1.5474452554744524, "grad_norm": 1.4699268341064453, "learning_rate": 0.00016175573916687484, "loss": 1.5266, "step": 5300 }, { "epoch": 1.5766423357664232, "grad_norm": 1.494767427444458, "learning_rate": 0.00016025724882229208, "loss": 1.4496, "step": 5400 }, { "epoch": 1.6058394160583942, "grad_norm": 1.589040756225586, "learning_rate": 0.00015873720563583165, "loss": 1.4994, "step": 5500 }, { "epoch": 1.635036496350365, "grad_norm": 1.1471683979034424, "learning_rate": 0.00015719615329727512, "loss": 1.4464, "step": 5600 }, { "epoch": 1.6642335766423357, "grad_norm": 1.5427780151367188, "learning_rate": 0.00015563464301096756, "loss": 1.3936, "step": 5700 }, { "epoch": 1.6934306569343067, "grad_norm": 1.4828369617462158, "learning_rate": 0.0001540532332986628, "loss": 1.3653, "step": 5800 }, { "epoch": 1.7226277372262775, "grad_norm": 1.3401451110839844, "learning_rate": 0.0001524524897997509, "loss": 1.4141, "step": 5900 }, { "epoch": 1.7518248175182483, "grad_norm": 1.289553165435791, "learning_rate": 0.00015083298506894015, "loss": 1.3119, "step": 6000 }, { "epoch": 1.781021897810219, "grad_norm": 1.2549211978912354, "learning_rate": 0.00014919529837146528, "loss": 1.3384, "step": 6100 }, { "epoch": 1.8102189781021898, "grad_norm": 1.71195387840271, "learning_rate": 0.00014754001547589564, "loss": 1.3231, "step": 6200 }, { "epoch": 1.8394160583941606, "grad_norm": 1.4366215467453003, "learning_rate": 0.0001458677284446172, "loss": 1.3196, "step": 6300 }, { "epoch": 1.8686131386861313, "grad_norm": 1.5663456916809082, "learning_rate": 0.00014417903542206342, "loss": 1.4379, "step": 6400 }, { "epoch": 1.897810218978102, "grad_norm": 2.2416155338287354, "learning_rate": 0.00014247454042077068, "loss": 1.3796, "step": 6500 }, { "epoch": 1.9270072992700729, "grad_norm": 1.9517488479614258, "learning_rate": 0.00014075485310533473, "loss": 1.3097, "step": 6600 }, { "epoch": 1.9562043795620438, "grad_norm": 1.7964544296264648, "learning_rate": 0.00013902058857434557, "loss": 1.3401, "step": 6700 }, { "epoch": 1.9854014598540146, "grad_norm": 1.0145196914672852, "learning_rate": 0.00013727236714037872, "loss": 1.161, "step": 6800 }, { "epoch": 2.0145985401459856, "grad_norm": 3.1186952590942383, "learning_rate": 0.00013551081410812147, "loss": 1.1588, "step": 6900 }, { "epoch": 2.0437956204379564, "grad_norm": 2.3698039054870605, "learning_rate": 0.0001337365595507137, "loss": 1.0564, "step": 7000 }, { "epoch": 2.072992700729927, "grad_norm": 3.22995924949646, "learning_rate": 0.0001319502380843829, "loss": 1.1415, "step": 7100 }, { "epoch": 2.102189781021898, "grad_norm": 3.0756795406341553, "learning_rate": 0.00013015248864145434, "loss": 1.3046, "step": 7200 }, { "epoch": 2.1313868613138687, "grad_norm": 3.943307399749756, "learning_rate": 0.00012834395424181748, "loss": 1.1222, "step": 7300 }, { "epoch": 2.1605839416058394, "grad_norm": 2.424041271209717, "learning_rate": 0.00012652528176293042, "loss": 1.1787, "step": 7400 }, { "epoch": 2.18978102189781, "grad_norm": 2.5731918811798096, "learning_rate": 0.0001246971217084443, "loss": 1.1214, "step": 7500 }, { "epoch": 2.218978102189781, "grad_norm": 1.8488351106643677, "learning_rate": 0.00012286012797553075, "loss": 1.0242, "step": 7600 }, { "epoch": 2.2481751824817517, "grad_norm": 3.49588680267334, "learning_rate": 0.0001210149576209959, "loss": 1.1995, "step": 7700 }, { "epoch": 2.2773722627737225, "grad_norm": 1.9914380311965942, "learning_rate": 0.00011916227062626388, "loss": 1.2218, "step": 7800 }, { "epoch": 2.3065693430656933, "grad_norm": 2.816720485687256, "learning_rate": 0.00011730272966131422, "loss": 1.12, "step": 7900 }, { "epoch": 2.335766423357664, "grad_norm": 1.5543715953826904, "learning_rate": 0.00011543699984765788, "loss": 1.0535, "step": 8000 }, { "epoch": 2.3649635036496353, "grad_norm": 2.867494821548462, "learning_rate": 0.00011356574852043617, "loss": 1.1174, "step": 8100 }, { "epoch": 2.394160583941606, "grad_norm": 3.366215944290161, "learning_rate": 0.00011168964498972818, "loss": 1.3649, "step": 8200 }, { "epoch": 2.423357664233577, "grad_norm": 5.301421165466309, "learning_rate": 0.00010980936030115132, "loss": 1.0221, "step": 8300 }, { "epoch": 2.4525547445255476, "grad_norm": 2.44589900970459, "learning_rate": 0.0001079255669958416, "loss": 1.1887, "step": 8400 }, { "epoch": 2.4817518248175183, "grad_norm": 1.8434127569198608, "learning_rate": 0.00010603893886989883, "loss": 1.0379, "step": 8500 }, { "epoch": 2.510948905109489, "grad_norm": 5.0544257164001465, "learning_rate": 0.00010415015073338286, "loss": 1.0467, "step": 8600 }, { "epoch": 2.54014598540146, "grad_norm": 2.266094446182251, "learning_rate": 0.00010225987816894698, "loss": 1.0284, "step": 8700 }, { "epoch": 2.5693430656934306, "grad_norm": 4.524600028991699, "learning_rate": 0.00010036879729019559, "loss": 1.0921, "step": 8800 }, { "epoch": 2.5985401459854014, "grad_norm": 3.9681153297424316, "learning_rate": 9.847758449985124e-05, "loss": 1.0259, "step": 8900 }, { "epoch": 2.627737226277372, "grad_norm": 4.628920078277588, "learning_rate": 9.658691624781866e-05, "loss": 1.1767, "step": 9000 }, { "epoch": 2.656934306569343, "grad_norm": 3.457036256790161, "learning_rate": 9.469746878923188e-05, "loss": 1.0662, "step": 9100 }, { "epoch": 2.686131386861314, "grad_norm": 3.112386703491211, "learning_rate": 9.280991794257103e-05, "loss": 1.0129, "step": 9200 }, { "epoch": 2.7153284671532845, "grad_norm": 2.2668728828430176, "learning_rate": 9.092493884793501e-05, "loss": 1.0139, "step": 9300 }, { "epoch": 2.7445255474452557, "grad_norm": 3.580484390258789, "learning_rate": 8.904320572555734e-05, "loss": 1.0144, "step": 9400 }, { "epoch": 2.7737226277372264, "grad_norm": 1.5489946603775024, "learning_rate": 8.71653916346505e-05, "loss": 0.9979, "step": 9500 }, { "epoch": 2.802919708029197, "grad_norm": 1.7365363836288452, "learning_rate": 8.529216823266606e-05, "loss": 1.0874, "step": 9600 }, { "epoch": 2.832116788321168, "grad_norm": 3.3894410133361816, "learning_rate": 8.342420553505559e-05, "loss": 1.1251, "step": 9700 }, { "epoch": 2.8613138686131387, "grad_norm": 4.5942912101745605, "learning_rate": 8.15621716756195e-05, "loss": 1.1814, "step": 9800 }, { "epoch": 2.8905109489051095, "grad_norm": 1.3230512142181396, "learning_rate": 7.970673266752838e-05, "loss": 1.1201, "step": 9900 }, { "epoch": 2.9197080291970803, "grad_norm": 1.1530183553695679, "learning_rate": 7.785855216510337e-05, "loss": 1.0758, "step": 10000 }, { "epoch": 2.948905109489051, "grad_norm": 2.9893460273742676, "learning_rate": 7.601829122643957e-05, "loss": 1.02, "step": 10100 }, { "epoch": 2.978102189781022, "grad_norm": 2.8600292205810547, "learning_rate": 7.418660807695897e-05, "loss": 1.1625, "step": 10200 }, { "epoch": 3.0072992700729926, "grad_norm": 0.6397629976272583, "learning_rate": 7.236415787397548e-05, "loss": 1.2091, "step": 10300 }, { "epoch": 3.0364963503649633, "grad_norm": 0.5129945278167725, "learning_rate": 7.055159247235844e-05, "loss": 1.1668, "step": 10400 }, { "epoch": 3.065693430656934, "grad_norm": 0.49799928069114685, "learning_rate": 6.874956019137669e-05, "loss": 0.8265, "step": 10500 }, { "epoch": 3.094890510948905, "grad_norm": 0.8108125925064087, "learning_rate": 6.695870558280718e-05, "loss": 1.0216, "step": 10600 }, { "epoch": 3.124087591240876, "grad_norm": 0.45047125220298767, "learning_rate": 6.51796692003918e-05, "loss": 1.1076, "step": 10700 }, { "epoch": 3.153284671532847, "grad_norm": 0.700933039188385, "learning_rate": 6.341308737072349e-05, "loss": 0.9756, "step": 10800 }, { "epoch": 3.1824817518248176, "grad_norm": 0.5597018003463745, "learning_rate": 6.165959196564481e-05, "loss": 0.854, "step": 10900 }, { "epoch": 3.2116788321167884, "grad_norm": 0.5288468599319458, "learning_rate": 5.991981017623955e-05, "loss": 0.9882, "step": 11000 }, { "epoch": 3.240875912408759, "grad_norm": 0.7121827602386475, "learning_rate": 5.819436428849896e-05, "loss": 1.0701, "step": 11100 }, { "epoch": 3.27007299270073, "grad_norm": 0.7537636756896973, "learning_rate": 5.648387146074192e-05, "loss": 0.9367, "step": 11200 }, { "epoch": 3.2992700729927007, "grad_norm": 0.33666905760765076, "learning_rate": 5.478894350286965e-05, "loss": 1.0724, "step": 11300 }, { "epoch": 3.3284671532846715, "grad_norm": 0.4632696509361267, "learning_rate": 5.311018665753318e-05, "loss": 1.0971, "step": 11400 }, { "epoch": 3.3576642335766422, "grad_norm": 0.5160499811172485, "learning_rate": 5.144820138329223e-05, "loss": 1.0371, "step": 11500 }, { "epoch": 3.386861313868613, "grad_norm": 0.5635733008384705, "learning_rate": 4.980358213984282e-05, "loss": 0.978, "step": 11600 }, { "epoch": 3.4160583941605838, "grad_norm": 0.6345902681350708, "learning_rate": 4.8176917175390656e-05, "loss": 0.9646, "step": 11700 }, { "epoch": 3.445255474452555, "grad_norm": 0.6311865448951721, "learning_rate": 4.656878831624636e-05, "loss": 0.9127, "step": 11800 }, { "epoch": 3.4744525547445253, "grad_norm": 0.4736361503601074, "learning_rate": 4.497977075871738e-05, "loss": 1.2787, "step": 11900 }, { "epoch": 3.5036496350364965, "grad_norm": 0.3800007402896881, "learning_rate": 4.341043286337153e-05, "loss": 0.9448, "step": 12000 }, { "epoch": 3.5328467153284673, "grad_norm": 0.4484248161315918, "learning_rate": 4.1861335951745594e-05, "loss": 0.9813, "step": 12100 }, { "epoch": 3.562043795620438, "grad_norm": 0.4944378733634949, "learning_rate": 4.0333034105571565e-05, "loss": 1.1448, "step": 12200 }, { "epoch": 3.591240875912409, "grad_norm": 0.42330047488212585, "learning_rate": 3.882607396859229e-05, "loss": 1.0001, "step": 12300 }, { "epoch": 3.6204379562043796, "grad_norm": 0.492709219455719, "learning_rate": 3.734099455103779e-05, "loss": 1.1352, "step": 12400 }, { "epoch": 3.6496350364963503, "grad_norm": 0.42140600085258484, "learning_rate": 3.587832703683175e-05, "loss": 1.0103, "step": 12500 }, { "epoch": 3.678832116788321, "grad_norm": 0.393767774105072, "learning_rate": 3.4438594593597596e-05, "loss": 0.9709, "step": 12600 }, { "epoch": 3.708029197080292, "grad_norm": 0.5599704384803772, "learning_rate": 3.3022312185531214e-05, "loss": 0.9962, "step": 12700 }, { "epoch": 3.7372262773722627, "grad_norm": 0.3712032735347748, "learning_rate": 3.16299863892088e-05, "loss": 1.1135, "step": 12800 }, { "epoch": 3.7664233576642334, "grad_norm": 0.3675720989704132, "learning_rate": 3.026211521239408e-05, "loss": 0.9385, "step": 12900 }, { "epoch": 3.795620437956204, "grad_norm": 0.3175857961177826, "learning_rate": 2.891918791591046e-05, "loss": 1.0536, "step": 13000 }, { "epoch": 3.8248175182481754, "grad_norm": 0.42325085401535034, "learning_rate": 2.7601684838642405e-05, "loss": 1.0483, "step": 13100 }, { "epoch": 3.8540145985401457, "grad_norm": 0.41798830032348633, "learning_rate": 2.6310077225727224e-05, "loss": 1.068, "step": 13200 }, { "epoch": 3.883211678832117, "grad_norm": 0.4633995294570923, "learning_rate": 2.5044827060000085e-05, "loss": 1.0305, "step": 13300 }, { "epoch": 3.9124087591240877, "grad_norm": 0.38265103101730347, "learning_rate": 2.380638689675164e-05, "loss": 0.9594, "step": 13400 }, { "epoch": 3.9416058394160585, "grad_norm": 0.3224177062511444, "learning_rate": 2.2595199701858026e-05, "loss": 0.9919, "step": 13500 }, { "epoch": 3.9708029197080292, "grad_norm": 0.44377565383911133, "learning_rate": 2.1411698693340355e-05, "loss": 1.0119, "step": 13600 }, { "epoch": 4.0, "grad_norm": 2.024462938308716, "learning_rate": 2.0256307186411295e-05, "loss": 0.8573, "step": 13700 }, { "epoch": 4.029197080291971, "grad_norm": 2.932596206665039, "learning_rate": 1.912943844206333e-05, "loss": 0.9969, "step": 13800 }, { "epoch": 4.0583941605839415, "grad_norm": 2.8395912647247314, "learning_rate": 1.803149551925356e-05, "loss": 0.8543, "step": 13900 }, { "epoch": 4.087591240875913, "grad_norm": 3.7360405921936035, "learning_rate": 1.6962871130737168e-05, "loss": 0.9961, "step": 14000 }, { "epoch": 4.116788321167883, "grad_norm": 1.6590383052825928, "learning_rate": 1.59239475026018e-05, "loss": 0.839, "step": 14100 }, { "epoch": 4.145985401459854, "grad_norm": 2.4709715843200684, "learning_rate": 1.4915096237552873e-05, "loss": 1.0494, "step": 14200 }, { "epoch": 4.175182481751825, "grad_norm": 3.2636585235595703, "learning_rate": 1.3936678181998374e-05, "loss": 1.0201, "step": 14300 }, { "epoch": 4.204379562043796, "grad_norm": 1.440502405166626, "learning_rate": 1.298904329698123e-05, "loss": 1.0966, "step": 14400 }, { "epoch": 4.233576642335766, "grad_norm": 2.1606736183166504, "learning_rate": 1.2072530533005012e-05, "loss": 1.0813, "step": 14500 }, { "epoch": 4.262773722627737, "grad_norm": 1.3628579378128052, "learning_rate": 1.1187467708798116e-05, "loss": 1.0362, "step": 14600 }, { "epoch": 4.291970802919708, "grad_norm": 2.560506820678711, "learning_rate": 1.0334171394059122e-05, "loss": 1.0291, "step": 14700 }, { "epoch": 4.321167883211679, "grad_norm": 1.9132288694381714, "learning_rate": 9.512946796226296e-06, "loss": 1.0084, "step": 14800 }, { "epoch": 4.350364963503649, "grad_norm": 2.191732406616211, "learning_rate": 8.724087651310609e-06, "loss": 0.9277, "step": 14900 }, { "epoch": 4.37956204379562, "grad_norm": 1.6962709426879883, "learning_rate": 7.967876118832229e-06, "loss": 0.9741, "step": 15000 }, { "epoch": 4.408759124087592, "grad_norm": 1.9875869750976562, "learning_rate": 7.244582680897527e-06, "loss": 0.9878, "step": 15100 }, { "epoch": 4.437956204379562, "grad_norm": 3.515268087387085, "learning_rate": 6.554466045452923e-06, "loss": 1.0505, "step": 15200 }, { "epoch": 4.467153284671533, "grad_norm": 3.007796287536621, "learning_rate": 5.897773053750066e-06, "loss": 1.0459, "step": 15300 }, { "epoch": 4.4963503649635035, "grad_norm": 1.8410799503326416, "learning_rate": 5.274738592055573e-06, "loss": 0.9201, "step": 15400 }, { "epoch": 4.525547445255475, "grad_norm": 2.346604824066162, "learning_rate": 4.6855855076367804e-06, "loss": 1.0322, "step": 15500 }, { "epoch": 4.554744525547445, "grad_norm": 2.057304859161377, "learning_rate": 4.130524529053626e-06, "loss": 1.001, "step": 15600 }, { "epoch": 4.583941605839416, "grad_norm": 4.114916801452637, "learning_rate": 3.609754190785164e-06, "loss": 0.9103, "step": 15700 }, { "epoch": 4.613138686131387, "grad_norm": 3.7978780269622803, "learning_rate": 3.1234607622176227e-06, "loss": 1.052, "step": 15800 }, { "epoch": 4.642335766423358, "grad_norm": 1.945447564125061, "learning_rate": 2.6718181810195696e-06, "loss": 0.9763, "step": 15900 }, { "epoch": 4.671532846715328, "grad_norm": 2.225212812423706, "learning_rate": 2.2549879909276593e-06, "loss": 1.0564, "step": 16000 }, { "epoch": 4.700729927007299, "grad_norm": 1.6726568937301636, "learning_rate": 1.8731192839657407e-06, "loss": 1.042, "step": 16100 }, { "epoch": 4.7299270072992705, "grad_norm": 1.6171780824661255, "learning_rate": 1.5263486471174482e-06, "loss": 0.8253, "step": 16200 }, { "epoch": 4.759124087591241, "grad_norm": 3.288196086883545, "learning_rate": 1.2148001134717369e-06, "loss": 1.0111, "step": 16300 }, { "epoch": 4.788321167883212, "grad_norm": 3.768450975418091, "learning_rate": 9.385851178586924e-07, "loss": 0.8975, "step": 16400 }, { "epoch": 4.817518248175182, "grad_norm": 3.030186176300049, "learning_rate": 6.978024569914032e-07, "loss": 1.0036, "step": 16500 }, { "epoch": 4.846715328467154, "grad_norm": 0.843503475189209, "learning_rate": 4.92538254128383e-07, "loss": 1.011, "step": 16600 }, { "epoch": 4.875912408759124, "grad_norm": 2.515411376953125, "learning_rate": 3.2286592826888953e-07, "loss": 1.2158, "step": 16700 }, { "epoch": 4.905109489051095, "grad_norm": 1.0402921438217163, "learning_rate": 1.8884616789244248e-07, "loss": 0.9555, "step": 16800 }, { "epoch": 4.934306569343065, "grad_norm": 0.927361249923706, "learning_rate": 9.052690925168695e-08, "loss": 1.0264, "step": 16900 }, { "epoch": 4.963503649635037, "grad_norm": 1.8087859153747559, "learning_rate": 2.7943319226564346e-08, "loss": 1.0197, "step": 17000 }, { "epoch": 4.992700729927007, "grad_norm": 2.9732730388641357, "learning_rate": 1.1177827458075386e-09, "loss": 1.0122, "step": 17100 } ], "logging_steps": 100, "max_steps": 17125, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2724762270100429e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }