{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "eval_steps": 500,
  "global_step": 17125,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.029197080291970802,
      "grad_norm": 0.8470640182495117,
      "learning_rate": 3.891050583657588e-05,
      "loss": 4.8508,
      "step": 100
    },
    {
      "epoch": 0.058394160583941604,
      "grad_norm": 1.3564280271530151,
      "learning_rate": 7.782101167315176e-05,
      "loss": 4.4557,
      "step": 200
    },
    {
      "epoch": 0.08759124087591241,
      "grad_norm": 1.3530114889144897,
      "learning_rate": 0.00011673151750972763,
      "loss": 4.1905,
      "step": 300
    },
    {
      "epoch": 0.11678832116788321,
      "grad_norm": 1.328213095664978,
      "learning_rate": 0.0001556420233463035,
      "loss": 4.1219,
      "step": 400
    },
    {
      "epoch": 0.145985401459854,
      "grad_norm": 1.3318136930465698,
      "learning_rate": 0.0001945525291828794,
      "loss": 3.9169,
      "step": 500
    },
    {
      "epoch": 0.17518248175182483,
      "grad_norm": 1.7535721063613892,
      "learning_rate": 0.00019998677287306197,
      "loss": 3.8635,
      "step": 600
    },
    {
      "epoch": 0.20437956204379562,
      "grad_norm": 1.6469675302505493,
      "learning_rate": 0.00019993813296600207,
      "loss": 3.775,
      "step": 700
    },
    {
      "epoch": 0.23357664233576642,
      "grad_norm": 1.637732744216919,
      "learning_rate": 0.00019985374713924855,
      "loss": 3.6853,
      "step": 800
    },
    {
      "epoch": 0.26277372262773724,
      "grad_norm": 1.2182079553604126,
      "learning_rate": 0.00019973364557596465,
      "loss": 3.6172,
      "step": 900
    },
    {
      "epoch": 0.291970802919708,
      "grad_norm": 1.8510034084320068,
      "learning_rate": 0.00019957787123413558,
      "loss": 3.5044,
      "step": 1000
    },
    {
      "epoch": 0.32116788321167883,
      "grad_norm": 1.4988285303115845,
      "learning_rate": 0.00019938647983120316,
      "loss": 3.445,
      "step": 1100
    },
    {
      "epoch": 0.35036496350364965,
      "grad_norm": 2.2497406005859375,
      "learning_rate": 0.0001991595398241369,
      "loss": 3.4068,
      "step": 1200
    },
    {
      "epoch": 0.3795620437956204,
      "grad_norm": 2.1076061725616455,
      "learning_rate": 0.00019889713238494824,
      "loss": 3.275,
      "step": 1300
    },
    {
      "epoch": 0.40875912408759124,
      "grad_norm": 2.4178290367126465,
      "learning_rate": 0.0001985993513716568,
      "loss": 3.1974,
      "step": 1400
    },
    {
      "epoch": 0.43795620437956206,
      "grad_norm": 1.8352422714233398,
      "learning_rate": 0.00019826630329471928,
      "loss": 3.1553,
      "step": 1500
    },
    {
      "epoch": 0.46715328467153283,
      "grad_norm": 1.7513710260391235,
      "learning_rate": 0.00019789810727893284,
      "loss": 3.0717,
      "step": 1600
    },
    {
      "epoch": 0.49635036496350365,
      "grad_norm": 1.8882313966751099,
      "learning_rate": 0.00019749489502082632,
      "loss": 3.0037,
      "step": 1700
    },
    {
      "epoch": 0.5255474452554745,
      "grad_norm": 2.062272310256958,
      "learning_rate": 0.000197056810741555,
      "loss": 2.9222,
      "step": 1800
    },
    {
      "epoch": 0.5547445255474452,
      "grad_norm": 1.6502212285995483,
      "learning_rate": 0.00019658401113531565,
      "loss": 2.8871,
      "step": 1900
    },
    {
      "epoch": 0.583941605839416,
      "grad_norm": 1.6748732328414917,
      "learning_rate": 0.0001960766653132999,
      "loss": 2.8554,
      "step": 2000
    },
    {
      "epoch": 0.6131386861313869,
      "grad_norm": 1.9615834951400757,
      "learning_rate": 0.0001955349547432065,
      "loss": 2.7841,
      "step": 2100
    },
    {
      "epoch": 0.6423357664233577,
      "grad_norm": 3.604374885559082,
      "learning_rate": 0.000194959073184334,
      "loss": 2.6938,
      "step": 2200
    },
    {
      "epoch": 0.6715328467153284,
      "grad_norm": 2.6139862537384033,
      "learning_rate": 0.00019434922661827663,
      "loss": 2.634,
      "step": 2300
    },
    {
      "epoch": 0.7007299270072993,
      "grad_norm": 2.5986483097076416,
      "learning_rate": 0.00019370563317524882,
      "loss": 2.5901,
      "step": 2400
    },
    {
      "epoch": 0.7299270072992701,
      "grad_norm": 2.7669074535369873,
      "learning_rate": 0.00019302852305606432,
      "loss": 2.5001,
      "step": 2500
    },
    {
      "epoch": 0.7591240875912408,
      "grad_norm": 2.4265894889831543,
      "learning_rate": 0.00019231813844979777,
      "loss": 2.4595,
      "step": 2600
    },
    {
      "epoch": 0.7883211678832117,
      "grad_norm": 2.589481830596924,
      "learning_rate": 0.0001915747334471584,
      "loss": 2.3496,
      "step": 2700
    },
    {
      "epoch": 0.8175182481751825,
      "grad_norm": 2.969148874282837,
      "learning_rate": 0.0001907985739496068,
      "loss": 2.3099,
      "step": 2800
    },
    {
      "epoch": 0.8467153284671532,
      "grad_norm": 2.7729787826538086,
      "learning_rate": 0.00018998993757424713,
      "loss": 2.2896,
      "step": 2900
    },
    {
      "epoch": 0.8759124087591241,
      "grad_norm": 3.3500335216522217,
      "learning_rate": 0.00018914911355452895,
      "loss": 2.2433,
      "step": 3000
    },
    {
      "epoch": 0.9051094890510949,
      "grad_norm": 2.8922252655029297,
      "learning_rate": 0.00018827640263679394,
      "loss": 2.1564,
      "step": 3100
    },
    {
      "epoch": 0.9343065693430657,
      "grad_norm": 2.678309679031372,
      "learning_rate": 0.0001873721169727048,
      "loss": 2.1654,
      "step": 3200
    },
    {
      "epoch": 0.9635036496350365,
      "grad_norm": 3.6411967277526855,
      "learning_rate": 0.00018643658000759493,
      "loss": 2.1212,
      "step": 3300
    },
    {
      "epoch": 0.9927007299270073,
      "grad_norm": 2.9299466609954834,
      "learning_rate": 0.0001854701263647781,
      "loss": 1.9858,
      "step": 3400
    },
    {
      "epoch": 1.0218978102189782,
      "grad_norm": 1.6852861642837524,
      "learning_rate": 0.0001844731017258603,
      "loss": 1.953,
      "step": 3500
    },
    {
      "epoch": 1.051094890510949,
      "grad_norm": 2.1536929607391357,
      "learning_rate": 0.00018344586270709613,
      "loss": 1.8913,
      "step": 3600
    },
    {
      "epoch": 1.0802919708029197,
      "grad_norm": 1.5574983358383179,
      "learning_rate": 0.00018238877673183428,
      "loss": 1.8142,
      "step": 3700
    },
    {
      "epoch": 1.1094890510948905,
      "grad_norm": 1.319333791732788,
      "learning_rate": 0.0001813022218990972,
      "loss": 1.7464,
      "step": 3800
    },
    {
      "epoch": 1.1386861313868613,
      "grad_norm": 1.4836809635162354,
      "learning_rate": 0.00018018658684834256,
      "loss": 1.7574,
      "step": 3900
    },
    {
      "epoch": 1.167883211678832,
      "grad_norm": 1.7886255979537964,
      "learning_rate": 0.00017904227062045437,
      "loss": 1.7255,
      "step": 4000
    },
    {
      "epoch": 1.197080291970803,
      "grad_norm": 1.7846744060516357,
      "learning_rate": 0.00017786968251501406,
      "loss": 1.7593,
      "step": 4100
    },
    {
      "epoch": 1.2262773722627738,
      "grad_norm": 1.5439305305480957,
      "learning_rate": 0.00017666924194390183,
      "loss": 1.7175,
      "step": 4200
    },
    {
      "epoch": 1.2554744525547445,
      "grad_norm": 1.6620721817016602,
      "learning_rate": 0.0001754413782812812,
      "loss": 1.6345,
      "step": 4300
    },
    {
      "epoch": 1.2846715328467153,
      "grad_norm": 1.7942287921905518,
      "learning_rate": 0.00017418653071002047,
      "loss": 1.6425,
      "step": 4400
    },
    {
      "epoch": 1.313868613138686,
      "grad_norm": 1.7287062406539917,
      "learning_rate": 0.0001729051480646052,
      "loss": 1.5037,
      "step": 4500
    },
    {
      "epoch": 1.343065693430657,
      "grad_norm": 1.340302586555481,
      "learning_rate": 0.00017159768867059936,
      "loss": 1.6512,
      "step": 4600
    },
    {
      "epoch": 1.3722627737226278,
      "grad_norm": 1.3054938316345215,
      "learning_rate": 0.0001702646201807107,
      "loss": 1.5379,
      "step": 4700
    },
    {
      "epoch": 1.4014598540145986,
      "grad_norm": 1.4920105934143066,
      "learning_rate": 0.00016890641940752095,
      "loss": 1.4965,
      "step": 4800
    },
    {
      "epoch": 1.4306569343065694,
      "grad_norm": 1.6876699924468994,
      "learning_rate": 0.00016752357215293897,
      "loss": 1.5866,
      "step": 4900
    },
    {
      "epoch": 1.4598540145985401,
      "grad_norm": 1.3976876735687256,
      "learning_rate": 0.00016611657303443903,
      "loss": 1.4894,
      "step": 5000
    },
    {
      "epoch": 1.489051094890511,
      "grad_norm": 1.5993022918701172,
      "learning_rate": 0.0001646859253081458,
      "loss": 1.5393,
      "step": 5100
    },
    {
      "epoch": 1.5182481751824817,
      "grad_norm": 1.5950144529342651,
      "learning_rate": 0.00016323214068882935,
      "loss": 1.4881,
      "step": 5200
    },
    {
      "epoch": 1.5474452554744524,
      "grad_norm": 1.4699268341064453,
      "learning_rate": 0.00016175573916687484,
      "loss": 1.5266,
      "step": 5300
    },
    {
      "epoch": 1.5766423357664232,
      "grad_norm": 1.494767427444458,
      "learning_rate": 0.00016025724882229208,
      "loss": 1.4496,
      "step": 5400
    },
    {
      "epoch": 1.6058394160583942,
      "grad_norm": 1.589040756225586,
      "learning_rate": 0.00015873720563583165,
      "loss": 1.4994,
      "step": 5500
    },
    {
      "epoch": 1.635036496350365,
      "grad_norm": 1.1471683979034424,
      "learning_rate": 0.00015719615329727512,
      "loss": 1.4464,
      "step": 5600
    },
    {
      "epoch": 1.6642335766423357,
      "grad_norm": 1.5427780151367188,
      "learning_rate": 0.00015563464301096756,
      "loss": 1.3936,
      "step": 5700
    },
    {
      "epoch": 1.6934306569343067,
      "grad_norm": 1.4828369617462158,
      "learning_rate": 0.0001540532332986628,
      "loss": 1.3653,
      "step": 5800
    },
    {
      "epoch": 1.7226277372262775,
      "grad_norm": 1.3401451110839844,
      "learning_rate": 0.0001524524897997509,
      "loss": 1.4141,
      "step": 5900
    },
    {
      "epoch": 1.7518248175182483,
      "grad_norm": 1.289553165435791,
      "learning_rate": 0.00015083298506894015,
      "loss": 1.3119,
      "step": 6000
    },
    {
      "epoch": 1.781021897810219,
      "grad_norm": 1.2549211978912354,
      "learning_rate": 0.00014919529837146528,
      "loss": 1.3384,
      "step": 6100
    },
    {
      "epoch": 1.8102189781021898,
      "grad_norm": 1.71195387840271,
      "learning_rate": 0.00014754001547589564,
      "loss": 1.3231,
      "step": 6200
    },
    {
      "epoch": 1.8394160583941606,
      "grad_norm": 1.4366215467453003,
      "learning_rate": 0.0001458677284446172,
      "loss": 1.3196,
      "step": 6300
    },
    {
      "epoch": 1.8686131386861313,
      "grad_norm": 1.5663456916809082,
      "learning_rate": 0.00014417903542206342,
      "loss": 1.4379,
      "step": 6400
    },
    {
      "epoch": 1.897810218978102,
      "grad_norm": 2.2416155338287354,
      "learning_rate": 0.00014247454042077068,
      "loss": 1.3796,
      "step": 6500
    },
    {
      "epoch": 1.9270072992700729,
      "grad_norm": 1.9517488479614258,
      "learning_rate": 0.00014075485310533473,
      "loss": 1.3097,
      "step": 6600
    },
    {
      "epoch": 1.9562043795620438,
      "grad_norm": 1.7964544296264648,
      "learning_rate": 0.00013902058857434557,
      "loss": 1.3401,
      "step": 6700
    },
    {
      "epoch": 1.9854014598540146,
      "grad_norm": 1.0145196914672852,
      "learning_rate": 0.00013727236714037872,
      "loss": 1.161,
      "step": 6800
    },
    {
      "epoch": 2.0145985401459856,
      "grad_norm": 3.1186952590942383,
      "learning_rate": 0.00013551081410812147,
      "loss": 1.1588,
      "step": 6900
    },
    {
      "epoch": 2.0437956204379564,
      "grad_norm": 2.3698039054870605,
      "learning_rate": 0.0001337365595507137,
      "loss": 1.0564,
      "step": 7000
    },
    {
      "epoch": 2.072992700729927,
      "grad_norm": 3.22995924949646,
      "learning_rate": 0.0001319502380843829,
      "loss": 1.1415,
      "step": 7100
    },
    {
      "epoch": 2.102189781021898,
      "grad_norm": 3.0756795406341553,
      "learning_rate": 0.00013015248864145434,
      "loss": 1.3046,
      "step": 7200
    },
    {
      "epoch": 2.1313868613138687,
      "grad_norm": 3.943307399749756,
      "learning_rate": 0.00012834395424181748,
      "loss": 1.1222,
      "step": 7300
    },
    {
      "epoch": 2.1605839416058394,
      "grad_norm": 2.424041271209717,
      "learning_rate": 0.00012652528176293042,
      "loss": 1.1787,
      "step": 7400
    },
    {
      "epoch": 2.18978102189781,
      "grad_norm": 2.5731918811798096,
      "learning_rate": 0.0001246971217084443,
      "loss": 1.1214,
      "step": 7500
    },
    {
      "epoch": 2.218978102189781,
      "grad_norm": 1.8488351106643677,
      "learning_rate": 0.00012286012797553075,
      "loss": 1.0242,
      "step": 7600
    },
    {
      "epoch": 2.2481751824817517,
      "grad_norm": 3.49588680267334,
      "learning_rate": 0.0001210149576209959,
      "loss": 1.1995,
      "step": 7700
    },
    {
      "epoch": 2.2773722627737225,
      "grad_norm": 1.9914380311965942,
      "learning_rate": 0.00011916227062626388,
      "loss": 1.2218,
      "step": 7800
    },
    {
      "epoch": 2.3065693430656933,
      "grad_norm": 2.816720485687256,
      "learning_rate": 0.00011730272966131422,
      "loss": 1.12,
      "step": 7900
    },
    {
      "epoch": 2.335766423357664,
      "grad_norm": 1.5543715953826904,
      "learning_rate": 0.00011543699984765788,
      "loss": 1.0535,
      "step": 8000
    },
    {
      "epoch": 2.3649635036496353,
      "grad_norm": 2.867494821548462,
      "learning_rate": 0.00011356574852043617,
      "loss": 1.1174,
      "step": 8100
    },
    {
      "epoch": 2.394160583941606,
      "grad_norm": 3.366215944290161,
      "learning_rate": 0.00011168964498972818,
      "loss": 1.3649,
      "step": 8200
    },
    {
      "epoch": 2.423357664233577,
      "grad_norm": 5.301421165466309,
      "learning_rate": 0.00010980936030115132,
      "loss": 1.0221,
      "step": 8300
    },
    {
      "epoch": 2.4525547445255476,
      "grad_norm": 2.44589900970459,
      "learning_rate": 0.0001079255669958416,
      "loss": 1.1887,
      "step": 8400
    },
    {
      "epoch": 2.4817518248175183,
      "grad_norm": 1.8434127569198608,
      "learning_rate": 0.00010603893886989883,
      "loss": 1.0379,
      "step": 8500
    },
    {
      "epoch": 2.510948905109489,
      "grad_norm": 5.0544257164001465,
      "learning_rate": 0.00010415015073338286,
      "loss": 1.0467,
      "step": 8600
    },
    {
      "epoch": 2.54014598540146,
      "grad_norm": 2.266094446182251,
      "learning_rate": 0.00010225987816894698,
      "loss": 1.0284,
      "step": 8700
    },
    {
      "epoch": 2.5693430656934306,
      "grad_norm": 4.524600028991699,
      "learning_rate": 0.00010036879729019559,
      "loss": 1.0921,
      "step": 8800
    },
    {
      "epoch": 2.5985401459854014,
      "grad_norm": 3.9681153297424316,
      "learning_rate": 9.847758449985124e-05,
      "loss": 1.0259,
      "step": 8900
    },
    {
      "epoch": 2.627737226277372,
      "grad_norm": 4.628920078277588,
      "learning_rate": 9.658691624781866e-05,
      "loss": 1.1767,
      "step": 9000
    },
    {
      "epoch": 2.656934306569343,
      "grad_norm": 3.457036256790161,
      "learning_rate": 9.469746878923188e-05,
      "loss": 1.0662,
      "step": 9100
    },
    {
      "epoch": 2.686131386861314,
      "grad_norm": 3.112386703491211,
      "learning_rate": 9.280991794257103e-05,
      "loss": 1.0129,
      "step": 9200
    },
    {
      "epoch": 2.7153284671532845,
      "grad_norm": 2.2668728828430176,
      "learning_rate": 9.092493884793501e-05,
      "loss": 1.0139,
      "step": 9300
    },
    {
      "epoch": 2.7445255474452557,
      "grad_norm": 3.580484390258789,
      "learning_rate": 8.904320572555734e-05,
      "loss": 1.0144,
      "step": 9400
    },
    {
      "epoch": 2.7737226277372264,
      "grad_norm": 1.5489946603775024,
      "learning_rate": 8.71653916346505e-05,
      "loss": 0.9979,
      "step": 9500
    },
    {
      "epoch": 2.802919708029197,
      "grad_norm": 1.7365363836288452,
      "learning_rate": 8.529216823266606e-05,
      "loss": 1.0874,
      "step": 9600
    },
    {
      "epoch": 2.832116788321168,
      "grad_norm": 3.3894410133361816,
      "learning_rate": 8.342420553505559e-05,
      "loss": 1.1251,
      "step": 9700
    },
    {
      "epoch": 2.8613138686131387,
      "grad_norm": 4.5942912101745605,
      "learning_rate": 8.15621716756195e-05,
      "loss": 1.1814,
      "step": 9800
    },
    {
      "epoch": 2.8905109489051095,
      "grad_norm": 1.3230512142181396,
      "learning_rate": 7.970673266752838e-05,
      "loss": 1.1201,
      "step": 9900
    },
    {
      "epoch": 2.9197080291970803,
      "grad_norm": 1.1530183553695679,
      "learning_rate": 7.785855216510337e-05,
      "loss": 1.0758,
      "step": 10000
    },
    {
      "epoch": 2.948905109489051,
      "grad_norm": 2.9893460273742676,
      "learning_rate": 7.601829122643957e-05,
      "loss": 1.02,
      "step": 10100
    },
    {
      "epoch": 2.978102189781022,
      "grad_norm": 2.8600292205810547,
      "learning_rate": 7.418660807695897e-05,
      "loss": 1.1625,
      "step": 10200
    },
    {
      "epoch": 3.0072992700729926,
      "grad_norm": 0.6397629976272583,
      "learning_rate": 7.236415787397548e-05,
      "loss": 1.2091,
      "step": 10300
    },
    {
      "epoch": 3.0364963503649633,
      "grad_norm": 0.5129945278167725,
      "learning_rate": 7.055159247235844e-05,
      "loss": 1.1668,
      "step": 10400
    },
    {
      "epoch": 3.065693430656934,
      "grad_norm": 0.49799928069114685,
      "learning_rate": 6.874956019137669e-05,
      "loss": 0.8265,
      "step": 10500
    },
    {
      "epoch": 3.094890510948905,
      "grad_norm": 0.8108125925064087,
      "learning_rate": 6.695870558280718e-05,
      "loss": 1.0216,
      "step": 10600
    },
    {
      "epoch": 3.124087591240876,
      "grad_norm": 0.45047125220298767,
      "learning_rate": 6.51796692003918e-05,
      "loss": 1.1076,
      "step": 10700
    },
    {
      "epoch": 3.153284671532847,
      "grad_norm": 0.700933039188385,
      "learning_rate": 6.341308737072349e-05,
      "loss": 0.9756,
      "step": 10800
    },
    {
      "epoch": 3.1824817518248176,
      "grad_norm": 0.5597018003463745,
      "learning_rate": 6.165959196564481e-05,
      "loss": 0.854,
      "step": 10900
    },
    {
      "epoch": 3.2116788321167884,
      "grad_norm": 0.5288468599319458,
      "learning_rate": 5.991981017623955e-05,
      "loss": 0.9882,
      "step": 11000
    },
    {
      "epoch": 3.240875912408759,
      "grad_norm": 0.7121827602386475,
      "learning_rate": 5.819436428849896e-05,
      "loss": 1.0701,
      "step": 11100
    },
    {
      "epoch": 3.27007299270073,
      "grad_norm": 0.7537636756896973,
      "learning_rate": 5.648387146074192e-05,
      "loss": 0.9367,
      "step": 11200
    },
    {
      "epoch": 3.2992700729927007,
      "grad_norm": 0.33666905760765076,
      "learning_rate": 5.478894350286965e-05,
      "loss": 1.0724,
      "step": 11300
    },
    {
      "epoch": 3.3284671532846715,
      "grad_norm": 0.4632696509361267,
      "learning_rate": 5.311018665753318e-05,
      "loss": 1.0971,
      "step": 11400
    },
    {
      "epoch": 3.3576642335766422,
      "grad_norm": 0.5160499811172485,
      "learning_rate": 5.144820138329223e-05,
      "loss": 1.0371,
      "step": 11500
    },
    {
      "epoch": 3.386861313868613,
      "grad_norm": 0.5635733008384705,
      "learning_rate": 4.980358213984282e-05,
      "loss": 0.978,
      "step": 11600
    },
    {
      "epoch": 3.4160583941605838,
      "grad_norm": 0.6345902681350708,
      "learning_rate": 4.8176917175390656e-05,
      "loss": 0.9646,
      "step": 11700
    },
    {
      "epoch": 3.445255474452555,
      "grad_norm": 0.6311865448951721,
      "learning_rate": 4.656878831624636e-05,
      "loss": 0.9127,
      "step": 11800
    },
    {
      "epoch": 3.4744525547445253,
      "grad_norm": 0.4736361503601074,
      "learning_rate": 4.497977075871738e-05,
      "loss": 1.2787,
      "step": 11900
    },
    {
      "epoch": 3.5036496350364965,
      "grad_norm": 0.3800007402896881,
      "learning_rate": 4.341043286337153e-05,
      "loss": 0.9448,
      "step": 12000
    },
    {
      "epoch": 3.5328467153284673,
      "grad_norm": 0.4484248161315918,
      "learning_rate": 4.1861335951745594e-05,
      "loss": 0.9813,
      "step": 12100
    },
    {
      "epoch": 3.562043795620438,
      "grad_norm": 0.4944378733634949,
      "learning_rate": 4.0333034105571565e-05,
      "loss": 1.1448,
      "step": 12200
    },
    {
      "epoch": 3.591240875912409,
      "grad_norm": 0.42330047488212585,
      "learning_rate": 3.882607396859229e-05,
      "loss": 1.0001,
      "step": 12300
    },
    {
      "epoch": 3.6204379562043796,
      "grad_norm": 0.492709219455719,
      "learning_rate": 3.734099455103779e-05,
      "loss": 1.1352,
      "step": 12400
    },
    {
      "epoch": 3.6496350364963503,
      "grad_norm": 0.42140600085258484,
      "learning_rate": 3.587832703683175e-05,
      "loss": 1.0103,
      "step": 12500
    },
    {
      "epoch": 3.678832116788321,
      "grad_norm": 0.393767774105072,
      "learning_rate": 3.4438594593597596e-05,
      "loss": 0.9709,
      "step": 12600
    },
    {
      "epoch": 3.708029197080292,
      "grad_norm": 0.5599704384803772,
      "learning_rate": 3.3022312185531214e-05,
      "loss": 0.9962,
      "step": 12700
    },
    {
      "epoch": 3.7372262773722627,
      "grad_norm": 0.3712032735347748,
      "learning_rate": 3.16299863892088e-05,
      "loss": 1.1135,
      "step": 12800
    },
    {
      "epoch": 3.7664233576642334,
      "grad_norm": 0.3675720989704132,
      "learning_rate": 3.026211521239408e-05,
      "loss": 0.9385,
      "step": 12900
    },
    {
      "epoch": 3.795620437956204,
      "grad_norm": 0.3175857961177826,
      "learning_rate": 2.891918791591046e-05,
      "loss": 1.0536,
      "step": 13000
    },
    {
      "epoch": 3.8248175182481754,
      "grad_norm": 0.42325085401535034,
      "learning_rate": 2.7601684838642405e-05,
      "loss": 1.0483,
      "step": 13100
    },
    {
      "epoch": 3.8540145985401457,
      "grad_norm": 0.41798830032348633,
      "learning_rate": 2.6310077225727224e-05,
      "loss": 1.068,
      "step": 13200
    },
    {
      "epoch": 3.883211678832117,
      "grad_norm": 0.4633995294570923,
      "learning_rate": 2.5044827060000085e-05,
      "loss": 1.0305,
      "step": 13300
    },
    {
      "epoch": 3.9124087591240877,
      "grad_norm": 0.38265103101730347,
      "learning_rate": 2.380638689675164e-05,
      "loss": 0.9594,
      "step": 13400
    },
    {
      "epoch": 3.9416058394160585,
      "grad_norm": 0.3224177062511444,
      "learning_rate": 2.2595199701858026e-05,
      "loss": 0.9919,
      "step": 13500
    },
    {
      "epoch": 3.9708029197080292,
      "grad_norm": 0.44377565383911133,
      "learning_rate": 2.1411698693340355e-05,
      "loss": 1.0119,
      "step": 13600
    },
    {
      "epoch": 4.0,
      "grad_norm": 2.024462938308716,
      "learning_rate": 2.0256307186411295e-05,
      "loss": 0.8573,
      "step": 13700
    },
    {
      "epoch": 4.029197080291971,
      "grad_norm": 2.932596206665039,
      "learning_rate": 1.912943844206333e-05,
      "loss": 0.9969,
      "step": 13800
    },
    {
      "epoch": 4.0583941605839415,
      "grad_norm": 2.8395912647247314,
      "learning_rate": 1.803149551925356e-05,
      "loss": 0.8543,
      "step": 13900
    },
    {
      "epoch": 4.087591240875913,
      "grad_norm": 3.7360405921936035,
      "learning_rate": 1.6962871130737168e-05,
      "loss": 0.9961,
      "step": 14000
    },
    {
      "epoch": 4.116788321167883,
      "grad_norm": 1.6590383052825928,
      "learning_rate": 1.59239475026018e-05,
      "loss": 0.839,
      "step": 14100
    },
    {
      "epoch": 4.145985401459854,
      "grad_norm": 2.4709715843200684,
      "learning_rate": 1.4915096237552873e-05,
      "loss": 1.0494,
      "step": 14200
    },
    {
      "epoch": 4.175182481751825,
      "grad_norm": 3.2636585235595703,
      "learning_rate": 1.3936678181998374e-05,
      "loss": 1.0201,
      "step": 14300
    },
    {
      "epoch": 4.204379562043796,
      "grad_norm": 1.440502405166626,
      "learning_rate": 1.298904329698123e-05,
      "loss": 1.0966,
      "step": 14400
    },
    {
      "epoch": 4.233576642335766,
      "grad_norm": 2.1606736183166504,
      "learning_rate": 1.2072530533005012e-05,
      "loss": 1.0813,
      "step": 14500
    },
    {
      "epoch": 4.262773722627737,
      "grad_norm": 1.3628579378128052,
      "learning_rate": 1.1187467708798116e-05,
      "loss": 1.0362,
      "step": 14600
    },
    {
      "epoch": 4.291970802919708,
      "grad_norm": 2.560506820678711,
      "learning_rate": 1.0334171394059122e-05,
      "loss": 1.0291,
      "step": 14700
    },
    {
      "epoch": 4.321167883211679,
      "grad_norm": 1.9132288694381714,
      "learning_rate": 9.512946796226296e-06,
      "loss": 1.0084,
      "step": 14800
    },
    {
      "epoch": 4.350364963503649,
      "grad_norm": 2.191732406616211,
      "learning_rate": 8.724087651310609e-06,
      "loss": 0.9277,
      "step": 14900
    },
    {
      "epoch": 4.37956204379562,
      "grad_norm": 1.6962709426879883,
      "learning_rate": 7.967876118832229e-06,
      "loss": 0.9741,
      "step": 15000
    },
    {
      "epoch": 4.408759124087592,
      "grad_norm": 1.9875869750976562,
      "learning_rate": 7.244582680897527e-06,
      "loss": 0.9878,
      "step": 15100
    },
    {
      "epoch": 4.437956204379562,
      "grad_norm": 3.515268087387085,
      "learning_rate": 6.554466045452923e-06,
      "loss": 1.0505,
      "step": 15200
    },
    {
      "epoch": 4.467153284671533,
      "grad_norm": 3.007796287536621,
      "learning_rate": 5.897773053750066e-06,
      "loss": 1.0459,
      "step": 15300
    },
    {
      "epoch": 4.4963503649635035,
      "grad_norm": 1.8410799503326416,
      "learning_rate": 5.274738592055573e-06,
      "loss": 0.9201,
      "step": 15400
    },
    {
      "epoch": 4.525547445255475,
      "grad_norm": 2.346604824066162,
      "learning_rate": 4.6855855076367804e-06,
      "loss": 1.0322,
      "step": 15500
    },
    {
      "epoch": 4.554744525547445,
      "grad_norm": 2.057304859161377,
      "learning_rate": 4.130524529053626e-06,
      "loss": 1.001,
      "step": 15600
    },
    {
      "epoch": 4.583941605839416,
      "grad_norm": 4.114916801452637,
      "learning_rate": 3.609754190785164e-06,
      "loss": 0.9103,
      "step": 15700
    },
    {
      "epoch": 4.613138686131387,
      "grad_norm": 3.7978780269622803,
      "learning_rate": 3.1234607622176227e-06,
      "loss": 1.052,
      "step": 15800
    },
    {
      "epoch": 4.642335766423358,
      "grad_norm": 1.945447564125061,
      "learning_rate": 2.6718181810195696e-06,
      "loss": 0.9763,
      "step": 15900
    },
    {
      "epoch": 4.671532846715328,
      "grad_norm": 2.225212812423706,
      "learning_rate": 2.2549879909276593e-06,
      "loss": 1.0564,
      "step": 16000
    },
    {
      "epoch": 4.700729927007299,
      "grad_norm": 1.6726568937301636,
      "learning_rate": 1.8731192839657407e-06,
      "loss": 1.042,
      "step": 16100
    },
    {
      "epoch": 4.7299270072992705,
      "grad_norm": 1.6171780824661255,
      "learning_rate": 1.5263486471174482e-06,
      "loss": 0.8253,
      "step": 16200
    },
    {
      "epoch": 4.759124087591241,
      "grad_norm": 3.288196086883545,
      "learning_rate": 1.2148001134717369e-06,
      "loss": 1.0111,
      "step": 16300
    },
    {
      "epoch": 4.788321167883212,
      "grad_norm": 3.768450975418091,
      "learning_rate": 9.385851178586924e-07,
      "loss": 0.8975,
      "step": 16400
    },
    {
      "epoch": 4.817518248175182,
      "grad_norm": 3.030186176300049,
      "learning_rate": 6.978024569914032e-07,
      "loss": 1.0036,
      "step": 16500
    },
    {
      "epoch": 4.846715328467154,
      "grad_norm": 0.843503475189209,
      "learning_rate": 4.92538254128383e-07,
      "loss": 1.011,
      "step": 16600
    },
    {
      "epoch": 4.875912408759124,
      "grad_norm": 2.515411376953125,
      "learning_rate": 3.2286592826888953e-07,
      "loss": 1.2158,
      "step": 16700
    },
    {
      "epoch": 4.905109489051095,
      "grad_norm": 1.0402921438217163,
      "learning_rate": 1.8884616789244248e-07,
      "loss": 0.9555,
      "step": 16800
    },
    {
      "epoch": 4.934306569343065,
      "grad_norm": 0.927361249923706,
      "learning_rate": 9.052690925168695e-08,
      "loss": 1.0264,
      "step": 16900
    },
    {
      "epoch": 4.963503649635037,
      "grad_norm": 1.8087859153747559,
      "learning_rate": 2.7943319226564346e-08,
      "loss": 1.0197,
      "step": 17000
    },
    {
      "epoch": 4.992700729927007,
      "grad_norm": 2.9732730388641357,
      "learning_rate": 1.1177827458075386e-09,
      "loss": 1.0122,
      "step": 17100
    }
  ],
  "logging_steps": 100,
  "max_steps": 17125,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 0,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.2724762270100429e+17,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}