Llama3.1-8B-Middo-Alpaca-4o-mini / trainer_state.json
Word2Li's picture
Upload model
516e3b5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9964556962025316,
"eval_steps": 500,
"global_step": 246,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004050632911392405,
"grad_norm": 13.497359275817871,
"learning_rate": 2.5e-06,
"loss": 1.2803,
"step": 1
},
{
"epoch": 0.00810126582278481,
"grad_norm": 11.971280097961426,
"learning_rate": 5e-06,
"loss": 1.2781,
"step": 2
},
{
"epoch": 0.012151898734177215,
"grad_norm": 10.184432029724121,
"learning_rate": 7.500000000000001e-06,
"loss": 1.2722,
"step": 3
},
{
"epoch": 0.01620253164556962,
"grad_norm": 6.801725387573242,
"learning_rate": 1e-05,
"loss": 1.1279,
"step": 4
},
{
"epoch": 0.020253164556962026,
"grad_norm": 4.426397800445557,
"learning_rate": 1.25e-05,
"loss": 1.1123,
"step": 5
},
{
"epoch": 0.02430379746835443,
"grad_norm": 5.871100425720215,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.1242,
"step": 6
},
{
"epoch": 0.028354430379746835,
"grad_norm": 4.175433158874512,
"learning_rate": 1.7500000000000002e-05,
"loss": 1.0718,
"step": 7
},
{
"epoch": 0.03240506329113924,
"grad_norm": 5.5522990226745605,
"learning_rate": 2e-05,
"loss": 1.0707,
"step": 8
},
{
"epoch": 0.03645569620253165,
"grad_norm": 5.652365207672119,
"learning_rate": 1.999912881672411e-05,
"loss": 1.1061,
"step": 9
},
{
"epoch": 0.04050632911392405,
"grad_norm": 5.447442531585693,
"learning_rate": 1.9996515418688493e-05,
"loss": 1.0846,
"step": 10
},
{
"epoch": 0.044556962025316456,
"grad_norm": 2.7070164680480957,
"learning_rate": 1.999216026124288e-05,
"loss": 1.086,
"step": 11
},
{
"epoch": 0.04860759493670886,
"grad_norm": 2.1301090717315674,
"learning_rate": 1.998606410321534e-05,
"loss": 1.0542,
"step": 12
},
{
"epoch": 0.052658227848101265,
"grad_norm": 2.466780185699463,
"learning_rate": 1.9978228006780056e-05,
"loss": 1.0618,
"step": 13
},
{
"epoch": 0.05670886075949367,
"grad_norm": 3.075913667678833,
"learning_rate": 1.9968653337272262e-05,
"loss": 1.0487,
"step": 14
},
{
"epoch": 0.060759493670886074,
"grad_norm": 2.206223249435425,
"learning_rate": 1.9957341762950346e-05,
"loss": 1.0237,
"step": 15
},
{
"epoch": 0.06481012658227848,
"grad_norm": 1.7991361618041992,
"learning_rate": 1.9944295254705187e-05,
"loss": 1.0212,
"step": 16
},
{
"epoch": 0.06886075949367089,
"grad_norm": 1.9748821258544922,
"learning_rate": 1.9929516085716736e-05,
"loss": 1.0243,
"step": 17
},
{
"epoch": 0.0729113924050633,
"grad_norm": 2.207519054412842,
"learning_rate": 1.9913006831057967e-05,
"loss": 1.0105,
"step": 18
},
{
"epoch": 0.0769620253164557,
"grad_norm": 1.6344636678695679,
"learning_rate": 1.9894770367246197e-05,
"loss": 1.002,
"step": 19
},
{
"epoch": 0.0810126582278481,
"grad_norm": 1.5756722688674927,
"learning_rate": 1.9874809871741877e-05,
"loss": 0.9825,
"step": 20
},
{
"epoch": 0.08506329113924051,
"grad_norm": 1.6223676204681396,
"learning_rate": 1.9853128822394976e-05,
"loss": 0.982,
"step": 21
},
{
"epoch": 0.08911392405063291,
"grad_norm": 1.5844409465789795,
"learning_rate": 1.982973099683902e-05,
"loss": 1.024,
"step": 22
},
{
"epoch": 0.09316455696202532,
"grad_norm": 1.8773488998413086,
"learning_rate": 1.980462047183287e-05,
"loss": 0.9737,
"step": 23
},
{
"epoch": 0.09721518987341772,
"grad_norm": 1.8542094230651855,
"learning_rate": 1.977780162255041e-05,
"loss": 1.0221,
"step": 24
},
{
"epoch": 0.10126582278481013,
"grad_norm": 2.02828049659729,
"learning_rate": 1.9749279121818235e-05,
"loss": 0.996,
"step": 25
},
{
"epoch": 0.10531645569620253,
"grad_norm": 1.464856743812561,
"learning_rate": 1.9719057939301477e-05,
"loss": 0.9812,
"step": 26
},
{
"epoch": 0.10936708860759493,
"grad_norm": 1.8954442739486694,
"learning_rate": 1.9687143340637885e-05,
"loss": 0.9721,
"step": 27
},
{
"epoch": 0.11341772151898734,
"grad_norm": 1.6294549703598022,
"learning_rate": 1.9653540886520387e-05,
"loss": 1.0003,
"step": 28
},
{
"epoch": 0.11746835443037974,
"grad_norm": 1.55502188205719,
"learning_rate": 1.961825643172819e-05,
"loss": 0.9727,
"step": 29
},
{
"epoch": 0.12151898734177215,
"grad_norm": 1.4686181545257568,
"learning_rate": 1.9581296124106682e-05,
"loss": 0.9658,
"step": 30
},
{
"epoch": 0.12556962025316457,
"grad_norm": 1.47079598903656,
"learning_rate": 1.9542666403496232e-05,
"loss": 0.9884,
"step": 31
},
{
"epoch": 0.12962025316455697,
"grad_norm": 1.6131620407104492,
"learning_rate": 1.9502374000610152e-05,
"loss": 1.0191,
"step": 32
},
{
"epoch": 0.13367088607594937,
"grad_norm": 1.5124179124832153,
"learning_rate": 1.946042593586195e-05,
"loss": 0.9668,
"step": 33
},
{
"epoch": 0.13772151898734178,
"grad_norm": 1.4320518970489502,
"learning_rate": 1.941682951814212e-05,
"loss": 0.9631,
"step": 34
},
{
"epoch": 0.14177215189873418,
"grad_norm": 1.4760117530822754,
"learning_rate": 1.9371592343544655e-05,
"loss": 0.9843,
"step": 35
},
{
"epoch": 0.1458227848101266,
"grad_norm": 1.2786954641342163,
"learning_rate": 1.932472229404356e-05,
"loss": 0.9281,
"step": 36
},
{
"epoch": 0.149873417721519,
"grad_norm": 1.5036900043487549,
"learning_rate": 1.927622753611948e-05,
"loss": 0.9629,
"step": 37
},
{
"epoch": 0.1539240506329114,
"grad_norm": 1.4190330505371094,
"learning_rate": 1.922611651933683e-05,
"loss": 0.9404,
"step": 38
},
{
"epoch": 0.1579746835443038,
"grad_norm": 1.4339755773544312,
"learning_rate": 1.9174397974871563e-05,
"loss": 0.9955,
"step": 39
},
{
"epoch": 0.1620253164556962,
"grad_norm": 1.4856185913085938,
"learning_rate": 1.912108091398988e-05,
"loss": 0.9969,
"step": 40
},
{
"epoch": 0.1660759493670886,
"grad_norm": 1.5168850421905518,
"learning_rate": 1.906617462647813e-05,
"loss": 0.9656,
"step": 41
},
{
"epoch": 0.17012658227848101,
"grad_norm": 1.40033757686615,
"learning_rate": 1.900968867902419e-05,
"loss": 1.0061,
"step": 42
},
{
"epoch": 0.17417721518987342,
"grad_norm": 1.3259391784667969,
"learning_rate": 1.8951632913550625e-05,
"loss": 0.9666,
"step": 43
},
{
"epoch": 0.17822784810126582,
"grad_norm": 1.432612657546997,
"learning_rate": 1.8892017445499812e-05,
"loss": 0.9565,
"step": 44
},
{
"epoch": 0.18227848101265823,
"grad_norm": 1.4262988567352295,
"learning_rate": 1.8830852662071507e-05,
"loss": 0.939,
"step": 45
},
{
"epoch": 0.18632911392405063,
"grad_norm": 1.369807243347168,
"learning_rate": 1.876814922041299e-05,
"loss": 0.9547,
"step": 46
},
{
"epoch": 0.19037974683544304,
"grad_norm": 1.2927881479263306,
"learning_rate": 1.8703918045762197e-05,
"loss": 0.9698,
"step": 47
},
{
"epoch": 0.19443037974683544,
"grad_norm": 1.3403329849243164,
"learning_rate": 1.8638170329544164e-05,
"loss": 0.9405,
"step": 48
},
{
"epoch": 0.19848101265822785,
"grad_norm": 1.388960599899292,
"learning_rate": 1.857091752742105e-05,
"loss": 1.0083,
"step": 49
},
{
"epoch": 0.20253164556962025,
"grad_norm": 1.2685046195983887,
"learning_rate": 1.8502171357296144e-05,
"loss": 0.9702,
"step": 50
},
{
"epoch": 0.20658227848101265,
"grad_norm": 1.3383150100708008,
"learning_rate": 1.8431943797272187e-05,
"loss": 0.9709,
"step": 51
},
{
"epoch": 0.21063291139240506,
"grad_norm": 1.3614344596862793,
"learning_rate": 1.8360247083564343e-05,
"loss": 0.977,
"step": 52
},
{
"epoch": 0.21468354430379746,
"grad_norm": 1.3981597423553467,
"learning_rate": 1.828709370836819e-05,
"loss": 0.9734,
"step": 53
},
{
"epoch": 0.21873417721518987,
"grad_norm": 1.3510750532150269,
"learning_rate": 1.8212496417683135e-05,
"loss": 0.9746,
"step": 54
},
{
"epoch": 0.22278481012658227,
"grad_norm": 1.703658103942871,
"learning_rate": 1.81364682090916e-05,
"loss": 0.9822,
"step": 55
},
{
"epoch": 0.22683544303797468,
"grad_norm": 1.1477118730545044,
"learning_rate": 1.805902232949435e-05,
"loss": 0.9598,
"step": 56
},
{
"epoch": 0.23088607594936708,
"grad_norm": 1.4172189235687256,
"learning_rate": 1.7980172272802398e-05,
"loss": 0.917,
"step": 57
},
{
"epoch": 0.23493670886075949,
"grad_norm": 1.4359899759292603,
"learning_rate": 1.789993177758588e-05,
"loss": 0.9716,
"step": 58
},
{
"epoch": 0.2389873417721519,
"grad_norm": 1.2954188585281372,
"learning_rate": 1.78183148246803e-05,
"loss": 0.9889,
"step": 59
},
{
"epoch": 0.2430379746835443,
"grad_norm": 1.4424415826797485,
"learning_rate": 1.773533563475053e-05,
"loss": 1.0034,
"step": 60
},
{
"epoch": 0.2470886075949367,
"grad_norm": 1.2044004201889038,
"learning_rate": 1.7651008665813083e-05,
"loss": 0.9816,
"step": 61
},
{
"epoch": 0.25113924050632913,
"grad_norm": 1.3561129570007324,
"learning_rate": 1.7565348610716963e-05,
"loss": 0.9711,
"step": 62
},
{
"epoch": 0.25518987341772154,
"grad_norm": 1.4384827613830566,
"learning_rate": 1.7478370394583647e-05,
"loss": 1.0251,
"step": 63
},
{
"epoch": 0.25924050632911394,
"grad_norm": 1.306980848312378,
"learning_rate": 1.7390089172206594e-05,
"loss": 0.9603,
"step": 64
},
{
"epoch": 0.26329113924050634,
"grad_norm": 1.357668399810791,
"learning_rate": 1.73005203254107e-05,
"loss": 0.9714,
"step": 65
},
{
"epoch": 0.26734177215189875,
"grad_norm": 1.4257886409759521,
"learning_rate": 1.720967946037225e-05,
"loss": 1.0092,
"step": 66
},
{
"epoch": 0.27139240506329115,
"grad_norm": 1.218770980834961,
"learning_rate": 1.7117582404899714e-05,
"loss": 0.9515,
"step": 67
},
{
"epoch": 0.27544303797468356,
"grad_norm": 1.0931731462478638,
"learning_rate": 1.7024245205675986e-05,
"loss": 0.9759,
"step": 68
},
{
"epoch": 0.27949367088607596,
"grad_norm": 1.2003381252288818,
"learning_rate": 1.692968412546247e-05,
"loss": 0.9399,
"step": 69
},
{
"epoch": 0.28354430379746837,
"grad_norm": 1.1908068656921387,
"learning_rate": 1.6833915640265485e-05,
"loss": 0.9535,
"step": 70
},
{
"epoch": 0.28759493670886077,
"grad_norm": 1.533835530281067,
"learning_rate": 1.6736956436465573e-05,
"loss": 0.9673,
"step": 71
},
{
"epoch": 0.2916455696202532,
"grad_norm": 1.2078437805175781,
"learning_rate": 1.6638823407910085e-05,
"loss": 0.9868,
"step": 72
},
{
"epoch": 0.2956962025316456,
"grad_norm": 1.244404673576355,
"learning_rate": 1.6539533652969683e-05,
"loss": 0.9662,
"step": 73
},
{
"epoch": 0.299746835443038,
"grad_norm": 1.4124116897583008,
"learning_rate": 1.6439104471559157e-05,
"loss": 1.0016,
"step": 74
},
{
"epoch": 0.3037974683544304,
"grad_norm": 1.2600699663162231,
"learning_rate": 1.6337553362123165e-05,
"loss": 0.9885,
"step": 75
},
{
"epoch": 0.3078481012658228,
"grad_norm": 1.258617639541626,
"learning_rate": 1.6234898018587336e-05,
"loss": 0.9284,
"step": 76
},
{
"epoch": 0.3118987341772152,
"grad_norm": 1.3986912965774536,
"learning_rate": 1.6131156327275372e-05,
"loss": 0.9733,
"step": 77
},
{
"epoch": 0.3159493670886076,
"grad_norm": 1.2392029762268066,
"learning_rate": 1.6026346363792565e-05,
"loss": 0.9788,
"step": 78
},
{
"epoch": 0.32,
"grad_norm": 1.2097805738449097,
"learning_rate": 1.5920486389876383e-05,
"loss": 0.9405,
"step": 79
},
{
"epoch": 0.3240506329113924,
"grad_norm": 1.2511652708053589,
"learning_rate": 1.58135948502146e-05,
"loss": 0.942,
"step": 80
},
{
"epoch": 0.3281012658227848,
"grad_norm": 1.1774235963821411,
"learning_rate": 1.5705690369231552e-05,
"loss": 0.9778,
"step": 81
},
{
"epoch": 0.3321518987341772,
"grad_norm": 1.2575836181640625,
"learning_rate": 1.5596791747843083e-05,
"loss": 0.9466,
"step": 82
},
{
"epoch": 0.3362025316455696,
"grad_norm": 1.2601759433746338,
"learning_rate": 1.5486917960180742e-05,
"loss": 0.9295,
"step": 83
},
{
"epoch": 0.34025316455696203,
"grad_norm": 1.4311736822128296,
"learning_rate": 1.5376088150285777e-05,
"loss": 0.9894,
"step": 84
},
{
"epoch": 0.34430379746835443,
"grad_norm": 1.393281102180481,
"learning_rate": 1.526432162877356e-05,
"loss": 0.9497,
"step": 85
},
{
"epoch": 0.34835443037974684,
"grad_norm": 1.2661269903182983,
"learning_rate": 1.515163786946896e-05,
"loss": 0.9634,
"step": 86
},
{
"epoch": 0.35240506329113924,
"grad_norm": 1.4130759239196777,
"learning_rate": 1.5038056506013297e-05,
"loss": 0.9384,
"step": 87
},
{
"epoch": 0.35645569620253165,
"grad_norm": 1.3350815773010254,
"learning_rate": 1.4923597328443423e-05,
"loss": 0.9385,
"step": 88
},
{
"epoch": 0.36050632911392405,
"grad_norm": 1.3958266973495483,
"learning_rate": 1.4808280279743594e-05,
"loss": 0.9596,
"step": 89
},
{
"epoch": 0.36455696202531646,
"grad_norm": 1.2324460744857788,
"learning_rate": 1.4692125452370664e-05,
"loss": 0.9333,
"step": 90
},
{
"epoch": 0.36860759493670886,
"grad_norm": 1.3735852241516113,
"learning_rate": 1.4575153084753233e-05,
"loss": 0.9148,
"step": 91
},
{
"epoch": 0.37265822784810126,
"grad_norm": 1.1630451679229736,
"learning_rate": 1.4457383557765385e-05,
"loss": 0.9479,
"step": 92
},
{
"epoch": 0.37670886075949367,
"grad_norm": 1.1744204759597778,
"learning_rate": 1.4338837391175582e-05,
"loss": 0.9433,
"step": 93
},
{
"epoch": 0.3807594936708861,
"grad_norm": 1.1749955415725708,
"learning_rate": 1.4219535240071378e-05,
"loss": 0.9097,
"step": 94
},
{
"epoch": 0.3848101265822785,
"grad_norm": 1.2120801210403442,
"learning_rate": 1.4099497891260538e-05,
"loss": 0.9623,
"step": 95
},
{
"epoch": 0.3888607594936709,
"grad_norm": 1.3005883693695068,
"learning_rate": 1.397874625964921e-05,
"loss": 0.9562,
"step": 96
},
{
"epoch": 0.3929113924050633,
"grad_norm": 1.2138869762420654,
"learning_rate": 1.3857301384597796e-05,
"loss": 0.9524,
"step": 97
},
{
"epoch": 0.3969620253164557,
"grad_norm": 1.355625033378601,
"learning_rate": 1.3735184426255117e-05,
"loss": 0.9434,
"step": 98
},
{
"epoch": 0.4010126582278481,
"grad_norm": 1.1441434621810913,
"learning_rate": 1.3612416661871532e-05,
"loss": 0.9922,
"step": 99
},
{
"epoch": 0.4050632911392405,
"grad_norm": 1.3381670713424683,
"learning_rate": 1.348901948209167e-05,
"loss": 0.9179,
"step": 100
},
{
"epoch": 0.4091139240506329,
"grad_norm": 1.169856071472168,
"learning_rate": 1.3365014387227393e-05,
"loss": 0.9251,
"step": 101
},
{
"epoch": 0.4131645569620253,
"grad_norm": 1.215080976486206,
"learning_rate": 1.324042298351166e-05,
"loss": 0.9315,
"step": 102
},
{
"epoch": 0.4172151898734177,
"grad_norm": 1.1357917785644531,
"learning_rate": 1.3115266979333917e-05,
"loss": 0.9354,
"step": 103
},
{
"epoch": 0.4212658227848101,
"grad_norm": 1.1942713260650635,
"learning_rate": 1.2989568181457704e-05,
"loss": 0.9249,
"step": 104
},
{
"epoch": 0.4253164556962025,
"grad_norm": 1.1736984252929688,
"learning_rate": 1.2863348491221129e-05,
"loss": 0.9138,
"step": 105
},
{
"epoch": 0.4293670886075949,
"grad_norm": 1.187155842781067,
"learning_rate": 1.2736629900720832e-05,
"loss": 0.9255,
"step": 106
},
{
"epoch": 0.43341772151898733,
"grad_norm": 1.084222674369812,
"learning_rate": 1.2609434488980168e-05,
"loss": 0.9086,
"step": 107
},
{
"epoch": 0.43746835443037974,
"grad_norm": 1.1785520315170288,
"learning_rate": 1.248178441810224e-05,
"loss": 0.9428,
"step": 108
},
{
"epoch": 0.44151898734177214,
"grad_norm": 1.1602153778076172,
"learning_rate": 1.2353701929408425e-05,
"loss": 0.9178,
"step": 109
},
{
"epoch": 0.44556962025316454,
"grad_norm": 1.2142413854599,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.914,
"step": 110
},
{
"epoch": 0.44962025316455695,
"grad_norm": 1.134624719619751,
"learning_rate": 1.2096329036685469e-05,
"loss": 0.9093,
"step": 111
},
{
"epoch": 0.45367088607594935,
"grad_norm": 1.1606262922286987,
"learning_rate": 1.1967083476448282e-05,
"loss": 0.9109,
"step": 112
},
{
"epoch": 0.45772151898734176,
"grad_norm": 1.1459376811981201,
"learning_rate": 1.1837495178165706e-05,
"loss": 0.9197,
"step": 113
},
{
"epoch": 0.46177215189873416,
"grad_norm": 1.2801434993743896,
"learning_rate": 1.1707586720869375e-05,
"loss": 0.9557,
"step": 114
},
{
"epoch": 0.46582278481012657,
"grad_norm": 1.2383016347885132,
"learning_rate": 1.1577380739374376e-05,
"loss": 0.9287,
"step": 115
},
{
"epoch": 0.46987341772151897,
"grad_norm": 1.2320594787597656,
"learning_rate": 1.1446899920335407e-05,
"loss": 0.9559,
"step": 116
},
{
"epoch": 0.4739240506329114,
"grad_norm": 1.0938814878463745,
"learning_rate": 1.1316166998293937e-05,
"loss": 0.969,
"step": 117
},
{
"epoch": 0.4779746835443038,
"grad_norm": 1.168281078338623,
"learning_rate": 1.118520475171703e-05,
"loss": 0.9374,
"step": 118
},
{
"epoch": 0.4820253164556962,
"grad_norm": 1.163573145866394,
"learning_rate": 1.1054035999028478e-05,
"loss": 0.9503,
"step": 119
},
{
"epoch": 0.4860759493670886,
"grad_norm": 1.064112901687622,
"learning_rate": 1.092268359463302e-05,
"loss": 0.9221,
"step": 120
},
{
"epoch": 0.490126582278481,
"grad_norm": 1.1465892791748047,
"learning_rate": 1.0791170424934248e-05,
"loss": 0.9178,
"step": 121
},
{
"epoch": 0.4941772151898734,
"grad_norm": 1.2368698120117188,
"learning_rate": 1.0659519404346955e-05,
"loss": 0.9571,
"step": 122
},
{
"epoch": 0.4982278481012658,
"grad_norm": 1.028232455253601,
"learning_rate": 1.0527753471304625e-05,
"loss": 0.9567,
"step": 123
},
{
"epoch": 0.5022784810126583,
"grad_norm": 1.2025481462478638,
"learning_rate": 1.0395895584262696e-05,
"loss": 0.9406,
"step": 124
},
{
"epoch": 0.5063291139240507,
"grad_norm": 1.0962940454483032,
"learning_rate": 1.0263968717698365e-05,
"loss": 0.9053,
"step": 125
},
{
"epoch": 0.5103797468354431,
"grad_norm": 1.102229356765747,
"learning_rate": 1.013199585810759e-05,
"loss": 0.9523,
"step": 126
},
{
"epoch": 0.5144303797468355,
"grad_norm": 1.2116892337799072,
"learning_rate": 1e-05,
"loss": 0.898,
"step": 127
},
{
"epoch": 0.5184810126582279,
"grad_norm": 1.0647423267364502,
"learning_rate": 9.868004141892412e-06,
"loss": 0.946,
"step": 128
},
{
"epoch": 0.5225316455696203,
"grad_norm": 1.1740139722824097,
"learning_rate": 9.73603128230164e-06,
"loss": 0.9145,
"step": 129
},
{
"epoch": 0.5265822784810127,
"grad_norm": 1.168250560760498,
"learning_rate": 9.604104415737309e-06,
"loss": 0.9757,
"step": 130
},
{
"epoch": 0.5306329113924051,
"grad_norm": 1.1168098449707031,
"learning_rate": 9.472246528695377e-06,
"loss": 0.9328,
"step": 131
},
{
"epoch": 0.5346835443037975,
"grad_norm": 1.1775215864181519,
"learning_rate": 9.340480595653047e-06,
"loss": 0.9239,
"step": 132
},
{
"epoch": 0.5387341772151899,
"grad_norm": 1.2135863304138184,
"learning_rate": 9.208829575065754e-06,
"loss": 0.908,
"step": 133
},
{
"epoch": 0.5427848101265823,
"grad_norm": 1.0800964832305908,
"learning_rate": 9.07731640536698e-06,
"loss": 0.9478,
"step": 134
},
{
"epoch": 0.5468354430379747,
"grad_norm": 1.0159581899642944,
"learning_rate": 8.945964000971525e-06,
"loss": 0.9377,
"step": 135
},
{
"epoch": 0.5508860759493671,
"grad_norm": 1.1772390604019165,
"learning_rate": 8.814795248282974e-06,
"loss": 0.9329,
"step": 136
},
{
"epoch": 0.5549367088607595,
"grad_norm": 1.163148283958435,
"learning_rate": 8.683833001706068e-06,
"loss": 0.9317,
"step": 137
},
{
"epoch": 0.5589873417721519,
"grad_norm": 1.167102575302124,
"learning_rate": 8.553100079664598e-06,
"loss": 0.9015,
"step": 138
},
{
"epoch": 0.5630379746835443,
"grad_norm": 1.2055310010910034,
"learning_rate": 8.422619260625626e-06,
"loss": 0.9573,
"step": 139
},
{
"epoch": 0.5670886075949367,
"grad_norm": 1.287394642829895,
"learning_rate": 8.292413279130625e-06,
"loss": 0.9177,
"step": 140
},
{
"epoch": 0.5711392405063291,
"grad_norm": 1.08267343044281,
"learning_rate": 8.162504821834296e-06,
"loss": 0.9502,
"step": 141
},
{
"epoch": 0.5751898734177215,
"grad_norm": 1.1666055917739868,
"learning_rate": 8.03291652355172e-06,
"loss": 0.8894,
"step": 142
},
{
"epoch": 0.579240506329114,
"grad_norm": 1.1575030088424683,
"learning_rate": 7.903670963314536e-06,
"loss": 0.9206,
"step": 143
},
{
"epoch": 0.5832911392405064,
"grad_norm": 1.148395299911499,
"learning_rate": 7.774790660436857e-06,
"loss": 0.9333,
"step": 144
},
{
"epoch": 0.5873417721518988,
"grad_norm": 1.0939373970031738,
"learning_rate": 7.646298070591578e-06,
"loss": 0.9334,
"step": 145
},
{
"epoch": 0.5913924050632912,
"grad_norm": 1.0633858442306519,
"learning_rate": 7.518215581897763e-06,
"loss": 0.911,
"step": 146
},
{
"epoch": 0.5954430379746836,
"grad_norm": 1.0724154710769653,
"learning_rate": 7.390565511019834e-06,
"loss": 0.94,
"step": 147
},
{
"epoch": 0.599493670886076,
"grad_norm": 1.059989333152771,
"learning_rate": 7.263370099279173e-06,
"loss": 0.9125,
"step": 148
},
{
"epoch": 0.6035443037974684,
"grad_norm": 1.0825798511505127,
"learning_rate": 7.136651508778876e-06,
"loss": 0.9218,
"step": 149
},
{
"epoch": 0.6075949367088608,
"grad_norm": 1.1126887798309326,
"learning_rate": 7.010431818542298e-06,
"loss": 0.9038,
"step": 150
},
{
"epoch": 0.6116455696202532,
"grad_norm": 1.0414491891860962,
"learning_rate": 6.884733020666086e-06,
"loss": 0.9031,
"step": 151
},
{
"epoch": 0.6156962025316456,
"grad_norm": 1.1232143640518188,
"learning_rate": 6.759577016488343e-06,
"loss": 0.8803,
"step": 152
},
{
"epoch": 0.619746835443038,
"grad_norm": 1.111212968826294,
"learning_rate": 6.634985612772611e-06,
"loss": 0.9066,
"step": 153
},
{
"epoch": 0.6237974683544304,
"grad_norm": 1.0482438802719116,
"learning_rate": 6.510980517908334e-06,
"loss": 0.9245,
"step": 154
},
{
"epoch": 0.6278481012658228,
"grad_norm": 1.1926794052124023,
"learning_rate": 6.387583338128471e-06,
"loss": 0.8985,
"step": 155
},
{
"epoch": 0.6318987341772152,
"grad_norm": 1.089269995689392,
"learning_rate": 6.264815573744884e-06,
"loss": 0.9339,
"step": 156
},
{
"epoch": 0.6359493670886076,
"grad_norm": 1.158069372177124,
"learning_rate": 6.142698615402205e-06,
"loss": 0.9193,
"step": 157
},
{
"epoch": 0.64,
"grad_norm": 1.1409528255462646,
"learning_rate": 6.021253740350793e-06,
"loss": 0.9499,
"step": 158
},
{
"epoch": 0.6440506329113924,
"grad_norm": 1.042941689491272,
"learning_rate": 5.900502108739466e-06,
"loss": 0.9067,
"step": 159
},
{
"epoch": 0.6481012658227848,
"grad_norm": 1.202627420425415,
"learning_rate": 5.780464759928623e-06,
"loss": 0.8872,
"step": 160
},
{
"epoch": 0.6521518987341772,
"grad_norm": 1.089460015296936,
"learning_rate": 5.66116260882442e-06,
"loss": 0.8848,
"step": 161
},
{
"epoch": 0.6562025316455696,
"grad_norm": 1.09579336643219,
"learning_rate": 5.542616442234618e-06,
"loss": 0.8871,
"step": 162
},
{
"epoch": 0.660253164556962,
"grad_norm": 1.073123574256897,
"learning_rate": 5.42484691524677e-06,
"loss": 0.9174,
"step": 163
},
{
"epoch": 0.6643037974683544,
"grad_norm": 1.0642234086990356,
"learning_rate": 5.307874547629339e-06,
"loss": 0.9088,
"step": 164
},
{
"epoch": 0.6683544303797468,
"grad_norm": 1.055198073387146,
"learning_rate": 5.191719720256407e-06,
"loss": 0.8979,
"step": 165
},
{
"epoch": 0.6724050632911392,
"grad_norm": 1.0669524669647217,
"learning_rate": 5.076402671556578e-06,
"loss": 0.9151,
"step": 166
},
{
"epoch": 0.6764556962025317,
"grad_norm": 1.0371671915054321,
"learning_rate": 4.961943493986709e-06,
"loss": 0.8848,
"step": 167
},
{
"epoch": 0.6805063291139241,
"grad_norm": 1.0839803218841553,
"learning_rate": 4.848362130531039e-06,
"loss": 0.9173,
"step": 168
},
{
"epoch": 0.6845569620253165,
"grad_norm": 1.0065879821777344,
"learning_rate": 4.7356783712264405e-06,
"loss": 0.8986,
"step": 169
},
{
"epoch": 0.6886075949367089,
"grad_norm": 1.0394333600997925,
"learning_rate": 4.623911849714226e-06,
"loss": 0.8955,
"step": 170
},
{
"epoch": 0.6926582278481013,
"grad_norm": 1.0068820714950562,
"learning_rate": 4.5130820398192645e-06,
"loss": 0.8658,
"step": 171
},
{
"epoch": 0.6967088607594937,
"grad_norm": 1.0753767490386963,
"learning_rate": 4.403208252156921e-06,
"loss": 0.8855,
"step": 172
},
{
"epoch": 0.7007594936708861,
"grad_norm": 1.0549858808517456,
"learning_rate": 4.294309630768452e-06,
"loss": 0.9062,
"step": 173
},
{
"epoch": 0.7048101265822785,
"grad_norm": 1.1192678213119507,
"learning_rate": 4.186405149785403e-06,
"loss": 0.8991,
"step": 174
},
{
"epoch": 0.7088607594936709,
"grad_norm": 1.0334043502807617,
"learning_rate": 4.079513610123619e-06,
"loss": 0.9067,
"step": 175
},
{
"epoch": 0.7129113924050633,
"grad_norm": 1.1050986051559448,
"learning_rate": 3.973653636207437e-06,
"loss": 0.8982,
"step": 176
},
{
"epoch": 0.7169620253164557,
"grad_norm": 1.1419743299484253,
"learning_rate": 3.86884367272463e-06,
"loss": 0.9532,
"step": 177
},
{
"epoch": 0.7210126582278481,
"grad_norm": 1.052414894104004,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.87,
"step": 178
},
{
"epoch": 0.7250632911392405,
"grad_norm": 1.1054644584655762,
"learning_rate": 3.6624466378768387e-06,
"loss": 0.9101,
"step": 179
},
{
"epoch": 0.7291139240506329,
"grad_norm": 1.0414938926696777,
"learning_rate": 3.560895528440844e-06,
"loss": 0.882,
"step": 180
},
{
"epoch": 0.7331645569620253,
"grad_norm": 1.041466474533081,
"learning_rate": 3.460466347030319e-06,
"loss": 0.8985,
"step": 181
},
{
"epoch": 0.7372151898734177,
"grad_norm": 1.0860273838043213,
"learning_rate": 3.361176592089919e-06,
"loss": 0.9046,
"step": 182
},
{
"epoch": 0.7412658227848101,
"grad_norm": 1.0401769876480103,
"learning_rate": 3.2630435635344283e-06,
"loss": 0.915,
"step": 183
},
{
"epoch": 0.7453164556962025,
"grad_norm": 1.0929770469665527,
"learning_rate": 3.1660843597345137e-06,
"loss": 0.8906,
"step": 184
},
{
"epoch": 0.7493670886075949,
"grad_norm": 1.0812665224075317,
"learning_rate": 3.0703158745375316e-06,
"loss": 0.9205,
"step": 185
},
{
"epoch": 0.7534177215189873,
"grad_norm": 1.010650873184204,
"learning_rate": 2.975754794324015e-06,
"loss": 0.8729,
"step": 186
},
{
"epoch": 0.7574683544303797,
"grad_norm": 1.110718846321106,
"learning_rate": 2.8824175951002918e-06,
"loss": 0.8899,
"step": 187
},
{
"epoch": 0.7615189873417721,
"grad_norm": 1.1400407552719116,
"learning_rate": 2.7903205396277546e-06,
"loss": 0.904,
"step": 188
},
{
"epoch": 0.7655696202531646,
"grad_norm": 0.9677958488464355,
"learning_rate": 2.6994796745893e-06,
"loss": 0.9165,
"step": 189
},
{
"epoch": 0.769620253164557,
"grad_norm": 1.0478403568267822,
"learning_rate": 2.6099108277934105e-06,
"loss": 0.8733,
"step": 190
},
{
"epoch": 0.7736708860759494,
"grad_norm": 1.0226112604141235,
"learning_rate": 2.5216296054163547e-06,
"loss": 0.9155,
"step": 191
},
{
"epoch": 0.7777215189873418,
"grad_norm": 1.0125083923339844,
"learning_rate": 2.4346513892830427e-06,
"loss": 0.9454,
"step": 192
},
{
"epoch": 0.7817721518987342,
"grad_norm": 1.0066049098968506,
"learning_rate": 2.34899133418692e-06,
"loss": 0.9016,
"step": 193
},
{
"epoch": 0.7858227848101266,
"grad_norm": 1.0303598642349243,
"learning_rate": 2.2646643652494693e-06,
"loss": 0.8988,
"step": 194
},
{
"epoch": 0.789873417721519,
"grad_norm": 0.9669736623764038,
"learning_rate": 2.1816851753197023e-06,
"loss": 0.8754,
"step": 195
},
{
"epoch": 0.7939240506329114,
"grad_norm": 1.039135217666626,
"learning_rate": 2.100068222414121e-06,
"loss": 0.8954,
"step": 196
},
{
"epoch": 0.7979746835443038,
"grad_norm": 1.032785415649414,
"learning_rate": 2.019827727197605e-06,
"loss": 0.8981,
"step": 197
},
{
"epoch": 0.8020253164556962,
"grad_norm": 0.9627246260643005,
"learning_rate": 1.9409776705056514e-06,
"loss": 0.8895,
"step": 198
},
{
"epoch": 0.8060759493670886,
"grad_norm": 0.9815201163291931,
"learning_rate": 1.8635317909083983e-06,
"loss": 0.903,
"step": 199
},
{
"epoch": 0.810126582278481,
"grad_norm": 1.0689142942428589,
"learning_rate": 1.7875035823168641e-06,
"loss": 0.9047,
"step": 200
},
{
"epoch": 0.8141772151898734,
"grad_norm": 1.0282758474349976,
"learning_rate": 1.712906291631814e-06,
"loss": 0.8979,
"step": 201
},
{
"epoch": 0.8182278481012658,
"grad_norm": 1.0624666213989258,
"learning_rate": 1.6397529164356606e-06,
"loss": 0.8986,
"step": 202
},
{
"epoch": 0.8222784810126582,
"grad_norm": 0.9882222414016724,
"learning_rate": 1.5680562027278156e-06,
"loss": 0.9011,
"step": 203
},
{
"epoch": 0.8263291139240506,
"grad_norm": 1.0419288873672485,
"learning_rate": 1.4978286427038602e-06,
"loss": 0.8815,
"step": 204
},
{
"epoch": 0.830379746835443,
"grad_norm": 1.0580886602401733,
"learning_rate": 1.4290824725789542e-06,
"loss": 0.8613,
"step": 205
},
{
"epoch": 0.8344303797468354,
"grad_norm": 0.9857713580131531,
"learning_rate": 1.3618296704558364e-06,
"loss": 0.8646,
"step": 206
},
{
"epoch": 0.8384810126582278,
"grad_norm": 1.0579966306686401,
"learning_rate": 1.2960819542378055e-06,
"loss": 0.9152,
"step": 207
},
{
"epoch": 0.8425316455696202,
"grad_norm": 1.0483579635620117,
"learning_rate": 1.2318507795870138e-06,
"loss": 0.9258,
"step": 208
},
{
"epoch": 0.8465822784810126,
"grad_norm": 1.0411852598190308,
"learning_rate": 1.1691473379284945e-06,
"loss": 0.8713,
"step": 209
},
{
"epoch": 0.850632911392405,
"grad_norm": 0.9969512820243835,
"learning_rate": 1.1079825545001887e-06,
"loss": 0.8882,
"step": 210
},
{
"epoch": 0.8546835443037974,
"grad_norm": 0.9896324872970581,
"learning_rate": 1.0483670864493777e-06,
"loss": 0.8773,
"step": 211
},
{
"epoch": 0.8587341772151899,
"grad_norm": 0.9491327404975891,
"learning_rate": 9.903113209758098e-07,
"loss": 0.8902,
"step": 212
},
{
"epoch": 0.8627848101265823,
"grad_norm": 1.1105666160583496,
"learning_rate": 9.33825373521875e-07,
"loss": 0.8639,
"step": 213
},
{
"epoch": 0.8668354430379747,
"grad_norm": 1.0579783916473389,
"learning_rate": 8.789190860101226e-07,
"loss": 0.8608,
"step": 214
},
{
"epoch": 0.8708860759493671,
"grad_norm": 1.0631372928619385,
"learning_rate": 8.256020251284381e-07,
"loss": 0.8739,
"step": 215
},
{
"epoch": 0.8749367088607595,
"grad_norm": 1.0193462371826172,
"learning_rate": 7.738834806631712e-07,
"loss": 0.8666,
"step": 216
},
{
"epoch": 0.8789873417721519,
"grad_norm": 0.9283171892166138,
"learning_rate": 7.237724638805221e-07,
"loss": 0.9192,
"step": 217
},
{
"epoch": 0.8830379746835443,
"grad_norm": 0.9990526437759399,
"learning_rate": 6.752777059564431e-07,
"loss": 0.8985,
"step": 218
},
{
"epoch": 0.8870886075949367,
"grad_norm": 0.9724840521812439,
"learning_rate": 6.284076564553465e-07,
"loss": 0.8909,
"step": 219
},
{
"epoch": 0.8911392405063291,
"grad_norm": 0.9561246633529663,
"learning_rate": 5.831704818578842e-07,
"loss": 0.8756,
"step": 220
},
{
"epoch": 0.8951898734177215,
"grad_norm": 0.9968512654304504,
"learning_rate": 5.395740641380532e-07,
"loss": 0.924,
"step": 221
},
{
"epoch": 0.8992405063291139,
"grad_norm": 1.0275803804397583,
"learning_rate": 4.976259993898503e-07,
"loss": 0.8724,
"step": 222
},
{
"epoch": 0.9032911392405063,
"grad_norm": 1.1665242910385132,
"learning_rate": 4.573335965037706e-07,
"loss": 0.8611,
"step": 223
},
{
"epoch": 0.9073417721518987,
"grad_norm": 1.0668385028839111,
"learning_rate": 4.187038758933204e-07,
"loss": 0.8902,
"step": 224
},
{
"epoch": 0.9113924050632911,
"grad_norm": 1.0450729131698608,
"learning_rate": 3.817435682718096e-07,
"loss": 0.8827,
"step": 225
},
{
"epoch": 0.9154430379746835,
"grad_norm": 0.950907826423645,
"learning_rate": 3.4645911347961357e-07,
"loss": 0.8826,
"step": 226
},
{
"epoch": 0.9194936708860759,
"grad_norm": 1.0657615661621094,
"learning_rate": 3.128566593621152e-07,
"loss": 0.9196,
"step": 227
},
{
"epoch": 0.9235443037974683,
"grad_norm": 1.054658055305481,
"learning_rate": 2.809420606985236e-07,
"loss": 0.8949,
"step": 228
},
{
"epoch": 0.9275949367088607,
"grad_norm": 0.9942110776901245,
"learning_rate": 2.507208781817638e-07,
"loss": 0.8986,
"step": 229
},
{
"epoch": 0.9316455696202531,
"grad_norm": 1.0271327495574951,
"learning_rate": 2.2219837744959284e-07,
"loss": 0.9016,
"step": 230
},
{
"epoch": 0.9356962025316455,
"grad_norm": 1.0039608478546143,
"learning_rate": 1.9537952816713334e-07,
"loss": 0.8706,
"step": 231
},
{
"epoch": 0.9397468354430379,
"grad_norm": 0.9501670598983765,
"learning_rate": 1.7026900316098217e-07,
"loss": 0.9061,
"step": 232
},
{
"epoch": 0.9437974683544303,
"grad_norm": 1.018248200416565,
"learning_rate": 1.4687117760502579e-07,
"loss": 0.8873,
"step": 233
},
{
"epoch": 0.9478481012658228,
"grad_norm": 0.9783368110656738,
"learning_rate": 1.2519012825812804e-07,
"loss": 0.9005,
"step": 234
},
{
"epoch": 0.9518987341772152,
"grad_norm": 0.9772806167602539,
"learning_rate": 1.0522963275380494e-07,
"loss": 0.9059,
"step": 235
},
{
"epoch": 0.9559493670886076,
"grad_norm": 0.985298216342926,
"learning_rate": 8.699316894203225e-08,
"loss": 0.8724,
"step": 236
},
{
"epoch": 0.96,
"grad_norm": 1.0296227931976318,
"learning_rate": 7.048391428326585e-08,
"loss": 0.8971,
"step": 237
},
{
"epoch": 0.9640506329113924,
"grad_norm": 1.0708626508712769,
"learning_rate": 5.5704745294815624e-08,
"loss": 0.9268,
"step": 238
},
{
"epoch": 0.9681012658227848,
"grad_norm": 0.9980558156967163,
"learning_rate": 4.2658237049655325e-08,
"loss": 0.874,
"step": 239
},
{
"epoch": 0.9721518987341772,
"grad_norm": 0.9491819739341736,
"learning_rate": 3.134666272774034e-08,
"loss": 0.8829,
"step": 240
},
{
"epoch": 0.9762025316455696,
"grad_norm": 1.0668548345565796,
"learning_rate": 2.177199321994672e-08,
"loss": 0.8689,
"step": 241
},
{
"epoch": 0.980253164556962,
"grad_norm": 1.0019922256469727,
"learning_rate": 1.3935896784663671e-08,
"loss": 0.8823,
"step": 242
},
{
"epoch": 0.9843037974683544,
"grad_norm": 1.0567593574523926,
"learning_rate": 7.83973875712385e-09,
"loss": 0.9175,
"step": 243
},
{
"epoch": 0.9883544303797468,
"grad_norm": 1.0056061744689941,
"learning_rate": 3.4845813115114147e-09,
"loss": 0.8979,
"step": 244
},
{
"epoch": 0.9924050632911392,
"grad_norm": 0.9826939702033997,
"learning_rate": 8.711832758934169e-10,
"loss": 0.8757,
"step": 245
},
{
"epoch": 0.9964556962025316,
"grad_norm": 0.9738645553588867,
"learning_rate": 0.0,
"loss": 0.9027,
"step": 246
},
{
"epoch": 0.9964556962025316,
"step": 246,
"total_flos": 2.1359726465573192e+18,
"train_loss": 0.9420681825982846,
"train_runtime": 3147.8466,
"train_samples_per_second": 20.072,
"train_steps_per_second": 0.078
}
],
"logging_steps": 1.0,
"max_steps": 246,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1359726465573192e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}