| { | |
| "best_metric": 1.3063520193099976, | |
| "best_model_checkpoint": "/mnt/users/n3thakur/vectara/huggingface-dpo/trained_models/v3/Meta-Llama-3-8B-Instruct-miracl-mix-raft-sft-25th-apr-v1.0/checkpoint-2000", | |
| "epoch": 0.9996544972935621, | |
| "eval_steps": 200, | |
| "global_step": 2170, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.0262953847325709, | |
| "learning_rate": 4.608294930875576e-08, | |
| "loss": 1.7621, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.954880939596983, | |
| "learning_rate": 2.3041474654377884e-07, | |
| "loss": 1.7602, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.0004644202041775, | |
| "learning_rate": 4.608294930875577e-07, | |
| "loss": 1.8162, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.0280115397490373, | |
| "learning_rate": 6.912442396313365e-07, | |
| "loss": 1.7724, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.8732665912953524, | |
| "learning_rate": 9.216589861751154e-07, | |
| "loss": 1.7378, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.9441885280056963, | |
| "learning_rate": 1.1520737327188942e-06, | |
| "loss": 1.7507, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.9845213443275476, | |
| "learning_rate": 1.382488479262673e-06, | |
| "loss": 1.7561, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.9521516823054887, | |
| "learning_rate": 1.6129032258064516e-06, | |
| "loss": 1.7479, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.0047303931298321, | |
| "learning_rate": 1.8433179723502307e-06, | |
| "loss": 1.7773, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.9230639224993269, | |
| "learning_rate": 2.0737327188940094e-06, | |
| "loss": 1.7752, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.9112137084493539, | |
| "learning_rate": 2.3041474654377884e-06, | |
| "loss": 1.7213, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.8909029043612109, | |
| "learning_rate": 2.5345622119815673e-06, | |
| "loss": 1.7881, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.7189360071641709, | |
| "learning_rate": 2.764976958525346e-06, | |
| "loss": 1.6636, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.6692633878715751, | |
| "learning_rate": 2.9953917050691243e-06, | |
| "loss": 1.6508, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.6188511577538577, | |
| "learning_rate": 3.225806451612903e-06, | |
| "loss": 1.7011, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.6279410712564498, | |
| "learning_rate": 3.4562211981566825e-06, | |
| "loss": 1.6403, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.5342697084988228, | |
| "learning_rate": 3.6866359447004615e-06, | |
| "loss": 1.656, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.44402512139023, | |
| "learning_rate": 3.91705069124424e-06, | |
| "loss": 1.6617, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.4644619191475398, | |
| "learning_rate": 4.147465437788019e-06, | |
| "loss": 1.6303, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.371289417426217, | |
| "learning_rate": 4.377880184331797e-06, | |
| "loss": 1.6453, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.3309787275557066, | |
| "learning_rate": 4.608294930875577e-06, | |
| "loss": 1.5851, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.31477252449769144, | |
| "learning_rate": 4.838709677419355e-06, | |
| "loss": 1.604, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.2612544823481869, | |
| "learning_rate": 5.0691244239631346e-06, | |
| "loss": 1.5725, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.2605484332821398, | |
| "learning_rate": 5.299539170506913e-06, | |
| "loss": 1.5644, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.25148232016385397, | |
| "learning_rate": 5.529953917050692e-06, | |
| "loss": 1.5467, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.2601998287824473, | |
| "learning_rate": 5.76036866359447e-06, | |
| "loss": 1.589, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.2385121609938763, | |
| "learning_rate": 5.9907834101382485e-06, | |
| "loss": 1.6055, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.24629649793790628, | |
| "learning_rate": 6.221198156682028e-06, | |
| "loss": 1.5373, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.2468655185850127, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 1.5614, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.22223391970377232, | |
| "learning_rate": 6.682027649769586e-06, | |
| "loss": 1.5624, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.2062049955510628, | |
| "learning_rate": 6.912442396313365e-06, | |
| "loss": 1.5013, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.19876212655323225, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 1.575, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.19765238479942285, | |
| "learning_rate": 7.373271889400923e-06, | |
| "loss": 1.5167, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.19240215611308686, | |
| "learning_rate": 7.603686635944701e-06, | |
| "loss": 1.5071, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.2001053295395004, | |
| "learning_rate": 7.83410138248848e-06, | |
| "loss": 1.4932, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.20863957442778325, | |
| "learning_rate": 8.064516129032258e-06, | |
| "loss": 1.5371, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.1880073967302754, | |
| "learning_rate": 8.294930875576038e-06, | |
| "loss": 1.492, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.18052519991071567, | |
| "learning_rate": 8.525345622119815e-06, | |
| "loss": 1.5039, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.17252733686762506, | |
| "learning_rate": 8.755760368663595e-06, | |
| "loss": 1.492, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.1754856007551659, | |
| "learning_rate": 8.986175115207374e-06, | |
| "loss": 1.4926, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.16521189205999245, | |
| "learning_rate": 9.216589861751153e-06, | |
| "loss": 1.4903, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_loss": 1.3960996866226196, | |
| "eval_runtime": 1753.0374, | |
| "eval_samples_per_second": 2.162, | |
| "eval_steps_per_second": 0.27, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.1618648599795676, | |
| "learning_rate": 9.447004608294931e-06, | |
| "loss": 1.4499, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.1609921643942943, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 1.4775, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.161807092383862, | |
| "learning_rate": 9.90783410138249e-06, | |
| "loss": 1.4779, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.15789342984068674, | |
| "learning_rate": 9.999941779365509e-06, | |
| "loss": 1.4064, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.17953421611953463, | |
| "learning_rate": 9.99958599150926e-06, | |
| "loss": 1.4216, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.17033046165043145, | |
| "learning_rate": 9.998906783581494e-06, | |
| "loss": 1.4872, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.15935058975437466, | |
| "learning_rate": 9.997904199519748e-06, | |
| "loss": 1.4473, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.16188090301066688, | |
| "learning_rate": 9.996578304180551e-06, | |
| "loss": 1.4484, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.1692826467439708, | |
| "learning_rate": 9.994929183335237e-06, | |
| "loss": 1.4576, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.16233371930510018, | |
| "learning_rate": 9.992956943664401e-06, | |
| "loss": 1.4674, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.1601701723529579, | |
| "learning_rate": 9.99066171275098e-06, | |
| "loss": 1.434, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.1515934744211598, | |
| "learning_rate": 9.988043639072021e-06, | |
| "loss": 1.469, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.1558360755989006, | |
| "learning_rate": 9.985102891989063e-06, | |
| "loss": 1.4688, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.20565866787405954, | |
| "learning_rate": 9.98183966173718e-06, | |
| "loss": 1.4794, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.16589082756750548, | |
| "learning_rate": 9.97825415941269e-06, | |
| "loss": 1.4514, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.16179243903344054, | |
| "learning_rate": 9.974346616959476e-06, | |
| "loss": 1.4802, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.15782689090886812, | |
| "learning_rate": 9.970117287154004e-06, | |
| "loss": 1.4356, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.15954652235182576, | |
| "learning_rate": 9.965566443588956e-06, | |
| "loss": 1.3886, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.16253218390477409, | |
| "learning_rate": 9.960694380655539e-06, | |
| "loss": 1.456, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.15571215758010262, | |
| "learning_rate": 9.955501413524438e-06, | |
| "loss": 1.4038, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.15764668416666167, | |
| "learning_rate": 9.949987878125427e-06, | |
| "loss": 1.4292, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.15565461047093965, | |
| "learning_rate": 9.944154131125643e-06, | |
| "loss": 1.3845, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.16274505869667716, | |
| "learning_rate": 9.938000549906509e-06, | |
| "loss": 1.4143, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.15771377018169355, | |
| "learning_rate": 9.93152753253932e-06, | |
| "loss": 1.414, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.15819595549860918, | |
| "learning_rate": 9.924735497759497e-06, | |
| "loss": 1.398, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.16836842547624103, | |
| "learning_rate": 9.917624884939495e-06, | |
| "loss": 1.415, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.16454346246919038, | |
| "learning_rate": 9.910196154060381e-06, | |
| "loss": 1.5025, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.15952712351687925, | |
| "learning_rate": 9.902449785682084e-06, | |
| "loss": 1.4602, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.15913354058980336, | |
| "learning_rate": 9.894386280912298e-06, | |
| "loss": 1.4437, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.16526633029775475, | |
| "learning_rate": 9.88600616137407e-06, | |
| "loss": 1.443, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.1570973278965336, | |
| "learning_rate": 9.877309969172065e-06, | |
| "loss": 1.4001, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.16323401596231638, | |
| "learning_rate": 9.868298266857477e-06, | |
| "loss": 1.4115, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.16999252145703372, | |
| "learning_rate": 9.858971637391662e-06, | |
| "loss": 1.431, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.1598876979960763, | |
| "learning_rate": 9.849330684108409e-06, | |
| "loss": 1.3925, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.16716694688842804, | |
| "learning_rate": 9.83937603067492e-06, | |
| "loss": 1.4369, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.17160997265475317, | |
| "learning_rate": 9.829108321051461e-06, | |
| "loss": 1.4236, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.1709419213466886, | |
| "learning_rate": 9.818528219449705e-06, | |
| "loss": 1.4156, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.1706660498176678, | |
| "learning_rate": 9.807636410289767e-06, | |
| "loss": 1.3531, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.1654207117166432, | |
| "learning_rate": 9.796433598155928e-06, | |
| "loss": 1.4282, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.17005569486504588, | |
| "learning_rate": 9.784920507751052e-06, | |
| "loss": 1.465, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "eval_loss": 1.3499208688735962, | |
| "eval_runtime": 1761.5948, | |
| "eval_samples_per_second": 2.151, | |
| "eval_steps_per_second": 0.269, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.16465910018813473, | |
| "learning_rate": 9.773097883849715e-06, | |
| "loss": 1.4856, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.1687241699251977, | |
| "learning_rate": 9.760966491250018e-06, | |
| "loss": 1.4448, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.173826161389088, | |
| "learning_rate": 9.748527114724111e-06, | |
| "loss": 1.4588, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.1664583424134419, | |
| "learning_rate": 9.735780558967434e-06, | |
| "loss": 1.3651, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.16305809613976227, | |
| "learning_rate": 9.72272764854666e-06, | |
| "loss": 1.386, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.16712240916605367, | |
| "learning_rate": 9.709369227846346e-06, | |
| "loss": 1.4249, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.1685889272801495, | |
| "learning_rate": 9.695706161014322e-06, | |
| "loss": 1.4629, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.16213828032523545, | |
| "learning_rate": 9.681739331905784e-06, | |
| "loss": 1.4633, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.1722162760562697, | |
| "learning_rate": 9.667469644026118e-06, | |
| "loss": 1.4147, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.16634245789106444, | |
| "learning_rate": 9.652898020472449e-06, | |
| "loss": 1.4254, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.17414333030226264, | |
| "learning_rate": 9.638025403873939e-06, | |
| "loss": 1.3734, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.17078620575685172, | |
| "learning_rate": 9.622852756330797e-06, | |
| "loss": 1.4313, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.17201073443780537, | |
| "learning_rate": 9.60738105935204e-06, | |
| "loss": 1.4412, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.1781118635582674, | |
| "learning_rate": 9.59161131379201e-06, | |
| "loss": 1.4102, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.17417835939253798, | |
| "learning_rate": 9.575544539785626e-06, | |
| "loss": 1.4311, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.17050610923384396, | |
| "learning_rate": 9.559181776682387e-06, | |
| "loss": 1.4627, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.176092363473546, | |
| "learning_rate": 9.542524082979138e-06, | |
| "loss": 1.4517, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.17498193119749225, | |
| "learning_rate": 9.525572536251608e-06, | |
| "loss": 1.3956, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.17003085925682157, | |
| "learning_rate": 9.50832823308468e-06, | |
| "loss": 1.4012, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.17064790853058265, | |
| "learning_rate": 9.490792289001476e-06, | |
| "loss": 1.3523, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.1799693810618531, | |
| "learning_rate": 9.472965838391187e-06, | |
| "loss": 1.4446, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.16868170325096435, | |
| "learning_rate": 9.454850034435679e-06, | |
| "loss": 1.3912, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.16981991686336434, | |
| "learning_rate": 9.436446049034913e-06, | |
| "loss": 1.3986, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.1755120644725739, | |
| "learning_rate": 9.417755072731121e-06, | |
| "loss": 1.4117, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.1757934575464805, | |
| "learning_rate": 9.398778314631801e-06, | |
| "loss": 1.3587, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.173876963970309, | |
| "learning_rate": 9.379517002331489e-06, | |
| "loss": 1.3862, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.17746114421749437, | |
| "learning_rate": 9.359972381832358e-06, | |
| "loss": 1.4309, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.16869333613216586, | |
| "learning_rate": 9.340145717463609e-06, | |
| "loss": 1.4118, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.17746743334088458, | |
| "learning_rate": 9.320038291799679e-06, | |
| "loss": 1.4433, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.17587914404814756, | |
| "learning_rate": 9.299651405577286e-06, | |
| "loss": 1.4421, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.18048857244310448, | |
| "learning_rate": 9.278986377611266e-06, | |
| "loss": 1.4221, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.17371693231140117, | |
| "learning_rate": 9.258044544709276e-06, | |
| "loss": 1.4131, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.17693122935797964, | |
| "learning_rate": 9.236827261585306e-06, | |
| "loss": 1.4205, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.18630601640696606, | |
| "learning_rate": 9.215335900772048e-06, | |
| "loss": 1.4067, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.17220695844147543, | |
| "learning_rate": 9.193571852532112e-06, | |
| "loss": 1.3834, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.18435801453629633, | |
| "learning_rate": 9.17153652476808e-06, | |
| "loss": 1.3485, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.1719982815834864, | |
| "learning_rate": 9.14923134293144e-06, | |
| "loss": 1.4265, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.17622887749418029, | |
| "learning_rate": 9.126657749930365e-06, | |
| "loss": 1.4242, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.1772303960409497, | |
| "learning_rate": 9.103817206036383e-06, | |
| "loss": 1.3901, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.18039291853040396, | |
| "learning_rate": 9.080711188789903e-06, | |
| "loss": 1.4193, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_loss": 1.333003282546997, | |
| "eval_runtime": 1759.7499, | |
| "eval_samples_per_second": 2.154, | |
| "eval_steps_per_second": 0.269, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.18468507528075692, | |
| "learning_rate": 9.057341192904641e-06, | |
| "loss": 1.4663, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.17990669625911423, | |
| "learning_rate": 9.033708730170925e-06, | |
| "loss": 1.4289, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.17925115015014306, | |
| "learning_rate": 9.009815329357893e-06, | |
| "loss": 1.4337, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.1742038025383068, | |
| "learning_rate": 8.985662536114614e-06, | |
| "loss": 1.4156, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.18266353155608991, | |
| "learning_rate": 8.961251912870077e-06, | |
| "loss": 1.3896, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.18057511349721395, | |
| "learning_rate": 8.936585038732143e-06, | |
| "loss": 1.3764, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.18596859871805246, | |
| "learning_rate": 8.91166350938537e-06, | |
| "loss": 1.4193, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.18986971270095016, | |
| "learning_rate": 8.886488936987817e-06, | |
| "loss": 1.3955, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.17418454140586195, | |
| "learning_rate": 8.861062950066723e-06, | |
| "loss": 1.427, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.18149732001872015, | |
| "learning_rate": 8.835387193413185e-06, | |
| "loss": 1.4046, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.18147198520857166, | |
| "learning_rate": 8.809463327975741e-06, | |
| "loss": 1.4058, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.18099108296498378, | |
| "learning_rate": 8.783293030752932e-06, | |
| "loss": 1.4066, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.1794240687483909, | |
| "learning_rate": 8.756877994684818e-06, | |
| "loss": 1.3921, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.1839307276635119, | |
| "learning_rate": 8.730219928543458e-06, | |
| "loss": 1.4054, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.18145863325096048, | |
| "learning_rate": 8.703320556822375e-06, | |
| "loss": 1.4053, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.1808487553528171, | |
| "learning_rate": 8.676181619624996e-06, | |
| "loss": 1.4055, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.1862476804387168, | |
| "learning_rate": 8.648804872552092e-06, | |
| "loss": 1.3841, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.19563150015616995, | |
| "learning_rate": 8.6211920865882e-06, | |
| "loss": 1.371, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.18923848385153544, | |
| "learning_rate": 8.593345047987069e-06, | |
| "loss": 1.3988, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.18078263758464272, | |
| "learning_rate": 8.565265558156101e-06, | |
| "loss": 1.4024, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.18258637947544226, | |
| "learning_rate": 8.536955433539824e-06, | |
| "loss": 1.371, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.19116331272141834, | |
| "learning_rate": 8.508416505502383e-06, | |
| "loss": 1.4456, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.17469959227839357, | |
| "learning_rate": 8.479650620209072e-06, | |
| "loss": 1.385, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.18410741548679613, | |
| "learning_rate": 8.450659638506908e-06, | |
| "loss": 1.4095, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.19634261946282605, | |
| "learning_rate": 8.421445435804255e-06, | |
| "loss": 1.3513, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.17826721350047323, | |
| "learning_rate": 8.3920099019495e-06, | |
| "loss": 1.3792, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.1826653979119606, | |
| "learning_rate": 8.362354941108803e-06, | |
| "loss": 1.4448, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.18664731594802075, | |
| "learning_rate": 8.33248247164292e-06, | |
| "loss": 1.3751, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.18231556377003602, | |
| "learning_rate": 8.3023944259831e-06, | |
| "loss": 1.3773, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.18711137034484868, | |
| "learning_rate": 8.272092750506084e-06, | |
| "loss": 1.4096, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.1877955269203901, | |
| "learning_rate": 8.241579405408192e-06, | |
| "loss": 1.3902, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.18482019451091206, | |
| "learning_rate": 8.21085636457851e-06, | |
| "loss": 1.3734, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.19891330231660218, | |
| "learning_rate": 8.179925615471218e-06, | |
| "loss": 1.4061, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.18663983192529415, | |
| "learning_rate": 8.148789158977012e-06, | |
| "loss": 1.3326, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.1874487096476331, | |
| "learning_rate": 8.117449009293668e-06, | |
| "loss": 1.3384, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.18710305973456598, | |
| "learning_rate": 8.085907193795745e-06, | |
| "loss": 1.3828, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.18416014945175566, | |
| "learning_rate": 8.05416575290344e-06, | |
| "loss": 1.3737, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.18615555988464447, | |
| "learning_rate": 8.022226739950587e-06, | |
| "loss": 1.4359, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.18594902983475312, | |
| "learning_rate": 7.990092221051835e-06, | |
| "loss": 1.389, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.18537070852284854, | |
| "learning_rate": 7.95776427496899e-06, | |
| "loss": 1.3593, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "eval_loss": 1.323183298110962, | |
| "eval_runtime": 1742.9319, | |
| "eval_samples_per_second": 2.174, | |
| "eval_steps_per_second": 0.272, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.1908169492182471, | |
| "learning_rate": 7.925244992976538e-06, | |
| "loss": 1.3406, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.18784810075497232, | |
| "learning_rate": 7.89253647872637e-06, | |
| "loss": 1.3842, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.19406647113841424, | |
| "learning_rate": 7.859640848111686e-06, | |
| "loss": 1.4286, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.19197603494160256, | |
| "learning_rate": 7.826560229130132e-06, | |
| "loss": 1.3928, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.19099716433921685, | |
| "learning_rate": 7.793296761746126e-06, | |
| "loss": 1.362, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.18788356013356616, | |
| "learning_rate": 7.759852597752447e-06, | |
| "loss": 1.4034, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.1921444320557867, | |
| "learning_rate": 7.726229900631015e-06, | |
| "loss": 1.3793, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.20734046130350145, | |
| "learning_rate": 7.692430845412946e-06, | |
| "loss": 1.4203, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.19179256662995678, | |
| "learning_rate": 7.658457618537853e-06, | |
| "loss": 1.4021, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.18555040743415147, | |
| "learning_rate": 7.624312417712403e-06, | |
| "loss": 1.423, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.19398963612254347, | |
| "learning_rate": 7.58999745176815e-06, | |
| "loss": 1.4367, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.19074855950766817, | |
| "learning_rate": 7.555514940518647e-06, | |
| "loss": 1.3695, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.1893550054117395, | |
| "learning_rate": 7.520867114615844e-06, | |
| "loss": 1.3939, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.19196726666071628, | |
| "learning_rate": 7.486056215405797e-06, | |
| "loss": 1.3964, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.2100904294893222, | |
| "learning_rate": 7.451084494783668e-06, | |
| "loss": 1.3753, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.1870023533707271, | |
| "learning_rate": 7.415954215048057e-06, | |
| "loss": 1.379, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.19635898208120364, | |
| "learning_rate": 7.38066764875465e-06, | |
| "loss": 1.4329, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.1896615635850299, | |
| "learning_rate": 7.345227078569218e-06, | |
| "loss": 1.357, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.19424130426015207, | |
| "learning_rate": 7.309634797119941e-06, | |
| "loss": 1.3774, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.22888693201104138, | |
| "learning_rate": 7.273893106849108e-06, | |
| "loss": 1.3976, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.1919456484613934, | |
| "learning_rate": 7.23800431986417e-06, | |
| "loss": 1.378, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.19241540158105003, | |
| "learning_rate": 7.201970757788172e-06, | |
| "loss": 1.4094, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.19333437065467562, | |
| "learning_rate": 7.165794751609569e-06, | |
| "loss": 1.3971, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.19460601771644864, | |
| "learning_rate": 7.1294786415314336e-06, | |
| "loss": 1.3879, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.18754355788787921, | |
| "learning_rate": 7.093024776820076e-06, | |
| "loss": 1.3534, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.18648196033472134, | |
| "learning_rate": 7.056435515653059e-06, | |
| "loss": 1.3969, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.18580702737411495, | |
| "learning_rate": 7.019713224966664e-06, | |
| "loss": 1.4416, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.24918096880464727, | |
| "learning_rate": 6.9828602803027664e-06, | |
| "loss": 1.3814, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.19003799001704857, | |
| "learning_rate": 6.945879065655164e-06, | |
| "loss": 1.3581, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.19777328354162663, | |
| "learning_rate": 6.90877197331536e-06, | |
| "loss": 1.3883, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.1978053982444075, | |
| "learning_rate": 6.871541403717808e-06, | |
| "loss": 1.4298, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.19360877663534273, | |
| "learning_rate": 6.83418976528462e-06, | |
| "loss": 1.3623, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.19124472077513613, | |
| "learning_rate": 6.7967194742697866e-06, | |
| "loss": 1.3965, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.2026314484251062, | |
| "learning_rate": 6.759132954602852e-06, | |
| "loss": 1.3889, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.20177996042623167, | |
| "learning_rate": 6.721432637732117e-06, | |
| "loss": 1.3987, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.19199095023199664, | |
| "learning_rate": 6.6836209624673575e-06, | |
| "loss": 1.3658, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.19092380360827313, | |
| "learning_rate": 6.64570037482205e-06, | |
| "loss": 1.3601, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.21156775391897173, | |
| "learning_rate": 6.607673327855149e-06, | |
| "loss": 1.4427, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.19213500215723073, | |
| "learning_rate": 6.569542281512388e-06, | |
| "loss": 1.3934, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.19675677797230362, | |
| "learning_rate": 6.531309702467159e-06, | |
| "loss": 1.3552, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "eval_loss": 1.3166489601135254, | |
| "eval_runtime": 1748.0429, | |
| "eval_samples_per_second": 2.168, | |
| "eval_steps_per_second": 0.271, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.18781337537819495, | |
| "learning_rate": 6.492978063960942e-06, | |
| "loss": 1.3937, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.1935060574506341, | |
| "learning_rate": 6.45454984564331e-06, | |
| "loss": 1.4284, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.1936429054806515, | |
| "learning_rate": 6.41602753341152e-06, | |
| "loss": 1.3618, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.19582370428755932, | |
| "learning_rate": 6.377413619249713e-06, | |
| "loss": 1.3822, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.18931877193304708, | |
| "learning_rate": 6.338710601067691e-06, | |
| "loss": 1.3473, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.1952618362908433, | |
| "learning_rate": 6.2999209825393445e-06, | |
| "loss": 1.369, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.196265797323174, | |
| "learning_rate": 6.2610472729406905e-06, | |
| "loss": 1.3679, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.1909001830769802, | |
| "learning_rate": 6.222091986987534e-06, | |
| "loss": 1.3939, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.1974314278029084, | |
| "learning_rate": 6.18305764467281e-06, | |
| "loss": 1.4111, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.19874201405315123, | |
| "learning_rate": 6.143946771103561e-06, | |
| "loss": 1.383, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.20109125948229767, | |
| "learning_rate": 6.104761896337581e-06, | |
| "loss": 1.3548, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.18937099603698346, | |
| "learning_rate": 6.0655055552197616e-06, | |
| "loss": 1.4427, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.20257668978871882, | |
| "learning_rate": 6.026180287218106e-06, | |
| "loss": 1.3773, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.19836764097355777, | |
| "learning_rate": 5.986788636259453e-06, | |
| "loss": 1.3945, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.19345413549116036, | |
| "learning_rate": 5.9473331505649125e-06, | |
| "loss": 1.4439, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.19576415671480885, | |
| "learning_rate": 5.907816382485026e-06, | |
| "loss": 1.3432, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.19643295445396305, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 1.3459, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.19760654138918068, | |
| "learning_rate": 5.828609228227603e-06, | |
| "loss": 1.4334, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.1907622356589435, | |
| "learning_rate": 5.788923965911028e-06, | |
| "loss": 1.3195, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.19974010922162466, | |
| "learning_rate": 5.749187668599574e-06, | |
| "loss": 1.3973, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.1899811113208983, | |
| "learning_rate": 5.709402906809307e-06, | |
| "loss": 1.3788, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.19125136139691074, | |
| "learning_rate": 5.669572254191431e-06, | |
| "loss": 1.3749, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.19773999696793723, | |
| "learning_rate": 5.6296982873658e-06, | |
| "loss": 1.3812, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.19522230435252472, | |
| "learning_rate": 5.5897835857542315e-06, | |
| "loss": 1.3639, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.19203158758627778, | |
| "learning_rate": 5.549830731413655e-06, | |
| "loss": 1.3988, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.19785365288187984, | |
| "learning_rate": 5.509842308869075e-06, | |
| "loss": 1.4031, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.20110593532241236, | |
| "learning_rate": 5.469820904946383e-06, | |
| "loss": 1.3447, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.20730230846403253, | |
| "learning_rate": 5.429769108605013e-06, | |
| "loss": 1.433, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.19538915157122386, | |
| "learning_rate": 5.389689510770462e-06, | |
| "loss": 1.3751, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.19657731321379315, | |
| "learning_rate": 5.3495847041666935e-06, | |
| "loss": 1.4427, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.19885598934336826, | |
| "learning_rate": 5.30945728314841e-06, | |
| "loss": 1.3526, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.19763223830130308, | |
| "learning_rate": 5.269309843533222e-06, | |
| "loss": 1.3792, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.1934600019166271, | |
| "learning_rate": 5.229144982433736e-06, | |
| "loss": 1.3827, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.19215817298202406, | |
| "learning_rate": 5.188965298089538e-06, | |
| "loss": 1.3609, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.1950665854099098, | |
| "learning_rate": 5.148773389699123e-06, | |
| "loss": 1.3728, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.20163181813335146, | |
| "learning_rate": 5.108571857251754e-06, | |
| "loss": 1.3937, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.19492701026725848, | |
| "learning_rate": 5.068363301359263e-06, | |
| "loss": 1.3976, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.1915934216230785, | |
| "learning_rate": 5.0281503230878304e-06, | |
| "loss": 1.3778, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.19100537033272963, | |
| "learning_rate": 4.98793552378971e-06, | |
| "loss": 1.4221, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.19909784027087774, | |
| "learning_rate": 4.947721504934966e-06, | |
| "loss": 1.3685, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "eval_loss": 1.3122756481170654, | |
| "eval_runtime": 1747.9099, | |
| "eval_samples_per_second": 2.168, | |
| "eval_steps_per_second": 0.271, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.19577442673853007, | |
| "learning_rate": 4.907510867943167e-06, | |
| "loss": 1.3595, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.2063936713267001, | |
| "learning_rate": 4.867306214015117e-06, | |
| "loss": 1.4202, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.19879562103497236, | |
| "learning_rate": 4.8271101439645765e-06, | |
| "loss": 1.3934, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.19945548173818886, | |
| "learning_rate": 4.786925258050024e-06, | |
| "loss": 1.3395, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.209259697791073, | |
| "learning_rate": 4.746754155806437e-06, | |
| "loss": 1.4072, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.19280116611929857, | |
| "learning_rate": 4.706599435877143e-06, | |
| "loss": 1.3976, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.1990941406891665, | |
| "learning_rate": 4.666463695845701e-06, | |
| "loss": 1.3912, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.19455600992105357, | |
| "learning_rate": 4.626349532067879e-06, | |
| "loss": 1.4003, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.20315382252326522, | |
| "learning_rate": 4.586259539503687e-06, | |
| "loss": 1.3876, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.19085048019709233, | |
| "learning_rate": 4.546196311549515e-06, | |
| "loss": 1.415, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.19155602915410036, | |
| "learning_rate": 4.506162439870366e-06, | |
| "loss": 1.388, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.1962993457750995, | |
| "learning_rate": 4.466160514232206e-06, | |
| "loss": 1.4069, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.19440064625069065, | |
| "learning_rate": 4.426193122334433e-06, | |
| "loss": 1.3625, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.19907155435638502, | |
| "learning_rate": 4.386262849642474e-06, | |
| "loss": 1.3621, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.19846401001306227, | |
| "learning_rate": 4.346372279220543e-06, | |
| "loss": 1.3438, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.19990637435356196, | |
| "learning_rate": 4.306523991564536e-06, | |
| "loss": 1.3857, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.1983045043565906, | |
| "learning_rate": 4.266720564435105e-06, | |
| "loss": 1.3477, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.2008086079053878, | |
| "learning_rate": 4.226964572690905e-06, | |
| "loss": 1.4032, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.2394515243143434, | |
| "learning_rate": 4.187258588122019e-06, | |
| "loss": 1.3757, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.1971799914885616, | |
| "learning_rate": 4.147605179283604e-06, | |
| "loss": 1.4156, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.2073477392149783, | |
| "learning_rate": 4.108006911329722e-06, | |
| "loss": 1.3881, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.2073153166108959, | |
| "learning_rate": 4.068466345847409e-06, | |
| "loss": 1.3687, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.20054177344121227, | |
| "learning_rate": 4.028986040690963e-06, | |
| "loss": 1.3785, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.20604849012426923, | |
| "learning_rate": 3.989568549816479e-06, | |
| "loss": 1.4169, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.19467062948633831, | |
| "learning_rate": 3.9502164231166354e-06, | |
| "loss": 1.4168, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.21219636801396732, | |
| "learning_rate": 3.910932206255742e-06, | |
| "loss": 1.3772, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.20051941796299297, | |
| "learning_rate": 3.87171844050507e-06, | |
| "loss": 1.3864, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.20033592892185176, | |
| "learning_rate": 3.8325776625784464e-06, | |
| "loss": 1.3984, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.19559628194598214, | |
| "learning_rate": 3.793512404468162e-06, | |
| "loss": 1.3954, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.21009377333677687, | |
| "learning_rate": 3.7545251932811824e-06, | |
| "loss": 1.3799, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.2058943301550432, | |
| "learning_rate": 3.7156185510756613e-06, | |
| "loss": 1.3763, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.20090178955938068, | |
| "learning_rate": 3.6767949946978026e-06, | |
| "loss": 1.4162, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.2034374577879598, | |
| "learning_rate": 3.6380570356190346e-06, | |
| "loss": 1.402, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.19652020002950302, | |
| "learning_rate": 3.5994071797735513e-06, | |
| "loss": 1.3667, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.20049219114904884, | |
| "learning_rate": 3.560847927396206e-06, | |
| "loss": 1.419, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.20857502981558484, | |
| "learning_rate": 3.5223817728607675e-06, | |
| "loss": 1.4082, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.19767809519926405, | |
| "learning_rate": 3.484011204518568e-06, | |
| "loss": 1.3947, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.1948689279113259, | |
| "learning_rate": 3.4457387045375255e-06, | |
| "loss": 1.3625, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.20429495021474384, | |
| "learning_rate": 3.4075667487415785e-06, | |
| "loss": 1.3978, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.2022436664727946, | |
| "learning_rate": 3.3694978064505258e-06, | |
| "loss": 1.3487, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 1.3093819618225098, | |
| "eval_runtime": 1769.0297, | |
| "eval_samples_per_second": 2.142, | |
| "eval_steps_per_second": 0.268, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.2010813579540422, | |
| "learning_rate": 3.331534340320287e-06, | |
| "loss": 1.3582, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.19489960246084403, | |
| "learning_rate": 3.293678806183596e-06, | |
| "loss": 1.42, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.20100567658267351, | |
| "learning_rate": 3.255933652891133e-06, | |
| "loss": 1.3887, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.198604846014806, | |
| "learning_rate": 3.218301322153111e-06, | |
| "loss": 1.3543, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.20073641523853392, | |
| "learning_rate": 3.180784248381322e-06, | |
| "loss": 1.3513, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.19424293274594104, | |
| "learning_rate": 3.1433848585316607e-06, | |
| "loss": 1.3885, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.2028571716247717, | |
| "learning_rate": 3.10610557194712e-06, | |
| "loss": 1.3824, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.1973272586923241, | |
| "learning_rate": 3.068948800201289e-06, | |
| "loss": 1.3332, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.20459650028827572, | |
| "learning_rate": 3.0319169469423487e-06, | |
| "loss": 1.3715, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.1966290880736998, | |
| "learning_rate": 2.995012407737581e-06, | |
| "loss": 1.3985, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.20394208158347749, | |
| "learning_rate": 2.958237569918404e-06, | |
| "loss": 1.3867, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.1971397504636877, | |
| "learning_rate": 2.9215948124259343e-06, | |
| "loss": 1.3739, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.1984132646647921, | |
| "learning_rate": 2.885086505657094e-06, | |
| "loss": 1.4459, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.20375542452519896, | |
| "learning_rate": 2.848715011311271e-06, | |
| "loss": 1.3606, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.19875885515941272, | |
| "learning_rate": 2.8124826822375473e-06, | |
| "loss": 1.4034, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.20571233168262154, | |
| "learning_rate": 2.7763918622824903e-06, | |
| "loss": 1.4358, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.20325983672817444, | |
| "learning_rate": 2.7404448861385293e-06, | |
| "loss": 1.3271, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.204674596724046, | |
| "learning_rate": 2.7046440791929306e-06, | |
| "loss": 1.3656, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.19864331126229412, | |
| "learning_rate": 2.6689917573773615e-06, | |
| "loss": 1.3712, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.19759957217116436, | |
| "learning_rate": 2.633490227018092e-06, | |
| "loss": 1.4061, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.1963001810018751, | |
| "learning_rate": 2.5981417846867753e-06, | |
| "loss": 1.3753, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.2037326390652298, | |
| "learning_rate": 2.5629487170518974e-06, | |
| "loss": 1.3468, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.20092378895163732, | |
| "learning_rate": 2.527913300730863e-06, | |
| "loss": 1.3831, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.2075661167474541, | |
| "learning_rate": 2.4930378021426977e-06, | |
| "loss": 1.3786, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.2002833079588797, | |
| "learning_rate": 2.4583244773614675e-06, | |
| "loss": 1.4058, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.20306778592495606, | |
| "learning_rate": 2.423775571970301e-06, | |
| "loss": 1.3704, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.22057110661612167, | |
| "learning_rate": 2.3893933209161465e-06, | |
| "loss": 1.3965, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.20413023154970353, | |
| "learning_rate": 2.3551799483651894e-06, | |
| "loss": 1.3935, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.2013070052195424, | |
| "learning_rate": 2.321137667558965e-06, | |
| "loss": 1.3757, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.20133662802149876, | |
| "learning_rate": 2.2872686806712037e-06, | |
| "loss": 1.3533, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.19816355686283976, | |
| "learning_rate": 2.2535751786653476e-06, | |
| "loss": 1.4014, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.20020195119951492, | |
| "learning_rate": 2.220059341152837e-06, | |
| "loss": 1.3721, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.20298393289470038, | |
| "learning_rate": 2.1867233362521127e-06, | |
| "loss": 1.3255, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.20399564011279728, | |
| "learning_rate": 2.153569320448348e-06, | |
| "loss": 1.3928, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.20016486527706043, | |
| "learning_rate": 2.120599438453968e-06, | |
| "loss": 1.3769, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.20169522873545517, | |
| "learning_rate": 2.087815823069886e-06, | |
| "loss": 1.3745, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.1991770376256046, | |
| "learning_rate": 2.055220595047551e-06, | |
| "loss": 1.3542, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.2083925008192503, | |
| "learning_rate": 2.022815862951751e-06, | |
| "loss": 1.4182, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.20086735172689546, | |
| "learning_rate": 1.990603723024213e-06, | |
| "loss": 1.3524, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.20621921540546848, | |
| "learning_rate": 1.9585862590480005e-06, | |
| "loss": 1.3891, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "eval_loss": 1.3076461553573608, | |
| "eval_runtime": 1777.0113, | |
| "eval_samples_per_second": 2.133, | |
| "eval_steps_per_second": 0.267, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.20007769280547363, | |
| "learning_rate": 1.926765542212707e-06, | |
| "loss": 1.3856, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.1980821730251063, | |
| "learning_rate": 1.8951436309804766e-06, | |
| "loss": 1.383, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.20314121257047116, | |
| "learning_rate": 1.8637225709528506e-06, | |
| "loss": 1.3752, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.2000738503413009, | |
| "learning_rate": 1.832504394738428e-06, | |
| "loss": 1.3501, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.19626992677384789, | |
| "learning_rate": 1.8014911218213832e-06, | |
| "loss": 1.3776, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.2132143728962325, | |
| "learning_rate": 1.770684758430824e-06, | |
| "loss": 1.3641, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.20230145941791186, | |
| "learning_rate": 1.7400872974110088e-06, | |
| "loss": 1.3714, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.20322390315184813, | |
| "learning_rate": 1.7097007180924375e-06, | |
| "loss": 1.3559, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.2097270533204938, | |
| "learning_rate": 1.6795269861638041e-06, | |
| "loss": 1.3555, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.20409576963271045, | |
| "learning_rate": 1.6495680535448405e-06, | |
| "loss": 1.3376, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.209992582333707, | |
| "learning_rate": 1.6198258582600418e-06, | |
| "loss": 1.3393, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.20186485511811675, | |
| "learning_rate": 1.590302324313303e-06, | |
| "loss": 1.3476, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.2039959334961091, | |
| "learning_rate": 1.5609993615634578e-06, | |
| "loss": 1.4172, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.2032115658244104, | |
| "learning_rate": 1.531918865600725e-06, | |
| "loss": 1.3866, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.20663125873556282, | |
| "learning_rate": 1.5030627176240903e-06, | |
| "loss": 1.3413, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.21419285282068773, | |
| "learning_rate": 1.4744327843196043e-06, | |
| "loss": 1.3685, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.20427116148089472, | |
| "learning_rate": 1.446030917739633e-06, | |
| "loss": 1.3864, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.21267201464638189, | |
| "learning_rate": 1.4178589551830585e-06, | |
| "loss": 1.3578, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.2021417320485247, | |
| "learning_rate": 1.3899187190764062e-06, | |
| "loss": 1.4034, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.19953428976865786, | |
| "learning_rate": 1.3622120168559656e-06, | |
| "loss": 1.3378, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.20322168013048264, | |
| "learning_rate": 1.3347406408508695e-06, | |
| "loss": 1.4032, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.19617963419360818, | |
| "learning_rate": 1.3075063681671408e-06, | |
| "loss": 1.3815, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.20338294719666272, | |
| "learning_rate": 1.280510960572745e-06, | |
| "loss": 1.376, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.20462364560864363, | |
| "learning_rate": 1.2537561643836087e-06, | |
| "loss": 1.3866, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.1963149280229846, | |
| "learning_rate": 1.2272437103506596e-06, | |
| "loss": 1.372, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.19716113021249304, | |
| "learning_rate": 1.200975313547867e-06, | |
| "loss": 1.3599, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.20573469916350295, | |
| "learning_rate": 1.1749526732612842e-06, | |
| "loss": 1.3562, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.20684676012413714, | |
| "learning_rate": 1.1491774728791416e-06, | |
| "loss": 1.3296, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.2090768120512491, | |
| "learning_rate": 1.1236513797829285e-06, | |
| "loss": 1.4248, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.20571837930886522, | |
| "learning_rate": 1.0983760452395415e-06, | |
| "loss": 1.3609, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.20867562597707268, | |
| "learning_rate": 1.07335310429447e-06, | |
| "loss": 1.3848, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.1988423228103918, | |
| "learning_rate": 1.048584175666012e-06, | |
| "loss": 1.3712, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.21063039041467455, | |
| "learning_rate": 1.0240708616405788e-06, | |
| "loss": 1.3611, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.20306439311302277, | |
| "learning_rate": 9.998147479690251e-07, | |
| "loss": 1.3478, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.2020964557722793, | |
| "learning_rate": 9.75817403764079e-07, | |
| "loss": 1.3433, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.20597190432737983, | |
| "learning_rate": 9.520803813988366e-07, | |
| "loss": 1.4058, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.20380209329114748, | |
| "learning_rate": 9.286052164063369e-07, | |
| "loss": 1.4028, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.2041857004062742, | |
| "learning_rate": 9.053934273802312e-07, | |
| "loss": 1.383, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.20684812528414637, | |
| "learning_rate": 8.824465158765433e-07, | |
| "loss": 1.3512, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.20386591712113425, | |
| "learning_rate": 8.597659663165364e-07, | |
| "loss": 1.3858, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "eval_loss": 1.306676983833313, | |
| "eval_runtime": 1760.6924, | |
| "eval_samples_per_second": 2.153, | |
| "eval_steps_per_second": 0.269, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.19511280510666812, | |
| "learning_rate": 8.373532458906897e-07, | |
| "loss": 1.3261, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.20356348959673148, | |
| "learning_rate": 8.15209804463783e-07, | |
| "loss": 1.3288, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.2103018153717413, | |
| "learning_rate": 7.93337074481108e-07, | |
| "loss": 1.4425, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.20468220080909677, | |
| "learning_rate": 7.717364708758024e-07, | |
| "loss": 1.406, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.2040164153992187, | |
| "learning_rate": 7.504093909773174e-07, | |
| "loss": 1.3601, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.19961500414001193, | |
| "learning_rate": 7.293572144210332e-07, | |
| "loss": 1.3777, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.20035042813237278, | |
| "learning_rate": 7.085813030590022e-07, | |
| "loss": 1.3944, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.20685428154054034, | |
| "learning_rate": 6.880830008718564e-07, | |
| "loss": 1.3778, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.19642667708352796, | |
| "learning_rate": 6.678636338818645e-07, | |
| "loss": 1.3458, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.20222774080494071, | |
| "learning_rate": 6.47924510067151e-07, | |
| "loss": 1.3655, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.20419539962584285, | |
| "learning_rate": 6.282669192770896e-07, | |
| "loss": 1.424, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.1975487856167091, | |
| "learning_rate": 6.088921331488568e-07, | |
| "loss": 1.3424, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.2094178118159778, | |
| "learning_rate": 5.898014050251765e-07, | |
| "loss": 1.3611, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.20636955576818874, | |
| "learning_rate": 5.709959698732359e-07, | |
| "loss": 1.3779, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.21747927560303068, | |
| "learning_rate": 5.524770442047978e-07, | |
| "loss": 1.3308, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.20050302654738292, | |
| "learning_rate": 5.342458259975147e-07, | |
| "loss": 1.3865, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.2392412132808601, | |
| "learning_rate": 5.163034946174161e-07, | |
| "loss": 1.3792, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.20914070749895322, | |
| "learning_rate": 4.986512107426283e-07, | |
| "loss": 1.3812, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.21320420478322508, | |
| "learning_rate": 4.812901162882871e-07, | |
| "loss": 1.443, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.20606583697965636, | |
| "learning_rate": 4.6422133433266513e-07, | |
| "loss": 1.3546, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.20597370559054526, | |
| "learning_rate": 4.474459690445293e-07, | |
| "loss": 1.3803, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.20609624124174958, | |
| "learning_rate": 4.309651056117009e-07, | |
| "loss": 1.3806, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.21011536963816732, | |
| "learning_rate": 4.1477981017086387e-07, | |
| "loss": 1.3857, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.20861991251552262, | |
| "learning_rate": 3.9889112973859554e-07, | |
| "loss": 1.4178, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.20565429912809985, | |
| "learning_rate": 3.8330009214363197e-07, | |
| "loss": 1.3485, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.213262370271027, | |
| "learning_rate": 3.680077059603876e-07, | |
| "loss": 1.3857, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.21006993775062202, | |
| "learning_rate": 3.530149604436983e-07, | |
| "loss": 1.3718, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.20971631358735854, | |
| "learning_rate": 3.3832282546483686e-07, | |
| "loss": 1.3401, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.20675053040706537, | |
| "learning_rate": 3.239322514487686e-07, | |
| "loss": 1.3976, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.2070605402743848, | |
| "learning_rate": 3.098441693126719e-07, | |
| "loss": 1.3801, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.2036473853339382, | |
| "learning_rate": 2.9605949040571456e-07, | |
| "loss": 1.3975, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.20446499651024982, | |
| "learning_rate": 2.8257910645009935e-07, | |
| "loss": 1.3932, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.20745728206982056, | |
| "learning_rate": 2.6940388948338057e-07, | |
| "loss": 1.4214, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.20628378675298625, | |
| "learning_rate": 2.565346918020534e-07, | |
| "loss": 1.3234, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.19884308459081187, | |
| "learning_rate": 2.4397234590641696e-07, | |
| "loss": 1.4086, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.196985413218799, | |
| "learning_rate": 2.3171766444672227e-07, | |
| "loss": 1.4203, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.20225024562843938, | |
| "learning_rate": 2.1977144017060027e-07, | |
| "loss": 1.3859, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.20030763530819898, | |
| "learning_rate": 2.0813444587178156e-07, | |
| "loss": 1.3889, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.20327002619311596, | |
| "learning_rate": 1.9680743434010385e-07, | |
| "loss": 1.3745, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.20095518840823687, | |
| "learning_rate": 1.8579113831281525e-07, | |
| "loss": 1.3635, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "eval_loss": 1.3063520193099976, | |
| "eval_runtime": 2013.9504, | |
| "eval_samples_per_second": 1.882, | |
| "eval_steps_per_second": 0.235, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.20459938594623875, | |
| "learning_rate": 1.7508627042717387e-07, | |
| "loss": 1.4269, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.2004705031433909, | |
| "learning_rate": 1.6469352317434627e-07, | |
| "loss": 1.3789, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.19922047944368607, | |
| "learning_rate": 1.5461356885461077e-07, | |
| "loss": 1.3811, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.20506581374899654, | |
| "learning_rate": 1.4484705953386968e-07, | |
| "loss": 1.3677, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.20196871274072786, | |
| "learning_rate": 1.35394627001465e-07, | |
| "loss": 1.3871, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.21302376799020897, | |
| "learning_rate": 1.2625688272930925e-07, | |
| "loss": 1.3673, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.20437076323448575, | |
| "learning_rate": 1.174344178323289e-07, | |
| "loss": 1.3701, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.20259273082335125, | |
| "learning_rate": 1.0892780303022377e-07, | |
| "loss": 1.4004, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.19927961571470354, | |
| "learning_rate": 1.007375886105555e-07, | |
| "loss": 1.3781, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.19648191268958623, | |
| "learning_rate": 9.286430439313876e-08, | |
| "loss": 1.3719, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.2081842753213948, | |
| "learning_rate": 8.530845969577594e-08, | |
| "loss": 1.3347, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.19777453793305202, | |
| "learning_rate": 7.80705433013046e-08, | |
| "loss": 1.3645, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.2001901456526902, | |
| "learning_rate": 7.115102342598101e-08, | |
| "loss": 1.3549, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.2085779076020987, | |
| "learning_rate": 6.455034768919288e-08, | |
| "loss": 1.395, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.2007001569199437, | |
| "learning_rate": 5.826894308449904e-08, | |
| "loss": 1.3418, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.20019336918988379, | |
| "learning_rate": 5.230721595201049e-08, | |
| "loss": 1.3808, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.20532075618369505, | |
| "learning_rate": 4.666555195210365e-08, | |
| "loss": 1.3624, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.19955856360806487, | |
| "learning_rate": 4.134431604047195e-08, | |
| "loss": 1.3851, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.19995218707394016, | |
| "learning_rate": 3.63438524445181e-08, | |
| "loss": 1.404, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.2027473195079258, | |
| "learning_rate": 3.166448464108629e-08, | |
| "loss": 1.3654, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.21316411475600813, | |
| "learning_rate": 2.7306515335532857e-08, | |
| "loss": 1.4004, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.19985677477921807, | |
| "learning_rate": 2.327022644215193e-08, | |
| "loss": 1.3813, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.20946530787894166, | |
| "learning_rate": 1.9555879065930038e-08, | |
| "loss": 1.4226, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.1975351975591122, | |
| "learning_rate": 1.6163713485662923e-08, | |
| "loss": 1.3792, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.20571768755825784, | |
| "learning_rate": 1.3093949138406892e-08, | |
| "loss": 1.3918, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.44915439425778153, | |
| "learning_rate": 1.03467846052846e-08, | |
| "loss": 1.3642, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.20214902180431968, | |
| "learning_rate": 7.922397598642551e-09, | |
| "loss": 1.3599, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.19767842515824202, | |
| "learning_rate": 5.820944950549745e-09, | |
| "loss": 1.3599, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.1991438801798082, | |
| "learning_rate": 4.042562602655231e-09, | |
| "loss": 1.3446, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.20173015884228035, | |
| "learning_rate": 2.5873655973945864e-09, | |
| "loss": 1.3461, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.20119946240562925, | |
| "learning_rate": 1.4554480705458729e-09, | |
| "loss": 1.3474, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.20468648298029762, | |
| "learning_rate": 6.468832451417273e-10, | |
| "loss": 1.3649, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.19998165788056904, | |
| "learning_rate": 1.617234267320411e-10, | |
| "loss": 1.3677, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.21026892718522863, | |
| "learning_rate": 0.0, | |
| "loss": 1.3748, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 2170, | |
| "total_flos": 7055767844683776.0, | |
| "train_loss": 1.4133363889659056, | |
| "train_runtime": 112943.9974, | |
| "train_samples_per_second": 0.615, | |
| "train_steps_per_second": 0.019 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2170, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "total_flos": 7055767844683776.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |