SearchSimulation_3B / trainer_state.json
Anonymous-adhj2383's picture
Upload folder using huggingface_hub
922d5e2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.2953367875647668,
"eval_steps": 10000000,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012953367875647668,
"grad_norm": 25.608994557453922,
"learning_rate": 6.476683937823834e-09,
"loss": 2.9207,
"step": 10
},
{
"epoch": 0.025906735751295335,
"grad_norm": 25.006509435749546,
"learning_rate": 1.2953367875647667e-08,
"loss": 2.8035,
"step": 20
},
{
"epoch": 0.038860103626943004,
"grad_norm": 26.07092571978953,
"learning_rate": 1.9430051813471502e-08,
"loss": 2.8884,
"step": 30
},
{
"epoch": 0.05181347150259067,
"grad_norm": 24.9115069323374,
"learning_rate": 2.5906735751295334e-08,
"loss": 2.8786,
"step": 40
},
{
"epoch": 0.06476683937823834,
"grad_norm": 26.35515887725646,
"learning_rate": 3.238341968911917e-08,
"loss": 2.8856,
"step": 50
},
{
"epoch": 0.07772020725388601,
"grad_norm": 26.802778574551475,
"learning_rate": 3.8860103626943005e-08,
"loss": 2.8532,
"step": 60
},
{
"epoch": 0.09067357512953368,
"grad_norm": 26.534962747282748,
"learning_rate": 4.533678756476684e-08,
"loss": 2.8952,
"step": 70
},
{
"epoch": 0.10362694300518134,
"grad_norm": 23.051599328506548,
"learning_rate": 5.181347150259067e-08,
"loss": 2.8437,
"step": 80
},
{
"epoch": 0.11658031088082901,
"grad_norm": 24.471329875902512,
"learning_rate": 5.8290155440414504e-08,
"loss": 2.8237,
"step": 90
},
{
"epoch": 0.12953367875647667,
"grad_norm": 24.33959816120034,
"learning_rate": 6.476683937823834e-08,
"loss": 2.8199,
"step": 100
},
{
"epoch": 0.14248704663212436,
"grad_norm": 25.214396561848275,
"learning_rate": 7.124352331606218e-08,
"loss": 2.8259,
"step": 110
},
{
"epoch": 0.15544041450777202,
"grad_norm": 20.508728927761148,
"learning_rate": 7.772020725388601e-08,
"loss": 2.7907,
"step": 120
},
{
"epoch": 0.16839378238341968,
"grad_norm": 21.133956193530185,
"learning_rate": 8.419689119170984e-08,
"loss": 2.7776,
"step": 130
},
{
"epoch": 0.18134715025906736,
"grad_norm": 20.216670897828962,
"learning_rate": 9.067357512953368e-08,
"loss": 2.6967,
"step": 140
},
{
"epoch": 0.19430051813471502,
"grad_norm": 19.577892832244164,
"learning_rate": 9.715025906735751e-08,
"loss": 2.6823,
"step": 150
},
{
"epoch": 0.20725388601036268,
"grad_norm": 14.26088592523892,
"learning_rate": 1.0362694300518134e-07,
"loss": 2.6147,
"step": 160
},
{
"epoch": 0.22020725388601037,
"grad_norm": 12.389614421651768,
"learning_rate": 1.1010362694300518e-07,
"loss": 2.5705,
"step": 170
},
{
"epoch": 0.23316062176165803,
"grad_norm": 10.537108203822184,
"learning_rate": 1.1658031088082901e-07,
"loss": 2.5779,
"step": 180
},
{
"epoch": 0.24611398963730569,
"grad_norm": 8.566097835293071,
"learning_rate": 1.2305699481865284e-07,
"loss": 2.5464,
"step": 190
},
{
"epoch": 0.25906735751295334,
"grad_norm": 7.5406588821241884,
"learning_rate": 1.2953367875647668e-07,
"loss": 2.5462,
"step": 200
},
{
"epoch": 0.27202072538860106,
"grad_norm": 6.406776125251227,
"learning_rate": 1.3601036269430052e-07,
"loss": 2.4201,
"step": 210
},
{
"epoch": 0.2849740932642487,
"grad_norm": 5.0333340242378135,
"learning_rate": 1.4248704663212436e-07,
"loss": 2.4404,
"step": 220
},
{
"epoch": 0.2979274611398964,
"grad_norm": 4.96460074385302,
"learning_rate": 1.4896373056994818e-07,
"loss": 2.4234,
"step": 230
},
{
"epoch": 0.31088082901554404,
"grad_norm": 4.718132167499178,
"learning_rate": 1.5544041450777202e-07,
"loss": 2.447,
"step": 240
},
{
"epoch": 0.3238341968911917,
"grad_norm": 4.473414832247284,
"learning_rate": 1.6191709844559583e-07,
"loss": 2.3905,
"step": 250
},
{
"epoch": 0.33678756476683935,
"grad_norm": 4.231641138749174,
"learning_rate": 1.6839378238341968e-07,
"loss": 2.3678,
"step": 260
},
{
"epoch": 0.34974093264248707,
"grad_norm": 3.950230234878744,
"learning_rate": 1.7487046632124352e-07,
"loss": 2.3378,
"step": 270
},
{
"epoch": 0.3626943005181347,
"grad_norm": 3.789360959290258,
"learning_rate": 1.8134715025906736e-07,
"loss": 2.3507,
"step": 280
},
{
"epoch": 0.3756476683937824,
"grad_norm": 3.752329492676831,
"learning_rate": 1.8782383419689118e-07,
"loss": 2.3752,
"step": 290
},
{
"epoch": 0.38860103626943004,
"grad_norm": 3.8201206496833313,
"learning_rate": 1.9430051813471502e-07,
"loss": 2.3437,
"step": 300
},
{
"epoch": 0.4015544041450777,
"grad_norm": 3.8776779124718175,
"learning_rate": 2.0077720207253883e-07,
"loss": 2.3889,
"step": 310
},
{
"epoch": 0.41450777202072536,
"grad_norm": 3.989802925727435,
"learning_rate": 2.0725388601036267e-07,
"loss": 2.3129,
"step": 320
},
{
"epoch": 0.4274611398963731,
"grad_norm": 3.651464736452137,
"learning_rate": 2.1373056994818652e-07,
"loss": 2.3276,
"step": 330
},
{
"epoch": 0.44041450777202074,
"grad_norm": 3.784690196455203,
"learning_rate": 2.2020725388601036e-07,
"loss": 2.31,
"step": 340
},
{
"epoch": 0.4533678756476684,
"grad_norm": 3.5900123693564683,
"learning_rate": 2.2668393782383417e-07,
"loss": 2.3023,
"step": 350
},
{
"epoch": 0.46632124352331605,
"grad_norm": 3.616764531573499,
"learning_rate": 2.3316062176165802e-07,
"loss": 2.2977,
"step": 360
},
{
"epoch": 0.4792746113989637,
"grad_norm": 3.4970049253402076,
"learning_rate": 2.3963730569948183e-07,
"loss": 2.32,
"step": 370
},
{
"epoch": 0.49222797927461137,
"grad_norm": 3.7715349943050733,
"learning_rate": 2.4611398963730567e-07,
"loss": 2.274,
"step": 380
},
{
"epoch": 0.5051813471502591,
"grad_norm": 3.7123935945294897,
"learning_rate": 2.525906735751295e-07,
"loss": 2.3086,
"step": 390
},
{
"epoch": 0.5181347150259067,
"grad_norm": 3.372291772174901,
"learning_rate": 2.5906735751295336e-07,
"loss": 2.2496,
"step": 400
},
{
"epoch": 0.5310880829015544,
"grad_norm": 3.784294595936631,
"learning_rate": 2.655440414507772e-07,
"loss": 2.2858,
"step": 410
},
{
"epoch": 0.5440414507772021,
"grad_norm": 3.4351612072380675,
"learning_rate": 2.7202072538860104e-07,
"loss": 2.28,
"step": 420
},
{
"epoch": 0.5569948186528497,
"grad_norm": 3.3411063107189753,
"learning_rate": 2.7849740932642483e-07,
"loss": 2.2764,
"step": 430
},
{
"epoch": 0.5699481865284974,
"grad_norm": 3.3765768734179993,
"learning_rate": 2.849740932642487e-07,
"loss": 2.3278,
"step": 440
},
{
"epoch": 0.582901554404145,
"grad_norm": 3.501479583740947,
"learning_rate": 2.914507772020725e-07,
"loss": 2.275,
"step": 450
},
{
"epoch": 0.5958549222797928,
"grad_norm": 3.4545990517473895,
"learning_rate": 2.9792746113989635e-07,
"loss": 2.292,
"step": 460
},
{
"epoch": 0.6088082901554405,
"grad_norm": 3.4921131143644137,
"learning_rate": 3.044041450777202e-07,
"loss": 2.306,
"step": 470
},
{
"epoch": 0.6217616580310881,
"grad_norm": 3.3569942808113202,
"learning_rate": 3.1088082901554404e-07,
"loss": 2.2818,
"step": 480
},
{
"epoch": 0.6347150259067358,
"grad_norm": 3.473735761418456,
"learning_rate": 3.173575129533679e-07,
"loss": 2.292,
"step": 490
},
{
"epoch": 0.6476683937823834,
"grad_norm": 3.4098418845884506,
"learning_rate": 3.2383419689119167e-07,
"loss": 2.2388,
"step": 500
},
{
"epoch": 0.6606217616580311,
"grad_norm": 3.2884572160731103,
"learning_rate": 3.303108808290155e-07,
"loss": 2.2485,
"step": 510
},
{
"epoch": 0.6735751295336787,
"grad_norm": 3.49059998854233,
"learning_rate": 3.3678756476683935e-07,
"loss": 2.2546,
"step": 520
},
{
"epoch": 0.6865284974093264,
"grad_norm": 3.228603206545892,
"learning_rate": 3.432642487046632e-07,
"loss": 2.2611,
"step": 530
},
{
"epoch": 0.6994818652849741,
"grad_norm": 3.201214883425356,
"learning_rate": 3.4974093264248704e-07,
"loss": 2.2423,
"step": 540
},
{
"epoch": 0.7124352331606217,
"grad_norm": 3.174572440904334,
"learning_rate": 3.562176165803109e-07,
"loss": 2.2575,
"step": 550
},
{
"epoch": 0.7253886010362695,
"grad_norm": 3.6637540799374086,
"learning_rate": 3.626943005181347e-07,
"loss": 2.2703,
"step": 560
},
{
"epoch": 0.7383419689119171,
"grad_norm": 3.2144805242394456,
"learning_rate": 3.691709844559585e-07,
"loss": 2.2536,
"step": 570
},
{
"epoch": 0.7512953367875648,
"grad_norm": 3.2832998982931647,
"learning_rate": 3.7564766839378235e-07,
"loss": 2.2923,
"step": 580
},
{
"epoch": 0.7642487046632125,
"grad_norm": 3.339591881411431,
"learning_rate": 3.8212435233160625e-07,
"loss": 2.2582,
"step": 590
},
{
"epoch": 0.7772020725388601,
"grad_norm": 3.1793659057987114,
"learning_rate": 3.8860103626943004e-07,
"loss": 2.2725,
"step": 600
},
{
"epoch": 0.7901554404145078,
"grad_norm": 3.105051267095931,
"learning_rate": 3.950777202072539e-07,
"loss": 2.2609,
"step": 610
},
{
"epoch": 0.8031088082901554,
"grad_norm": 3.1058601287467837,
"learning_rate": 4.0155440414507767e-07,
"loss": 2.2564,
"step": 620
},
{
"epoch": 0.8160621761658031,
"grad_norm": 3.2145397711050374,
"learning_rate": 4.0803108808290156e-07,
"loss": 2.2643,
"step": 630
},
{
"epoch": 0.8290155440414507,
"grad_norm": 2.9574287184508896,
"learning_rate": 4.1450777202072535e-07,
"loss": 2.2198,
"step": 640
},
{
"epoch": 0.8419689119170984,
"grad_norm": 3.385438951564467,
"learning_rate": 4.209844559585492e-07,
"loss": 2.2482,
"step": 650
},
{
"epoch": 0.8549222797927462,
"grad_norm": 3.232841492939655,
"learning_rate": 4.2746113989637303e-07,
"loss": 2.2553,
"step": 660
},
{
"epoch": 0.8678756476683938,
"grad_norm": 3.2498023529603226,
"learning_rate": 4.339378238341969e-07,
"loss": 2.2091,
"step": 670
},
{
"epoch": 0.8808290155440415,
"grad_norm": 3.387272764637681,
"learning_rate": 4.404145077720207e-07,
"loss": 2.2803,
"step": 680
},
{
"epoch": 0.8937823834196891,
"grad_norm": 3.205923523135513,
"learning_rate": 4.468911917098445e-07,
"loss": 2.2416,
"step": 690
},
{
"epoch": 0.9067357512953368,
"grad_norm": 3.328638392908686,
"learning_rate": 4.5336787564766835e-07,
"loss": 2.2512,
"step": 700
},
{
"epoch": 0.9196891191709845,
"grad_norm": 3.223397811767207,
"learning_rate": 4.5984455958549224e-07,
"loss": 2.2233,
"step": 710
},
{
"epoch": 0.9326424870466321,
"grad_norm": 2.903434123937875,
"learning_rate": 4.6632124352331603e-07,
"loss": 2.2221,
"step": 720
},
{
"epoch": 0.9455958549222798,
"grad_norm": 3.167214093551616,
"learning_rate": 4.7279792746113987e-07,
"loss": 2.1797,
"step": 730
},
{
"epoch": 0.9585492227979274,
"grad_norm": 2.8228184362789936,
"learning_rate": 4.792746113989637e-07,
"loss": 2.2113,
"step": 740
},
{
"epoch": 0.9715025906735751,
"grad_norm": 3.2547987666473506,
"learning_rate": 4.857512953367875e-07,
"loss": 2.2603,
"step": 750
},
{
"epoch": 0.9844559585492227,
"grad_norm": 3.2819941887944317,
"learning_rate": 4.922279792746113e-07,
"loss": 2.2197,
"step": 760
},
{
"epoch": 0.9974093264248705,
"grad_norm": 3.1793621366904765,
"learning_rate": 4.987046632124352e-07,
"loss": 2.1802,
"step": 770
},
{
"epoch": 1.0103626943005182,
"grad_norm": 3.129077964136437,
"learning_rate": 5.05181347150259e-07,
"loss": 2.2115,
"step": 780
},
{
"epoch": 1.0233160621761659,
"grad_norm": 3.624077927910293,
"learning_rate": 5.116580310880829e-07,
"loss": 2.2505,
"step": 790
},
{
"epoch": 1.0362694300518134,
"grad_norm": 3.001991950399265,
"learning_rate": 5.181347150259067e-07,
"loss": 2.2186,
"step": 800
},
{
"epoch": 1.049222797927461,
"grad_norm": 3.2056710194420077,
"learning_rate": 5.246113989637306e-07,
"loss": 2.2441,
"step": 810
},
{
"epoch": 1.0621761658031088,
"grad_norm": 3.3095386913147307,
"learning_rate": 5.310880829015544e-07,
"loss": 2.2022,
"step": 820
},
{
"epoch": 1.0751295336787565,
"grad_norm": 3.2748460342108276,
"learning_rate": 5.375647668393782e-07,
"loss": 2.2102,
"step": 830
},
{
"epoch": 1.0880829015544042,
"grad_norm": 3.1482065886570108,
"learning_rate": 5.440414507772021e-07,
"loss": 2.186,
"step": 840
},
{
"epoch": 1.1010362694300517,
"grad_norm": 3.2541206943048016,
"learning_rate": 5.505181347150258e-07,
"loss": 2.2447,
"step": 850
},
{
"epoch": 1.1139896373056994,
"grad_norm": 3.35168061407981,
"learning_rate": 5.569948186528497e-07,
"loss": 2.287,
"step": 860
},
{
"epoch": 1.1269430051813472,
"grad_norm": 3.324702715900074,
"learning_rate": 5.634715025906735e-07,
"loss": 2.2381,
"step": 870
},
{
"epoch": 1.1398963730569949,
"grad_norm": 2.981057376298506,
"learning_rate": 5.699481865284974e-07,
"loss": 2.253,
"step": 880
},
{
"epoch": 1.1528497409326426,
"grad_norm": 3.2319048665014734,
"learning_rate": 5.764248704663213e-07,
"loss": 2.234,
"step": 890
},
{
"epoch": 1.16580310880829,
"grad_norm": 3.197206460323895,
"learning_rate": 5.82901554404145e-07,
"loss": 2.2401,
"step": 900
},
{
"epoch": 1.1787564766839378,
"grad_norm": 3.128653204841994,
"learning_rate": 5.893782383419689e-07,
"loss": 2.2044,
"step": 910
},
{
"epoch": 1.1917098445595855,
"grad_norm": 3.194111726652451,
"learning_rate": 5.958549222797927e-07,
"loss": 2.1694,
"step": 920
},
{
"epoch": 1.2046632124352332,
"grad_norm": 3.07095427002542,
"learning_rate": 6.023316062176166e-07,
"loss": 2.2416,
"step": 930
},
{
"epoch": 1.2176165803108807,
"grad_norm": 3.0432092192539777,
"learning_rate": 6.088082901554404e-07,
"loss": 2.2327,
"step": 940
},
{
"epoch": 1.2305699481865284,
"grad_norm": 3.045795113156715,
"learning_rate": 6.152849740932642e-07,
"loss": 2.1957,
"step": 950
},
{
"epoch": 1.2435233160621761,
"grad_norm": 3.363436323137103,
"learning_rate": 6.217616580310881e-07,
"loss": 2.2182,
"step": 960
},
{
"epoch": 1.2564766839378239,
"grad_norm": 3.0067901045334633,
"learning_rate": 6.282383419689119e-07,
"loss": 2.1986,
"step": 970
},
{
"epoch": 1.2694300518134716,
"grad_norm": 3.047870744487349,
"learning_rate": 6.347150259067358e-07,
"loss": 2.2217,
"step": 980
},
{
"epoch": 1.2823834196891193,
"grad_norm": 3.2325772640891395,
"learning_rate": 6.411917098445595e-07,
"loss": 2.2146,
"step": 990
},
{
"epoch": 1.2953367875647668,
"grad_norm": 3.2882251454495006,
"learning_rate": 6.476683937823833e-07,
"loss": 2.203,
"step": 1000
}
],
"logging_steps": 10,
"max_steps": 15440,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 15311042248704.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}