bashgemma-270m / checkpoint-3600 /trainer_state.json
thinkthink-dev's picture
Upload folder using huggingface_hub
d482ce9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.144167758846658,
"eval_steps": 500,
"global_step": 3600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00436871996505024,
"grad_norm": 9.839941024780273,
"learning_rate": 8e-05,
"loss": 2.5246,
"step": 5
},
{
"epoch": 0.00873743993010048,
"grad_norm": 13.773455619812012,
"learning_rate": 0.00018,
"loss": 1.1343,
"step": 10
},
{
"epoch": 0.01310615989515072,
"grad_norm": 5.6580424308776855,
"learning_rate": 0.0001999997582552296,
"loss": 0.7712,
"step": 15
},
{
"epoch": 0.01747487986020096,
"grad_norm": 5.294467926025391,
"learning_rate": 0.0001999987761691029,
"loss": 0.73,
"step": 20
},
{
"epoch": 0.021843599825251202,
"grad_norm": 2.8633503913879395,
"learning_rate": 0.00019999703863998527,
"loss": 0.7289,
"step": 25
},
{
"epoch": 0.02621231979030144,
"grad_norm": 3.2836177349090576,
"learning_rate": 0.00019999454568100293,
"loss": 0.4686,
"step": 30
},
{
"epoch": 0.03058103975535168,
"grad_norm": 4.878258228302002,
"learning_rate": 0.00019999129731098898,
"loss": 0.6629,
"step": 35
},
{
"epoch": 0.03494975972040192,
"grad_norm": 2.899914026260376,
"learning_rate": 0.00019998729355448326,
"loss": 0.6038,
"step": 40
},
{
"epoch": 0.039318479685452164,
"grad_norm": 3.289844274520874,
"learning_rate": 0.00019998253444173235,
"loss": 0.4573,
"step": 45
},
{
"epoch": 0.043687199650502405,
"grad_norm": 2.957254648208618,
"learning_rate": 0.00019997702000868896,
"loss": 0.594,
"step": 50
},
{
"epoch": 0.048055919615552646,
"grad_norm": 3.171276807785034,
"learning_rate": 0.00019997075029701207,
"loss": 0.5719,
"step": 55
},
{
"epoch": 0.05242463958060288,
"grad_norm": 2.55605149269104,
"learning_rate": 0.0001999637253540663,
"loss": 0.5971,
"step": 60
},
{
"epoch": 0.05679335954565312,
"grad_norm": 2.127289295196533,
"learning_rate": 0.00019995594523292178,
"loss": 0.5712,
"step": 65
},
{
"epoch": 0.06116207951070336,
"grad_norm": 3.3928685188293457,
"learning_rate": 0.00019994740999235359,
"loss": 0.5712,
"step": 70
},
{
"epoch": 0.0655307994757536,
"grad_norm": 2.6700279712677,
"learning_rate": 0.00019993811969684142,
"loss": 0.427,
"step": 75
},
{
"epoch": 0.06989951944080385,
"grad_norm": 2.6936633586883545,
"learning_rate": 0.00019992807441656898,
"loss": 0.5321,
"step": 80
},
{
"epoch": 0.07426823940585409,
"grad_norm": 3.9897687435150146,
"learning_rate": 0.00019991727422742362,
"loss": 0.6025,
"step": 85
},
{
"epoch": 0.07863695937090433,
"grad_norm": 2.3496663570404053,
"learning_rate": 0.00019990571921099553,
"loss": 0.5975,
"step": 90
},
{
"epoch": 0.08300567933595457,
"grad_norm": 3.3796467781066895,
"learning_rate": 0.0001998934094545774,
"loss": 0.5255,
"step": 95
},
{
"epoch": 0.08737439930100481,
"grad_norm": 3.1103007793426514,
"learning_rate": 0.00019988034505116352,
"loss": 0.4946,
"step": 100
},
{
"epoch": 0.09174311926605505,
"grad_norm": 2.002304792404175,
"learning_rate": 0.00019986652609944926,
"loss": 0.425,
"step": 105
},
{
"epoch": 0.09611183923110529,
"grad_norm": 1.7572168111801147,
"learning_rate": 0.00019985195270383018,
"loss": 0.6073,
"step": 110
},
{
"epoch": 0.10048055919615553,
"grad_norm": 2.745215654373169,
"learning_rate": 0.00019983662497440133,
"loss": 0.586,
"step": 115
},
{
"epoch": 0.10484927916120576,
"grad_norm": 1.8170915842056274,
"learning_rate": 0.0001998205430269564,
"loss": 0.5255,
"step": 120
},
{
"epoch": 0.109217999126256,
"grad_norm": 1.4944056272506714,
"learning_rate": 0.00019980370698298677,
"loss": 0.4219,
"step": 125
},
{
"epoch": 0.11358671909130624,
"grad_norm": 1.6616989374160767,
"learning_rate": 0.00019978611696968074,
"loss": 0.4231,
"step": 130
},
{
"epoch": 0.11795543905635648,
"grad_norm": 2.0523645877838135,
"learning_rate": 0.00019976777311992247,
"loss": 0.5298,
"step": 135
},
{
"epoch": 0.12232415902140673,
"grad_norm": 2.065765619277954,
"learning_rate": 0.00019974867557229098,
"loss": 0.5228,
"step": 140
},
{
"epoch": 0.12669287898645698,
"grad_norm": 1.7283438444137573,
"learning_rate": 0.00019972882447105912,
"loss": 0.3452,
"step": 145
},
{
"epoch": 0.1310615989515072,
"grad_norm": 2.655750274658203,
"learning_rate": 0.00019970821996619244,
"loss": 0.508,
"step": 150
},
{
"epoch": 0.13543031891655744,
"grad_norm": 2.67799973487854,
"learning_rate": 0.0001996868622133482,
"loss": 0.4359,
"step": 155
},
{
"epoch": 0.1397990388816077,
"grad_norm": 1.6298809051513672,
"learning_rate": 0.00019966475137387396,
"loss": 0.5447,
"step": 160
},
{
"epoch": 0.14416775884665792,
"grad_norm": 1.4772286415100098,
"learning_rate": 0.00019964188761480657,
"loss": 0.4105,
"step": 165
},
{
"epoch": 0.14853647881170817,
"grad_norm": 2.2986271381378174,
"learning_rate": 0.00019961827110887083,
"loss": 0.603,
"step": 170
},
{
"epoch": 0.1529051987767584,
"grad_norm": 2.8261911869049072,
"learning_rate": 0.00019959390203447817,
"loss": 0.4649,
"step": 175
},
{
"epoch": 0.15727391874180865,
"grad_norm": 1.7771011590957642,
"learning_rate": 0.00019956878057572524,
"loss": 0.4394,
"step": 180
},
{
"epoch": 0.16164263870685888,
"grad_norm": 1.7315421104431152,
"learning_rate": 0.00019954290692239274,
"loss": 0.5289,
"step": 185
},
{
"epoch": 0.16601135867190914,
"grad_norm": 1.6124423742294312,
"learning_rate": 0.00019951628126994373,
"loss": 0.4173,
"step": 190
},
{
"epoch": 0.17038007863695936,
"grad_norm": 1.792577862739563,
"learning_rate": 0.00019948890381952232,
"loss": 0.4331,
"step": 195
},
{
"epoch": 0.17474879860200962,
"grad_norm": 1.9038774967193604,
"learning_rate": 0.000199460774777952,
"loss": 0.4247,
"step": 200
},
{
"epoch": 0.17911751856705985,
"grad_norm": 2.457122802734375,
"learning_rate": 0.00019943189435773432,
"loss": 0.4519,
"step": 205
},
{
"epoch": 0.1834862385321101,
"grad_norm": 1.97683584690094,
"learning_rate": 0.00019940226277704706,
"loss": 0.4761,
"step": 210
},
{
"epoch": 0.18785495849716033,
"grad_norm": 2.1646862030029297,
"learning_rate": 0.0001993718802597426,
"loss": 0.5294,
"step": 215
},
{
"epoch": 0.19222367846221058,
"grad_norm": 1.565412998199463,
"learning_rate": 0.00019934074703534637,
"loss": 0.3999,
"step": 220
},
{
"epoch": 0.1965923984272608,
"grad_norm": 2.4315876960754395,
"learning_rate": 0.00019930886333905504,
"loss": 0.378,
"step": 225
},
{
"epoch": 0.20096111839231107,
"grad_norm": 2.7567529678344727,
"learning_rate": 0.00019927622941173467,
"loss": 0.5075,
"step": 230
},
{
"epoch": 0.2053298383573613,
"grad_norm": 1.8640387058258057,
"learning_rate": 0.00019924284549991902,
"loss": 0.4749,
"step": 235
},
{
"epoch": 0.20969855832241152,
"grad_norm": 2.090924024581909,
"learning_rate": 0.00019920871185580757,
"loss": 0.4353,
"step": 240
},
{
"epoch": 0.21406727828746178,
"grad_norm": 1.9691081047058105,
"learning_rate": 0.00019917382873726376,
"loss": 0.4051,
"step": 245
},
{
"epoch": 0.218435998252512,
"grad_norm": 1.8130213022232056,
"learning_rate": 0.0001991381964078128,
"loss": 0.526,
"step": 250
},
{
"epoch": 0.22280471821756226,
"grad_norm": 2.078805923461914,
"learning_rate": 0.00019910181513664,
"loss": 0.5654,
"step": 255
},
{
"epoch": 0.22717343818261249,
"grad_norm": 2.0686287879943848,
"learning_rate": 0.0001990646851985884,
"loss": 0.43,
"step": 260
},
{
"epoch": 0.23154215814766274,
"grad_norm": 1.475821614265442,
"learning_rate": 0.00019902680687415705,
"loss": 0.355,
"step": 265
},
{
"epoch": 0.23591087811271297,
"grad_norm": 1.901236891746521,
"learning_rate": 0.0001989881804494985,
"loss": 0.4522,
"step": 270
},
{
"epoch": 0.24027959807776322,
"grad_norm": 1.2583553791046143,
"learning_rate": 0.00019894880621641704,
"loss": 0.3869,
"step": 275
},
{
"epoch": 0.24464831804281345,
"grad_norm": 1.712336540222168,
"learning_rate": 0.00019890868447236613,
"loss": 0.454,
"step": 280
},
{
"epoch": 0.2490170380078637,
"grad_norm": 2.3967206478118896,
"learning_rate": 0.00019886781552044634,
"loss": 0.4074,
"step": 285
},
{
"epoch": 0.25338575797291396,
"grad_norm": 2.0578925609588623,
"learning_rate": 0.0001988261996694032,
"loss": 0.4268,
"step": 290
},
{
"epoch": 0.2577544779379642,
"grad_norm": 1.7411088943481445,
"learning_rate": 0.0001987838372336245,
"loss": 0.334,
"step": 295
},
{
"epoch": 0.2621231979030144,
"grad_norm": 1.8145533800125122,
"learning_rate": 0.0001987407285331382,
"loss": 0.4019,
"step": 300
},
{
"epoch": 0.26649191786806464,
"grad_norm": 1.3501653671264648,
"learning_rate": 0.00019869687389361,
"loss": 0.32,
"step": 305
},
{
"epoch": 0.27086063783311487,
"grad_norm": 1.208422303199768,
"learning_rate": 0.00019865227364634073,
"loss": 0.4548,
"step": 310
},
{
"epoch": 0.27522935779816515,
"grad_norm": 1.521690011024475,
"learning_rate": 0.00019860692812826396,
"loss": 0.3572,
"step": 315
},
{
"epoch": 0.2795980777632154,
"grad_norm": 2.2849714756011963,
"learning_rate": 0.0001985608376819434,
"loss": 0.4555,
"step": 320
},
{
"epoch": 0.2839667977282656,
"grad_norm": 2.7733798027038574,
"learning_rate": 0.00019851400265557037,
"loss": 0.4726,
"step": 325
},
{
"epoch": 0.28833551769331583,
"grad_norm": 1.973522424697876,
"learning_rate": 0.00019846642340296114,
"loss": 0.4585,
"step": 330
},
{
"epoch": 0.2927042376583661,
"grad_norm": 1.7133642435073853,
"learning_rate": 0.0001984181002835542,
"loss": 0.4679,
"step": 335
},
{
"epoch": 0.29707295762341634,
"grad_norm": 2.8383235931396484,
"learning_rate": 0.00019836903366240768,
"loss": 0.4119,
"step": 340
},
{
"epoch": 0.30144167758846657,
"grad_norm": 2.798276901245117,
"learning_rate": 0.00019831922391019645,
"loss": 0.3665,
"step": 345
},
{
"epoch": 0.3058103975535168,
"grad_norm": 2.171276569366455,
"learning_rate": 0.00019826867140320938,
"loss": 0.5691,
"step": 350
},
{
"epoch": 0.3101791175185671,
"grad_norm": 2.0866177082061768,
"learning_rate": 0.00019821737652334653,
"loss": 0.4074,
"step": 355
},
{
"epoch": 0.3145478374836173,
"grad_norm": 1.3713918924331665,
"learning_rate": 0.0001981653396581162,
"loss": 0.3379,
"step": 360
},
{
"epoch": 0.31891655744866754,
"grad_norm": 1.6086684465408325,
"learning_rate": 0.0001981125612006321,
"loss": 0.3563,
"step": 365
},
{
"epoch": 0.32328527741371776,
"grad_norm": 2.655686378479004,
"learning_rate": 0.0001980590415496102,
"loss": 0.3988,
"step": 370
},
{
"epoch": 0.32765399737876805,
"grad_norm": 1.5271559953689575,
"learning_rate": 0.00019800478110936596,
"loss": 0.5784,
"step": 375
},
{
"epoch": 0.3320227173438183,
"grad_norm": 1.3043195009231567,
"learning_rate": 0.00019794978028981106,
"loss": 0.2637,
"step": 380
},
{
"epoch": 0.3363914373088685,
"grad_norm": 2.539109706878662,
"learning_rate": 0.0001978940395064504,
"loss": 0.4658,
"step": 385
},
{
"epoch": 0.34076015727391873,
"grad_norm": 1.7521268129348755,
"learning_rate": 0.00019783755918037903,
"loss": 0.4253,
"step": 390
},
{
"epoch": 0.34512887723896896,
"grad_norm": 1.5679692029953003,
"learning_rate": 0.00019778033973827882,
"loss": 0.4528,
"step": 395
},
{
"epoch": 0.34949759720401924,
"grad_norm": 1.670640468597412,
"learning_rate": 0.00019772238161241528,
"loss": 0.3724,
"step": 400
},
{
"epoch": 0.35386631716906947,
"grad_norm": 1.520856261253357,
"learning_rate": 0.00019766368524063438,
"loss": 0.4141,
"step": 405
},
{
"epoch": 0.3582350371341197,
"grad_norm": 1.0802158117294312,
"learning_rate": 0.00019760425106635926,
"loss": 0.3268,
"step": 410
},
{
"epoch": 0.3626037570991699,
"grad_norm": 1.7306379079818726,
"learning_rate": 0.0001975440795385866,
"loss": 0.3654,
"step": 415
},
{
"epoch": 0.3669724770642202,
"grad_norm": 1.5037274360656738,
"learning_rate": 0.0001974831711118836,
"loss": 0.4285,
"step": 420
},
{
"epoch": 0.37134119702927043,
"grad_norm": 1.4654844999313354,
"learning_rate": 0.00019742152624638437,
"loss": 0.2548,
"step": 425
},
{
"epoch": 0.37570991699432066,
"grad_norm": 2.6770753860473633,
"learning_rate": 0.00019735914540778638,
"loss": 0.4238,
"step": 430
},
{
"epoch": 0.3800786369593709,
"grad_norm": 1.1864055395126343,
"learning_rate": 0.00019729602906734704,
"loss": 0.3959,
"step": 435
},
{
"epoch": 0.38444735692442117,
"grad_norm": 1.904876708984375,
"learning_rate": 0.00019723217770188024,
"loss": 0.3603,
"step": 440
},
{
"epoch": 0.3888160768894714,
"grad_norm": 1.7086598873138428,
"learning_rate": 0.0001971675917937525,
"loss": 0.551,
"step": 445
},
{
"epoch": 0.3931847968545216,
"grad_norm": 1.4635995626449585,
"learning_rate": 0.00019710227183087947,
"loss": 0.3738,
"step": 450
},
{
"epoch": 0.39755351681957185,
"grad_norm": 1.6047295331954956,
"learning_rate": 0.00019703621830672238,
"loss": 0.475,
"step": 455
},
{
"epoch": 0.40192223678462213,
"grad_norm": 1.4741933345794678,
"learning_rate": 0.00019696943172028394,
"loss": 0.4021,
"step": 460
},
{
"epoch": 0.40629095674967236,
"grad_norm": 2.8138020038604736,
"learning_rate": 0.00019690191257610497,
"loss": 0.3665,
"step": 465
},
{
"epoch": 0.4106596767147226,
"grad_norm": 1.6264874935150146,
"learning_rate": 0.00019683366138426034,
"loss": 0.3598,
"step": 470
},
{
"epoch": 0.4150283966797728,
"grad_norm": 1.6185061931610107,
"learning_rate": 0.00019676467866035525,
"loss": 0.5003,
"step": 475
},
{
"epoch": 0.41939711664482304,
"grad_norm": 1.8654040098190308,
"learning_rate": 0.00019669496492552113,
"loss": 0.397,
"step": 480
},
{
"epoch": 0.4237658366098733,
"grad_norm": 1.2525237798690796,
"learning_rate": 0.00019662452070641205,
"loss": 0.3235,
"step": 485
},
{
"epoch": 0.42813455657492355,
"grad_norm": 1.7755401134490967,
"learning_rate": 0.00019655334653520036,
"loss": 0.2978,
"step": 490
},
{
"epoch": 0.4325032765399738,
"grad_norm": 1.6025470495224,
"learning_rate": 0.00019648144294957297,
"loss": 0.4436,
"step": 495
},
{
"epoch": 0.436871996505024,
"grad_norm": 1.085461974143982,
"learning_rate": 0.00019640881049272713,
"loss": 0.22,
"step": 500
},
{
"epoch": 0.4412407164700743,
"grad_norm": 1.491818904876709,
"learning_rate": 0.00019633544971336636,
"loss": 0.2714,
"step": 505
},
{
"epoch": 0.4456094364351245,
"grad_norm": 0.9479840993881226,
"learning_rate": 0.0001962613611656963,
"loss": 0.3735,
"step": 510
},
{
"epoch": 0.44997815640017474,
"grad_norm": 3.0529448986053467,
"learning_rate": 0.0001961865454094205,
"loss": 0.4779,
"step": 515
},
{
"epoch": 0.45434687636522497,
"grad_norm": 2.831089973449707,
"learning_rate": 0.00019611100300973635,
"loss": 0.469,
"step": 520
},
{
"epoch": 0.45871559633027525,
"grad_norm": 2.1834311485290527,
"learning_rate": 0.00019603473453733052,
"loss": 0.4163,
"step": 525
},
{
"epoch": 0.4630843162953255,
"grad_norm": 1.3152204751968384,
"learning_rate": 0.00019595774056837493,
"loss": 0.3744,
"step": 530
},
{
"epoch": 0.4674530362603757,
"grad_norm": 1.4493387937545776,
"learning_rate": 0.00019588002168452223,
"loss": 0.3117,
"step": 535
},
{
"epoch": 0.47182175622542594,
"grad_norm": 1.1412076950073242,
"learning_rate": 0.00019580157847290147,
"loss": 0.3152,
"step": 540
},
{
"epoch": 0.47619047619047616,
"grad_norm": 1.5004645586013794,
"learning_rate": 0.00019572241152611365,
"loss": 0.3271,
"step": 545
},
{
"epoch": 0.48055919615552645,
"grad_norm": 2.3333992958068848,
"learning_rate": 0.0001956425214422272,
"loss": 0.3626,
"step": 550
},
{
"epoch": 0.4849279161205767,
"grad_norm": 1.5423107147216797,
"learning_rate": 0.0001955619088247736,
"loss": 0.4588,
"step": 555
},
{
"epoch": 0.4892966360856269,
"grad_norm": 3.008280038833618,
"learning_rate": 0.00019548057428274266,
"loss": 0.5275,
"step": 560
},
{
"epoch": 0.49366535605067713,
"grad_norm": 1.0968583822250366,
"learning_rate": 0.00019539851843057798,
"loss": 0.3233,
"step": 565
},
{
"epoch": 0.4980340760157274,
"grad_norm": 1.265228271484375,
"learning_rate": 0.00019531574188817234,
"loss": 0.2743,
"step": 570
},
{
"epoch": 0.5024027959807776,
"grad_norm": 1.9382916688919067,
"learning_rate": 0.000195232245280863,
"loss": 0.3189,
"step": 575
},
{
"epoch": 0.5067715159458279,
"grad_norm": 1.6710058450698853,
"learning_rate": 0.00019514802923942687,
"loss": 0.345,
"step": 580
},
{
"epoch": 0.5111402359108781,
"grad_norm": 1.8377633094787598,
"learning_rate": 0.000195063094400076,
"loss": 0.4441,
"step": 585
},
{
"epoch": 0.5155089558759284,
"grad_norm": 1.432173728942871,
"learning_rate": 0.0001949774414044525,
"loss": 0.3277,
"step": 590
},
{
"epoch": 0.5198776758409785,
"grad_norm": 1.096330165863037,
"learning_rate": 0.0001948910708996239,
"loss": 0.3821,
"step": 595
},
{
"epoch": 0.5242463958060288,
"grad_norm": 1.1951391696929932,
"learning_rate": 0.00019480398353807798,
"loss": 0.4303,
"step": 600
},
{
"epoch": 0.5286151157710791,
"grad_norm": 0.9764880537986755,
"learning_rate": 0.0001947161799777183,
"loss": 0.2693,
"step": 605
},
{
"epoch": 0.5329838357361293,
"grad_norm": 1.2566354274749756,
"learning_rate": 0.00019462766088185874,
"loss": 0.2851,
"step": 610
},
{
"epoch": 0.5373525557011796,
"grad_norm": 1.494903802871704,
"learning_rate": 0.0001945384269192188,
"loss": 0.36,
"step": 615
},
{
"epoch": 0.5417212756662297,
"grad_norm": 1.5508995056152344,
"learning_rate": 0.00019444847876391844,
"loss": 0.3682,
"step": 620
},
{
"epoch": 0.54608999563128,
"grad_norm": 2.227889060974121,
"learning_rate": 0.00019435781709547305,
"loss": 0.3889,
"step": 625
},
{
"epoch": 0.5504587155963303,
"grad_norm": 0.9221494197845459,
"learning_rate": 0.0001942664425987882,
"loss": 0.3375,
"step": 630
},
{
"epoch": 0.5548274355613805,
"grad_norm": 1.3386973142623901,
"learning_rate": 0.00019417435596415458,
"loss": 0.4833,
"step": 635
},
{
"epoch": 0.5591961555264308,
"grad_norm": 1.9686752557754517,
"learning_rate": 0.00019408155788724272,
"loss": 0.4739,
"step": 640
},
{
"epoch": 0.563564875491481,
"grad_norm": 2.3978073596954346,
"learning_rate": 0.00019398804906909777,
"loss": 0.4681,
"step": 645
},
{
"epoch": 0.5679335954565312,
"grad_norm": 1.536699652671814,
"learning_rate": 0.0001938938302161342,
"loss": 0.2684,
"step": 650
},
{
"epoch": 0.5723023154215815,
"grad_norm": 1.691787600517273,
"learning_rate": 0.00019379890204013043,
"loss": 0.3512,
"step": 655
},
{
"epoch": 0.5766710353866317,
"grad_norm": 1.7557870149612427,
"learning_rate": 0.0001937032652582235,
"loss": 0.3423,
"step": 660
},
{
"epoch": 0.581039755351682,
"grad_norm": 1.7950220108032227,
"learning_rate": 0.0001936069205929036,
"loss": 0.2831,
"step": 665
},
{
"epoch": 0.5854084753167322,
"grad_norm": 1.928232192993164,
"learning_rate": 0.00019350986877200867,
"loss": 0.323,
"step": 670
},
{
"epoch": 0.5897771952817824,
"grad_norm": 1.86429762840271,
"learning_rate": 0.00019341211052871887,
"loss": 0.4248,
"step": 675
},
{
"epoch": 0.5941459152468327,
"grad_norm": 2.022738456726074,
"learning_rate": 0.00019331364660155103,
"loss": 0.3411,
"step": 680
},
{
"epoch": 0.598514635211883,
"grad_norm": 1.2337995767593384,
"learning_rate": 0.00019321447773435306,
"loss": 0.2368,
"step": 685
},
{
"epoch": 0.6028833551769331,
"grad_norm": 2.015075445175171,
"learning_rate": 0.00019311460467629843,
"loss": 0.5116,
"step": 690
},
{
"epoch": 0.6072520751419834,
"grad_norm": 1.2344030141830444,
"learning_rate": 0.00019301402818188036,
"loss": 0.3313,
"step": 695
},
{
"epoch": 0.6116207951070336,
"grad_norm": 1.129764437675476,
"learning_rate": 0.00019291274901090625,
"loss": 0.408,
"step": 700
},
{
"epoch": 0.6159895150720839,
"grad_norm": 1.4350385665893555,
"learning_rate": 0.00019281076792849184,
"loss": 0.3729,
"step": 705
},
{
"epoch": 0.6203582350371342,
"grad_norm": 1.9586119651794434,
"learning_rate": 0.00019270808570505553,
"loss": 0.4315,
"step": 710
},
{
"epoch": 0.6247269550021843,
"grad_norm": 1.0157238245010376,
"learning_rate": 0.00019260470311631243,
"loss": 0.2861,
"step": 715
},
{
"epoch": 0.6290956749672346,
"grad_norm": 1.3841652870178223,
"learning_rate": 0.00019250062094326864,
"loss": 0.4037,
"step": 720
},
{
"epoch": 0.6334643949322848,
"grad_norm": 1.848821997642517,
"learning_rate": 0.00019239583997221525,
"loss": 0.3665,
"step": 725
},
{
"epoch": 0.6378331148973351,
"grad_norm": 0.9416481256484985,
"learning_rate": 0.0001922903609947225,
"loss": 0.339,
"step": 730
},
{
"epoch": 0.6422018348623854,
"grad_norm": 1.0696804523468018,
"learning_rate": 0.0001921841848076336,
"loss": 0.2783,
"step": 735
},
{
"epoch": 0.6465705548274355,
"grad_norm": 1.9199622869491577,
"learning_rate": 0.00019207731221305903,
"loss": 0.2904,
"step": 740
},
{
"epoch": 0.6509392747924858,
"grad_norm": 1.347430944442749,
"learning_rate": 0.00019196974401837008,
"loss": 0.2719,
"step": 745
},
{
"epoch": 0.6553079947575361,
"grad_norm": 0.9743670225143433,
"learning_rate": 0.0001918614810361932,
"loss": 0.2748,
"step": 750
},
{
"epoch": 0.6596767147225863,
"grad_norm": 1.4043099880218506,
"learning_rate": 0.00019175252408440343,
"loss": 0.3285,
"step": 755
},
{
"epoch": 0.6640454346876365,
"grad_norm": 2.9343338012695312,
"learning_rate": 0.0001916428739861185,
"loss": 0.4962,
"step": 760
},
{
"epoch": 0.6684141546526867,
"grad_norm": 2.3201515674591064,
"learning_rate": 0.0001915325315696926,
"loss": 0.3243,
"step": 765
},
{
"epoch": 0.672782874617737,
"grad_norm": 1.675564169883728,
"learning_rate": 0.00019142149766870992,
"loss": 0.4596,
"step": 770
},
{
"epoch": 0.6771515945827873,
"grad_norm": 1.664604663848877,
"learning_rate": 0.00019130977312197854,
"loss": 0.3024,
"step": 775
},
{
"epoch": 0.6815203145478375,
"grad_norm": 1.8358148336410522,
"learning_rate": 0.00019119735877352412,
"loss": 0.3862,
"step": 780
},
{
"epoch": 0.6858890345128877,
"grad_norm": 1.3632128238677979,
"learning_rate": 0.00019108425547258328,
"loss": 0.2374,
"step": 785
},
{
"epoch": 0.6902577544779379,
"grad_norm": 2.0279934406280518,
"learning_rate": 0.0001909704640735975,
"loss": 0.4392,
"step": 790
},
{
"epoch": 0.6946264744429882,
"grad_norm": 1.2824902534484863,
"learning_rate": 0.0001908559854362064,
"loss": 0.2782,
"step": 795
},
{
"epoch": 0.6989951944080385,
"grad_norm": 1.3477047681808472,
"learning_rate": 0.00019074082042524145,
"loss": 0.3631,
"step": 800
},
{
"epoch": 0.7033639143730887,
"grad_norm": 1.8478046655654907,
"learning_rate": 0.00019062496991071928,
"loss": 0.3788,
"step": 805
},
{
"epoch": 0.7077326343381389,
"grad_norm": 1.470382571220398,
"learning_rate": 0.0001905084347678352,
"loss": 0.3825,
"step": 810
},
{
"epoch": 0.7121013543031892,
"grad_norm": 2.4951813220977783,
"learning_rate": 0.00019039121587695652,
"loss": 0.3359,
"step": 815
},
{
"epoch": 0.7164700742682394,
"grad_norm": 2.3441359996795654,
"learning_rate": 0.000190273314123616,
"loss": 0.32,
"step": 820
},
{
"epoch": 0.7208387942332897,
"grad_norm": 2.372884750366211,
"learning_rate": 0.00019015473039850513,
"loss": 0.3651,
"step": 825
},
{
"epoch": 0.7252075141983398,
"grad_norm": 2.4474101066589355,
"learning_rate": 0.0001900354655974672,
"loss": 0.4401,
"step": 830
},
{
"epoch": 0.7295762341633901,
"grad_norm": 1.4031054973602295,
"learning_rate": 0.0001899155206214909,
"loss": 0.308,
"step": 835
},
{
"epoch": 0.7339449541284404,
"grad_norm": 1.6008141040802002,
"learning_rate": 0.00018979489637670322,
"loss": 0.2937,
"step": 840
},
{
"epoch": 0.7383136740934906,
"grad_norm": 0.9202178120613098,
"learning_rate": 0.0001896735937743627,
"loss": 0.3157,
"step": 845
},
{
"epoch": 0.7426823940585409,
"grad_norm": 1.024746298789978,
"learning_rate": 0.00018955161373085253,
"loss": 0.2934,
"step": 850
},
{
"epoch": 0.747051114023591,
"grad_norm": 1.1573566198349,
"learning_rate": 0.00018942895716767374,
"loss": 0.3617,
"step": 855
},
{
"epoch": 0.7514198339886413,
"grad_norm": 1.227409839630127,
"learning_rate": 0.00018930562501143805,
"loss": 0.3581,
"step": 860
},
{
"epoch": 0.7557885539536916,
"grad_norm": 1.5460100173950195,
"learning_rate": 0.00018918161819386095,
"loss": 0.3393,
"step": 865
},
{
"epoch": 0.7601572739187418,
"grad_norm": 1.688852310180664,
"learning_rate": 0.0001890569376517548,
"loss": 0.4389,
"step": 870
},
{
"epoch": 0.764525993883792,
"grad_norm": 1.5271598100662231,
"learning_rate": 0.00018893158432702149,
"loss": 0.2915,
"step": 875
},
{
"epoch": 0.7688947138488423,
"grad_norm": 1.695788860321045,
"learning_rate": 0.00018880555916664555,
"loss": 0.4026,
"step": 880
},
{
"epoch": 0.7732634338138925,
"grad_norm": 1.6879792213439941,
"learning_rate": 0.00018867886312268683,
"loss": 0.2857,
"step": 885
},
{
"epoch": 0.7776321537789428,
"grad_norm": 2.0718719959259033,
"learning_rate": 0.00018855149715227344,
"loss": 0.4236,
"step": 890
},
{
"epoch": 0.782000873743993,
"grad_norm": 1.5112775564193726,
"learning_rate": 0.00018842346221759448,
"loss": 0.325,
"step": 895
},
{
"epoch": 0.7863695937090432,
"grad_norm": 1.2844749689102173,
"learning_rate": 0.00018829475928589271,
"loss": 0.3782,
"step": 900
},
{
"epoch": 0.7907383136740935,
"grad_norm": 2.150299072265625,
"learning_rate": 0.00018816538932945728,
"loss": 0.3726,
"step": 905
},
{
"epoch": 0.7951070336391437,
"grad_norm": 1.7050650119781494,
"learning_rate": 0.00018803535332561646,
"loss": 0.3824,
"step": 910
},
{
"epoch": 0.799475753604194,
"grad_norm": 1.8164982795715332,
"learning_rate": 0.00018790465225673012,
"loss": 0.3664,
"step": 915
},
{
"epoch": 0.8038444735692443,
"grad_norm": 1.1102941036224365,
"learning_rate": 0.00018777328711018244,
"loss": 0.3166,
"step": 920
},
{
"epoch": 0.8082131935342944,
"grad_norm": 1.4220764636993408,
"learning_rate": 0.0001876412588783743,
"loss": 0.3049,
"step": 925
},
{
"epoch": 0.8125819134993447,
"grad_norm": 2.11336088180542,
"learning_rate": 0.000187508568558716,
"loss": 0.3076,
"step": 930
},
{
"epoch": 0.8169506334643949,
"grad_norm": 1.9948710203170776,
"learning_rate": 0.00018737521715361948,
"loss": 0.3846,
"step": 935
},
{
"epoch": 0.8213193534294452,
"grad_norm": 1.8913676738739014,
"learning_rate": 0.00018724120567049094,
"loss": 0.4296,
"step": 940
},
{
"epoch": 0.8256880733944955,
"grad_norm": 1.3633447885513306,
"learning_rate": 0.0001871065351217231,
"loss": 0.3569,
"step": 945
},
{
"epoch": 0.8300567933595456,
"grad_norm": 1.4957417249679565,
"learning_rate": 0.00018697120652468762,
"loss": 0.3085,
"step": 950
},
{
"epoch": 0.8344255133245959,
"grad_norm": 2.076399803161621,
"learning_rate": 0.0001868352209017275,
"loss": 0.3331,
"step": 955
},
{
"epoch": 0.8387942332896461,
"grad_norm": 1.1817855834960938,
"learning_rate": 0.00018669857928014906,
"loss": 0.3414,
"step": 960
},
{
"epoch": 0.8431629532546964,
"grad_norm": 1.4255414009094238,
"learning_rate": 0.00018656128269221454,
"loss": 0.2782,
"step": 965
},
{
"epoch": 0.8475316732197467,
"grad_norm": 1.326687216758728,
"learning_rate": 0.0001864233321751341,
"loss": 0.2998,
"step": 970
},
{
"epoch": 0.8519003931847968,
"grad_norm": 2.222280263900757,
"learning_rate": 0.00018628472877105793,
"loss": 0.3348,
"step": 975
},
{
"epoch": 0.8562691131498471,
"grad_norm": 1.518401026725769,
"learning_rate": 0.00018614547352706863,
"loss": 0.3816,
"step": 980
},
{
"epoch": 0.8606378331148974,
"grad_norm": 1.1030207872390747,
"learning_rate": 0.00018600556749517305,
"loss": 0.3222,
"step": 985
},
{
"epoch": 0.8650065530799476,
"grad_norm": 2.406994104385376,
"learning_rate": 0.00018586501173229437,
"loss": 0.3754,
"step": 990
},
{
"epoch": 0.8693752730449978,
"grad_norm": 1.2401646375656128,
"learning_rate": 0.00018572380730026434,
"loss": 0.4402,
"step": 995
},
{
"epoch": 0.873743993010048,
"grad_norm": 2.0233402252197266,
"learning_rate": 0.0001855819552658149,
"loss": 0.3323,
"step": 1000
},
{
"epoch": 0.8781127129750983,
"grad_norm": 1.5329450368881226,
"learning_rate": 0.00018543945670057045,
"loss": 0.235,
"step": 1005
},
{
"epoch": 0.8824814329401486,
"grad_norm": 1.8849459886550903,
"learning_rate": 0.00018529631268103964,
"loss": 0.357,
"step": 1010
},
{
"epoch": 0.8868501529051988,
"grad_norm": 2.016646146774292,
"learning_rate": 0.0001851525242886071,
"loss": 0.2663,
"step": 1015
},
{
"epoch": 0.891218872870249,
"grad_norm": 2.3272440433502197,
"learning_rate": 0.0001850080926095255,
"loss": 0.2926,
"step": 1020
},
{
"epoch": 0.8955875928352992,
"grad_norm": 1.7760261297225952,
"learning_rate": 0.00018486301873490713,
"loss": 0.4155,
"step": 1025
},
{
"epoch": 0.8999563128003495,
"grad_norm": 1.4679979085922241,
"learning_rate": 0.0001847173037607159,
"loss": 0.2877,
"step": 1030
},
{
"epoch": 0.9043250327653998,
"grad_norm": 1.8398054838180542,
"learning_rate": 0.0001845709487877588,
"loss": 0.2856,
"step": 1035
},
{
"epoch": 0.9086937527304499,
"grad_norm": 3.05880069732666,
"learning_rate": 0.00018442395492167775,
"loss": 0.3373,
"step": 1040
},
{
"epoch": 0.9130624726955002,
"grad_norm": 1.2527328729629517,
"learning_rate": 0.0001842763232729412,
"loss": 0.2412,
"step": 1045
},
{
"epoch": 0.9174311926605505,
"grad_norm": 1.7745814323425293,
"learning_rate": 0.00018412805495683575,
"loss": 0.3955,
"step": 1050
},
{
"epoch": 0.9217999126256007,
"grad_norm": 3.2864468097686768,
"learning_rate": 0.0001839791510934577,
"loss": 0.333,
"step": 1055
},
{
"epoch": 0.926168632590651,
"grad_norm": 2.0274927616119385,
"learning_rate": 0.0001838296128077046,
"loss": 0.4004,
"step": 1060
},
{
"epoch": 0.9305373525557011,
"grad_norm": 1.9851633310317993,
"learning_rate": 0.0001836794412292668,
"loss": 0.3132,
"step": 1065
},
{
"epoch": 0.9349060725207514,
"grad_norm": 1.3309999704360962,
"learning_rate": 0.00018352863749261883,
"loss": 0.2645,
"step": 1070
},
{
"epoch": 0.9392747924858017,
"grad_norm": 2.0173072814941406,
"learning_rate": 0.00018337720273701088,
"loss": 0.4376,
"step": 1075
},
{
"epoch": 0.9436435124508519,
"grad_norm": 1.815408706665039,
"learning_rate": 0.00018322513810646024,
"loss": 0.2851,
"step": 1080
},
{
"epoch": 0.9480122324159022,
"grad_norm": 1.1190584897994995,
"learning_rate": 0.00018307244474974254,
"loss": 0.4664,
"step": 1085
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.9746566414833069,
"learning_rate": 0.00018291912382038317,
"loss": 0.3816,
"step": 1090
},
{
"epoch": 0.9567496723460026,
"grad_norm": 1.9062715768814087,
"learning_rate": 0.0001827651764766485,
"loss": 0.3031,
"step": 1095
},
{
"epoch": 0.9611183923110529,
"grad_norm": 1.027502417564392,
"learning_rate": 0.00018261060388153718,
"loss": 0.2657,
"step": 1100
},
{
"epoch": 0.9654871122761031,
"grad_norm": 2.239164352416992,
"learning_rate": 0.00018245540720277135,
"loss": 0.3367,
"step": 1105
},
{
"epoch": 0.9698558322411533,
"grad_norm": 1.5922635793685913,
"learning_rate": 0.0001822995876127878,
"loss": 0.3044,
"step": 1110
},
{
"epoch": 0.9742245522062036,
"grad_norm": 1.9189236164093018,
"learning_rate": 0.00018214314628872905,
"loss": 0.3326,
"step": 1115
},
{
"epoch": 0.9785932721712538,
"grad_norm": 1.1626375913619995,
"learning_rate": 0.00018198608441243467,
"loss": 0.2761,
"step": 1120
},
{
"epoch": 0.9829619921363041,
"grad_norm": 1.805367112159729,
"learning_rate": 0.00018182840317043202,
"loss": 0.3337,
"step": 1125
},
{
"epoch": 0.9873307121013543,
"grad_norm": 1.5879418849945068,
"learning_rate": 0.0001816701037539277,
"loss": 0.3242,
"step": 1130
},
{
"epoch": 0.9916994320664045,
"grad_norm": 1.3560898303985596,
"learning_rate": 0.00018151118735879805,
"loss": 0.2794,
"step": 1135
},
{
"epoch": 0.9960681520314548,
"grad_norm": 1.0656763315200806,
"learning_rate": 0.0001813516551855806,
"loss": 0.3336,
"step": 1140
},
{
"epoch": 1.0,
"grad_norm": 3.2105913162231445,
"learning_rate": 0.00018119150843946472,
"loss": 0.3753,
"step": 1145
},
{
"epoch": 1.0043687199650502,
"grad_norm": 1.2890548706054688,
"learning_rate": 0.00018103074833028258,
"loss": 0.2943,
"step": 1150
},
{
"epoch": 1.0087374399301006,
"grad_norm": 1.8480075597763062,
"learning_rate": 0.00018086937607250002,
"loss": 0.3057,
"step": 1155
},
{
"epoch": 1.0131061598951507,
"grad_norm": 1.3526337146759033,
"learning_rate": 0.00018070739288520736,
"loss": 0.2328,
"step": 1160
},
{
"epoch": 1.017474879860201,
"grad_norm": 1.0462696552276611,
"learning_rate": 0.00018054479999211025,
"loss": 0.1896,
"step": 1165
},
{
"epoch": 1.0218435998252513,
"grad_norm": 0.9586630463600159,
"learning_rate": 0.00018038159862152027,
"loss": 0.2567,
"step": 1170
},
{
"epoch": 1.0262123197903015,
"grad_norm": 1.0181586742401123,
"learning_rate": 0.0001802177900063459,
"loss": 0.2653,
"step": 1175
},
{
"epoch": 1.0305810397553516,
"grad_norm": 1.2034084796905518,
"learning_rate": 0.0001800533753840829,
"loss": 0.3953,
"step": 1180
},
{
"epoch": 1.0349497597204018,
"grad_norm": 2.6563191413879395,
"learning_rate": 0.0001798883559968053,
"loss": 0.185,
"step": 1185
},
{
"epoch": 1.0393184796854522,
"grad_norm": 1.2556034326553345,
"learning_rate": 0.00017972273309115568,
"loss": 0.2452,
"step": 1190
},
{
"epoch": 1.0436871996505024,
"grad_norm": 1.4751702547073364,
"learning_rate": 0.00017955650791833604,
"loss": 0.2827,
"step": 1195
},
{
"epoch": 1.0480559196155526,
"grad_norm": 3.8620717525482178,
"learning_rate": 0.00017938968173409811,
"loss": 0.2953,
"step": 1200
},
{
"epoch": 1.052424639580603,
"grad_norm": 1.2123383283615112,
"learning_rate": 0.00017922225579873407,
"loss": 0.2165,
"step": 1205
},
{
"epoch": 1.0567933595456531,
"grad_norm": 1.911566972732544,
"learning_rate": 0.0001790542313770669,
"loss": 0.2444,
"step": 1210
},
{
"epoch": 1.0611620795107033,
"grad_norm": 1.9949162006378174,
"learning_rate": 0.00017888560973844083,
"loss": 0.255,
"step": 1215
},
{
"epoch": 1.0655307994757537,
"grad_norm": 0.9666941165924072,
"learning_rate": 0.0001787163921567118,
"loss": 0.1913,
"step": 1220
},
{
"epoch": 1.0698995194408039,
"grad_norm": 0.7195447087287903,
"learning_rate": 0.0001785465799102378,
"loss": 0.2541,
"step": 1225
},
{
"epoch": 1.074268239405854,
"grad_norm": 1.5414377450942993,
"learning_rate": 0.0001783761742818693,
"loss": 0.3626,
"step": 1230
},
{
"epoch": 1.0786369593709044,
"grad_norm": 1.2173676490783691,
"learning_rate": 0.0001782051765589394,
"loss": 0.2913,
"step": 1235
},
{
"epoch": 1.0830056793359546,
"grad_norm": 1.6966580152511597,
"learning_rate": 0.00017803358803325416,
"loss": 0.2613,
"step": 1240
},
{
"epoch": 1.0873743993010048,
"grad_norm": 1.8033946752548218,
"learning_rate": 0.00017786141000108302,
"loss": 0.2734,
"step": 1245
},
{
"epoch": 1.091743119266055,
"grad_norm": 1.1834598779678345,
"learning_rate": 0.00017768864376314873,
"loss": 0.2548,
"step": 1250
},
{
"epoch": 1.0961118392311053,
"grad_norm": 1.441835641860962,
"learning_rate": 0.00017751529062461777,
"loss": 0.3404,
"step": 1255
},
{
"epoch": 1.1004805591961555,
"grad_norm": 1.443575382232666,
"learning_rate": 0.0001773413518950902,
"loss": 0.312,
"step": 1260
},
{
"epoch": 1.1048492791612057,
"grad_norm": 1.2344982624053955,
"learning_rate": 0.0001771668288885901,
"loss": 0.2594,
"step": 1265
},
{
"epoch": 1.109217999126256,
"grad_norm": 1.3508884906768799,
"learning_rate": 0.0001769917229235554,
"loss": 0.3467,
"step": 1270
},
{
"epoch": 1.1135867190913062,
"grad_norm": 1.3105831146240234,
"learning_rate": 0.00017681603532282805,
"loss": 0.2393,
"step": 1275
},
{
"epoch": 1.1179554390563564,
"grad_norm": 1.3476370573043823,
"learning_rate": 0.00017663976741364394,
"loss": 0.3318,
"step": 1280
},
{
"epoch": 1.1223241590214068,
"grad_norm": 1.9715685844421387,
"learning_rate": 0.00017646292052762296,
"loss": 0.2808,
"step": 1285
},
{
"epoch": 1.126692878986457,
"grad_norm": 1.2339669466018677,
"learning_rate": 0.00017628549600075884,
"loss": 0.2753,
"step": 1290
},
{
"epoch": 1.1310615989515072,
"grad_norm": 1.3921184539794922,
"learning_rate": 0.00017610749517340914,
"loss": 0.2096,
"step": 1295
},
{
"epoch": 1.1354303189165575,
"grad_norm": 1.3537594079971313,
"learning_rate": 0.0001759289193902851,
"loss": 0.2232,
"step": 1300
},
{
"epoch": 1.1397990388816077,
"grad_norm": 2.207932472229004,
"learning_rate": 0.00017574977000044147,
"loss": 0.4179,
"step": 1305
},
{
"epoch": 1.144167758846658,
"grad_norm": 2.4464988708496094,
"learning_rate": 0.0001755700483572663,
"loss": 0.3863,
"step": 1310
},
{
"epoch": 1.148536478811708,
"grad_norm": 1.2169779539108276,
"learning_rate": 0.00017538975581847077,
"loss": 0.2131,
"step": 1315
},
{
"epoch": 1.1529051987767585,
"grad_norm": 1.2162644863128662,
"learning_rate": 0.00017520889374607893,
"loss": 0.2299,
"step": 1320
},
{
"epoch": 1.1572739187418086,
"grad_norm": 1.7051069736480713,
"learning_rate": 0.0001750274635064173,
"loss": 0.2994,
"step": 1325
},
{
"epoch": 1.1616426387068588,
"grad_norm": 1.814450740814209,
"learning_rate": 0.00017484546647010473,
"loss": 0.2948,
"step": 1330
},
{
"epoch": 1.1660113586719092,
"grad_norm": 1.3929287195205688,
"learning_rate": 0.00017466290401204186,
"loss": 0.3837,
"step": 1335
},
{
"epoch": 1.1703800786369594,
"grad_norm": 1.1973505020141602,
"learning_rate": 0.00017447977751140086,
"loss": 0.2335,
"step": 1340
},
{
"epoch": 1.1747487986020095,
"grad_norm": 1.4987421035766602,
"learning_rate": 0.00017429608835161506,
"loss": 0.2484,
"step": 1345
},
{
"epoch": 1.17911751856706,
"grad_norm": 1.3741153478622437,
"learning_rate": 0.00017411183792036822,
"loss": 0.2475,
"step": 1350
},
{
"epoch": 1.18348623853211,
"grad_norm": 1.8292722702026367,
"learning_rate": 0.0001739270276095844,
"loss": 0.2898,
"step": 1355
},
{
"epoch": 1.1878549584971603,
"grad_norm": 1.854209065437317,
"learning_rate": 0.00017374165881541717,
"loss": 0.3992,
"step": 1360
},
{
"epoch": 1.1922236784622107,
"grad_norm": 1.6196980476379395,
"learning_rate": 0.0001735557329382393,
"loss": 0.3053,
"step": 1365
},
{
"epoch": 1.1965923984272608,
"grad_norm": 1.6935441493988037,
"learning_rate": 0.00017336925138263195,
"loss": 0.239,
"step": 1370
},
{
"epoch": 1.200961118392311,
"grad_norm": 1.7809889316558838,
"learning_rate": 0.00017318221555737422,
"loss": 0.2152,
"step": 1375
},
{
"epoch": 1.2053298383573612,
"grad_norm": 2.3215832710266113,
"learning_rate": 0.0001729946268754324,
"loss": 0.3185,
"step": 1380
},
{
"epoch": 1.2096985583224116,
"grad_norm": 1.347947120666504,
"learning_rate": 0.00017280648675394947,
"loss": 0.3085,
"step": 1385
},
{
"epoch": 1.2140672782874617,
"grad_norm": 1.3840276002883911,
"learning_rate": 0.00017261779661423407,
"loss": 0.2016,
"step": 1390
},
{
"epoch": 1.218435998252512,
"grad_norm": 1.2935765981674194,
"learning_rate": 0.00017242855788175015,
"loss": 0.2063,
"step": 1395
},
{
"epoch": 1.2228047182175623,
"grad_norm": 1.2883801460266113,
"learning_rate": 0.00017223877198610591,
"loss": 0.2181,
"step": 1400
},
{
"epoch": 1.2271734381826125,
"grad_norm": 1.3021811246871948,
"learning_rate": 0.00017204844036104318,
"loss": 0.2283,
"step": 1405
},
{
"epoch": 1.2315421581476627,
"grad_norm": 1.1819430589675903,
"learning_rate": 0.00017185756444442648,
"loss": 0.2652,
"step": 1410
},
{
"epoch": 1.235910878112713,
"grad_norm": 2.0612573623657227,
"learning_rate": 0.00017166614567823212,
"loss": 0.2977,
"step": 1415
},
{
"epoch": 1.2402795980777632,
"grad_norm": 3.0888679027557373,
"learning_rate": 0.00017147418550853756,
"loss": 0.2682,
"step": 1420
},
{
"epoch": 1.2446483180428134,
"grad_norm": 2.311062812805176,
"learning_rate": 0.0001712816853855101,
"loss": 0.3329,
"step": 1425
},
{
"epoch": 1.2490170380078638,
"grad_norm": 1.2064367532730103,
"learning_rate": 0.00017108864676339627,
"loss": 0.2065,
"step": 1430
},
{
"epoch": 1.253385757972914,
"grad_norm": 1.4042255878448486,
"learning_rate": 0.00017089507110051066,
"loss": 0.1738,
"step": 1435
},
{
"epoch": 1.2577544779379641,
"grad_norm": 2.3508129119873047,
"learning_rate": 0.00017070095985922493,
"loss": 0.403,
"step": 1440
},
{
"epoch": 1.2621231979030143,
"grad_norm": 1.2386358976364136,
"learning_rate": 0.0001705063145059568,
"loss": 0.272,
"step": 1445
},
{
"epoch": 1.2664919178680647,
"grad_norm": 0.806268036365509,
"learning_rate": 0.00017031113651115893,
"loss": 0.2549,
"step": 1450
},
{
"epoch": 1.2708606378331149,
"grad_norm": 1.6991655826568604,
"learning_rate": 0.00017011542734930786,
"loss": 0.2331,
"step": 1455
},
{
"epoch": 1.2752293577981653,
"grad_norm": 1.1343746185302734,
"learning_rate": 0.00016991918849889283,
"loss": 0.3112,
"step": 1460
},
{
"epoch": 1.2795980777632154,
"grad_norm": 1.3381041288375854,
"learning_rate": 0.00016972242144240463,
"loss": 0.1974,
"step": 1465
},
{
"epoch": 1.2839667977282656,
"grad_norm": 3.6427793502807617,
"learning_rate": 0.00016952512766632439,
"loss": 0.2315,
"step": 1470
},
{
"epoch": 1.2883355176933158,
"grad_norm": 1.8139313459396362,
"learning_rate": 0.0001693273086611123,
"loss": 0.1771,
"step": 1475
},
{
"epoch": 1.2927042376583662,
"grad_norm": 1.205609679222107,
"learning_rate": 0.00016912896592119654,
"loss": 0.2551,
"step": 1480
},
{
"epoch": 1.2970729576234163,
"grad_norm": 1.355162262916565,
"learning_rate": 0.00016893010094496172,
"loss": 0.2452,
"step": 1485
},
{
"epoch": 1.3014416775884665,
"grad_norm": 1.2561094760894775,
"learning_rate": 0.00016873071523473777,
"loss": 0.2163,
"step": 1490
},
{
"epoch": 1.305810397553517,
"grad_norm": 1.3165076971054077,
"learning_rate": 0.00016853081029678853,
"loss": 0.3273,
"step": 1495
},
{
"epoch": 1.310179117518567,
"grad_norm": 1.8802030086517334,
"learning_rate": 0.00016833038764130028,
"loss": 0.3797,
"step": 1500
},
{
"epoch": 1.3145478374836173,
"grad_norm": 1.7062153816223145,
"learning_rate": 0.0001681294487823704,
"loss": 0.2989,
"step": 1505
},
{
"epoch": 1.3189165574486674,
"grad_norm": 2.0729176998138428,
"learning_rate": 0.00016792799523799613,
"loss": 0.2587,
"step": 1510
},
{
"epoch": 1.3232852774137178,
"grad_norm": 1.129841685295105,
"learning_rate": 0.00016772602853006268,
"loss": 0.2201,
"step": 1515
},
{
"epoch": 1.327653997378768,
"grad_norm": 1.2515584230422974,
"learning_rate": 0.00016752355018433206,
"loss": 0.2397,
"step": 1520
},
{
"epoch": 1.3320227173438184,
"grad_norm": 1.2597646713256836,
"learning_rate": 0.0001673205617304315,
"loss": 0.2157,
"step": 1525
},
{
"epoch": 1.3363914373088686,
"grad_norm": 1.8813763856887817,
"learning_rate": 0.0001671170647018418,
"loss": 0.2765,
"step": 1530
},
{
"epoch": 1.3407601572739187,
"grad_norm": 2.208132266998291,
"learning_rate": 0.00016691306063588583,
"loss": 0.2258,
"step": 1535
},
{
"epoch": 1.345128877238969,
"grad_norm": 1.9504673480987549,
"learning_rate": 0.00016670855107371683,
"loss": 0.2779,
"step": 1540
},
{
"epoch": 1.3494975972040193,
"grad_norm": 2.171309471130371,
"learning_rate": 0.00016650353756030692,
"loss": 0.3031,
"step": 1545
},
{
"epoch": 1.3538663171690695,
"grad_norm": 2.3320510387420654,
"learning_rate": 0.00016629802164443519,
"loss": 0.3288,
"step": 1550
},
{
"epoch": 1.3582350371341196,
"grad_norm": 1.4883947372436523,
"learning_rate": 0.0001660920048786763,
"loss": 0.2416,
"step": 1555
},
{
"epoch": 1.36260375709917,
"grad_norm": 1.1198906898498535,
"learning_rate": 0.00016588548881938845,
"loss": 0.2337,
"step": 1560
},
{
"epoch": 1.3669724770642202,
"grad_norm": 1.4867557287216187,
"learning_rate": 0.0001656784750267019,
"loss": 0.3154,
"step": 1565
},
{
"epoch": 1.3713411970292704,
"grad_norm": 2.2435972690582275,
"learning_rate": 0.0001654709650645069,
"loss": 0.245,
"step": 1570
},
{
"epoch": 1.3757099169943205,
"grad_norm": 2.2508065700531006,
"learning_rate": 0.00016526296050044215,
"loss": 0.3097,
"step": 1575
},
{
"epoch": 1.380078636959371,
"grad_norm": 0.8681387901306152,
"learning_rate": 0.00016505446290588277,
"loss": 0.295,
"step": 1580
},
{
"epoch": 1.3844473569244211,
"grad_norm": 1.143965244293213,
"learning_rate": 0.00016484547385592848,
"loss": 0.2534,
"step": 1585
},
{
"epoch": 1.3888160768894715,
"grad_norm": 2.3972761631011963,
"learning_rate": 0.00016463599492939177,
"loss": 0.2527,
"step": 1590
},
{
"epoch": 1.3931847968545217,
"grad_norm": 2.7718844413757324,
"learning_rate": 0.00016442602770878586,
"loss": 0.2697,
"step": 1595
},
{
"epoch": 1.3975535168195719,
"grad_norm": 1.7743192911148071,
"learning_rate": 0.00016421557378031279,
"loss": 0.2784,
"step": 1600
},
{
"epoch": 1.401922236784622,
"grad_norm": 1.879279613494873,
"learning_rate": 0.0001640046347338515,
"loss": 0.2828,
"step": 1605
},
{
"epoch": 1.4062909567496724,
"grad_norm": 1.5005972385406494,
"learning_rate": 0.00016379321216294574,
"loss": 0.2161,
"step": 1610
},
{
"epoch": 1.4106596767147226,
"grad_norm": 1.1289541721343994,
"learning_rate": 0.00016358130766479202,
"loss": 0.1995,
"step": 1615
},
{
"epoch": 1.4150283966797728,
"grad_norm": 0.7074740529060364,
"learning_rate": 0.0001633689228402276,
"loss": 0.2569,
"step": 1620
},
{
"epoch": 1.4193971166448232,
"grad_norm": 1.4253774881362915,
"learning_rate": 0.00016315605929371842,
"loss": 0.3133,
"step": 1625
},
{
"epoch": 1.4237658366098733,
"grad_norm": 1.6986387968063354,
"learning_rate": 0.0001629427186333469,
"loss": 0.2671,
"step": 1630
},
{
"epoch": 1.4281345565749235,
"grad_norm": 1.194734811782837,
"learning_rate": 0.0001627289024707998,
"loss": 0.2592,
"step": 1635
},
{
"epoch": 1.4325032765399737,
"grad_norm": 1.6335885524749756,
"learning_rate": 0.00016251461242135616,
"loss": 0.2628,
"step": 1640
},
{
"epoch": 1.436871996505024,
"grad_norm": 1.417205572128296,
"learning_rate": 0.0001622998501038749,
"loss": 0.2129,
"step": 1645
},
{
"epoch": 1.4412407164700742,
"grad_norm": 1.0214331150054932,
"learning_rate": 0.0001620846171407828,
"loss": 0.2505,
"step": 1650
},
{
"epoch": 1.4456094364351246,
"grad_norm": 0.9859942197799683,
"learning_rate": 0.000161868915158062,
"loss": 0.1947,
"step": 1655
},
{
"epoch": 1.4499781564001748,
"grad_norm": 1.9299548864364624,
"learning_rate": 0.00016165274578523807,
"loss": 0.2716,
"step": 1660
},
{
"epoch": 1.454346876365225,
"grad_norm": 1.4493621587753296,
"learning_rate": 0.00016143611065536727,
"loss": 0.2066,
"step": 1665
},
{
"epoch": 1.4587155963302751,
"grad_norm": 1.4922335147857666,
"learning_rate": 0.00016121901140502456,
"loss": 0.291,
"step": 1670
},
{
"epoch": 1.4630843162953255,
"grad_norm": 0.9710771441459656,
"learning_rate": 0.00016100144967429113,
"loss": 0.207,
"step": 1675
},
{
"epoch": 1.4674530362603757,
"grad_norm": 2.2792580127716064,
"learning_rate": 0.0001607834271067419,
"loss": 0.1705,
"step": 1680
},
{
"epoch": 1.4718217562254259,
"grad_norm": 2.3811166286468506,
"learning_rate": 0.00016056494534943323,
"loss": 0.2994,
"step": 1685
},
{
"epoch": 1.4761904761904763,
"grad_norm": 1.6609042882919312,
"learning_rate": 0.00016034600605289046,
"loss": 0.2,
"step": 1690
},
{
"epoch": 1.4805591961555264,
"grad_norm": 2.835557222366333,
"learning_rate": 0.0001601266108710954,
"loss": 0.3283,
"step": 1695
},
{
"epoch": 1.4849279161205766,
"grad_norm": 1.5667732954025269,
"learning_rate": 0.00015990676146147384,
"loss": 0.2685,
"step": 1700
},
{
"epoch": 1.4892966360856268,
"grad_norm": 1.7098650932312012,
"learning_rate": 0.0001596864594848831,
"loss": 0.2466,
"step": 1705
},
{
"epoch": 1.4936653560506772,
"grad_norm": 1.5239036083221436,
"learning_rate": 0.00015946570660559933,
"loss": 0.2577,
"step": 1710
},
{
"epoch": 1.4980340760157274,
"grad_norm": 1.0807280540466309,
"learning_rate": 0.00015924450449130513,
"loss": 0.2017,
"step": 1715
},
{
"epoch": 1.5024027959807777,
"grad_norm": 1.5457370281219482,
"learning_rate": 0.0001590228548130768,
"loss": 0.2695,
"step": 1720
},
{
"epoch": 1.506771515945828,
"grad_norm": 1.1175668239593506,
"learning_rate": 0.00015880075924537185,
"loss": 0.2727,
"step": 1725
},
{
"epoch": 1.511140235910878,
"grad_norm": 0.9887433052062988,
"learning_rate": 0.00015857821946601615,
"loss": 0.2561,
"step": 1730
},
{
"epoch": 1.5155089558759283,
"grad_norm": 1.205710768699646,
"learning_rate": 0.00015835523715619144,
"loss": 0.2441,
"step": 1735
},
{
"epoch": 1.5198776758409784,
"grad_norm": 1.3492207527160645,
"learning_rate": 0.00015813181400042262,
"loss": 0.2832,
"step": 1740
},
{
"epoch": 1.5242463958060288,
"grad_norm": 1.2649726867675781,
"learning_rate": 0.00015790795168656486,
"loss": 0.2393,
"step": 1745
},
{
"epoch": 1.5286151157710792,
"grad_norm": 2.4311208724975586,
"learning_rate": 0.00015768365190579103,
"loss": 0.2831,
"step": 1750
},
{
"epoch": 1.5329838357361294,
"grad_norm": 1.1982014179229736,
"learning_rate": 0.00015745891635257885,
"loss": 0.2893,
"step": 1755
},
{
"epoch": 1.5373525557011796,
"grad_norm": 1.842958688735962,
"learning_rate": 0.0001572337467246981,
"loss": 0.2158,
"step": 1760
},
{
"epoch": 1.5417212756662297,
"grad_norm": 1.9764633178710938,
"learning_rate": 0.00015700814472319774,
"loss": 0.3043,
"step": 1765
},
{
"epoch": 1.54608999563128,
"grad_norm": 1.0637012720108032,
"learning_rate": 0.00015678211205239314,
"loss": 0.258,
"step": 1770
},
{
"epoch": 1.5504587155963303,
"grad_norm": 1.0332783460617065,
"learning_rate": 0.00015655565041985318,
"loss": 0.2373,
"step": 1775
},
{
"epoch": 1.5548274355613805,
"grad_norm": 1.965062141418457,
"learning_rate": 0.00015632876153638732,
"loss": 0.1725,
"step": 1780
},
{
"epoch": 1.5591961555264309,
"grad_norm": 0.6934140920639038,
"learning_rate": 0.00015610144711603272,
"loss": 0.2201,
"step": 1785
},
{
"epoch": 1.563564875491481,
"grad_norm": 1.2010079622268677,
"learning_rate": 0.00015587370887604123,
"loss": 0.1796,
"step": 1790
},
{
"epoch": 1.5679335954565312,
"grad_norm": 1.765724539756775,
"learning_rate": 0.00015564554853686645,
"loss": 0.2457,
"step": 1795
},
{
"epoch": 1.5723023154215814,
"grad_norm": 1.848937749862671,
"learning_rate": 0.00015541696782215084,
"loss": 0.2096,
"step": 1800
},
{
"epoch": 1.5766710353866316,
"grad_norm": 1.9904989004135132,
"learning_rate": 0.00015518796845871247,
"loss": 0.2651,
"step": 1805
},
{
"epoch": 1.581039755351682,
"grad_norm": 0.8786830902099609,
"learning_rate": 0.00015495855217653216,
"loss": 0.2256,
"step": 1810
},
{
"epoch": 1.5854084753167323,
"grad_norm": 2.0665247440338135,
"learning_rate": 0.00015472872070874033,
"loss": 0.3413,
"step": 1815
},
{
"epoch": 1.5897771952817825,
"grad_norm": 2.08548903465271,
"learning_rate": 0.000154498475791604,
"loss": 0.2753,
"step": 1820
},
{
"epoch": 1.5941459152468327,
"grad_norm": 1.4281671047210693,
"learning_rate": 0.00015426781916451346,
"loss": 0.2527,
"step": 1825
},
{
"epoch": 1.5985146352118829,
"grad_norm": 0.9327197074890137,
"learning_rate": 0.00015403675256996942,
"loss": 0.1906,
"step": 1830
},
{
"epoch": 1.602883355176933,
"grad_norm": 2.447357177734375,
"learning_rate": 0.00015380527775356962,
"loss": 0.2377,
"step": 1835
},
{
"epoch": 1.6072520751419834,
"grad_norm": 1.3971562385559082,
"learning_rate": 0.00015357339646399578,
"loss": 0.2307,
"step": 1840
},
{
"epoch": 1.6116207951070336,
"grad_norm": 1.4695461988449097,
"learning_rate": 0.00015334111045300022,
"loss": 0.2273,
"step": 1845
},
{
"epoch": 1.615989515072084,
"grad_norm": 2.3516275882720947,
"learning_rate": 0.0001531084214753928,
"loss": 0.3006,
"step": 1850
},
{
"epoch": 1.6203582350371342,
"grad_norm": 1.7472774982452393,
"learning_rate": 0.00015287533128902764,
"loss": 0.2943,
"step": 1855
},
{
"epoch": 1.6247269550021843,
"grad_norm": 2.3994486331939697,
"learning_rate": 0.00015264184165478977,
"loss": 0.2277,
"step": 1860
},
{
"epoch": 1.6290956749672345,
"grad_norm": 2.234163522720337,
"learning_rate": 0.00015240795433658187,
"loss": 0.2319,
"step": 1865
},
{
"epoch": 1.6334643949322847,
"grad_norm": 1.5536202192306519,
"learning_rate": 0.00015217367110131086,
"loss": 0.1685,
"step": 1870
},
{
"epoch": 1.637833114897335,
"grad_norm": 1.564252257347107,
"learning_rate": 0.0001519389937188747,
"loss": 0.2817,
"step": 1875
},
{
"epoch": 1.6422018348623855,
"grad_norm": 1.6615930795669556,
"learning_rate": 0.00015170392396214897,
"loss": 0.2409,
"step": 1880
},
{
"epoch": 1.6465705548274356,
"grad_norm": 2.7048113346099854,
"learning_rate": 0.00015146846360697332,
"loss": 0.3594,
"step": 1885
},
{
"epoch": 1.6509392747924858,
"grad_norm": 1.7237365245819092,
"learning_rate": 0.00015123261443213837,
"loss": 0.2692,
"step": 1890
},
{
"epoch": 1.655307994757536,
"grad_norm": 0.9304201602935791,
"learning_rate": 0.00015099637821937192,
"loss": 0.2457,
"step": 1895
},
{
"epoch": 1.6596767147225862,
"grad_norm": 1.8771346807479858,
"learning_rate": 0.00015075975675332573,
"loss": 0.2152,
"step": 1900
},
{
"epoch": 1.6640454346876365,
"grad_norm": 1.0985363721847534,
"learning_rate": 0.00015052275182156198,
"loss": 0.2191,
"step": 1905
},
{
"epoch": 1.6684141546526867,
"grad_norm": 1.3073694705963135,
"learning_rate": 0.00015028536521453968,
"loss": 0.3686,
"step": 1910
},
{
"epoch": 1.6727828746177371,
"grad_norm": 0.7887938618659973,
"learning_rate": 0.0001500475987256013,
"loss": 0.2015,
"step": 1915
},
{
"epoch": 1.6771515945827873,
"grad_norm": 1.5100387334823608,
"learning_rate": 0.0001498094541509591,
"loss": 0.2545,
"step": 1920
},
{
"epoch": 1.6815203145478375,
"grad_norm": 1.5141887664794922,
"learning_rate": 0.00014957093328968156,
"loss": 0.2385,
"step": 1925
},
{
"epoch": 1.6858890345128876,
"grad_norm": 1.1331568956375122,
"learning_rate": 0.00014933203794367992,
"loss": 0.2578,
"step": 1930
},
{
"epoch": 1.6902577544779378,
"grad_norm": 1.6967277526855469,
"learning_rate": 0.00014909276991769435,
"loss": 0.3004,
"step": 1935
},
{
"epoch": 1.6946264744429882,
"grad_norm": 1.1680165529251099,
"learning_rate": 0.00014885313101928055,
"loss": 0.2306,
"step": 1940
},
{
"epoch": 1.6989951944080386,
"grad_norm": 0.8564541339874268,
"learning_rate": 0.00014861312305879592,
"loss": 0.2384,
"step": 1945
},
{
"epoch": 1.7033639143730888,
"grad_norm": 0.8954014778137207,
"learning_rate": 0.00014837274784938596,
"loss": 0.1804,
"step": 1950
},
{
"epoch": 1.707732634338139,
"grad_norm": 1.9697721004486084,
"learning_rate": 0.00014813200720697055,
"loss": 0.2337,
"step": 1955
},
{
"epoch": 1.712101354303189,
"grad_norm": 1.3099297285079956,
"learning_rate": 0.00014789090295023031,
"loss": 0.2387,
"step": 1960
},
{
"epoch": 1.7164700742682393,
"grad_norm": 1.8576496839523315,
"learning_rate": 0.00014764943690059269,
"loss": 0.2739,
"step": 1965
},
{
"epoch": 1.7208387942332897,
"grad_norm": 1.8436728715896606,
"learning_rate": 0.0001474076108822184,
"loss": 0.1997,
"step": 1970
},
{
"epoch": 1.7252075141983398,
"grad_norm": 1.0517886877059937,
"learning_rate": 0.0001471654267219875,
"loss": 0.2432,
"step": 1975
},
{
"epoch": 1.7295762341633902,
"grad_norm": 0.9853880405426025,
"learning_rate": 0.00014692288624948557,
"loss": 0.3059,
"step": 1980
},
{
"epoch": 1.7339449541284404,
"grad_norm": 2.1775450706481934,
"learning_rate": 0.00014667999129699011,
"loss": 0.3374,
"step": 1985
},
{
"epoch": 1.7383136740934906,
"grad_norm": 0.7085615396499634,
"learning_rate": 0.0001464367436994565,
"loss": 0.1907,
"step": 1990
},
{
"epoch": 1.7426823940585408,
"grad_norm": 1.3183834552764893,
"learning_rate": 0.00014619314529450405,
"loss": 0.2587,
"step": 1995
},
{
"epoch": 1.747051114023591,
"grad_norm": 2.9087324142456055,
"learning_rate": 0.00014594919792240246,
"loss": 0.301,
"step": 2000
},
{
"epoch": 1.7514198339886413,
"grad_norm": 1.5489122867584229,
"learning_rate": 0.00014570490342605751,
"loss": 0.2061,
"step": 2005
},
{
"epoch": 1.7557885539536917,
"grad_norm": 1.4719579219818115,
"learning_rate": 0.00014546026365099753,
"loss": 0.2334,
"step": 2010
},
{
"epoch": 1.7601572739187419,
"grad_norm": 1.499053955078125,
"learning_rate": 0.0001452152804453591,
"loss": 0.2817,
"step": 2015
},
{
"epoch": 1.764525993883792,
"grad_norm": 1.0219850540161133,
"learning_rate": 0.00014496995565987337,
"loss": 0.2292,
"step": 2020
},
{
"epoch": 1.7688947138488422,
"grad_norm": 1.4518020153045654,
"learning_rate": 0.00014472429114785194,
"loss": 0.2321,
"step": 2025
},
{
"epoch": 1.7732634338138924,
"grad_norm": 1.0398428440093994,
"learning_rate": 0.00014447828876517277,
"loss": 0.2649,
"step": 2030
},
{
"epoch": 1.7776321537789428,
"grad_norm": 0.5719377994537354,
"learning_rate": 0.00014423195037026646,
"loss": 0.2239,
"step": 2035
},
{
"epoch": 1.782000873743993,
"grad_norm": 1.4201310873031616,
"learning_rate": 0.00014398527782410187,
"loss": 0.1812,
"step": 2040
},
{
"epoch": 1.7863695937090434,
"grad_norm": 1.380936622619629,
"learning_rate": 0.00014373827299017227,
"loss": 0.2379,
"step": 2045
},
{
"epoch": 1.7907383136740935,
"grad_norm": 0.9390376210212708,
"learning_rate": 0.0001434909377344812,
"loss": 0.1879,
"step": 2050
},
{
"epoch": 1.7951070336391437,
"grad_norm": 1.697022557258606,
"learning_rate": 0.0001432432739255284,
"loss": 0.2644,
"step": 2055
},
{
"epoch": 1.7994757536041939,
"grad_norm": 1.7329260110855103,
"learning_rate": 0.00014299528343429566,
"loss": 0.4845,
"step": 2060
},
{
"epoch": 1.8038444735692443,
"grad_norm": 1.098286747932434,
"learning_rate": 0.00014274696813423269,
"loss": 0.2452,
"step": 2065
},
{
"epoch": 1.8082131935342944,
"grad_norm": 0.6370453238487244,
"learning_rate": 0.00014249832990124292,
"loss": 0.2423,
"step": 2070
},
{
"epoch": 1.8125819134993448,
"grad_norm": 1.1073155403137207,
"learning_rate": 0.0001422493706136695,
"loss": 0.269,
"step": 2075
},
{
"epoch": 1.816950633464395,
"grad_norm": 1.5754518508911133,
"learning_rate": 0.0001420000921522809,
"loss": 0.2506,
"step": 2080
},
{
"epoch": 1.8213193534294452,
"grad_norm": 2.118640422821045,
"learning_rate": 0.0001417504964002569,
"loss": 0.2072,
"step": 2085
},
{
"epoch": 1.8256880733944953,
"grad_norm": 1.0548226833343506,
"learning_rate": 0.0001415005852431741,
"loss": 0.2509,
"step": 2090
},
{
"epoch": 1.8300567933595455,
"grad_norm": 2.12402606010437,
"learning_rate": 0.00014125036056899197,
"loss": 0.2599,
"step": 2095
},
{
"epoch": 1.834425513324596,
"grad_norm": 1.251313328742981,
"learning_rate": 0.00014099982426803842,
"loss": 0.2302,
"step": 2100
},
{
"epoch": 1.838794233289646,
"grad_norm": 1.9044132232666016,
"learning_rate": 0.0001407489782329955,
"loss": 0.2487,
"step": 2105
},
{
"epoch": 1.8431629532546965,
"grad_norm": 0.7197313904762268,
"learning_rate": 0.00014049782435888525,
"loss": 0.1864,
"step": 2110
},
{
"epoch": 1.8475316732197467,
"grad_norm": 1.1280083656311035,
"learning_rate": 0.00014024636454305515,
"loss": 0.2028,
"step": 2115
},
{
"epoch": 1.8519003931847968,
"grad_norm": 2.0197250843048096,
"learning_rate": 0.00013999460068516407,
"loss": 0.3204,
"step": 2120
},
{
"epoch": 1.856269113149847,
"grad_norm": 1.2510887384414673,
"learning_rate": 0.0001397425346871677,
"loss": 0.213,
"step": 2125
},
{
"epoch": 1.8606378331148974,
"grad_norm": 1.8764947652816772,
"learning_rate": 0.0001394901684533042,
"loss": 0.2815,
"step": 2130
},
{
"epoch": 1.8650065530799476,
"grad_norm": 2.152601480484009,
"learning_rate": 0.00013923750389007998,
"loss": 0.2884,
"step": 2135
},
{
"epoch": 1.869375273044998,
"grad_norm": 1.9770750999450684,
"learning_rate": 0.00013898454290625515,
"loss": 0.3571,
"step": 2140
},
{
"epoch": 1.8737439930100481,
"grad_norm": 1.4003419876098633,
"learning_rate": 0.00013873128741282906,
"loss": 0.247,
"step": 2145
},
{
"epoch": 1.8781127129750983,
"grad_norm": 1.565500020980835,
"learning_rate": 0.00013847773932302603,
"loss": 0.2227,
"step": 2150
},
{
"epoch": 1.8824814329401485,
"grad_norm": 0.588405966758728,
"learning_rate": 0.00013822390055228079,
"loss": 0.2331,
"step": 2155
},
{
"epoch": 1.8868501529051986,
"grad_norm": 0.9293955564498901,
"learning_rate": 0.00013796977301822397,
"loss": 0.258,
"step": 2160
},
{
"epoch": 1.891218872870249,
"grad_norm": 3.2338411808013916,
"learning_rate": 0.00013771535864066773,
"loss": 0.4014,
"step": 2165
},
{
"epoch": 1.8955875928352992,
"grad_norm": 1.0802291631698608,
"learning_rate": 0.00013746065934159123,
"loss": 0.1996,
"step": 2170
},
{
"epoch": 1.8999563128003496,
"grad_norm": 2.0415797233581543,
"learning_rate": 0.00013720567704512593,
"loss": 0.2392,
"step": 2175
},
{
"epoch": 1.9043250327653998,
"grad_norm": 1.554219365119934,
"learning_rate": 0.00013695041367754133,
"loss": 0.2165,
"step": 2180
},
{
"epoch": 1.90869375273045,
"grad_norm": 8.951557159423828,
"learning_rate": 0.00013669487116723024,
"loss": 0.2473,
"step": 2185
},
{
"epoch": 1.9130624726955001,
"grad_norm": 2.31048321723938,
"learning_rate": 0.0001364390514446943,
"loss": 0.2333,
"step": 2190
},
{
"epoch": 1.9174311926605505,
"grad_norm": 1.3068311214447021,
"learning_rate": 0.0001361829564425293,
"loss": 0.2034,
"step": 2195
},
{
"epoch": 1.9217999126256007,
"grad_norm": 0.9649588465690613,
"learning_rate": 0.00013592658809541064,
"loss": 0.2276,
"step": 2200
},
{
"epoch": 1.926168632590651,
"grad_norm": 2.0399038791656494,
"learning_rate": 0.00013566994834007877,
"loss": 0.1955,
"step": 2205
},
{
"epoch": 1.9305373525557012,
"grad_norm": 1.4719631671905518,
"learning_rate": 0.00013541303911532445,
"loss": 0.1883,
"step": 2210
},
{
"epoch": 1.9349060725207514,
"grad_norm": 1.938029170036316,
"learning_rate": 0.00013515586236197418,
"loss": 0.2386,
"step": 2215
},
{
"epoch": 1.9392747924858016,
"grad_norm": 1.9169769287109375,
"learning_rate": 0.00013489842002287542,
"loss": 0.2435,
"step": 2220
},
{
"epoch": 1.9436435124508518,
"grad_norm": 1.1561371088027954,
"learning_rate": 0.0001346407140428822,
"loss": 0.2305,
"step": 2225
},
{
"epoch": 1.9480122324159022,
"grad_norm": 1.9873589277267456,
"learning_rate": 0.00013438274636884,
"loss": 0.2706,
"step": 2230
},
{
"epoch": 1.9523809523809523,
"grad_norm": 1.9053728580474854,
"learning_rate": 0.00013412451894957144,
"loss": 0.187,
"step": 2235
},
{
"epoch": 1.9567496723460027,
"grad_norm": 1.3117566108703613,
"learning_rate": 0.00013386603373586134,
"loss": 0.2317,
"step": 2240
},
{
"epoch": 1.961118392311053,
"grad_norm": 0.7097095251083374,
"learning_rate": 0.000133607292680442,
"loss": 0.2926,
"step": 2245
},
{
"epoch": 1.965487112276103,
"grad_norm": 1.8591192960739136,
"learning_rate": 0.0001333482977379785,
"loss": 0.2025,
"step": 2250
},
{
"epoch": 1.9698558322411532,
"grad_norm": 2.633700370788574,
"learning_rate": 0.00013308905086505395,
"loss": 0.2513,
"step": 2255
},
{
"epoch": 1.9742245522062036,
"grad_norm": 1.4099256992340088,
"learning_rate": 0.0001328295540201546,
"loss": 0.2447,
"step": 2260
},
{
"epoch": 1.9785932721712538,
"grad_norm": 1.20600163936615,
"learning_rate": 0.00013256980916365527,
"loss": 0.2449,
"step": 2265
},
{
"epoch": 1.9829619921363042,
"grad_norm": 1.1572972536087036,
"learning_rate": 0.0001323098182578042,
"loss": 0.1937,
"step": 2270
},
{
"epoch": 1.9873307121013544,
"grad_norm": 1.0929369926452637,
"learning_rate": 0.00013204958326670853,
"loss": 0.2273,
"step": 2275
},
{
"epoch": 1.9916994320664045,
"grad_norm": 1.674107313156128,
"learning_rate": 0.00013178910615631933,
"loss": 0.3191,
"step": 2280
},
{
"epoch": 1.9960681520314547,
"grad_norm": 1.306754469871521,
"learning_rate": 0.00013152838889441673,
"loss": 0.2723,
"step": 2285
},
{
"epoch": 2.0,
"grad_norm": 1.877669334411621,
"learning_rate": 0.00013126743345059512,
"loss": 0.2246,
"step": 2290
},
{
"epoch": 2.00436871996505,
"grad_norm": 0.8999947905540466,
"learning_rate": 0.00013100624179624828,
"loss": 0.1528,
"step": 2295
},
{
"epoch": 2.0087374399301003,
"grad_norm": 1.1233611106872559,
"learning_rate": 0.00013074481590455433,
"loss": 0.215,
"step": 2300
},
{
"epoch": 2.0131061598951505,
"grad_norm": 1.4673850536346436,
"learning_rate": 0.00013048315775046108,
"loss": 0.1379,
"step": 2305
},
{
"epoch": 2.017474879860201,
"grad_norm": 1.4777660369873047,
"learning_rate": 0.0001302212693106709,
"loss": 0.1343,
"step": 2310
},
{
"epoch": 2.0218435998252513,
"grad_norm": 1.7447019815444946,
"learning_rate": 0.00012995915256362584,
"loss": 0.1591,
"step": 2315
},
{
"epoch": 2.0262123197903015,
"grad_norm": 0.8237628936767578,
"learning_rate": 0.00012969680948949272,
"loss": 0.1182,
"step": 2320
},
{
"epoch": 2.0305810397553516,
"grad_norm": 1.5945308208465576,
"learning_rate": 0.00012943424207014818,
"loss": 0.1624,
"step": 2325
},
{
"epoch": 2.034949759720402,
"grad_norm": 1.3121789693832397,
"learning_rate": 0.00012917145228916367,
"loss": 0.1313,
"step": 2330
},
{
"epoch": 2.039318479685452,
"grad_norm": 1.028647780418396,
"learning_rate": 0.00012890844213179044,
"loss": 0.214,
"step": 2335
},
{
"epoch": 2.0436871996505026,
"grad_norm": 1.0864648818969727,
"learning_rate": 0.00012864521358494464,
"loss": 0.1929,
"step": 2340
},
{
"epoch": 2.0480559196155528,
"grad_norm": 1.2443722486495972,
"learning_rate": 0.00012838176863719217,
"loss": 0.1479,
"step": 2345
},
{
"epoch": 2.052424639580603,
"grad_norm": 3.708357095718384,
"learning_rate": 0.00012811810927873386,
"loss": 0.1897,
"step": 2350
},
{
"epoch": 2.056793359545653,
"grad_norm": 1.483973741531372,
"learning_rate": 0.00012785423750139008,
"loss": 0.1188,
"step": 2355
},
{
"epoch": 2.0611620795107033,
"grad_norm": 1.0019135475158691,
"learning_rate": 0.00012759015529858624,
"loss": 0.1683,
"step": 2360
},
{
"epoch": 2.0655307994757535,
"grad_norm": 1.5917549133300781,
"learning_rate": 0.00012732586466533715,
"loss": 0.2293,
"step": 2365
},
{
"epoch": 2.0698995194408036,
"grad_norm": 1.4390945434570312,
"learning_rate": 0.00012706136759823233,
"loss": 0.1873,
"step": 2370
},
{
"epoch": 2.0742682394058543,
"grad_norm": 2.951580762863159,
"learning_rate": 0.00012679666609542083,
"loss": 0.1984,
"step": 2375
},
{
"epoch": 2.0786369593709044,
"grad_norm": 2.2444632053375244,
"learning_rate": 0.00012653176215659596,
"loss": 0.1709,
"step": 2380
},
{
"epoch": 2.0830056793359546,
"grad_norm": 1.191437840461731,
"learning_rate": 0.0001262666577829806,
"loss": 0.1926,
"step": 2385
},
{
"epoch": 2.0873743993010048,
"grad_norm": 2.2964091300964355,
"learning_rate": 0.00012600135497731156,
"loss": 0.1583,
"step": 2390
},
{
"epoch": 2.091743119266055,
"grad_norm": 0.9333243370056152,
"learning_rate": 0.00012573585574382487,
"loss": 0.1261,
"step": 2395
},
{
"epoch": 2.096111839231105,
"grad_norm": 0.8678691387176514,
"learning_rate": 0.00012547016208824038,
"loss": 0.1491,
"step": 2400
},
{
"epoch": 2.1004805591961557,
"grad_norm": 1.7139450311660767,
"learning_rate": 0.00012520427601774682,
"loss": 0.1599,
"step": 2405
},
{
"epoch": 2.104849279161206,
"grad_norm": 1.161952257156372,
"learning_rate": 0.0001249381995409864,
"loss": 0.1806,
"step": 2410
},
{
"epoch": 2.109217999126256,
"grad_norm": 2.1318793296813965,
"learning_rate": 0.00012467193466803982,
"loss": 0.273,
"step": 2415
},
{
"epoch": 2.1135867190913062,
"grad_norm": 1.7493711709976196,
"learning_rate": 0.00012440548341041108,
"loss": 0.192,
"step": 2420
},
{
"epoch": 2.1179554390563564,
"grad_norm": 1.2488105297088623,
"learning_rate": 0.00012413884778101207,
"loss": 0.1602,
"step": 2425
},
{
"epoch": 2.1223241590214066,
"grad_norm": 1.1086466312408447,
"learning_rate": 0.00012387202979414767,
"loss": 0.1318,
"step": 2430
},
{
"epoch": 2.126692878986457,
"grad_norm": 1.6508293151855469,
"learning_rate": 0.00012360503146550034,
"loss": 0.1595,
"step": 2435
},
{
"epoch": 2.1310615989515074,
"grad_norm": 0.8604519367218018,
"learning_rate": 0.00012333785481211487,
"loss": 0.1409,
"step": 2440
},
{
"epoch": 2.1354303189165575,
"grad_norm": 1.6457815170288086,
"learning_rate": 0.00012307050185238333,
"loss": 0.1807,
"step": 2445
},
{
"epoch": 2.1397990388816077,
"grad_norm": 1.9465317726135254,
"learning_rate": 0.00012280297460602957,
"loss": 0.1804,
"step": 2450
},
{
"epoch": 2.144167758846658,
"grad_norm": 1.763515591621399,
"learning_rate": 0.00012253527509409418,
"loss": 0.152,
"step": 2455
},
{
"epoch": 2.148536478811708,
"grad_norm": 2.8693459033966064,
"learning_rate": 0.00012226740533891913,
"loss": 0.1469,
"step": 2460
},
{
"epoch": 2.1529051987767582,
"grad_norm": 1.7148571014404297,
"learning_rate": 0.00012199936736413246,
"loss": 0.1892,
"step": 2465
},
{
"epoch": 2.157273918741809,
"grad_norm": 1.3995704650878906,
"learning_rate": 0.00012173116319463306,
"loss": 0.2003,
"step": 2470
},
{
"epoch": 2.161642638706859,
"grad_norm": 1.0834511518478394,
"learning_rate": 0.00012146279485657532,
"loss": 0.1784,
"step": 2475
},
{
"epoch": 2.166011358671909,
"grad_norm": 1.1982789039611816,
"learning_rate": 0.00012119426437735384,
"loss": 0.1722,
"step": 2480
},
{
"epoch": 2.1703800786369594,
"grad_norm": 2.481656551361084,
"learning_rate": 0.0001209255737855881,
"loss": 0.1938,
"step": 2485
},
{
"epoch": 2.1747487986020095,
"grad_norm": 0.7617926001548767,
"learning_rate": 0.00012065672511110728,
"loss": 0.1533,
"step": 2490
},
{
"epoch": 2.1791175185670597,
"grad_norm": 0.9713643789291382,
"learning_rate": 0.0001203877203849346,
"loss": 0.1241,
"step": 2495
},
{
"epoch": 2.18348623853211,
"grad_norm": 1.348576545715332,
"learning_rate": 0.00012011856163927235,
"loss": 0.1882,
"step": 2500
},
{
"epoch": 2.1878549584971605,
"grad_norm": 3.1400299072265625,
"learning_rate": 0.00011984925090748626,
"loss": 0.2369,
"step": 2505
},
{
"epoch": 2.1922236784622107,
"grad_norm": 1.1611807346343994,
"learning_rate": 0.00011957979022409027,
"loss": 0.1614,
"step": 2510
},
{
"epoch": 2.196592398427261,
"grad_norm": 1.4003978967666626,
"learning_rate": 0.00011931018162473117,
"loss": 0.1431,
"step": 2515
},
{
"epoch": 2.200961118392311,
"grad_norm": 1.7134933471679688,
"learning_rate": 0.00011904042714617311,
"loss": 0.1917,
"step": 2520
},
{
"epoch": 2.205329838357361,
"grad_norm": 1.4700738191604614,
"learning_rate": 0.00011877052882628237,
"loss": 0.1506,
"step": 2525
},
{
"epoch": 2.2096985583224114,
"grad_norm": 0.9795822501182556,
"learning_rate": 0.00011850048870401185,
"loss": 0.1663,
"step": 2530
},
{
"epoch": 2.214067278287462,
"grad_norm": 1.8647089004516602,
"learning_rate": 0.00011823030881938564,
"loss": 0.1984,
"step": 2535
},
{
"epoch": 2.218435998252512,
"grad_norm": 1.2616095542907715,
"learning_rate": 0.00011795999121348378,
"loss": 0.1158,
"step": 2540
},
{
"epoch": 2.2228047182175623,
"grad_norm": 1.105303168296814,
"learning_rate": 0.00011768953792842663,
"loss": 0.1932,
"step": 2545
},
{
"epoch": 2.2271734381826125,
"grad_norm": 1.0846420526504517,
"learning_rate": 0.00011741895100735958,
"loss": 0.156,
"step": 2550
},
{
"epoch": 2.2315421581476627,
"grad_norm": 2.285141944885254,
"learning_rate": 0.00011714823249443763,
"loss": 0.1639,
"step": 2555
},
{
"epoch": 2.235910878112713,
"grad_norm": 1.022466778755188,
"learning_rate": 0.00011687738443480975,
"loss": 0.1591,
"step": 2560
},
{
"epoch": 2.2402795980777634,
"grad_norm": 2.86263370513916,
"learning_rate": 0.00011660640887460377,
"loss": 0.24,
"step": 2565
},
{
"epoch": 2.2446483180428136,
"grad_norm": 1.2612568140029907,
"learning_rate": 0.00011633530786091051,
"loss": 0.1625,
"step": 2570
},
{
"epoch": 2.249017038007864,
"grad_norm": 1.171590805053711,
"learning_rate": 0.00011606408344176873,
"loss": 0.1702,
"step": 2575
},
{
"epoch": 2.253385757972914,
"grad_norm": 1.118406891822815,
"learning_rate": 0.0001157927376661493,
"loss": 0.1518,
"step": 2580
},
{
"epoch": 2.257754477937964,
"grad_norm": 0.8476281762123108,
"learning_rate": 0.00011552127258394003,
"loss": 0.2093,
"step": 2585
},
{
"epoch": 2.2621231979030143,
"grad_norm": 1.7625765800476074,
"learning_rate": 0.0001152496902459299,
"loss": 0.1764,
"step": 2590
},
{
"epoch": 2.2664919178680645,
"grad_norm": 1.5414633750915527,
"learning_rate": 0.00011497799270379374,
"loss": 0.1185,
"step": 2595
},
{
"epoch": 2.270860637833115,
"grad_norm": 2.2619729042053223,
"learning_rate": 0.00011470618201007677,
"loss": 0.1554,
"step": 2600
},
{
"epoch": 2.2752293577981653,
"grad_norm": 0.9168559312820435,
"learning_rate": 0.00011443426021817887,
"loss": 0.1241,
"step": 2605
},
{
"epoch": 2.2795980777632154,
"grad_norm": 1.3820409774780273,
"learning_rate": 0.00011416222938233936,
"loss": 0.1387,
"step": 2610
},
{
"epoch": 2.2839667977282656,
"grad_norm": 2.417088270187378,
"learning_rate": 0.00011389009155762128,
"loss": 0.267,
"step": 2615
},
{
"epoch": 2.288335517693316,
"grad_norm": 1.2400354146957397,
"learning_rate": 0.00011361784879989588,
"loss": 0.1586,
"step": 2620
},
{
"epoch": 2.292704237658366,
"grad_norm": 1.5232576131820679,
"learning_rate": 0.00011334550316582717,
"loss": 0.2155,
"step": 2625
},
{
"epoch": 2.297072957623416,
"grad_norm": 1.0652902126312256,
"learning_rate": 0.00011307305671285633,
"loss": 0.1849,
"step": 2630
},
{
"epoch": 2.3014416775884667,
"grad_norm": 1.0222703218460083,
"learning_rate": 0.00011280051149918622,
"loss": 0.1351,
"step": 2635
},
{
"epoch": 2.305810397553517,
"grad_norm": 2.64607310295105,
"learning_rate": 0.00011252786958376569,
"loss": 0.1929,
"step": 2640
},
{
"epoch": 2.310179117518567,
"grad_norm": 1.090824842453003,
"learning_rate": 0.00011225513302627422,
"loss": 0.1215,
"step": 2645
},
{
"epoch": 2.3145478374836173,
"grad_norm": 1.6671061515808105,
"learning_rate": 0.00011198230388710625,
"loss": 0.1279,
"step": 2650
},
{
"epoch": 2.3189165574486674,
"grad_norm": 1.3377041816711426,
"learning_rate": 0.00011170938422735558,
"loss": 0.1527,
"step": 2655
},
{
"epoch": 2.3232852774137176,
"grad_norm": 1.554307460784912,
"learning_rate": 0.00011143637610879989,
"loss": 0.16,
"step": 2660
},
{
"epoch": 2.327653997378768,
"grad_norm": 1.2246366739273071,
"learning_rate": 0.00011116328159388514,
"loss": 0.1791,
"step": 2665
},
{
"epoch": 2.3320227173438184,
"grad_norm": 3.7316997051239014,
"learning_rate": 0.00011089010274570992,
"loss": 0.2552,
"step": 2670
},
{
"epoch": 2.3363914373088686,
"grad_norm": 2.306547164916992,
"learning_rate": 0.00011061684162801,
"loss": 0.2081,
"step": 2675
},
{
"epoch": 2.3407601572739187,
"grad_norm": 1.9740854501724243,
"learning_rate": 0.00011034350030514253,
"loss": 0.147,
"step": 2680
},
{
"epoch": 2.345128877238969,
"grad_norm": 1.5787196159362793,
"learning_rate": 0.00011007008084207072,
"loss": 0.152,
"step": 2685
},
{
"epoch": 2.349497597204019,
"grad_norm": 1.394114375114441,
"learning_rate": 0.00010979658530434793,
"loss": 0.2236,
"step": 2690
},
{
"epoch": 2.3538663171690697,
"grad_norm": 1.1155906915664673,
"learning_rate": 0.0001095230157581024,
"loss": 0.1224,
"step": 2695
},
{
"epoch": 2.35823503713412,
"grad_norm": 1.6753724813461304,
"learning_rate": 0.00010924937427002136,
"loss": 0.1937,
"step": 2700
},
{
"epoch": 2.36260375709917,
"grad_norm": 1.3327598571777344,
"learning_rate": 0.00010897566290733552,
"loss": 0.2221,
"step": 2705
},
{
"epoch": 2.36697247706422,
"grad_norm": 2.0531933307647705,
"learning_rate": 0.00010870188373780352,
"loss": 0.146,
"step": 2710
},
{
"epoch": 2.3713411970292704,
"grad_norm": 1.6777331829071045,
"learning_rate": 0.00010842803882969623,
"loss": 0.1931,
"step": 2715
},
{
"epoch": 2.3757099169943205,
"grad_norm": 2.2374494075775146,
"learning_rate": 0.00010815413025178112,
"loss": 0.1637,
"step": 2720
},
{
"epoch": 2.3800786369593707,
"grad_norm": 2.244675636291504,
"learning_rate": 0.00010788016007330665,
"loss": 0.1644,
"step": 2725
},
{
"epoch": 2.3844473569244213,
"grad_norm": 0.7124090194702148,
"learning_rate": 0.00010760613036398668,
"loss": 0.1246,
"step": 2730
},
{
"epoch": 2.3888160768894715,
"grad_norm": 1.298221230506897,
"learning_rate": 0.00010733204319398477,
"loss": 0.1604,
"step": 2735
},
{
"epoch": 2.3931847968545217,
"grad_norm": 3.08442759513855,
"learning_rate": 0.00010705790063389858,
"loss": 0.2225,
"step": 2740
},
{
"epoch": 2.397553516819572,
"grad_norm": 1.3459490537643433,
"learning_rate": 0.00010678370475474424,
"loss": 0.2159,
"step": 2745
},
{
"epoch": 2.401922236784622,
"grad_norm": 1.9191830158233643,
"learning_rate": 0.00010650945762794058,
"loss": 0.2479,
"step": 2750
},
{
"epoch": 2.406290956749672,
"grad_norm": 1.102486491203308,
"learning_rate": 0.00010623516132529372,
"loss": 0.1371,
"step": 2755
},
{
"epoch": 2.4106596767147224,
"grad_norm": 1.311330795288086,
"learning_rate": 0.00010596081791898118,
"loss": 0.1672,
"step": 2760
},
{
"epoch": 2.415028396679773,
"grad_norm": 2.336740732192993,
"learning_rate": 0.00010568642948153636,
"loss": 0.2096,
"step": 2765
},
{
"epoch": 2.419397116644823,
"grad_norm": 0.9913440942764282,
"learning_rate": 0.00010541199808583286,
"loss": 0.2231,
"step": 2770
},
{
"epoch": 2.4237658366098733,
"grad_norm": 1.6791926622390747,
"learning_rate": 0.00010513752580506878,
"loss": 0.1871,
"step": 2775
},
{
"epoch": 2.4281345565749235,
"grad_norm": 1.459643840789795,
"learning_rate": 0.00010486301471275111,
"loss": 0.2017,
"step": 2780
},
{
"epoch": 2.4325032765399737,
"grad_norm": 2.661412239074707,
"learning_rate": 0.00010458846688268003,
"loss": 0.2038,
"step": 2785
},
{
"epoch": 2.436871996505024,
"grad_norm": 2.1982479095458984,
"learning_rate": 0.00010431388438893326,
"loss": 0.1776,
"step": 2790
},
{
"epoch": 2.4412407164700745,
"grad_norm": 0.9297656416893005,
"learning_rate": 0.00010403926930585042,
"loss": 0.124,
"step": 2795
},
{
"epoch": 2.4456094364351246,
"grad_norm": 0.9144073128700256,
"learning_rate": 0.00010376462370801724,
"loss": 0.1728,
"step": 2800
},
{
"epoch": 2.449978156400175,
"grad_norm": 0.8969748616218567,
"learning_rate": 0.00010348994967025012,
"loss": 0.1268,
"step": 2805
},
{
"epoch": 2.454346876365225,
"grad_norm": 1.952581763267517,
"learning_rate": 0.00010321524926758012,
"loss": 0.1516,
"step": 2810
},
{
"epoch": 2.458715596330275,
"grad_norm": 1.463630199432373,
"learning_rate": 0.00010294052457523766,
"loss": 0.1748,
"step": 2815
},
{
"epoch": 2.4630843162953253,
"grad_norm": 0.9995607137680054,
"learning_rate": 0.0001026657776686365,
"loss": 0.1815,
"step": 2820
},
{
"epoch": 2.467453036260376,
"grad_norm": 1.2662900686264038,
"learning_rate": 0.00010239101062335834,
"loss": 0.1727,
"step": 2825
},
{
"epoch": 2.471821756225426,
"grad_norm": 1.2032655477523804,
"learning_rate": 0.00010211622551513697,
"loss": 0.1398,
"step": 2830
},
{
"epoch": 2.4761904761904763,
"grad_norm": 1.363953948020935,
"learning_rate": 0.00010184142441984259,
"loss": 0.1405,
"step": 2835
},
{
"epoch": 2.4805591961555264,
"grad_norm": 1.4486302137374878,
"learning_rate": 0.00010156660941346627,
"loss": 0.1877,
"step": 2840
},
{
"epoch": 2.4849279161205766,
"grad_norm": 1.4691563844680786,
"learning_rate": 0.00010129178257210413,
"loss": 0.1261,
"step": 2845
},
{
"epoch": 2.489296636085627,
"grad_norm": 1.7420620918273926,
"learning_rate": 0.0001010169459719416,
"loss": 0.1672,
"step": 2850
},
{
"epoch": 2.493665356050677,
"grad_norm": 1.8223110437393188,
"learning_rate": 0.00010074210168923804,
"loss": 0.1476,
"step": 2855
},
{
"epoch": 2.4980340760157276,
"grad_norm": 1.197109580039978,
"learning_rate": 0.00010046725180031062,
"loss": 0.1227,
"step": 2860
},
{
"epoch": 2.5024027959807777,
"grad_norm": 1.7598116397857666,
"learning_rate": 0.00010019239838151906,
"loss": 0.1882,
"step": 2865
},
{
"epoch": 2.506771515945828,
"grad_norm": 1.498608946800232,
"learning_rate": 9.99175435092496e-05,
"loss": 0.248,
"step": 2870
},
{
"epoch": 2.511140235910878,
"grad_norm": 1.8565483093261719,
"learning_rate": 9.964268925989954e-05,
"loss": 0.1935,
"step": 2875
},
{
"epoch": 2.5155089558759283,
"grad_norm": 0.8620087504386902,
"learning_rate": 9.936783770986145e-05,
"loss": 0.126,
"step": 2880
},
{
"epoch": 2.5198776758409784,
"grad_norm": 0.7721559405326843,
"learning_rate": 9.909299093550757e-05,
"loss": 0.1066,
"step": 2885
},
{
"epoch": 2.5242463958060286,
"grad_norm": 1.5533393621444702,
"learning_rate": 9.88181510131739e-05,
"loss": 0.2407,
"step": 2890
},
{
"epoch": 2.5286151157710792,
"grad_norm": 1.1294517517089844,
"learning_rate": 9.854332001914486e-05,
"loss": 0.1601,
"step": 2895
},
{
"epoch": 2.5329838357361294,
"grad_norm": 1.7611277103424072,
"learning_rate": 9.826850002963729e-05,
"loss": 0.192,
"step": 2900
},
{
"epoch": 2.5373525557011796,
"grad_norm": 1.9960044622421265,
"learning_rate": 9.799369312078502e-05,
"loss": 0.1327,
"step": 2905
},
{
"epoch": 2.5417212756662297,
"grad_norm": 0.9330713748931885,
"learning_rate": 9.771890136862288e-05,
"loss": 0.1479,
"step": 2910
},
{
"epoch": 2.54608999563128,
"grad_norm": 1.2665588855743408,
"learning_rate": 9.744412684907138e-05,
"loss": 0.1906,
"step": 2915
},
{
"epoch": 2.5504587155963305,
"grad_norm": 1.5277937650680542,
"learning_rate": 9.716937163792075e-05,
"loss": 0.1372,
"step": 2920
},
{
"epoch": 2.5548274355613803,
"grad_norm": 1.4961965084075928,
"learning_rate": 9.689463781081542e-05,
"loss": 0.1955,
"step": 2925
},
{
"epoch": 2.559196155526431,
"grad_norm": 1.1431431770324707,
"learning_rate": 9.661992744323818e-05,
"loss": 0.1879,
"step": 2930
},
{
"epoch": 2.563564875491481,
"grad_norm": 1.1940690279006958,
"learning_rate": 9.634524261049464e-05,
"loss": 0.1565,
"step": 2935
},
{
"epoch": 2.567933595456531,
"grad_norm": 1.741918921470642,
"learning_rate": 9.607058538769756e-05,
"loss": 0.1563,
"step": 2940
},
{
"epoch": 2.5723023154215814,
"grad_norm": 2.1051597595214844,
"learning_rate": 9.579595784975103e-05,
"loss": 0.1136,
"step": 2945
},
{
"epoch": 2.5766710353866316,
"grad_norm": 1.361091136932373,
"learning_rate": 9.552136207133495e-05,
"loss": 0.1554,
"step": 2950
},
{
"epoch": 2.581039755351682,
"grad_norm": 1.1515636444091797,
"learning_rate": 9.524680012688928e-05,
"loss": 0.1832,
"step": 2955
},
{
"epoch": 2.5854084753167323,
"grad_norm": 1.6642508506774902,
"learning_rate": 9.497227409059832e-05,
"loss": 0.1716,
"step": 2960
},
{
"epoch": 2.5897771952817825,
"grad_norm": 1.1159862279891968,
"learning_rate": 9.469778603637518e-05,
"loss": 0.1392,
"step": 2965
},
{
"epoch": 2.5941459152468327,
"grad_norm": 1.3787099123001099,
"learning_rate": 9.4423338037846e-05,
"loss": 0.1429,
"step": 2970
},
{
"epoch": 2.598514635211883,
"grad_norm": 1.0721005201339722,
"learning_rate": 9.414893216833435e-05,
"loss": 0.1687,
"step": 2975
},
{
"epoch": 2.602883355176933,
"grad_norm": 1.7461411952972412,
"learning_rate": 9.387457050084552e-05,
"loss": 0.1757,
"step": 2980
},
{
"epoch": 2.607252075141983,
"grad_norm": 1.4739048480987549,
"learning_rate": 9.360025510805078e-05,
"loss": 0.1686,
"step": 2985
},
{
"epoch": 2.611620795107034,
"grad_norm": 1.7249925136566162,
"learning_rate": 9.332598806227195e-05,
"loss": 0.2044,
"step": 2990
},
{
"epoch": 2.615989515072084,
"grad_norm": 0.8930152654647827,
"learning_rate": 9.305177143546557e-05,
"loss": 0.1248,
"step": 2995
},
{
"epoch": 2.620358235037134,
"grad_norm": 0.8902339935302734,
"learning_rate": 9.277760729920728e-05,
"loss": 0.1633,
"step": 3000
},
{
"epoch": 2.6247269550021843,
"grad_norm": 2.3700525760650635,
"learning_rate": 9.250349772467618e-05,
"loss": 0.1805,
"step": 3005
},
{
"epoch": 2.6290956749672345,
"grad_norm": 1.4570140838623047,
"learning_rate": 9.222944478263915e-05,
"loss": 0.1918,
"step": 3010
},
{
"epoch": 2.6334643949322847,
"grad_norm": 1.3790675401687622,
"learning_rate": 9.195545054343529e-05,
"loss": 0.1127,
"step": 3015
},
{
"epoch": 2.637833114897335,
"grad_norm": 2.068342924118042,
"learning_rate": 9.16815170769602e-05,
"loss": 0.2045,
"step": 3020
},
{
"epoch": 2.6422018348623855,
"grad_norm": 2.2792484760284424,
"learning_rate": 9.14076464526504e-05,
"loss": 0.1942,
"step": 3025
},
{
"epoch": 2.6465705548274356,
"grad_norm": 2.240042209625244,
"learning_rate": 9.113384073946765e-05,
"loss": 0.1725,
"step": 3030
},
{
"epoch": 2.650939274792486,
"grad_norm": 1.1308009624481201,
"learning_rate": 9.086010200588328e-05,
"loss": 0.1611,
"step": 3035
},
{
"epoch": 2.655307994757536,
"grad_norm": 1.0229357481002808,
"learning_rate": 9.05864323198627e-05,
"loss": 0.1892,
"step": 3040
},
{
"epoch": 2.659676714722586,
"grad_norm": 1.2049839496612549,
"learning_rate": 9.03128337488497e-05,
"loss": 0.1585,
"step": 3045
},
{
"epoch": 2.6640454346876368,
"grad_norm": 1.1866313219070435,
"learning_rate": 9.003930835975082e-05,
"loss": 0.1224,
"step": 3050
},
{
"epoch": 2.6684141546526865,
"grad_norm": 1.1551754474639893,
"learning_rate": 8.976585821891966e-05,
"loss": 0.1275,
"step": 3055
},
{
"epoch": 2.672782874617737,
"grad_norm": 1.956389307975769,
"learning_rate": 8.949248539214145e-05,
"loss": 0.1537,
"step": 3060
},
{
"epoch": 2.6771515945827873,
"grad_norm": 0.9731565117835999,
"learning_rate": 8.921919194461735e-05,
"loss": 0.1233,
"step": 3065
},
{
"epoch": 2.6815203145478375,
"grad_norm": 0.7945232391357422,
"learning_rate": 8.894597994094879e-05,
"loss": 0.1518,
"step": 3070
},
{
"epoch": 2.6858890345128876,
"grad_norm": 1.3885526657104492,
"learning_rate": 8.867285144512202e-05,
"loss": 0.187,
"step": 3075
},
{
"epoch": 2.690257754477938,
"grad_norm": 1.2687430381774902,
"learning_rate": 8.839980852049229e-05,
"loss": 0.1599,
"step": 3080
},
{
"epoch": 2.6946264744429884,
"grad_norm": 1.0735893249511719,
"learning_rate": 8.812685322976851e-05,
"loss": 0.1826,
"step": 3085
},
{
"epoch": 2.6989951944080386,
"grad_norm": 1.6104284524917603,
"learning_rate": 8.785398763499755e-05,
"loss": 0.1791,
"step": 3090
},
{
"epoch": 2.7033639143730888,
"grad_norm": 1.6893842220306396,
"learning_rate": 8.758121379754865e-05,
"loss": 0.1588,
"step": 3095
},
{
"epoch": 2.707732634338139,
"grad_norm": 0.967934787273407,
"learning_rate": 8.730853377809784e-05,
"loss": 0.1376,
"step": 3100
},
{
"epoch": 2.712101354303189,
"grad_norm": 1.2705645561218262,
"learning_rate": 8.703594963661241e-05,
"loss": 0.1689,
"step": 3105
},
{
"epoch": 2.7164700742682393,
"grad_norm": 1.4289767742156982,
"learning_rate": 8.67634634323354e-05,
"loss": 0.1802,
"step": 3110
},
{
"epoch": 2.7208387942332894,
"grad_norm": 1.4124540090560913,
"learning_rate": 8.64910772237699e-05,
"loss": 0.1616,
"step": 3115
},
{
"epoch": 2.72520751419834,
"grad_norm": 1.1596726179122925,
"learning_rate": 8.62187930686636e-05,
"loss": 0.1676,
"step": 3120
},
{
"epoch": 2.7295762341633902,
"grad_norm": 1.243022084236145,
"learning_rate": 8.594661302399332e-05,
"loss": 0.139,
"step": 3125
},
{
"epoch": 2.7339449541284404,
"grad_norm": 1.3218811750411987,
"learning_rate": 8.56745391459492e-05,
"loss": 0.2164,
"step": 3130
},
{
"epoch": 2.7383136740934906,
"grad_norm": 2.225620985031128,
"learning_rate": 8.540257348991947e-05,
"loss": 0.1359,
"step": 3135
},
{
"epoch": 2.7426823940585408,
"grad_norm": 2.3583812713623047,
"learning_rate": 8.513071811047478e-05,
"loss": 0.2217,
"step": 3140
},
{
"epoch": 2.747051114023591,
"grad_norm": 2.345612049102783,
"learning_rate": 8.48589750613527e-05,
"loss": 0.1527,
"step": 3145
},
{
"epoch": 2.751419833988641,
"grad_norm": 0.6265102028846741,
"learning_rate": 8.458734639544207e-05,
"loss": 0.1387,
"step": 3150
},
{
"epoch": 2.7557885539536917,
"grad_norm": 1.4767670631408691,
"learning_rate": 8.431583416476779e-05,
"loss": 0.1774,
"step": 3155
},
{
"epoch": 2.760157273918742,
"grad_norm": 1.541107416152954,
"learning_rate": 8.404444042047507e-05,
"loss": 0.1522,
"step": 3160
},
{
"epoch": 2.764525993883792,
"grad_norm": 1.396525263786316,
"learning_rate": 8.377316721281402e-05,
"loss": 0.1972,
"step": 3165
},
{
"epoch": 2.7688947138488422,
"grad_norm": 1.0470671653747559,
"learning_rate": 8.35020165911242e-05,
"loss": 0.1808,
"step": 3170
},
{
"epoch": 2.7732634338138924,
"grad_norm": 1.216828465461731,
"learning_rate": 8.323099060381896e-05,
"loss": 0.1383,
"step": 3175
},
{
"epoch": 2.777632153778943,
"grad_norm": 1.0746128559112549,
"learning_rate": 8.296009129837022e-05,
"loss": 0.1591,
"step": 3180
},
{
"epoch": 2.7820008737439927,
"grad_norm": 1.3712584972381592,
"learning_rate": 8.268932072129287e-05,
"loss": 0.161,
"step": 3185
},
{
"epoch": 2.7863695937090434,
"grad_norm": 2.859886646270752,
"learning_rate": 8.241868091812924e-05,
"loss": 0.1858,
"step": 3190
},
{
"epoch": 2.7907383136740935,
"grad_norm": 1.0394188165664673,
"learning_rate": 8.214817393343383e-05,
"loss": 0.1196,
"step": 3195
},
{
"epoch": 2.7951070336391437,
"grad_norm": 1.1338419914245605,
"learning_rate": 8.187780181075766e-05,
"loss": 0.1264,
"step": 3200
},
{
"epoch": 2.799475753604194,
"grad_norm": 0.9405713677406311,
"learning_rate": 8.160756659263298e-05,
"loss": 0.1177,
"step": 3205
},
{
"epoch": 2.803844473569244,
"grad_norm": 1.4486256837844849,
"learning_rate": 8.13374703205578e-05,
"loss": 0.1745,
"step": 3210
},
{
"epoch": 2.8082131935342947,
"grad_norm": 1.058084487915039,
"learning_rate": 8.106751503498045e-05,
"loss": 0.1639,
"step": 3215
},
{
"epoch": 2.812581913499345,
"grad_norm": 1.712044358253479,
"learning_rate": 8.079770277528422e-05,
"loss": 0.1335,
"step": 3220
},
{
"epoch": 2.816950633464395,
"grad_norm": 1.0125361680984497,
"learning_rate": 8.052803557977175e-05,
"loss": 0.1197,
"step": 3225
},
{
"epoch": 2.821319353429445,
"grad_norm": 1.248329758644104,
"learning_rate": 8.025851548564999e-05,
"loss": 0.1143,
"step": 3230
},
{
"epoch": 2.8256880733944953,
"grad_norm": 0.976321280002594,
"learning_rate": 7.998914452901447e-05,
"loss": 0.1318,
"step": 3235
},
{
"epoch": 2.8300567933595455,
"grad_norm": 1.4181995391845703,
"learning_rate": 7.971992474483413e-05,
"loss": 0.1309,
"step": 3240
},
{
"epoch": 2.8344255133245957,
"grad_norm": 2.2041068077087402,
"learning_rate": 7.945085816693589e-05,
"loss": 0.1843,
"step": 3245
},
{
"epoch": 2.8387942332896463,
"grad_norm": 0.7392102479934692,
"learning_rate": 7.918194682798914e-05,
"loss": 0.1484,
"step": 3250
},
{
"epoch": 2.8431629532546965,
"grad_norm": 1.1940444707870483,
"learning_rate": 7.891319275949066e-05,
"loss": 0.1984,
"step": 3255
},
{
"epoch": 2.8475316732197467,
"grad_norm": 0.9530165791511536,
"learning_rate": 7.864459799174904e-05,
"loss": 0.1537,
"step": 3260
},
{
"epoch": 2.851900393184797,
"grad_norm": 1.3266198635101318,
"learning_rate": 7.837616455386954e-05,
"loss": 0.1721,
"step": 3265
},
{
"epoch": 2.856269113149847,
"grad_norm": 1.8742265701293945,
"learning_rate": 7.810789447373846e-05,
"loss": 0.1385,
"step": 3270
},
{
"epoch": 2.8606378331148976,
"grad_norm": 0.6404191851615906,
"learning_rate": 7.783978977800818e-05,
"loss": 0.1598,
"step": 3275
},
{
"epoch": 2.8650065530799473,
"grad_norm": 1.6875426769256592,
"learning_rate": 7.757185249208163e-05,
"loss": 0.1133,
"step": 3280
},
{
"epoch": 2.869375273044998,
"grad_norm": 2.657672166824341,
"learning_rate": 7.730408464009698e-05,
"loss": 0.1347,
"step": 3285
},
{
"epoch": 2.873743993010048,
"grad_norm": 1.3153127431869507,
"learning_rate": 7.70364882449125e-05,
"loss": 0.1183,
"step": 3290
},
{
"epoch": 2.8781127129750983,
"grad_norm": 2.145387649536133,
"learning_rate": 7.676906532809115e-05,
"loss": 0.1643,
"step": 3295
},
{
"epoch": 2.8824814329401485,
"grad_norm": 1.2466741800308228,
"learning_rate": 7.650181790988527e-05,
"loss": 0.156,
"step": 3300
},
{
"epoch": 2.8868501529051986,
"grad_norm": 1.4382154941558838,
"learning_rate": 7.62347480092215e-05,
"loss": 0.1441,
"step": 3305
},
{
"epoch": 2.8912188728702493,
"grad_norm": 1.2591091394424438,
"learning_rate": 7.596785764368539e-05,
"loss": 0.1966,
"step": 3310
},
{
"epoch": 2.895587592835299,
"grad_norm": 1.8565058708190918,
"learning_rate": 7.570114882950619e-05,
"loss": 0.1394,
"step": 3315
},
{
"epoch": 2.8999563128003496,
"grad_norm": 1.146375298500061,
"learning_rate": 7.543462358154153e-05,
"loss": 0.0894,
"step": 3320
},
{
"epoch": 2.9043250327653998,
"grad_norm": 1.0392186641693115,
"learning_rate": 7.51682839132624e-05,
"loss": 0.1452,
"step": 3325
},
{
"epoch": 2.90869375273045,
"grad_norm": 2.525780439376831,
"learning_rate": 7.49021318367378e-05,
"loss": 0.1675,
"step": 3330
},
{
"epoch": 2.9130624726955,
"grad_norm": 2.238649368286133,
"learning_rate": 7.463616936261952e-05,
"loss": 0.1889,
"step": 3335
},
{
"epoch": 2.9174311926605503,
"grad_norm": 1.5493022203445435,
"learning_rate": 7.437039850012704e-05,
"loss": 0.1021,
"step": 3340
},
{
"epoch": 2.921799912625601,
"grad_norm": 1.0183563232421875,
"learning_rate": 7.410482125703225e-05,
"loss": 0.1179,
"step": 3345
},
{
"epoch": 2.926168632590651,
"grad_norm": 0.9551679491996765,
"learning_rate": 7.383943963964439e-05,
"loss": 0.1458,
"step": 3350
},
{
"epoch": 2.9305373525557012,
"grad_norm": 3.2518422603607178,
"learning_rate": 7.357425565279483e-05,
"loss": 0.1401,
"step": 3355
},
{
"epoch": 2.9349060725207514,
"grad_norm": 2.4154436588287354,
"learning_rate": 7.330927129982191e-05,
"loss": 0.2123,
"step": 3360
},
{
"epoch": 2.9392747924858016,
"grad_norm": 1.7631040811538696,
"learning_rate": 7.304448858255588e-05,
"loss": 0.1705,
"step": 3365
},
{
"epoch": 2.9436435124508518,
"grad_norm": 1.741014003753662,
"learning_rate": 7.277990950130369e-05,
"loss": 0.1277,
"step": 3370
},
{
"epoch": 2.948012232415902,
"grad_norm": 1.7614057064056396,
"learning_rate": 7.25155360548339e-05,
"loss": 0.147,
"step": 3375
},
{
"epoch": 2.9523809523809526,
"grad_norm": 1.403290867805481,
"learning_rate": 7.225137024036164e-05,
"loss": 0.172,
"step": 3380
},
{
"epoch": 2.9567496723460027,
"grad_norm": 1.9428198337554932,
"learning_rate": 7.19874140535335e-05,
"loss": 0.2027,
"step": 3385
},
{
"epoch": 2.961118392311053,
"grad_norm": 2.5754446983337402,
"learning_rate": 7.172366948841232e-05,
"loss": 0.1601,
"step": 3390
},
{
"epoch": 2.965487112276103,
"grad_norm": 1.068940281867981,
"learning_rate": 7.146013853746237e-05,
"loss": 0.1444,
"step": 3395
},
{
"epoch": 2.9698558322411532,
"grad_norm": 1.6970683336257935,
"learning_rate": 7.119682319153409e-05,
"loss": 0.148,
"step": 3400
},
{
"epoch": 2.974224552206204,
"grad_norm": 1.3835630416870117,
"learning_rate": 7.093372543984915e-05,
"loss": 0.1352,
"step": 3405
},
{
"epoch": 2.9785932721712536,
"grad_norm": 0.9599865078926086,
"learning_rate": 7.067084726998548e-05,
"loss": 0.1035,
"step": 3410
},
{
"epoch": 2.982961992136304,
"grad_norm": 2.335697889328003,
"learning_rate": 7.040819066786195e-05,
"loss": 0.1958,
"step": 3415
},
{
"epoch": 2.9873307121013544,
"grad_norm": 1.150230050086975,
"learning_rate": 7.014575761772382e-05,
"loss": 0.1602,
"step": 3420
},
{
"epoch": 2.9916994320664045,
"grad_norm": 1.7480928897857666,
"learning_rate": 6.988355010212742e-05,
"loss": 0.1794,
"step": 3425
},
{
"epoch": 2.9960681520314547,
"grad_norm": 1.2962545156478882,
"learning_rate": 6.962157010192529e-05,
"loss": 0.1637,
"step": 3430
},
{
"epoch": 3.0,
"grad_norm": 0.600903332233429,
"learning_rate": 6.935981959625126e-05,
"loss": 0.1508,
"step": 3435
},
{
"epoch": 3.00436871996505,
"grad_norm": 0.9637101888656616,
"learning_rate": 6.909830056250527e-05,
"loss": 0.1208,
"step": 3440
},
{
"epoch": 3.0087374399301003,
"grad_norm": 1.0528173446655273,
"learning_rate": 6.883701497633876e-05,
"loss": 0.0717,
"step": 3445
},
{
"epoch": 3.0131061598951505,
"grad_norm": 1.6418970823287964,
"learning_rate": 6.857596481163957e-05,
"loss": 0.0916,
"step": 3450
},
{
"epoch": 3.017474879860201,
"grad_norm": 1.6248525381088257,
"learning_rate": 6.831515204051692e-05,
"loss": 0.0801,
"step": 3455
},
{
"epoch": 3.0218435998252513,
"grad_norm": 1.015778660774231,
"learning_rate": 6.805457863328683e-05,
"loss": 0.0989,
"step": 3460
},
{
"epoch": 3.0262123197903015,
"grad_norm": 0.8818852305412292,
"learning_rate": 6.779424655845687e-05,
"loss": 0.0915,
"step": 3465
},
{
"epoch": 3.0305810397553516,
"grad_norm": 0.8511345982551575,
"learning_rate": 6.75341577827115e-05,
"loss": 0.0819,
"step": 3470
},
{
"epoch": 3.034949759720402,
"grad_norm": 0.9664357304573059,
"learning_rate": 6.727431427089724e-05,
"loss": 0.0947,
"step": 3475
},
{
"epoch": 3.039318479685452,
"grad_norm": 1.4519518613815308,
"learning_rate": 6.701471798600766e-05,
"loss": 0.0688,
"step": 3480
},
{
"epoch": 3.0436871996505026,
"grad_norm": 1.7815632820129395,
"learning_rate": 6.675537088916882e-05,
"loss": 0.0865,
"step": 3485
},
{
"epoch": 3.0480559196155528,
"grad_norm": 1.5605847835540771,
"learning_rate": 6.6496274939624e-05,
"loss": 0.1307,
"step": 3490
},
{
"epoch": 3.052424639580603,
"grad_norm": 1.2095025777816772,
"learning_rate": 6.623743209471942e-05,
"loss": 0.1197,
"step": 3495
},
{
"epoch": 3.056793359545653,
"grad_norm": 1.2208069562911987,
"learning_rate": 6.597884430988917e-05,
"loss": 0.1115,
"step": 3500
},
{
"epoch": 3.0611620795107033,
"grad_norm": 1.4211591482162476,
"learning_rate": 6.572051353864043e-05,
"loss": 0.072,
"step": 3505
},
{
"epoch": 3.0655307994757535,
"grad_norm": 1.2047765254974365,
"learning_rate": 6.546244173253878e-05,
"loss": 0.0544,
"step": 3510
},
{
"epoch": 3.0698995194408036,
"grad_norm": 1.7253801822662354,
"learning_rate": 6.520463084119343e-05,
"loss": 0.0794,
"step": 3515
},
{
"epoch": 3.0742682394058543,
"grad_norm": 7.309013366699219,
"learning_rate": 6.494708281224255e-05,
"loss": 0.1807,
"step": 3520
},
{
"epoch": 3.0786369593709044,
"grad_norm": 1.258571743965149,
"learning_rate": 6.468979959133852e-05,
"loss": 0.0716,
"step": 3525
},
{
"epoch": 3.0830056793359546,
"grad_norm": 1.4243378639221191,
"learning_rate": 6.443278312213312e-05,
"loss": 0.1365,
"step": 3530
},
{
"epoch": 3.0873743993010048,
"grad_norm": 2.169480562210083,
"learning_rate": 6.417603534626306e-05,
"loss": 0.1059,
"step": 3535
},
{
"epoch": 3.091743119266055,
"grad_norm": 1.2186070680618286,
"learning_rate": 6.391955820333513e-05,
"loss": 0.0943,
"step": 3540
},
{
"epoch": 3.096111839231105,
"grad_norm": 2.4399561882019043,
"learning_rate": 6.366335363091165e-05,
"loss": 0.0943,
"step": 3545
},
{
"epoch": 3.1004805591961557,
"grad_norm": 1.6928596496582031,
"learning_rate": 6.340742356449579e-05,
"loss": 0.1019,
"step": 3550
},
{
"epoch": 3.104849279161206,
"grad_norm": 2.0251004695892334,
"learning_rate": 6.315176993751699e-05,
"loss": 0.0932,
"step": 3555
},
{
"epoch": 3.109217999126256,
"grad_norm": 1.7529054880142212,
"learning_rate": 6.289639468131622e-05,
"loss": 0.0946,
"step": 3560
},
{
"epoch": 3.1135867190913062,
"grad_norm": 1.67819344997406,
"learning_rate": 6.264129972513163e-05,
"loss": 0.0813,
"step": 3565
},
{
"epoch": 3.1179554390563564,
"grad_norm": 1.2460740804672241,
"learning_rate": 6.238648699608375e-05,
"loss": 0.1024,
"step": 3570
},
{
"epoch": 3.1223241590214066,
"grad_norm": 1.6184308528900146,
"learning_rate": 6.213195841916104e-05,
"loss": 0.0847,
"step": 3575
},
{
"epoch": 3.126692878986457,
"grad_norm": 1.5642549991607666,
"learning_rate": 6.187771591720536e-05,
"loss": 0.1008,
"step": 3580
},
{
"epoch": 3.1310615989515074,
"grad_norm": 1.9698232412338257,
"learning_rate": 6.16237614108973e-05,
"loss": 0.0994,
"step": 3585
},
{
"epoch": 3.1354303189165575,
"grad_norm": 1.6373701095581055,
"learning_rate": 6.137009681874192e-05,
"loss": 0.1204,
"step": 3590
},
{
"epoch": 3.1397990388816077,
"grad_norm": 0.9783329963684082,
"learning_rate": 6.111672405705402e-05,
"loss": 0.0716,
"step": 3595
},
{
"epoch": 3.144167758846658,
"grad_norm": 1.6490020751953125,
"learning_rate": 6.086364503994382e-05,
"loss": 0.078,
"step": 3600
}
],
"logging_steps": 5,
"max_steps": 5725,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1749970321545216.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}