bashgemma-270m / checkpoint-4600 /trainer_state.json
thinkthink-dev's picture
Upload folder using huggingface_hub
d482ce9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.017474879860201,
"eval_steps": 500,
"global_step": 4600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00436871996505024,
"grad_norm": 9.839941024780273,
"learning_rate": 8e-05,
"loss": 2.5246,
"step": 5
},
{
"epoch": 0.00873743993010048,
"grad_norm": 13.773455619812012,
"learning_rate": 0.00018,
"loss": 1.1343,
"step": 10
},
{
"epoch": 0.01310615989515072,
"grad_norm": 5.6580424308776855,
"learning_rate": 0.0001999997582552296,
"loss": 0.7712,
"step": 15
},
{
"epoch": 0.01747487986020096,
"grad_norm": 5.294467926025391,
"learning_rate": 0.0001999987761691029,
"loss": 0.73,
"step": 20
},
{
"epoch": 0.021843599825251202,
"grad_norm": 2.8633503913879395,
"learning_rate": 0.00019999703863998527,
"loss": 0.7289,
"step": 25
},
{
"epoch": 0.02621231979030144,
"grad_norm": 3.2836177349090576,
"learning_rate": 0.00019999454568100293,
"loss": 0.4686,
"step": 30
},
{
"epoch": 0.03058103975535168,
"grad_norm": 4.878258228302002,
"learning_rate": 0.00019999129731098898,
"loss": 0.6629,
"step": 35
},
{
"epoch": 0.03494975972040192,
"grad_norm": 2.899914026260376,
"learning_rate": 0.00019998729355448326,
"loss": 0.6038,
"step": 40
},
{
"epoch": 0.039318479685452164,
"grad_norm": 3.289844274520874,
"learning_rate": 0.00019998253444173235,
"loss": 0.4573,
"step": 45
},
{
"epoch": 0.043687199650502405,
"grad_norm": 2.957254648208618,
"learning_rate": 0.00019997702000868896,
"loss": 0.594,
"step": 50
},
{
"epoch": 0.048055919615552646,
"grad_norm": 3.171276807785034,
"learning_rate": 0.00019997075029701207,
"loss": 0.5719,
"step": 55
},
{
"epoch": 0.05242463958060288,
"grad_norm": 2.55605149269104,
"learning_rate": 0.0001999637253540663,
"loss": 0.5971,
"step": 60
},
{
"epoch": 0.05679335954565312,
"grad_norm": 2.127289295196533,
"learning_rate": 0.00019995594523292178,
"loss": 0.5712,
"step": 65
},
{
"epoch": 0.06116207951070336,
"grad_norm": 3.3928685188293457,
"learning_rate": 0.00019994740999235359,
"loss": 0.5712,
"step": 70
},
{
"epoch": 0.0655307994757536,
"grad_norm": 2.6700279712677,
"learning_rate": 0.00019993811969684142,
"loss": 0.427,
"step": 75
},
{
"epoch": 0.06989951944080385,
"grad_norm": 2.6936633586883545,
"learning_rate": 0.00019992807441656898,
"loss": 0.5321,
"step": 80
},
{
"epoch": 0.07426823940585409,
"grad_norm": 3.9897687435150146,
"learning_rate": 0.00019991727422742362,
"loss": 0.6025,
"step": 85
},
{
"epoch": 0.07863695937090433,
"grad_norm": 2.3496663570404053,
"learning_rate": 0.00019990571921099553,
"loss": 0.5975,
"step": 90
},
{
"epoch": 0.08300567933595457,
"grad_norm": 3.3796467781066895,
"learning_rate": 0.0001998934094545774,
"loss": 0.5255,
"step": 95
},
{
"epoch": 0.08737439930100481,
"grad_norm": 3.1103007793426514,
"learning_rate": 0.00019988034505116352,
"loss": 0.4946,
"step": 100
},
{
"epoch": 0.09174311926605505,
"grad_norm": 2.002304792404175,
"learning_rate": 0.00019986652609944926,
"loss": 0.425,
"step": 105
},
{
"epoch": 0.09611183923110529,
"grad_norm": 1.7572168111801147,
"learning_rate": 0.00019985195270383018,
"loss": 0.6073,
"step": 110
},
{
"epoch": 0.10048055919615553,
"grad_norm": 2.745215654373169,
"learning_rate": 0.00019983662497440133,
"loss": 0.586,
"step": 115
},
{
"epoch": 0.10484927916120576,
"grad_norm": 1.8170915842056274,
"learning_rate": 0.0001998205430269564,
"loss": 0.5255,
"step": 120
},
{
"epoch": 0.109217999126256,
"grad_norm": 1.4944056272506714,
"learning_rate": 0.00019980370698298677,
"loss": 0.4219,
"step": 125
},
{
"epoch": 0.11358671909130624,
"grad_norm": 1.6616989374160767,
"learning_rate": 0.00019978611696968074,
"loss": 0.4231,
"step": 130
},
{
"epoch": 0.11795543905635648,
"grad_norm": 2.0523645877838135,
"learning_rate": 0.00019976777311992247,
"loss": 0.5298,
"step": 135
},
{
"epoch": 0.12232415902140673,
"grad_norm": 2.065765619277954,
"learning_rate": 0.00019974867557229098,
"loss": 0.5228,
"step": 140
},
{
"epoch": 0.12669287898645698,
"grad_norm": 1.7283438444137573,
"learning_rate": 0.00019972882447105912,
"loss": 0.3452,
"step": 145
},
{
"epoch": 0.1310615989515072,
"grad_norm": 2.655750274658203,
"learning_rate": 0.00019970821996619244,
"loss": 0.508,
"step": 150
},
{
"epoch": 0.13543031891655744,
"grad_norm": 2.67799973487854,
"learning_rate": 0.0001996868622133482,
"loss": 0.4359,
"step": 155
},
{
"epoch": 0.1397990388816077,
"grad_norm": 1.6298809051513672,
"learning_rate": 0.00019966475137387396,
"loss": 0.5447,
"step": 160
},
{
"epoch": 0.14416775884665792,
"grad_norm": 1.4772286415100098,
"learning_rate": 0.00019964188761480657,
"loss": 0.4105,
"step": 165
},
{
"epoch": 0.14853647881170817,
"grad_norm": 2.2986271381378174,
"learning_rate": 0.00019961827110887083,
"loss": 0.603,
"step": 170
},
{
"epoch": 0.1529051987767584,
"grad_norm": 2.8261911869049072,
"learning_rate": 0.00019959390203447817,
"loss": 0.4649,
"step": 175
},
{
"epoch": 0.15727391874180865,
"grad_norm": 1.7771011590957642,
"learning_rate": 0.00019956878057572524,
"loss": 0.4394,
"step": 180
},
{
"epoch": 0.16164263870685888,
"grad_norm": 1.7315421104431152,
"learning_rate": 0.00019954290692239274,
"loss": 0.5289,
"step": 185
},
{
"epoch": 0.16601135867190914,
"grad_norm": 1.6124423742294312,
"learning_rate": 0.00019951628126994373,
"loss": 0.4173,
"step": 190
},
{
"epoch": 0.17038007863695936,
"grad_norm": 1.792577862739563,
"learning_rate": 0.00019948890381952232,
"loss": 0.4331,
"step": 195
},
{
"epoch": 0.17474879860200962,
"grad_norm": 1.9038774967193604,
"learning_rate": 0.000199460774777952,
"loss": 0.4247,
"step": 200
},
{
"epoch": 0.17911751856705985,
"grad_norm": 2.457122802734375,
"learning_rate": 0.00019943189435773432,
"loss": 0.4519,
"step": 205
},
{
"epoch": 0.1834862385321101,
"grad_norm": 1.97683584690094,
"learning_rate": 0.00019940226277704706,
"loss": 0.4761,
"step": 210
},
{
"epoch": 0.18785495849716033,
"grad_norm": 2.1646862030029297,
"learning_rate": 0.0001993718802597426,
"loss": 0.5294,
"step": 215
},
{
"epoch": 0.19222367846221058,
"grad_norm": 1.565412998199463,
"learning_rate": 0.00019934074703534637,
"loss": 0.3999,
"step": 220
},
{
"epoch": 0.1965923984272608,
"grad_norm": 2.4315876960754395,
"learning_rate": 0.00019930886333905504,
"loss": 0.378,
"step": 225
},
{
"epoch": 0.20096111839231107,
"grad_norm": 2.7567529678344727,
"learning_rate": 0.00019927622941173467,
"loss": 0.5075,
"step": 230
},
{
"epoch": 0.2053298383573613,
"grad_norm": 1.8640387058258057,
"learning_rate": 0.00019924284549991902,
"loss": 0.4749,
"step": 235
},
{
"epoch": 0.20969855832241152,
"grad_norm": 2.090924024581909,
"learning_rate": 0.00019920871185580757,
"loss": 0.4353,
"step": 240
},
{
"epoch": 0.21406727828746178,
"grad_norm": 1.9691081047058105,
"learning_rate": 0.00019917382873726376,
"loss": 0.4051,
"step": 245
},
{
"epoch": 0.218435998252512,
"grad_norm": 1.8130213022232056,
"learning_rate": 0.0001991381964078128,
"loss": 0.526,
"step": 250
},
{
"epoch": 0.22280471821756226,
"grad_norm": 2.078805923461914,
"learning_rate": 0.00019910181513664,
"loss": 0.5654,
"step": 255
},
{
"epoch": 0.22717343818261249,
"grad_norm": 2.0686287879943848,
"learning_rate": 0.0001990646851985884,
"loss": 0.43,
"step": 260
},
{
"epoch": 0.23154215814766274,
"grad_norm": 1.475821614265442,
"learning_rate": 0.00019902680687415705,
"loss": 0.355,
"step": 265
},
{
"epoch": 0.23591087811271297,
"grad_norm": 1.901236891746521,
"learning_rate": 0.0001989881804494985,
"loss": 0.4522,
"step": 270
},
{
"epoch": 0.24027959807776322,
"grad_norm": 1.2583553791046143,
"learning_rate": 0.00019894880621641704,
"loss": 0.3869,
"step": 275
},
{
"epoch": 0.24464831804281345,
"grad_norm": 1.712336540222168,
"learning_rate": 0.00019890868447236613,
"loss": 0.454,
"step": 280
},
{
"epoch": 0.2490170380078637,
"grad_norm": 2.3967206478118896,
"learning_rate": 0.00019886781552044634,
"loss": 0.4074,
"step": 285
},
{
"epoch": 0.25338575797291396,
"grad_norm": 2.0578925609588623,
"learning_rate": 0.0001988261996694032,
"loss": 0.4268,
"step": 290
},
{
"epoch": 0.2577544779379642,
"grad_norm": 1.7411088943481445,
"learning_rate": 0.0001987838372336245,
"loss": 0.334,
"step": 295
},
{
"epoch": 0.2621231979030144,
"grad_norm": 1.8145533800125122,
"learning_rate": 0.0001987407285331382,
"loss": 0.4019,
"step": 300
},
{
"epoch": 0.26649191786806464,
"grad_norm": 1.3501653671264648,
"learning_rate": 0.00019869687389361,
"loss": 0.32,
"step": 305
},
{
"epoch": 0.27086063783311487,
"grad_norm": 1.208422303199768,
"learning_rate": 0.00019865227364634073,
"loss": 0.4548,
"step": 310
},
{
"epoch": 0.27522935779816515,
"grad_norm": 1.521690011024475,
"learning_rate": 0.00019860692812826396,
"loss": 0.3572,
"step": 315
},
{
"epoch": 0.2795980777632154,
"grad_norm": 2.2849714756011963,
"learning_rate": 0.0001985608376819434,
"loss": 0.4555,
"step": 320
},
{
"epoch": 0.2839667977282656,
"grad_norm": 2.7733798027038574,
"learning_rate": 0.00019851400265557037,
"loss": 0.4726,
"step": 325
},
{
"epoch": 0.28833551769331583,
"grad_norm": 1.973522424697876,
"learning_rate": 0.00019846642340296114,
"loss": 0.4585,
"step": 330
},
{
"epoch": 0.2927042376583661,
"grad_norm": 1.7133642435073853,
"learning_rate": 0.0001984181002835542,
"loss": 0.4679,
"step": 335
},
{
"epoch": 0.29707295762341634,
"grad_norm": 2.8383235931396484,
"learning_rate": 0.00019836903366240768,
"loss": 0.4119,
"step": 340
},
{
"epoch": 0.30144167758846657,
"grad_norm": 2.798276901245117,
"learning_rate": 0.00019831922391019645,
"loss": 0.3665,
"step": 345
},
{
"epoch": 0.3058103975535168,
"grad_norm": 2.171276569366455,
"learning_rate": 0.00019826867140320938,
"loss": 0.5691,
"step": 350
},
{
"epoch": 0.3101791175185671,
"grad_norm": 2.0866177082061768,
"learning_rate": 0.00019821737652334653,
"loss": 0.4074,
"step": 355
},
{
"epoch": 0.3145478374836173,
"grad_norm": 1.3713918924331665,
"learning_rate": 0.0001981653396581162,
"loss": 0.3379,
"step": 360
},
{
"epoch": 0.31891655744866754,
"grad_norm": 1.6086684465408325,
"learning_rate": 0.0001981125612006321,
"loss": 0.3563,
"step": 365
},
{
"epoch": 0.32328527741371776,
"grad_norm": 2.655686378479004,
"learning_rate": 0.0001980590415496102,
"loss": 0.3988,
"step": 370
},
{
"epoch": 0.32765399737876805,
"grad_norm": 1.5271559953689575,
"learning_rate": 0.00019800478110936596,
"loss": 0.5784,
"step": 375
},
{
"epoch": 0.3320227173438183,
"grad_norm": 1.3043195009231567,
"learning_rate": 0.00019794978028981106,
"loss": 0.2637,
"step": 380
},
{
"epoch": 0.3363914373088685,
"grad_norm": 2.539109706878662,
"learning_rate": 0.0001978940395064504,
"loss": 0.4658,
"step": 385
},
{
"epoch": 0.34076015727391873,
"grad_norm": 1.7521268129348755,
"learning_rate": 0.00019783755918037903,
"loss": 0.4253,
"step": 390
},
{
"epoch": 0.34512887723896896,
"grad_norm": 1.5679692029953003,
"learning_rate": 0.00019778033973827882,
"loss": 0.4528,
"step": 395
},
{
"epoch": 0.34949759720401924,
"grad_norm": 1.670640468597412,
"learning_rate": 0.00019772238161241528,
"loss": 0.3724,
"step": 400
},
{
"epoch": 0.35386631716906947,
"grad_norm": 1.520856261253357,
"learning_rate": 0.00019766368524063438,
"loss": 0.4141,
"step": 405
},
{
"epoch": 0.3582350371341197,
"grad_norm": 1.0802158117294312,
"learning_rate": 0.00019760425106635926,
"loss": 0.3268,
"step": 410
},
{
"epoch": 0.3626037570991699,
"grad_norm": 1.7306379079818726,
"learning_rate": 0.0001975440795385866,
"loss": 0.3654,
"step": 415
},
{
"epoch": 0.3669724770642202,
"grad_norm": 1.5037274360656738,
"learning_rate": 0.0001974831711118836,
"loss": 0.4285,
"step": 420
},
{
"epoch": 0.37134119702927043,
"grad_norm": 1.4654844999313354,
"learning_rate": 0.00019742152624638437,
"loss": 0.2548,
"step": 425
},
{
"epoch": 0.37570991699432066,
"grad_norm": 2.6770753860473633,
"learning_rate": 0.00019735914540778638,
"loss": 0.4238,
"step": 430
},
{
"epoch": 0.3800786369593709,
"grad_norm": 1.1864055395126343,
"learning_rate": 0.00019729602906734704,
"loss": 0.3959,
"step": 435
},
{
"epoch": 0.38444735692442117,
"grad_norm": 1.904876708984375,
"learning_rate": 0.00019723217770188024,
"loss": 0.3603,
"step": 440
},
{
"epoch": 0.3888160768894714,
"grad_norm": 1.7086598873138428,
"learning_rate": 0.0001971675917937525,
"loss": 0.551,
"step": 445
},
{
"epoch": 0.3931847968545216,
"grad_norm": 1.4635995626449585,
"learning_rate": 0.00019710227183087947,
"loss": 0.3738,
"step": 450
},
{
"epoch": 0.39755351681957185,
"grad_norm": 1.6047295331954956,
"learning_rate": 0.00019703621830672238,
"loss": 0.475,
"step": 455
},
{
"epoch": 0.40192223678462213,
"grad_norm": 1.4741933345794678,
"learning_rate": 0.00019696943172028394,
"loss": 0.4021,
"step": 460
},
{
"epoch": 0.40629095674967236,
"grad_norm": 2.8138020038604736,
"learning_rate": 0.00019690191257610497,
"loss": 0.3665,
"step": 465
},
{
"epoch": 0.4106596767147226,
"grad_norm": 1.6264874935150146,
"learning_rate": 0.00019683366138426034,
"loss": 0.3598,
"step": 470
},
{
"epoch": 0.4150283966797728,
"grad_norm": 1.6185061931610107,
"learning_rate": 0.00019676467866035525,
"loss": 0.5003,
"step": 475
},
{
"epoch": 0.41939711664482304,
"grad_norm": 1.8654040098190308,
"learning_rate": 0.00019669496492552113,
"loss": 0.397,
"step": 480
},
{
"epoch": 0.4237658366098733,
"grad_norm": 1.2525237798690796,
"learning_rate": 0.00019662452070641205,
"loss": 0.3235,
"step": 485
},
{
"epoch": 0.42813455657492355,
"grad_norm": 1.7755401134490967,
"learning_rate": 0.00019655334653520036,
"loss": 0.2978,
"step": 490
},
{
"epoch": 0.4325032765399738,
"grad_norm": 1.6025470495224,
"learning_rate": 0.00019648144294957297,
"loss": 0.4436,
"step": 495
},
{
"epoch": 0.436871996505024,
"grad_norm": 1.085461974143982,
"learning_rate": 0.00019640881049272713,
"loss": 0.22,
"step": 500
},
{
"epoch": 0.4412407164700743,
"grad_norm": 1.491818904876709,
"learning_rate": 0.00019633544971336636,
"loss": 0.2714,
"step": 505
},
{
"epoch": 0.4456094364351245,
"grad_norm": 0.9479840993881226,
"learning_rate": 0.0001962613611656963,
"loss": 0.3735,
"step": 510
},
{
"epoch": 0.44997815640017474,
"grad_norm": 3.0529448986053467,
"learning_rate": 0.0001961865454094205,
"loss": 0.4779,
"step": 515
},
{
"epoch": 0.45434687636522497,
"grad_norm": 2.831089973449707,
"learning_rate": 0.00019611100300973635,
"loss": 0.469,
"step": 520
},
{
"epoch": 0.45871559633027525,
"grad_norm": 2.1834311485290527,
"learning_rate": 0.00019603473453733052,
"loss": 0.4163,
"step": 525
},
{
"epoch": 0.4630843162953255,
"grad_norm": 1.3152204751968384,
"learning_rate": 0.00019595774056837493,
"loss": 0.3744,
"step": 530
},
{
"epoch": 0.4674530362603757,
"grad_norm": 1.4493387937545776,
"learning_rate": 0.00019588002168452223,
"loss": 0.3117,
"step": 535
},
{
"epoch": 0.47182175622542594,
"grad_norm": 1.1412076950073242,
"learning_rate": 0.00019580157847290147,
"loss": 0.3152,
"step": 540
},
{
"epoch": 0.47619047619047616,
"grad_norm": 1.5004645586013794,
"learning_rate": 0.00019572241152611365,
"loss": 0.3271,
"step": 545
},
{
"epoch": 0.48055919615552645,
"grad_norm": 2.3333992958068848,
"learning_rate": 0.0001956425214422272,
"loss": 0.3626,
"step": 550
},
{
"epoch": 0.4849279161205767,
"grad_norm": 1.5423107147216797,
"learning_rate": 0.0001955619088247736,
"loss": 0.4588,
"step": 555
},
{
"epoch": 0.4892966360856269,
"grad_norm": 3.008280038833618,
"learning_rate": 0.00019548057428274266,
"loss": 0.5275,
"step": 560
},
{
"epoch": 0.49366535605067713,
"grad_norm": 1.0968583822250366,
"learning_rate": 0.00019539851843057798,
"loss": 0.3233,
"step": 565
},
{
"epoch": 0.4980340760157274,
"grad_norm": 1.265228271484375,
"learning_rate": 0.00019531574188817234,
"loss": 0.2743,
"step": 570
},
{
"epoch": 0.5024027959807776,
"grad_norm": 1.9382916688919067,
"learning_rate": 0.000195232245280863,
"loss": 0.3189,
"step": 575
},
{
"epoch": 0.5067715159458279,
"grad_norm": 1.6710058450698853,
"learning_rate": 0.00019514802923942687,
"loss": 0.345,
"step": 580
},
{
"epoch": 0.5111402359108781,
"grad_norm": 1.8377633094787598,
"learning_rate": 0.000195063094400076,
"loss": 0.4441,
"step": 585
},
{
"epoch": 0.5155089558759284,
"grad_norm": 1.432173728942871,
"learning_rate": 0.0001949774414044525,
"loss": 0.3277,
"step": 590
},
{
"epoch": 0.5198776758409785,
"grad_norm": 1.096330165863037,
"learning_rate": 0.0001948910708996239,
"loss": 0.3821,
"step": 595
},
{
"epoch": 0.5242463958060288,
"grad_norm": 1.1951391696929932,
"learning_rate": 0.00019480398353807798,
"loss": 0.4303,
"step": 600
},
{
"epoch": 0.5286151157710791,
"grad_norm": 0.9764880537986755,
"learning_rate": 0.0001947161799777183,
"loss": 0.2693,
"step": 605
},
{
"epoch": 0.5329838357361293,
"grad_norm": 1.2566354274749756,
"learning_rate": 0.00019462766088185874,
"loss": 0.2851,
"step": 610
},
{
"epoch": 0.5373525557011796,
"grad_norm": 1.494903802871704,
"learning_rate": 0.0001945384269192188,
"loss": 0.36,
"step": 615
},
{
"epoch": 0.5417212756662297,
"grad_norm": 1.5508995056152344,
"learning_rate": 0.00019444847876391844,
"loss": 0.3682,
"step": 620
},
{
"epoch": 0.54608999563128,
"grad_norm": 2.227889060974121,
"learning_rate": 0.00019435781709547305,
"loss": 0.3889,
"step": 625
},
{
"epoch": 0.5504587155963303,
"grad_norm": 0.9221494197845459,
"learning_rate": 0.0001942664425987882,
"loss": 0.3375,
"step": 630
},
{
"epoch": 0.5548274355613805,
"grad_norm": 1.3386973142623901,
"learning_rate": 0.00019417435596415458,
"loss": 0.4833,
"step": 635
},
{
"epoch": 0.5591961555264308,
"grad_norm": 1.9686752557754517,
"learning_rate": 0.00019408155788724272,
"loss": 0.4739,
"step": 640
},
{
"epoch": 0.563564875491481,
"grad_norm": 2.3978073596954346,
"learning_rate": 0.00019398804906909777,
"loss": 0.4681,
"step": 645
},
{
"epoch": 0.5679335954565312,
"grad_norm": 1.536699652671814,
"learning_rate": 0.0001938938302161342,
"loss": 0.2684,
"step": 650
},
{
"epoch": 0.5723023154215815,
"grad_norm": 1.691787600517273,
"learning_rate": 0.00019379890204013043,
"loss": 0.3512,
"step": 655
},
{
"epoch": 0.5766710353866317,
"grad_norm": 1.7557870149612427,
"learning_rate": 0.0001937032652582235,
"loss": 0.3423,
"step": 660
},
{
"epoch": 0.581039755351682,
"grad_norm": 1.7950220108032227,
"learning_rate": 0.0001936069205929036,
"loss": 0.2831,
"step": 665
},
{
"epoch": 0.5854084753167322,
"grad_norm": 1.928232192993164,
"learning_rate": 0.00019350986877200867,
"loss": 0.323,
"step": 670
},
{
"epoch": 0.5897771952817824,
"grad_norm": 1.86429762840271,
"learning_rate": 0.00019341211052871887,
"loss": 0.4248,
"step": 675
},
{
"epoch": 0.5941459152468327,
"grad_norm": 2.022738456726074,
"learning_rate": 0.00019331364660155103,
"loss": 0.3411,
"step": 680
},
{
"epoch": 0.598514635211883,
"grad_norm": 1.2337995767593384,
"learning_rate": 0.00019321447773435306,
"loss": 0.2368,
"step": 685
},
{
"epoch": 0.6028833551769331,
"grad_norm": 2.015075445175171,
"learning_rate": 0.00019311460467629843,
"loss": 0.5116,
"step": 690
},
{
"epoch": 0.6072520751419834,
"grad_norm": 1.2344030141830444,
"learning_rate": 0.00019301402818188036,
"loss": 0.3313,
"step": 695
},
{
"epoch": 0.6116207951070336,
"grad_norm": 1.129764437675476,
"learning_rate": 0.00019291274901090625,
"loss": 0.408,
"step": 700
},
{
"epoch": 0.6159895150720839,
"grad_norm": 1.4350385665893555,
"learning_rate": 0.00019281076792849184,
"loss": 0.3729,
"step": 705
},
{
"epoch": 0.6203582350371342,
"grad_norm": 1.9586119651794434,
"learning_rate": 0.00019270808570505553,
"loss": 0.4315,
"step": 710
},
{
"epoch": 0.6247269550021843,
"grad_norm": 1.0157238245010376,
"learning_rate": 0.00019260470311631243,
"loss": 0.2861,
"step": 715
},
{
"epoch": 0.6290956749672346,
"grad_norm": 1.3841652870178223,
"learning_rate": 0.00019250062094326864,
"loss": 0.4037,
"step": 720
},
{
"epoch": 0.6334643949322848,
"grad_norm": 1.848821997642517,
"learning_rate": 0.00019239583997221525,
"loss": 0.3665,
"step": 725
},
{
"epoch": 0.6378331148973351,
"grad_norm": 0.9416481256484985,
"learning_rate": 0.0001922903609947225,
"loss": 0.339,
"step": 730
},
{
"epoch": 0.6422018348623854,
"grad_norm": 1.0696804523468018,
"learning_rate": 0.0001921841848076336,
"loss": 0.2783,
"step": 735
},
{
"epoch": 0.6465705548274355,
"grad_norm": 1.9199622869491577,
"learning_rate": 0.00019207731221305903,
"loss": 0.2904,
"step": 740
},
{
"epoch": 0.6509392747924858,
"grad_norm": 1.347430944442749,
"learning_rate": 0.00019196974401837008,
"loss": 0.2719,
"step": 745
},
{
"epoch": 0.6553079947575361,
"grad_norm": 0.9743670225143433,
"learning_rate": 0.0001918614810361932,
"loss": 0.2748,
"step": 750
},
{
"epoch": 0.6596767147225863,
"grad_norm": 1.4043099880218506,
"learning_rate": 0.00019175252408440343,
"loss": 0.3285,
"step": 755
},
{
"epoch": 0.6640454346876365,
"grad_norm": 2.9343338012695312,
"learning_rate": 0.0001916428739861185,
"loss": 0.4962,
"step": 760
},
{
"epoch": 0.6684141546526867,
"grad_norm": 2.3201515674591064,
"learning_rate": 0.0001915325315696926,
"loss": 0.3243,
"step": 765
},
{
"epoch": 0.672782874617737,
"grad_norm": 1.675564169883728,
"learning_rate": 0.00019142149766870992,
"loss": 0.4596,
"step": 770
},
{
"epoch": 0.6771515945827873,
"grad_norm": 1.664604663848877,
"learning_rate": 0.00019130977312197854,
"loss": 0.3024,
"step": 775
},
{
"epoch": 0.6815203145478375,
"grad_norm": 1.8358148336410522,
"learning_rate": 0.00019119735877352412,
"loss": 0.3862,
"step": 780
},
{
"epoch": 0.6858890345128877,
"grad_norm": 1.3632128238677979,
"learning_rate": 0.00019108425547258328,
"loss": 0.2374,
"step": 785
},
{
"epoch": 0.6902577544779379,
"grad_norm": 2.0279934406280518,
"learning_rate": 0.0001909704640735975,
"loss": 0.4392,
"step": 790
},
{
"epoch": 0.6946264744429882,
"grad_norm": 1.2824902534484863,
"learning_rate": 0.0001908559854362064,
"loss": 0.2782,
"step": 795
},
{
"epoch": 0.6989951944080385,
"grad_norm": 1.3477047681808472,
"learning_rate": 0.00019074082042524145,
"loss": 0.3631,
"step": 800
},
{
"epoch": 0.7033639143730887,
"grad_norm": 1.8478046655654907,
"learning_rate": 0.00019062496991071928,
"loss": 0.3788,
"step": 805
},
{
"epoch": 0.7077326343381389,
"grad_norm": 1.470382571220398,
"learning_rate": 0.0001905084347678352,
"loss": 0.3825,
"step": 810
},
{
"epoch": 0.7121013543031892,
"grad_norm": 2.4951813220977783,
"learning_rate": 0.00019039121587695652,
"loss": 0.3359,
"step": 815
},
{
"epoch": 0.7164700742682394,
"grad_norm": 2.3441359996795654,
"learning_rate": 0.000190273314123616,
"loss": 0.32,
"step": 820
},
{
"epoch": 0.7208387942332897,
"grad_norm": 2.372884750366211,
"learning_rate": 0.00019015473039850513,
"loss": 0.3651,
"step": 825
},
{
"epoch": 0.7252075141983398,
"grad_norm": 2.4474101066589355,
"learning_rate": 0.0001900354655974672,
"loss": 0.4401,
"step": 830
},
{
"epoch": 0.7295762341633901,
"grad_norm": 1.4031054973602295,
"learning_rate": 0.0001899155206214909,
"loss": 0.308,
"step": 835
},
{
"epoch": 0.7339449541284404,
"grad_norm": 1.6008141040802002,
"learning_rate": 0.00018979489637670322,
"loss": 0.2937,
"step": 840
},
{
"epoch": 0.7383136740934906,
"grad_norm": 0.9202178120613098,
"learning_rate": 0.0001896735937743627,
"loss": 0.3157,
"step": 845
},
{
"epoch": 0.7426823940585409,
"grad_norm": 1.024746298789978,
"learning_rate": 0.00018955161373085253,
"loss": 0.2934,
"step": 850
},
{
"epoch": 0.747051114023591,
"grad_norm": 1.1573566198349,
"learning_rate": 0.00018942895716767374,
"loss": 0.3617,
"step": 855
},
{
"epoch": 0.7514198339886413,
"grad_norm": 1.227409839630127,
"learning_rate": 0.00018930562501143805,
"loss": 0.3581,
"step": 860
},
{
"epoch": 0.7557885539536916,
"grad_norm": 1.5460100173950195,
"learning_rate": 0.00018918161819386095,
"loss": 0.3393,
"step": 865
},
{
"epoch": 0.7601572739187418,
"grad_norm": 1.688852310180664,
"learning_rate": 0.0001890569376517548,
"loss": 0.4389,
"step": 870
},
{
"epoch": 0.764525993883792,
"grad_norm": 1.5271598100662231,
"learning_rate": 0.00018893158432702149,
"loss": 0.2915,
"step": 875
},
{
"epoch": 0.7688947138488423,
"grad_norm": 1.695788860321045,
"learning_rate": 0.00018880555916664555,
"loss": 0.4026,
"step": 880
},
{
"epoch": 0.7732634338138925,
"grad_norm": 1.6879792213439941,
"learning_rate": 0.00018867886312268683,
"loss": 0.2857,
"step": 885
},
{
"epoch": 0.7776321537789428,
"grad_norm": 2.0718719959259033,
"learning_rate": 0.00018855149715227344,
"loss": 0.4236,
"step": 890
},
{
"epoch": 0.782000873743993,
"grad_norm": 1.5112775564193726,
"learning_rate": 0.00018842346221759448,
"loss": 0.325,
"step": 895
},
{
"epoch": 0.7863695937090432,
"grad_norm": 1.2844749689102173,
"learning_rate": 0.00018829475928589271,
"loss": 0.3782,
"step": 900
},
{
"epoch": 0.7907383136740935,
"grad_norm": 2.150299072265625,
"learning_rate": 0.00018816538932945728,
"loss": 0.3726,
"step": 905
},
{
"epoch": 0.7951070336391437,
"grad_norm": 1.7050650119781494,
"learning_rate": 0.00018803535332561646,
"loss": 0.3824,
"step": 910
},
{
"epoch": 0.799475753604194,
"grad_norm": 1.8164982795715332,
"learning_rate": 0.00018790465225673012,
"loss": 0.3664,
"step": 915
},
{
"epoch": 0.8038444735692443,
"grad_norm": 1.1102941036224365,
"learning_rate": 0.00018777328711018244,
"loss": 0.3166,
"step": 920
},
{
"epoch": 0.8082131935342944,
"grad_norm": 1.4220764636993408,
"learning_rate": 0.0001876412588783743,
"loss": 0.3049,
"step": 925
},
{
"epoch": 0.8125819134993447,
"grad_norm": 2.11336088180542,
"learning_rate": 0.000187508568558716,
"loss": 0.3076,
"step": 930
},
{
"epoch": 0.8169506334643949,
"grad_norm": 1.9948710203170776,
"learning_rate": 0.00018737521715361948,
"loss": 0.3846,
"step": 935
},
{
"epoch": 0.8213193534294452,
"grad_norm": 1.8913676738739014,
"learning_rate": 0.00018724120567049094,
"loss": 0.4296,
"step": 940
},
{
"epoch": 0.8256880733944955,
"grad_norm": 1.3633447885513306,
"learning_rate": 0.0001871065351217231,
"loss": 0.3569,
"step": 945
},
{
"epoch": 0.8300567933595456,
"grad_norm": 1.4957417249679565,
"learning_rate": 0.00018697120652468762,
"loss": 0.3085,
"step": 950
},
{
"epoch": 0.8344255133245959,
"grad_norm": 2.076399803161621,
"learning_rate": 0.0001868352209017275,
"loss": 0.3331,
"step": 955
},
{
"epoch": 0.8387942332896461,
"grad_norm": 1.1817855834960938,
"learning_rate": 0.00018669857928014906,
"loss": 0.3414,
"step": 960
},
{
"epoch": 0.8431629532546964,
"grad_norm": 1.4255414009094238,
"learning_rate": 0.00018656128269221454,
"loss": 0.2782,
"step": 965
},
{
"epoch": 0.8475316732197467,
"grad_norm": 1.326687216758728,
"learning_rate": 0.0001864233321751341,
"loss": 0.2998,
"step": 970
},
{
"epoch": 0.8519003931847968,
"grad_norm": 2.222280263900757,
"learning_rate": 0.00018628472877105793,
"loss": 0.3348,
"step": 975
},
{
"epoch": 0.8562691131498471,
"grad_norm": 1.518401026725769,
"learning_rate": 0.00018614547352706863,
"loss": 0.3816,
"step": 980
},
{
"epoch": 0.8606378331148974,
"grad_norm": 1.1030207872390747,
"learning_rate": 0.00018600556749517305,
"loss": 0.3222,
"step": 985
},
{
"epoch": 0.8650065530799476,
"grad_norm": 2.406994104385376,
"learning_rate": 0.00018586501173229437,
"loss": 0.3754,
"step": 990
},
{
"epoch": 0.8693752730449978,
"grad_norm": 1.2401646375656128,
"learning_rate": 0.00018572380730026434,
"loss": 0.4402,
"step": 995
},
{
"epoch": 0.873743993010048,
"grad_norm": 2.0233402252197266,
"learning_rate": 0.0001855819552658149,
"loss": 0.3323,
"step": 1000
},
{
"epoch": 0.8781127129750983,
"grad_norm": 1.5329450368881226,
"learning_rate": 0.00018543945670057045,
"loss": 0.235,
"step": 1005
},
{
"epoch": 0.8824814329401486,
"grad_norm": 1.8849459886550903,
"learning_rate": 0.00018529631268103964,
"loss": 0.357,
"step": 1010
},
{
"epoch": 0.8868501529051988,
"grad_norm": 2.016646146774292,
"learning_rate": 0.0001851525242886071,
"loss": 0.2663,
"step": 1015
},
{
"epoch": 0.891218872870249,
"grad_norm": 2.3272440433502197,
"learning_rate": 0.0001850080926095255,
"loss": 0.2926,
"step": 1020
},
{
"epoch": 0.8955875928352992,
"grad_norm": 1.7760261297225952,
"learning_rate": 0.00018486301873490713,
"loss": 0.4155,
"step": 1025
},
{
"epoch": 0.8999563128003495,
"grad_norm": 1.4679979085922241,
"learning_rate": 0.0001847173037607159,
"loss": 0.2877,
"step": 1030
},
{
"epoch": 0.9043250327653998,
"grad_norm": 1.8398054838180542,
"learning_rate": 0.0001845709487877588,
"loss": 0.2856,
"step": 1035
},
{
"epoch": 0.9086937527304499,
"grad_norm": 3.05880069732666,
"learning_rate": 0.00018442395492167775,
"loss": 0.3373,
"step": 1040
},
{
"epoch": 0.9130624726955002,
"grad_norm": 1.2527328729629517,
"learning_rate": 0.0001842763232729412,
"loss": 0.2412,
"step": 1045
},
{
"epoch": 0.9174311926605505,
"grad_norm": 1.7745814323425293,
"learning_rate": 0.00018412805495683575,
"loss": 0.3955,
"step": 1050
},
{
"epoch": 0.9217999126256007,
"grad_norm": 3.2864468097686768,
"learning_rate": 0.0001839791510934577,
"loss": 0.333,
"step": 1055
},
{
"epoch": 0.926168632590651,
"grad_norm": 2.0274927616119385,
"learning_rate": 0.0001838296128077046,
"loss": 0.4004,
"step": 1060
},
{
"epoch": 0.9305373525557011,
"grad_norm": 1.9851633310317993,
"learning_rate": 0.0001836794412292668,
"loss": 0.3132,
"step": 1065
},
{
"epoch": 0.9349060725207514,
"grad_norm": 1.3309999704360962,
"learning_rate": 0.00018352863749261883,
"loss": 0.2645,
"step": 1070
},
{
"epoch": 0.9392747924858017,
"grad_norm": 2.0173072814941406,
"learning_rate": 0.00018337720273701088,
"loss": 0.4376,
"step": 1075
},
{
"epoch": 0.9436435124508519,
"grad_norm": 1.815408706665039,
"learning_rate": 0.00018322513810646024,
"loss": 0.2851,
"step": 1080
},
{
"epoch": 0.9480122324159022,
"grad_norm": 1.1190584897994995,
"learning_rate": 0.00018307244474974254,
"loss": 0.4664,
"step": 1085
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.9746566414833069,
"learning_rate": 0.00018291912382038317,
"loss": 0.3816,
"step": 1090
},
{
"epoch": 0.9567496723460026,
"grad_norm": 1.9062715768814087,
"learning_rate": 0.0001827651764766485,
"loss": 0.3031,
"step": 1095
},
{
"epoch": 0.9611183923110529,
"grad_norm": 1.027502417564392,
"learning_rate": 0.00018261060388153718,
"loss": 0.2657,
"step": 1100
},
{
"epoch": 0.9654871122761031,
"grad_norm": 2.239164352416992,
"learning_rate": 0.00018245540720277135,
"loss": 0.3367,
"step": 1105
},
{
"epoch": 0.9698558322411533,
"grad_norm": 1.5922635793685913,
"learning_rate": 0.0001822995876127878,
"loss": 0.3044,
"step": 1110
},
{
"epoch": 0.9742245522062036,
"grad_norm": 1.9189236164093018,
"learning_rate": 0.00018214314628872905,
"loss": 0.3326,
"step": 1115
},
{
"epoch": 0.9785932721712538,
"grad_norm": 1.1626375913619995,
"learning_rate": 0.00018198608441243467,
"loss": 0.2761,
"step": 1120
},
{
"epoch": 0.9829619921363041,
"grad_norm": 1.805367112159729,
"learning_rate": 0.00018182840317043202,
"loss": 0.3337,
"step": 1125
},
{
"epoch": 0.9873307121013543,
"grad_norm": 1.5879418849945068,
"learning_rate": 0.0001816701037539277,
"loss": 0.3242,
"step": 1130
},
{
"epoch": 0.9916994320664045,
"grad_norm": 1.3560898303985596,
"learning_rate": 0.00018151118735879805,
"loss": 0.2794,
"step": 1135
},
{
"epoch": 0.9960681520314548,
"grad_norm": 1.0656763315200806,
"learning_rate": 0.0001813516551855806,
"loss": 0.3336,
"step": 1140
},
{
"epoch": 1.0,
"grad_norm": 3.2105913162231445,
"learning_rate": 0.00018119150843946472,
"loss": 0.3753,
"step": 1145
},
{
"epoch": 1.0043687199650502,
"grad_norm": 1.2890548706054688,
"learning_rate": 0.00018103074833028258,
"loss": 0.2943,
"step": 1150
},
{
"epoch": 1.0087374399301006,
"grad_norm": 1.8480075597763062,
"learning_rate": 0.00018086937607250002,
"loss": 0.3057,
"step": 1155
},
{
"epoch": 1.0131061598951507,
"grad_norm": 1.3526337146759033,
"learning_rate": 0.00018070739288520736,
"loss": 0.2328,
"step": 1160
},
{
"epoch": 1.017474879860201,
"grad_norm": 1.0462696552276611,
"learning_rate": 0.00018054479999211025,
"loss": 0.1896,
"step": 1165
},
{
"epoch": 1.0218435998252513,
"grad_norm": 0.9586630463600159,
"learning_rate": 0.00018038159862152027,
"loss": 0.2567,
"step": 1170
},
{
"epoch": 1.0262123197903015,
"grad_norm": 1.0181586742401123,
"learning_rate": 0.0001802177900063459,
"loss": 0.2653,
"step": 1175
},
{
"epoch": 1.0305810397553516,
"grad_norm": 1.2034084796905518,
"learning_rate": 0.0001800533753840829,
"loss": 0.3953,
"step": 1180
},
{
"epoch": 1.0349497597204018,
"grad_norm": 2.6563191413879395,
"learning_rate": 0.0001798883559968053,
"loss": 0.185,
"step": 1185
},
{
"epoch": 1.0393184796854522,
"grad_norm": 1.2556034326553345,
"learning_rate": 0.00017972273309115568,
"loss": 0.2452,
"step": 1190
},
{
"epoch": 1.0436871996505024,
"grad_norm": 1.4751702547073364,
"learning_rate": 0.00017955650791833604,
"loss": 0.2827,
"step": 1195
},
{
"epoch": 1.0480559196155526,
"grad_norm": 3.8620717525482178,
"learning_rate": 0.00017938968173409811,
"loss": 0.2953,
"step": 1200
},
{
"epoch": 1.052424639580603,
"grad_norm": 1.2123383283615112,
"learning_rate": 0.00017922225579873407,
"loss": 0.2165,
"step": 1205
},
{
"epoch": 1.0567933595456531,
"grad_norm": 1.911566972732544,
"learning_rate": 0.0001790542313770669,
"loss": 0.2444,
"step": 1210
},
{
"epoch": 1.0611620795107033,
"grad_norm": 1.9949162006378174,
"learning_rate": 0.00017888560973844083,
"loss": 0.255,
"step": 1215
},
{
"epoch": 1.0655307994757537,
"grad_norm": 0.9666941165924072,
"learning_rate": 0.0001787163921567118,
"loss": 0.1913,
"step": 1220
},
{
"epoch": 1.0698995194408039,
"grad_norm": 0.7195447087287903,
"learning_rate": 0.0001785465799102378,
"loss": 0.2541,
"step": 1225
},
{
"epoch": 1.074268239405854,
"grad_norm": 1.5414377450942993,
"learning_rate": 0.0001783761742818693,
"loss": 0.3626,
"step": 1230
},
{
"epoch": 1.0786369593709044,
"grad_norm": 1.2173676490783691,
"learning_rate": 0.0001782051765589394,
"loss": 0.2913,
"step": 1235
},
{
"epoch": 1.0830056793359546,
"grad_norm": 1.6966580152511597,
"learning_rate": 0.00017803358803325416,
"loss": 0.2613,
"step": 1240
},
{
"epoch": 1.0873743993010048,
"grad_norm": 1.8033946752548218,
"learning_rate": 0.00017786141000108302,
"loss": 0.2734,
"step": 1245
},
{
"epoch": 1.091743119266055,
"grad_norm": 1.1834598779678345,
"learning_rate": 0.00017768864376314873,
"loss": 0.2548,
"step": 1250
},
{
"epoch": 1.0961118392311053,
"grad_norm": 1.441835641860962,
"learning_rate": 0.00017751529062461777,
"loss": 0.3404,
"step": 1255
},
{
"epoch": 1.1004805591961555,
"grad_norm": 1.443575382232666,
"learning_rate": 0.0001773413518950902,
"loss": 0.312,
"step": 1260
},
{
"epoch": 1.1048492791612057,
"grad_norm": 1.2344982624053955,
"learning_rate": 0.0001771668288885901,
"loss": 0.2594,
"step": 1265
},
{
"epoch": 1.109217999126256,
"grad_norm": 1.3508884906768799,
"learning_rate": 0.0001769917229235554,
"loss": 0.3467,
"step": 1270
},
{
"epoch": 1.1135867190913062,
"grad_norm": 1.3105831146240234,
"learning_rate": 0.00017681603532282805,
"loss": 0.2393,
"step": 1275
},
{
"epoch": 1.1179554390563564,
"grad_norm": 1.3476370573043823,
"learning_rate": 0.00017663976741364394,
"loss": 0.3318,
"step": 1280
},
{
"epoch": 1.1223241590214068,
"grad_norm": 1.9715685844421387,
"learning_rate": 0.00017646292052762296,
"loss": 0.2808,
"step": 1285
},
{
"epoch": 1.126692878986457,
"grad_norm": 1.2339669466018677,
"learning_rate": 0.00017628549600075884,
"loss": 0.2753,
"step": 1290
},
{
"epoch": 1.1310615989515072,
"grad_norm": 1.3921184539794922,
"learning_rate": 0.00017610749517340914,
"loss": 0.2096,
"step": 1295
},
{
"epoch": 1.1354303189165575,
"grad_norm": 1.3537594079971313,
"learning_rate": 0.0001759289193902851,
"loss": 0.2232,
"step": 1300
},
{
"epoch": 1.1397990388816077,
"grad_norm": 2.207932472229004,
"learning_rate": 0.00017574977000044147,
"loss": 0.4179,
"step": 1305
},
{
"epoch": 1.144167758846658,
"grad_norm": 2.4464988708496094,
"learning_rate": 0.0001755700483572663,
"loss": 0.3863,
"step": 1310
},
{
"epoch": 1.148536478811708,
"grad_norm": 1.2169779539108276,
"learning_rate": 0.00017538975581847077,
"loss": 0.2131,
"step": 1315
},
{
"epoch": 1.1529051987767585,
"grad_norm": 1.2162644863128662,
"learning_rate": 0.00017520889374607893,
"loss": 0.2299,
"step": 1320
},
{
"epoch": 1.1572739187418086,
"grad_norm": 1.7051069736480713,
"learning_rate": 0.0001750274635064173,
"loss": 0.2994,
"step": 1325
},
{
"epoch": 1.1616426387068588,
"grad_norm": 1.814450740814209,
"learning_rate": 0.00017484546647010473,
"loss": 0.2948,
"step": 1330
},
{
"epoch": 1.1660113586719092,
"grad_norm": 1.3929287195205688,
"learning_rate": 0.00017466290401204186,
"loss": 0.3837,
"step": 1335
},
{
"epoch": 1.1703800786369594,
"grad_norm": 1.1973505020141602,
"learning_rate": 0.00017447977751140086,
"loss": 0.2335,
"step": 1340
},
{
"epoch": 1.1747487986020095,
"grad_norm": 1.4987421035766602,
"learning_rate": 0.00017429608835161506,
"loss": 0.2484,
"step": 1345
},
{
"epoch": 1.17911751856706,
"grad_norm": 1.3741153478622437,
"learning_rate": 0.00017411183792036822,
"loss": 0.2475,
"step": 1350
},
{
"epoch": 1.18348623853211,
"grad_norm": 1.8292722702026367,
"learning_rate": 0.0001739270276095844,
"loss": 0.2898,
"step": 1355
},
{
"epoch": 1.1878549584971603,
"grad_norm": 1.854209065437317,
"learning_rate": 0.00017374165881541717,
"loss": 0.3992,
"step": 1360
},
{
"epoch": 1.1922236784622107,
"grad_norm": 1.6196980476379395,
"learning_rate": 0.0001735557329382393,
"loss": 0.3053,
"step": 1365
},
{
"epoch": 1.1965923984272608,
"grad_norm": 1.6935441493988037,
"learning_rate": 0.00017336925138263195,
"loss": 0.239,
"step": 1370
},
{
"epoch": 1.200961118392311,
"grad_norm": 1.7809889316558838,
"learning_rate": 0.00017318221555737422,
"loss": 0.2152,
"step": 1375
},
{
"epoch": 1.2053298383573612,
"grad_norm": 2.3215832710266113,
"learning_rate": 0.0001729946268754324,
"loss": 0.3185,
"step": 1380
},
{
"epoch": 1.2096985583224116,
"grad_norm": 1.347947120666504,
"learning_rate": 0.00017280648675394947,
"loss": 0.3085,
"step": 1385
},
{
"epoch": 1.2140672782874617,
"grad_norm": 1.3840276002883911,
"learning_rate": 0.00017261779661423407,
"loss": 0.2016,
"step": 1390
},
{
"epoch": 1.218435998252512,
"grad_norm": 1.2935765981674194,
"learning_rate": 0.00017242855788175015,
"loss": 0.2063,
"step": 1395
},
{
"epoch": 1.2228047182175623,
"grad_norm": 1.2883801460266113,
"learning_rate": 0.00017223877198610591,
"loss": 0.2181,
"step": 1400
},
{
"epoch": 1.2271734381826125,
"grad_norm": 1.3021811246871948,
"learning_rate": 0.00017204844036104318,
"loss": 0.2283,
"step": 1405
},
{
"epoch": 1.2315421581476627,
"grad_norm": 1.1819430589675903,
"learning_rate": 0.00017185756444442648,
"loss": 0.2652,
"step": 1410
},
{
"epoch": 1.235910878112713,
"grad_norm": 2.0612573623657227,
"learning_rate": 0.00017166614567823212,
"loss": 0.2977,
"step": 1415
},
{
"epoch": 1.2402795980777632,
"grad_norm": 3.0888679027557373,
"learning_rate": 0.00017147418550853756,
"loss": 0.2682,
"step": 1420
},
{
"epoch": 1.2446483180428134,
"grad_norm": 2.311062812805176,
"learning_rate": 0.0001712816853855101,
"loss": 0.3329,
"step": 1425
},
{
"epoch": 1.2490170380078638,
"grad_norm": 1.2064367532730103,
"learning_rate": 0.00017108864676339627,
"loss": 0.2065,
"step": 1430
},
{
"epoch": 1.253385757972914,
"grad_norm": 1.4042255878448486,
"learning_rate": 0.00017089507110051066,
"loss": 0.1738,
"step": 1435
},
{
"epoch": 1.2577544779379641,
"grad_norm": 2.3508129119873047,
"learning_rate": 0.00017070095985922493,
"loss": 0.403,
"step": 1440
},
{
"epoch": 1.2621231979030143,
"grad_norm": 1.2386358976364136,
"learning_rate": 0.0001705063145059568,
"loss": 0.272,
"step": 1445
},
{
"epoch": 1.2664919178680647,
"grad_norm": 0.806268036365509,
"learning_rate": 0.00017031113651115893,
"loss": 0.2549,
"step": 1450
},
{
"epoch": 1.2708606378331149,
"grad_norm": 1.6991655826568604,
"learning_rate": 0.00017011542734930786,
"loss": 0.2331,
"step": 1455
},
{
"epoch": 1.2752293577981653,
"grad_norm": 1.1343746185302734,
"learning_rate": 0.00016991918849889283,
"loss": 0.3112,
"step": 1460
},
{
"epoch": 1.2795980777632154,
"grad_norm": 1.3381041288375854,
"learning_rate": 0.00016972242144240463,
"loss": 0.1974,
"step": 1465
},
{
"epoch": 1.2839667977282656,
"grad_norm": 3.6427793502807617,
"learning_rate": 0.00016952512766632439,
"loss": 0.2315,
"step": 1470
},
{
"epoch": 1.2883355176933158,
"grad_norm": 1.8139313459396362,
"learning_rate": 0.0001693273086611123,
"loss": 0.1771,
"step": 1475
},
{
"epoch": 1.2927042376583662,
"grad_norm": 1.205609679222107,
"learning_rate": 0.00016912896592119654,
"loss": 0.2551,
"step": 1480
},
{
"epoch": 1.2970729576234163,
"grad_norm": 1.355162262916565,
"learning_rate": 0.00016893010094496172,
"loss": 0.2452,
"step": 1485
},
{
"epoch": 1.3014416775884665,
"grad_norm": 1.2561094760894775,
"learning_rate": 0.00016873071523473777,
"loss": 0.2163,
"step": 1490
},
{
"epoch": 1.305810397553517,
"grad_norm": 1.3165076971054077,
"learning_rate": 0.00016853081029678853,
"loss": 0.3273,
"step": 1495
},
{
"epoch": 1.310179117518567,
"grad_norm": 1.8802030086517334,
"learning_rate": 0.00016833038764130028,
"loss": 0.3797,
"step": 1500
},
{
"epoch": 1.3145478374836173,
"grad_norm": 1.7062153816223145,
"learning_rate": 0.0001681294487823704,
"loss": 0.2989,
"step": 1505
},
{
"epoch": 1.3189165574486674,
"grad_norm": 2.0729176998138428,
"learning_rate": 0.00016792799523799613,
"loss": 0.2587,
"step": 1510
},
{
"epoch": 1.3232852774137178,
"grad_norm": 1.129841685295105,
"learning_rate": 0.00016772602853006268,
"loss": 0.2201,
"step": 1515
},
{
"epoch": 1.327653997378768,
"grad_norm": 1.2515584230422974,
"learning_rate": 0.00016752355018433206,
"loss": 0.2397,
"step": 1520
},
{
"epoch": 1.3320227173438184,
"grad_norm": 1.2597646713256836,
"learning_rate": 0.0001673205617304315,
"loss": 0.2157,
"step": 1525
},
{
"epoch": 1.3363914373088686,
"grad_norm": 1.8813763856887817,
"learning_rate": 0.0001671170647018418,
"loss": 0.2765,
"step": 1530
},
{
"epoch": 1.3407601572739187,
"grad_norm": 2.208132266998291,
"learning_rate": 0.00016691306063588583,
"loss": 0.2258,
"step": 1535
},
{
"epoch": 1.345128877238969,
"grad_norm": 1.9504673480987549,
"learning_rate": 0.00016670855107371683,
"loss": 0.2779,
"step": 1540
},
{
"epoch": 1.3494975972040193,
"grad_norm": 2.171309471130371,
"learning_rate": 0.00016650353756030692,
"loss": 0.3031,
"step": 1545
},
{
"epoch": 1.3538663171690695,
"grad_norm": 2.3320510387420654,
"learning_rate": 0.00016629802164443519,
"loss": 0.3288,
"step": 1550
},
{
"epoch": 1.3582350371341196,
"grad_norm": 1.4883947372436523,
"learning_rate": 0.0001660920048786763,
"loss": 0.2416,
"step": 1555
},
{
"epoch": 1.36260375709917,
"grad_norm": 1.1198906898498535,
"learning_rate": 0.00016588548881938845,
"loss": 0.2337,
"step": 1560
},
{
"epoch": 1.3669724770642202,
"grad_norm": 1.4867557287216187,
"learning_rate": 0.0001656784750267019,
"loss": 0.3154,
"step": 1565
},
{
"epoch": 1.3713411970292704,
"grad_norm": 2.2435972690582275,
"learning_rate": 0.0001654709650645069,
"loss": 0.245,
"step": 1570
},
{
"epoch": 1.3757099169943205,
"grad_norm": 2.2508065700531006,
"learning_rate": 0.00016526296050044215,
"loss": 0.3097,
"step": 1575
},
{
"epoch": 1.380078636959371,
"grad_norm": 0.8681387901306152,
"learning_rate": 0.00016505446290588277,
"loss": 0.295,
"step": 1580
},
{
"epoch": 1.3844473569244211,
"grad_norm": 1.143965244293213,
"learning_rate": 0.00016484547385592848,
"loss": 0.2534,
"step": 1585
},
{
"epoch": 1.3888160768894715,
"grad_norm": 2.3972761631011963,
"learning_rate": 0.00016463599492939177,
"loss": 0.2527,
"step": 1590
},
{
"epoch": 1.3931847968545217,
"grad_norm": 2.7718844413757324,
"learning_rate": 0.00016442602770878586,
"loss": 0.2697,
"step": 1595
},
{
"epoch": 1.3975535168195719,
"grad_norm": 1.7743192911148071,
"learning_rate": 0.00016421557378031279,
"loss": 0.2784,
"step": 1600
},
{
"epoch": 1.401922236784622,
"grad_norm": 1.879279613494873,
"learning_rate": 0.0001640046347338515,
"loss": 0.2828,
"step": 1605
},
{
"epoch": 1.4062909567496724,
"grad_norm": 1.5005972385406494,
"learning_rate": 0.00016379321216294574,
"loss": 0.2161,
"step": 1610
},
{
"epoch": 1.4106596767147226,
"grad_norm": 1.1289541721343994,
"learning_rate": 0.00016358130766479202,
"loss": 0.1995,
"step": 1615
},
{
"epoch": 1.4150283966797728,
"grad_norm": 0.7074740529060364,
"learning_rate": 0.0001633689228402276,
"loss": 0.2569,
"step": 1620
},
{
"epoch": 1.4193971166448232,
"grad_norm": 1.4253774881362915,
"learning_rate": 0.00016315605929371842,
"loss": 0.3133,
"step": 1625
},
{
"epoch": 1.4237658366098733,
"grad_norm": 1.6986387968063354,
"learning_rate": 0.0001629427186333469,
"loss": 0.2671,
"step": 1630
},
{
"epoch": 1.4281345565749235,
"grad_norm": 1.194734811782837,
"learning_rate": 0.0001627289024707998,
"loss": 0.2592,
"step": 1635
},
{
"epoch": 1.4325032765399737,
"grad_norm": 1.6335885524749756,
"learning_rate": 0.00016251461242135616,
"loss": 0.2628,
"step": 1640
},
{
"epoch": 1.436871996505024,
"grad_norm": 1.417205572128296,
"learning_rate": 0.0001622998501038749,
"loss": 0.2129,
"step": 1645
},
{
"epoch": 1.4412407164700742,
"grad_norm": 1.0214331150054932,
"learning_rate": 0.0001620846171407828,
"loss": 0.2505,
"step": 1650
},
{
"epoch": 1.4456094364351246,
"grad_norm": 0.9859942197799683,
"learning_rate": 0.000161868915158062,
"loss": 0.1947,
"step": 1655
},
{
"epoch": 1.4499781564001748,
"grad_norm": 1.9299548864364624,
"learning_rate": 0.00016165274578523807,
"loss": 0.2716,
"step": 1660
},
{
"epoch": 1.454346876365225,
"grad_norm": 1.4493621587753296,
"learning_rate": 0.00016143611065536727,
"loss": 0.2066,
"step": 1665
},
{
"epoch": 1.4587155963302751,
"grad_norm": 1.4922335147857666,
"learning_rate": 0.00016121901140502456,
"loss": 0.291,
"step": 1670
},
{
"epoch": 1.4630843162953255,
"grad_norm": 0.9710771441459656,
"learning_rate": 0.00016100144967429113,
"loss": 0.207,
"step": 1675
},
{
"epoch": 1.4674530362603757,
"grad_norm": 2.2792580127716064,
"learning_rate": 0.0001607834271067419,
"loss": 0.1705,
"step": 1680
},
{
"epoch": 1.4718217562254259,
"grad_norm": 2.3811166286468506,
"learning_rate": 0.00016056494534943323,
"loss": 0.2994,
"step": 1685
},
{
"epoch": 1.4761904761904763,
"grad_norm": 1.6609042882919312,
"learning_rate": 0.00016034600605289046,
"loss": 0.2,
"step": 1690
},
{
"epoch": 1.4805591961555264,
"grad_norm": 2.835557222366333,
"learning_rate": 0.0001601266108710954,
"loss": 0.3283,
"step": 1695
},
{
"epoch": 1.4849279161205766,
"grad_norm": 1.5667732954025269,
"learning_rate": 0.00015990676146147384,
"loss": 0.2685,
"step": 1700
},
{
"epoch": 1.4892966360856268,
"grad_norm": 1.7098650932312012,
"learning_rate": 0.0001596864594848831,
"loss": 0.2466,
"step": 1705
},
{
"epoch": 1.4936653560506772,
"grad_norm": 1.5239036083221436,
"learning_rate": 0.00015946570660559933,
"loss": 0.2577,
"step": 1710
},
{
"epoch": 1.4980340760157274,
"grad_norm": 1.0807280540466309,
"learning_rate": 0.00015924450449130513,
"loss": 0.2017,
"step": 1715
},
{
"epoch": 1.5024027959807777,
"grad_norm": 1.5457370281219482,
"learning_rate": 0.0001590228548130768,
"loss": 0.2695,
"step": 1720
},
{
"epoch": 1.506771515945828,
"grad_norm": 1.1175668239593506,
"learning_rate": 0.00015880075924537185,
"loss": 0.2727,
"step": 1725
},
{
"epoch": 1.511140235910878,
"grad_norm": 0.9887433052062988,
"learning_rate": 0.00015857821946601615,
"loss": 0.2561,
"step": 1730
},
{
"epoch": 1.5155089558759283,
"grad_norm": 1.205710768699646,
"learning_rate": 0.00015835523715619144,
"loss": 0.2441,
"step": 1735
},
{
"epoch": 1.5198776758409784,
"grad_norm": 1.3492207527160645,
"learning_rate": 0.00015813181400042262,
"loss": 0.2832,
"step": 1740
},
{
"epoch": 1.5242463958060288,
"grad_norm": 1.2649726867675781,
"learning_rate": 0.00015790795168656486,
"loss": 0.2393,
"step": 1745
},
{
"epoch": 1.5286151157710792,
"grad_norm": 2.4311208724975586,
"learning_rate": 0.00015768365190579103,
"loss": 0.2831,
"step": 1750
},
{
"epoch": 1.5329838357361294,
"grad_norm": 1.1982014179229736,
"learning_rate": 0.00015745891635257885,
"loss": 0.2893,
"step": 1755
},
{
"epoch": 1.5373525557011796,
"grad_norm": 1.842958688735962,
"learning_rate": 0.0001572337467246981,
"loss": 0.2158,
"step": 1760
},
{
"epoch": 1.5417212756662297,
"grad_norm": 1.9764633178710938,
"learning_rate": 0.00015700814472319774,
"loss": 0.3043,
"step": 1765
},
{
"epoch": 1.54608999563128,
"grad_norm": 1.0637012720108032,
"learning_rate": 0.00015678211205239314,
"loss": 0.258,
"step": 1770
},
{
"epoch": 1.5504587155963303,
"grad_norm": 1.0332783460617065,
"learning_rate": 0.00015655565041985318,
"loss": 0.2373,
"step": 1775
},
{
"epoch": 1.5548274355613805,
"grad_norm": 1.965062141418457,
"learning_rate": 0.00015632876153638732,
"loss": 0.1725,
"step": 1780
},
{
"epoch": 1.5591961555264309,
"grad_norm": 0.6934140920639038,
"learning_rate": 0.00015610144711603272,
"loss": 0.2201,
"step": 1785
},
{
"epoch": 1.563564875491481,
"grad_norm": 1.2010079622268677,
"learning_rate": 0.00015587370887604123,
"loss": 0.1796,
"step": 1790
},
{
"epoch": 1.5679335954565312,
"grad_norm": 1.765724539756775,
"learning_rate": 0.00015564554853686645,
"loss": 0.2457,
"step": 1795
},
{
"epoch": 1.5723023154215814,
"grad_norm": 1.848937749862671,
"learning_rate": 0.00015541696782215084,
"loss": 0.2096,
"step": 1800
},
{
"epoch": 1.5766710353866316,
"grad_norm": 1.9904989004135132,
"learning_rate": 0.00015518796845871247,
"loss": 0.2651,
"step": 1805
},
{
"epoch": 1.581039755351682,
"grad_norm": 0.8786830902099609,
"learning_rate": 0.00015495855217653216,
"loss": 0.2256,
"step": 1810
},
{
"epoch": 1.5854084753167323,
"grad_norm": 2.0665247440338135,
"learning_rate": 0.00015472872070874033,
"loss": 0.3413,
"step": 1815
},
{
"epoch": 1.5897771952817825,
"grad_norm": 2.08548903465271,
"learning_rate": 0.000154498475791604,
"loss": 0.2753,
"step": 1820
},
{
"epoch": 1.5941459152468327,
"grad_norm": 1.4281671047210693,
"learning_rate": 0.00015426781916451346,
"loss": 0.2527,
"step": 1825
},
{
"epoch": 1.5985146352118829,
"grad_norm": 0.9327197074890137,
"learning_rate": 0.00015403675256996942,
"loss": 0.1906,
"step": 1830
},
{
"epoch": 1.602883355176933,
"grad_norm": 2.447357177734375,
"learning_rate": 0.00015380527775356962,
"loss": 0.2377,
"step": 1835
},
{
"epoch": 1.6072520751419834,
"grad_norm": 1.3971562385559082,
"learning_rate": 0.00015357339646399578,
"loss": 0.2307,
"step": 1840
},
{
"epoch": 1.6116207951070336,
"grad_norm": 1.4695461988449097,
"learning_rate": 0.00015334111045300022,
"loss": 0.2273,
"step": 1845
},
{
"epoch": 1.615989515072084,
"grad_norm": 2.3516275882720947,
"learning_rate": 0.0001531084214753928,
"loss": 0.3006,
"step": 1850
},
{
"epoch": 1.6203582350371342,
"grad_norm": 1.7472774982452393,
"learning_rate": 0.00015287533128902764,
"loss": 0.2943,
"step": 1855
},
{
"epoch": 1.6247269550021843,
"grad_norm": 2.3994486331939697,
"learning_rate": 0.00015264184165478977,
"loss": 0.2277,
"step": 1860
},
{
"epoch": 1.6290956749672345,
"grad_norm": 2.234163522720337,
"learning_rate": 0.00015240795433658187,
"loss": 0.2319,
"step": 1865
},
{
"epoch": 1.6334643949322847,
"grad_norm": 1.5536202192306519,
"learning_rate": 0.00015217367110131086,
"loss": 0.1685,
"step": 1870
},
{
"epoch": 1.637833114897335,
"grad_norm": 1.564252257347107,
"learning_rate": 0.0001519389937188747,
"loss": 0.2817,
"step": 1875
},
{
"epoch": 1.6422018348623855,
"grad_norm": 1.6615930795669556,
"learning_rate": 0.00015170392396214897,
"loss": 0.2409,
"step": 1880
},
{
"epoch": 1.6465705548274356,
"grad_norm": 2.7048113346099854,
"learning_rate": 0.00015146846360697332,
"loss": 0.3594,
"step": 1885
},
{
"epoch": 1.6509392747924858,
"grad_norm": 1.7237365245819092,
"learning_rate": 0.00015123261443213837,
"loss": 0.2692,
"step": 1890
},
{
"epoch": 1.655307994757536,
"grad_norm": 0.9304201602935791,
"learning_rate": 0.00015099637821937192,
"loss": 0.2457,
"step": 1895
},
{
"epoch": 1.6596767147225862,
"grad_norm": 1.8771346807479858,
"learning_rate": 0.00015075975675332573,
"loss": 0.2152,
"step": 1900
},
{
"epoch": 1.6640454346876365,
"grad_norm": 1.0985363721847534,
"learning_rate": 0.00015052275182156198,
"loss": 0.2191,
"step": 1905
},
{
"epoch": 1.6684141546526867,
"grad_norm": 1.3073694705963135,
"learning_rate": 0.00015028536521453968,
"loss": 0.3686,
"step": 1910
},
{
"epoch": 1.6727828746177371,
"grad_norm": 0.7887938618659973,
"learning_rate": 0.0001500475987256013,
"loss": 0.2015,
"step": 1915
},
{
"epoch": 1.6771515945827873,
"grad_norm": 1.5100387334823608,
"learning_rate": 0.0001498094541509591,
"loss": 0.2545,
"step": 1920
},
{
"epoch": 1.6815203145478375,
"grad_norm": 1.5141887664794922,
"learning_rate": 0.00014957093328968156,
"loss": 0.2385,
"step": 1925
},
{
"epoch": 1.6858890345128876,
"grad_norm": 1.1331568956375122,
"learning_rate": 0.00014933203794367992,
"loss": 0.2578,
"step": 1930
},
{
"epoch": 1.6902577544779378,
"grad_norm": 1.6967277526855469,
"learning_rate": 0.00014909276991769435,
"loss": 0.3004,
"step": 1935
},
{
"epoch": 1.6946264744429882,
"grad_norm": 1.1680165529251099,
"learning_rate": 0.00014885313101928055,
"loss": 0.2306,
"step": 1940
},
{
"epoch": 1.6989951944080386,
"grad_norm": 0.8564541339874268,
"learning_rate": 0.00014861312305879592,
"loss": 0.2384,
"step": 1945
},
{
"epoch": 1.7033639143730888,
"grad_norm": 0.8954014778137207,
"learning_rate": 0.00014837274784938596,
"loss": 0.1804,
"step": 1950
},
{
"epoch": 1.707732634338139,
"grad_norm": 1.9697721004486084,
"learning_rate": 0.00014813200720697055,
"loss": 0.2337,
"step": 1955
},
{
"epoch": 1.712101354303189,
"grad_norm": 1.3099297285079956,
"learning_rate": 0.00014789090295023031,
"loss": 0.2387,
"step": 1960
},
{
"epoch": 1.7164700742682393,
"grad_norm": 1.8576496839523315,
"learning_rate": 0.00014764943690059269,
"loss": 0.2739,
"step": 1965
},
{
"epoch": 1.7208387942332897,
"grad_norm": 1.8436728715896606,
"learning_rate": 0.0001474076108822184,
"loss": 0.1997,
"step": 1970
},
{
"epoch": 1.7252075141983398,
"grad_norm": 1.0517886877059937,
"learning_rate": 0.0001471654267219875,
"loss": 0.2432,
"step": 1975
},
{
"epoch": 1.7295762341633902,
"grad_norm": 0.9853880405426025,
"learning_rate": 0.00014692288624948557,
"loss": 0.3059,
"step": 1980
},
{
"epoch": 1.7339449541284404,
"grad_norm": 2.1775450706481934,
"learning_rate": 0.00014667999129699011,
"loss": 0.3374,
"step": 1985
},
{
"epoch": 1.7383136740934906,
"grad_norm": 0.7085615396499634,
"learning_rate": 0.0001464367436994565,
"loss": 0.1907,
"step": 1990
},
{
"epoch": 1.7426823940585408,
"grad_norm": 1.3183834552764893,
"learning_rate": 0.00014619314529450405,
"loss": 0.2587,
"step": 1995
},
{
"epoch": 1.747051114023591,
"grad_norm": 2.9087324142456055,
"learning_rate": 0.00014594919792240246,
"loss": 0.301,
"step": 2000
},
{
"epoch": 1.7514198339886413,
"grad_norm": 1.5489122867584229,
"learning_rate": 0.00014570490342605751,
"loss": 0.2061,
"step": 2005
},
{
"epoch": 1.7557885539536917,
"grad_norm": 1.4719579219818115,
"learning_rate": 0.00014546026365099753,
"loss": 0.2334,
"step": 2010
},
{
"epoch": 1.7601572739187419,
"grad_norm": 1.499053955078125,
"learning_rate": 0.0001452152804453591,
"loss": 0.2817,
"step": 2015
},
{
"epoch": 1.764525993883792,
"grad_norm": 1.0219850540161133,
"learning_rate": 0.00014496995565987337,
"loss": 0.2292,
"step": 2020
},
{
"epoch": 1.7688947138488422,
"grad_norm": 1.4518020153045654,
"learning_rate": 0.00014472429114785194,
"loss": 0.2321,
"step": 2025
},
{
"epoch": 1.7732634338138924,
"grad_norm": 1.0398428440093994,
"learning_rate": 0.00014447828876517277,
"loss": 0.2649,
"step": 2030
},
{
"epoch": 1.7776321537789428,
"grad_norm": 0.5719377994537354,
"learning_rate": 0.00014423195037026646,
"loss": 0.2239,
"step": 2035
},
{
"epoch": 1.782000873743993,
"grad_norm": 1.4201310873031616,
"learning_rate": 0.00014398527782410187,
"loss": 0.1812,
"step": 2040
},
{
"epoch": 1.7863695937090434,
"grad_norm": 1.380936622619629,
"learning_rate": 0.00014373827299017227,
"loss": 0.2379,
"step": 2045
},
{
"epoch": 1.7907383136740935,
"grad_norm": 0.9390376210212708,
"learning_rate": 0.0001434909377344812,
"loss": 0.1879,
"step": 2050
},
{
"epoch": 1.7951070336391437,
"grad_norm": 1.697022557258606,
"learning_rate": 0.0001432432739255284,
"loss": 0.2644,
"step": 2055
},
{
"epoch": 1.7994757536041939,
"grad_norm": 1.7329260110855103,
"learning_rate": 0.00014299528343429566,
"loss": 0.4845,
"step": 2060
},
{
"epoch": 1.8038444735692443,
"grad_norm": 1.098286747932434,
"learning_rate": 0.00014274696813423269,
"loss": 0.2452,
"step": 2065
},
{
"epoch": 1.8082131935342944,
"grad_norm": 0.6370453238487244,
"learning_rate": 0.00014249832990124292,
"loss": 0.2423,
"step": 2070
},
{
"epoch": 1.8125819134993448,
"grad_norm": 1.1073155403137207,
"learning_rate": 0.0001422493706136695,
"loss": 0.269,
"step": 2075
},
{
"epoch": 1.816950633464395,
"grad_norm": 1.5754518508911133,
"learning_rate": 0.0001420000921522809,
"loss": 0.2506,
"step": 2080
},
{
"epoch": 1.8213193534294452,
"grad_norm": 2.118640422821045,
"learning_rate": 0.0001417504964002569,
"loss": 0.2072,
"step": 2085
},
{
"epoch": 1.8256880733944953,
"grad_norm": 1.0548226833343506,
"learning_rate": 0.0001415005852431741,
"loss": 0.2509,
"step": 2090
},
{
"epoch": 1.8300567933595455,
"grad_norm": 2.12402606010437,
"learning_rate": 0.00014125036056899197,
"loss": 0.2599,
"step": 2095
},
{
"epoch": 1.834425513324596,
"grad_norm": 1.251313328742981,
"learning_rate": 0.00014099982426803842,
"loss": 0.2302,
"step": 2100
},
{
"epoch": 1.838794233289646,
"grad_norm": 1.9044132232666016,
"learning_rate": 0.0001407489782329955,
"loss": 0.2487,
"step": 2105
},
{
"epoch": 1.8431629532546965,
"grad_norm": 0.7197313904762268,
"learning_rate": 0.00014049782435888525,
"loss": 0.1864,
"step": 2110
},
{
"epoch": 1.8475316732197467,
"grad_norm": 1.1280083656311035,
"learning_rate": 0.00014024636454305515,
"loss": 0.2028,
"step": 2115
},
{
"epoch": 1.8519003931847968,
"grad_norm": 2.0197250843048096,
"learning_rate": 0.00013999460068516407,
"loss": 0.3204,
"step": 2120
},
{
"epoch": 1.856269113149847,
"grad_norm": 1.2510887384414673,
"learning_rate": 0.0001397425346871677,
"loss": 0.213,
"step": 2125
},
{
"epoch": 1.8606378331148974,
"grad_norm": 1.8764947652816772,
"learning_rate": 0.0001394901684533042,
"loss": 0.2815,
"step": 2130
},
{
"epoch": 1.8650065530799476,
"grad_norm": 2.152601480484009,
"learning_rate": 0.00013923750389007998,
"loss": 0.2884,
"step": 2135
},
{
"epoch": 1.869375273044998,
"grad_norm": 1.9770750999450684,
"learning_rate": 0.00013898454290625515,
"loss": 0.3571,
"step": 2140
},
{
"epoch": 1.8737439930100481,
"grad_norm": 1.4003419876098633,
"learning_rate": 0.00013873128741282906,
"loss": 0.247,
"step": 2145
},
{
"epoch": 1.8781127129750983,
"grad_norm": 1.565500020980835,
"learning_rate": 0.00013847773932302603,
"loss": 0.2227,
"step": 2150
},
{
"epoch": 1.8824814329401485,
"grad_norm": 0.588405966758728,
"learning_rate": 0.00013822390055228079,
"loss": 0.2331,
"step": 2155
},
{
"epoch": 1.8868501529051986,
"grad_norm": 0.9293955564498901,
"learning_rate": 0.00013796977301822397,
"loss": 0.258,
"step": 2160
},
{
"epoch": 1.891218872870249,
"grad_norm": 3.2338411808013916,
"learning_rate": 0.00013771535864066773,
"loss": 0.4014,
"step": 2165
},
{
"epoch": 1.8955875928352992,
"grad_norm": 1.0802291631698608,
"learning_rate": 0.00013746065934159123,
"loss": 0.1996,
"step": 2170
},
{
"epoch": 1.8999563128003496,
"grad_norm": 2.0415797233581543,
"learning_rate": 0.00013720567704512593,
"loss": 0.2392,
"step": 2175
},
{
"epoch": 1.9043250327653998,
"grad_norm": 1.554219365119934,
"learning_rate": 0.00013695041367754133,
"loss": 0.2165,
"step": 2180
},
{
"epoch": 1.90869375273045,
"grad_norm": 8.951557159423828,
"learning_rate": 0.00013669487116723024,
"loss": 0.2473,
"step": 2185
},
{
"epoch": 1.9130624726955001,
"grad_norm": 2.31048321723938,
"learning_rate": 0.0001364390514446943,
"loss": 0.2333,
"step": 2190
},
{
"epoch": 1.9174311926605505,
"grad_norm": 1.3068311214447021,
"learning_rate": 0.0001361829564425293,
"loss": 0.2034,
"step": 2195
},
{
"epoch": 1.9217999126256007,
"grad_norm": 0.9649588465690613,
"learning_rate": 0.00013592658809541064,
"loss": 0.2276,
"step": 2200
},
{
"epoch": 1.926168632590651,
"grad_norm": 2.0399038791656494,
"learning_rate": 0.00013566994834007877,
"loss": 0.1955,
"step": 2205
},
{
"epoch": 1.9305373525557012,
"grad_norm": 1.4719631671905518,
"learning_rate": 0.00013541303911532445,
"loss": 0.1883,
"step": 2210
},
{
"epoch": 1.9349060725207514,
"grad_norm": 1.938029170036316,
"learning_rate": 0.00013515586236197418,
"loss": 0.2386,
"step": 2215
},
{
"epoch": 1.9392747924858016,
"grad_norm": 1.9169769287109375,
"learning_rate": 0.00013489842002287542,
"loss": 0.2435,
"step": 2220
},
{
"epoch": 1.9436435124508518,
"grad_norm": 1.1561371088027954,
"learning_rate": 0.0001346407140428822,
"loss": 0.2305,
"step": 2225
},
{
"epoch": 1.9480122324159022,
"grad_norm": 1.9873589277267456,
"learning_rate": 0.00013438274636884,
"loss": 0.2706,
"step": 2230
},
{
"epoch": 1.9523809523809523,
"grad_norm": 1.9053728580474854,
"learning_rate": 0.00013412451894957144,
"loss": 0.187,
"step": 2235
},
{
"epoch": 1.9567496723460027,
"grad_norm": 1.3117566108703613,
"learning_rate": 0.00013386603373586134,
"loss": 0.2317,
"step": 2240
},
{
"epoch": 1.961118392311053,
"grad_norm": 0.7097095251083374,
"learning_rate": 0.000133607292680442,
"loss": 0.2926,
"step": 2245
},
{
"epoch": 1.965487112276103,
"grad_norm": 1.8591192960739136,
"learning_rate": 0.0001333482977379785,
"loss": 0.2025,
"step": 2250
},
{
"epoch": 1.9698558322411532,
"grad_norm": 2.633700370788574,
"learning_rate": 0.00013308905086505395,
"loss": 0.2513,
"step": 2255
},
{
"epoch": 1.9742245522062036,
"grad_norm": 1.4099256992340088,
"learning_rate": 0.0001328295540201546,
"loss": 0.2447,
"step": 2260
},
{
"epoch": 1.9785932721712538,
"grad_norm": 1.20600163936615,
"learning_rate": 0.00013256980916365527,
"loss": 0.2449,
"step": 2265
},
{
"epoch": 1.9829619921363042,
"grad_norm": 1.1572972536087036,
"learning_rate": 0.0001323098182578042,
"loss": 0.1937,
"step": 2270
},
{
"epoch": 1.9873307121013544,
"grad_norm": 1.0929369926452637,
"learning_rate": 0.00013204958326670853,
"loss": 0.2273,
"step": 2275
},
{
"epoch": 1.9916994320664045,
"grad_norm": 1.674107313156128,
"learning_rate": 0.00013178910615631933,
"loss": 0.3191,
"step": 2280
},
{
"epoch": 1.9960681520314547,
"grad_norm": 1.306754469871521,
"learning_rate": 0.00013152838889441673,
"loss": 0.2723,
"step": 2285
},
{
"epoch": 2.0,
"grad_norm": 1.877669334411621,
"learning_rate": 0.00013126743345059512,
"loss": 0.2246,
"step": 2290
},
{
"epoch": 2.00436871996505,
"grad_norm": 0.8999947905540466,
"learning_rate": 0.00013100624179624828,
"loss": 0.1528,
"step": 2295
},
{
"epoch": 2.0087374399301003,
"grad_norm": 1.1233611106872559,
"learning_rate": 0.00013074481590455433,
"loss": 0.215,
"step": 2300
},
{
"epoch": 2.0131061598951505,
"grad_norm": 1.4673850536346436,
"learning_rate": 0.00013048315775046108,
"loss": 0.1379,
"step": 2305
},
{
"epoch": 2.017474879860201,
"grad_norm": 1.4777660369873047,
"learning_rate": 0.0001302212693106709,
"loss": 0.1343,
"step": 2310
},
{
"epoch": 2.0218435998252513,
"grad_norm": 1.7447019815444946,
"learning_rate": 0.00012995915256362584,
"loss": 0.1591,
"step": 2315
},
{
"epoch": 2.0262123197903015,
"grad_norm": 0.8237628936767578,
"learning_rate": 0.00012969680948949272,
"loss": 0.1182,
"step": 2320
},
{
"epoch": 2.0305810397553516,
"grad_norm": 1.5945308208465576,
"learning_rate": 0.00012943424207014818,
"loss": 0.1624,
"step": 2325
},
{
"epoch": 2.034949759720402,
"grad_norm": 1.3121789693832397,
"learning_rate": 0.00012917145228916367,
"loss": 0.1313,
"step": 2330
},
{
"epoch": 2.039318479685452,
"grad_norm": 1.028647780418396,
"learning_rate": 0.00012890844213179044,
"loss": 0.214,
"step": 2335
},
{
"epoch": 2.0436871996505026,
"grad_norm": 1.0864648818969727,
"learning_rate": 0.00012864521358494464,
"loss": 0.1929,
"step": 2340
},
{
"epoch": 2.0480559196155528,
"grad_norm": 1.2443722486495972,
"learning_rate": 0.00012838176863719217,
"loss": 0.1479,
"step": 2345
},
{
"epoch": 2.052424639580603,
"grad_norm": 3.708357095718384,
"learning_rate": 0.00012811810927873386,
"loss": 0.1897,
"step": 2350
},
{
"epoch": 2.056793359545653,
"grad_norm": 1.483973741531372,
"learning_rate": 0.00012785423750139008,
"loss": 0.1188,
"step": 2355
},
{
"epoch": 2.0611620795107033,
"grad_norm": 1.0019135475158691,
"learning_rate": 0.00012759015529858624,
"loss": 0.1683,
"step": 2360
},
{
"epoch": 2.0655307994757535,
"grad_norm": 1.5917549133300781,
"learning_rate": 0.00012732586466533715,
"loss": 0.2293,
"step": 2365
},
{
"epoch": 2.0698995194408036,
"grad_norm": 1.4390945434570312,
"learning_rate": 0.00012706136759823233,
"loss": 0.1873,
"step": 2370
},
{
"epoch": 2.0742682394058543,
"grad_norm": 2.951580762863159,
"learning_rate": 0.00012679666609542083,
"loss": 0.1984,
"step": 2375
},
{
"epoch": 2.0786369593709044,
"grad_norm": 2.2444632053375244,
"learning_rate": 0.00012653176215659596,
"loss": 0.1709,
"step": 2380
},
{
"epoch": 2.0830056793359546,
"grad_norm": 1.191437840461731,
"learning_rate": 0.0001262666577829806,
"loss": 0.1926,
"step": 2385
},
{
"epoch": 2.0873743993010048,
"grad_norm": 2.2964091300964355,
"learning_rate": 0.00012600135497731156,
"loss": 0.1583,
"step": 2390
},
{
"epoch": 2.091743119266055,
"grad_norm": 0.9333243370056152,
"learning_rate": 0.00012573585574382487,
"loss": 0.1261,
"step": 2395
},
{
"epoch": 2.096111839231105,
"grad_norm": 0.8678691387176514,
"learning_rate": 0.00012547016208824038,
"loss": 0.1491,
"step": 2400
},
{
"epoch": 2.1004805591961557,
"grad_norm": 1.7139450311660767,
"learning_rate": 0.00012520427601774682,
"loss": 0.1599,
"step": 2405
},
{
"epoch": 2.104849279161206,
"grad_norm": 1.161952257156372,
"learning_rate": 0.0001249381995409864,
"loss": 0.1806,
"step": 2410
},
{
"epoch": 2.109217999126256,
"grad_norm": 2.1318793296813965,
"learning_rate": 0.00012467193466803982,
"loss": 0.273,
"step": 2415
},
{
"epoch": 2.1135867190913062,
"grad_norm": 1.7493711709976196,
"learning_rate": 0.00012440548341041108,
"loss": 0.192,
"step": 2420
},
{
"epoch": 2.1179554390563564,
"grad_norm": 1.2488105297088623,
"learning_rate": 0.00012413884778101207,
"loss": 0.1602,
"step": 2425
},
{
"epoch": 2.1223241590214066,
"grad_norm": 1.1086466312408447,
"learning_rate": 0.00012387202979414767,
"loss": 0.1318,
"step": 2430
},
{
"epoch": 2.126692878986457,
"grad_norm": 1.6508293151855469,
"learning_rate": 0.00012360503146550034,
"loss": 0.1595,
"step": 2435
},
{
"epoch": 2.1310615989515074,
"grad_norm": 0.8604519367218018,
"learning_rate": 0.00012333785481211487,
"loss": 0.1409,
"step": 2440
},
{
"epoch": 2.1354303189165575,
"grad_norm": 1.6457815170288086,
"learning_rate": 0.00012307050185238333,
"loss": 0.1807,
"step": 2445
},
{
"epoch": 2.1397990388816077,
"grad_norm": 1.9465317726135254,
"learning_rate": 0.00012280297460602957,
"loss": 0.1804,
"step": 2450
},
{
"epoch": 2.144167758846658,
"grad_norm": 1.763515591621399,
"learning_rate": 0.00012253527509409418,
"loss": 0.152,
"step": 2455
},
{
"epoch": 2.148536478811708,
"grad_norm": 2.8693459033966064,
"learning_rate": 0.00012226740533891913,
"loss": 0.1469,
"step": 2460
},
{
"epoch": 2.1529051987767582,
"grad_norm": 1.7148571014404297,
"learning_rate": 0.00012199936736413246,
"loss": 0.1892,
"step": 2465
},
{
"epoch": 2.157273918741809,
"grad_norm": 1.3995704650878906,
"learning_rate": 0.00012173116319463306,
"loss": 0.2003,
"step": 2470
},
{
"epoch": 2.161642638706859,
"grad_norm": 1.0834511518478394,
"learning_rate": 0.00012146279485657532,
"loss": 0.1784,
"step": 2475
},
{
"epoch": 2.166011358671909,
"grad_norm": 1.1982789039611816,
"learning_rate": 0.00012119426437735384,
"loss": 0.1722,
"step": 2480
},
{
"epoch": 2.1703800786369594,
"grad_norm": 2.481656551361084,
"learning_rate": 0.0001209255737855881,
"loss": 0.1938,
"step": 2485
},
{
"epoch": 2.1747487986020095,
"grad_norm": 0.7617926001548767,
"learning_rate": 0.00012065672511110728,
"loss": 0.1533,
"step": 2490
},
{
"epoch": 2.1791175185670597,
"grad_norm": 0.9713643789291382,
"learning_rate": 0.0001203877203849346,
"loss": 0.1241,
"step": 2495
},
{
"epoch": 2.18348623853211,
"grad_norm": 1.348576545715332,
"learning_rate": 0.00012011856163927235,
"loss": 0.1882,
"step": 2500
},
{
"epoch": 2.1878549584971605,
"grad_norm": 3.1400299072265625,
"learning_rate": 0.00011984925090748626,
"loss": 0.2369,
"step": 2505
},
{
"epoch": 2.1922236784622107,
"grad_norm": 1.1611807346343994,
"learning_rate": 0.00011957979022409027,
"loss": 0.1614,
"step": 2510
},
{
"epoch": 2.196592398427261,
"grad_norm": 1.4003978967666626,
"learning_rate": 0.00011931018162473117,
"loss": 0.1431,
"step": 2515
},
{
"epoch": 2.200961118392311,
"grad_norm": 1.7134933471679688,
"learning_rate": 0.00011904042714617311,
"loss": 0.1917,
"step": 2520
},
{
"epoch": 2.205329838357361,
"grad_norm": 1.4700738191604614,
"learning_rate": 0.00011877052882628237,
"loss": 0.1506,
"step": 2525
},
{
"epoch": 2.2096985583224114,
"grad_norm": 0.9795822501182556,
"learning_rate": 0.00011850048870401185,
"loss": 0.1663,
"step": 2530
},
{
"epoch": 2.214067278287462,
"grad_norm": 1.8647089004516602,
"learning_rate": 0.00011823030881938564,
"loss": 0.1984,
"step": 2535
},
{
"epoch": 2.218435998252512,
"grad_norm": 1.2616095542907715,
"learning_rate": 0.00011795999121348378,
"loss": 0.1158,
"step": 2540
},
{
"epoch": 2.2228047182175623,
"grad_norm": 1.105303168296814,
"learning_rate": 0.00011768953792842663,
"loss": 0.1932,
"step": 2545
},
{
"epoch": 2.2271734381826125,
"grad_norm": 1.0846420526504517,
"learning_rate": 0.00011741895100735958,
"loss": 0.156,
"step": 2550
},
{
"epoch": 2.2315421581476627,
"grad_norm": 2.285141944885254,
"learning_rate": 0.00011714823249443763,
"loss": 0.1639,
"step": 2555
},
{
"epoch": 2.235910878112713,
"grad_norm": 1.022466778755188,
"learning_rate": 0.00011687738443480975,
"loss": 0.1591,
"step": 2560
},
{
"epoch": 2.2402795980777634,
"grad_norm": 2.86263370513916,
"learning_rate": 0.00011660640887460377,
"loss": 0.24,
"step": 2565
},
{
"epoch": 2.2446483180428136,
"grad_norm": 1.2612568140029907,
"learning_rate": 0.00011633530786091051,
"loss": 0.1625,
"step": 2570
},
{
"epoch": 2.249017038007864,
"grad_norm": 1.171590805053711,
"learning_rate": 0.00011606408344176873,
"loss": 0.1702,
"step": 2575
},
{
"epoch": 2.253385757972914,
"grad_norm": 1.118406891822815,
"learning_rate": 0.0001157927376661493,
"loss": 0.1518,
"step": 2580
},
{
"epoch": 2.257754477937964,
"grad_norm": 0.8476281762123108,
"learning_rate": 0.00011552127258394003,
"loss": 0.2093,
"step": 2585
},
{
"epoch": 2.2621231979030143,
"grad_norm": 1.7625765800476074,
"learning_rate": 0.0001152496902459299,
"loss": 0.1764,
"step": 2590
},
{
"epoch": 2.2664919178680645,
"grad_norm": 1.5414633750915527,
"learning_rate": 0.00011497799270379374,
"loss": 0.1185,
"step": 2595
},
{
"epoch": 2.270860637833115,
"grad_norm": 2.2619729042053223,
"learning_rate": 0.00011470618201007677,
"loss": 0.1554,
"step": 2600
},
{
"epoch": 2.2752293577981653,
"grad_norm": 0.9168559312820435,
"learning_rate": 0.00011443426021817887,
"loss": 0.1241,
"step": 2605
},
{
"epoch": 2.2795980777632154,
"grad_norm": 1.3820409774780273,
"learning_rate": 0.00011416222938233936,
"loss": 0.1387,
"step": 2610
},
{
"epoch": 2.2839667977282656,
"grad_norm": 2.417088270187378,
"learning_rate": 0.00011389009155762128,
"loss": 0.267,
"step": 2615
},
{
"epoch": 2.288335517693316,
"grad_norm": 1.2400354146957397,
"learning_rate": 0.00011361784879989588,
"loss": 0.1586,
"step": 2620
},
{
"epoch": 2.292704237658366,
"grad_norm": 1.5232576131820679,
"learning_rate": 0.00011334550316582717,
"loss": 0.2155,
"step": 2625
},
{
"epoch": 2.297072957623416,
"grad_norm": 1.0652902126312256,
"learning_rate": 0.00011307305671285633,
"loss": 0.1849,
"step": 2630
},
{
"epoch": 2.3014416775884667,
"grad_norm": 1.0222703218460083,
"learning_rate": 0.00011280051149918622,
"loss": 0.1351,
"step": 2635
},
{
"epoch": 2.305810397553517,
"grad_norm": 2.64607310295105,
"learning_rate": 0.00011252786958376569,
"loss": 0.1929,
"step": 2640
},
{
"epoch": 2.310179117518567,
"grad_norm": 1.090824842453003,
"learning_rate": 0.00011225513302627422,
"loss": 0.1215,
"step": 2645
},
{
"epoch": 2.3145478374836173,
"grad_norm": 1.6671061515808105,
"learning_rate": 0.00011198230388710625,
"loss": 0.1279,
"step": 2650
},
{
"epoch": 2.3189165574486674,
"grad_norm": 1.3377041816711426,
"learning_rate": 0.00011170938422735558,
"loss": 0.1527,
"step": 2655
},
{
"epoch": 2.3232852774137176,
"grad_norm": 1.554307460784912,
"learning_rate": 0.00011143637610879989,
"loss": 0.16,
"step": 2660
},
{
"epoch": 2.327653997378768,
"grad_norm": 1.2246366739273071,
"learning_rate": 0.00011116328159388514,
"loss": 0.1791,
"step": 2665
},
{
"epoch": 2.3320227173438184,
"grad_norm": 3.7316997051239014,
"learning_rate": 0.00011089010274570992,
"loss": 0.2552,
"step": 2670
},
{
"epoch": 2.3363914373088686,
"grad_norm": 2.306547164916992,
"learning_rate": 0.00011061684162801,
"loss": 0.2081,
"step": 2675
},
{
"epoch": 2.3407601572739187,
"grad_norm": 1.9740854501724243,
"learning_rate": 0.00011034350030514253,
"loss": 0.147,
"step": 2680
},
{
"epoch": 2.345128877238969,
"grad_norm": 1.5787196159362793,
"learning_rate": 0.00011007008084207072,
"loss": 0.152,
"step": 2685
},
{
"epoch": 2.349497597204019,
"grad_norm": 1.394114375114441,
"learning_rate": 0.00010979658530434793,
"loss": 0.2236,
"step": 2690
},
{
"epoch": 2.3538663171690697,
"grad_norm": 1.1155906915664673,
"learning_rate": 0.0001095230157581024,
"loss": 0.1224,
"step": 2695
},
{
"epoch": 2.35823503713412,
"grad_norm": 1.6753724813461304,
"learning_rate": 0.00010924937427002136,
"loss": 0.1937,
"step": 2700
},
{
"epoch": 2.36260375709917,
"grad_norm": 1.3327598571777344,
"learning_rate": 0.00010897566290733552,
"loss": 0.2221,
"step": 2705
},
{
"epoch": 2.36697247706422,
"grad_norm": 2.0531933307647705,
"learning_rate": 0.00010870188373780352,
"loss": 0.146,
"step": 2710
},
{
"epoch": 2.3713411970292704,
"grad_norm": 1.6777331829071045,
"learning_rate": 0.00010842803882969623,
"loss": 0.1931,
"step": 2715
},
{
"epoch": 2.3757099169943205,
"grad_norm": 2.2374494075775146,
"learning_rate": 0.00010815413025178112,
"loss": 0.1637,
"step": 2720
},
{
"epoch": 2.3800786369593707,
"grad_norm": 2.244675636291504,
"learning_rate": 0.00010788016007330665,
"loss": 0.1644,
"step": 2725
},
{
"epoch": 2.3844473569244213,
"grad_norm": 0.7124090194702148,
"learning_rate": 0.00010760613036398668,
"loss": 0.1246,
"step": 2730
},
{
"epoch": 2.3888160768894715,
"grad_norm": 1.298221230506897,
"learning_rate": 0.00010733204319398477,
"loss": 0.1604,
"step": 2735
},
{
"epoch": 2.3931847968545217,
"grad_norm": 3.08442759513855,
"learning_rate": 0.00010705790063389858,
"loss": 0.2225,
"step": 2740
},
{
"epoch": 2.397553516819572,
"grad_norm": 1.3459490537643433,
"learning_rate": 0.00010678370475474424,
"loss": 0.2159,
"step": 2745
},
{
"epoch": 2.401922236784622,
"grad_norm": 1.9191830158233643,
"learning_rate": 0.00010650945762794058,
"loss": 0.2479,
"step": 2750
},
{
"epoch": 2.406290956749672,
"grad_norm": 1.102486491203308,
"learning_rate": 0.00010623516132529372,
"loss": 0.1371,
"step": 2755
},
{
"epoch": 2.4106596767147224,
"grad_norm": 1.311330795288086,
"learning_rate": 0.00010596081791898118,
"loss": 0.1672,
"step": 2760
},
{
"epoch": 2.415028396679773,
"grad_norm": 2.336740732192993,
"learning_rate": 0.00010568642948153636,
"loss": 0.2096,
"step": 2765
},
{
"epoch": 2.419397116644823,
"grad_norm": 0.9913440942764282,
"learning_rate": 0.00010541199808583286,
"loss": 0.2231,
"step": 2770
},
{
"epoch": 2.4237658366098733,
"grad_norm": 1.6791926622390747,
"learning_rate": 0.00010513752580506878,
"loss": 0.1871,
"step": 2775
},
{
"epoch": 2.4281345565749235,
"grad_norm": 1.459643840789795,
"learning_rate": 0.00010486301471275111,
"loss": 0.2017,
"step": 2780
},
{
"epoch": 2.4325032765399737,
"grad_norm": 2.661412239074707,
"learning_rate": 0.00010458846688268003,
"loss": 0.2038,
"step": 2785
},
{
"epoch": 2.436871996505024,
"grad_norm": 2.1982479095458984,
"learning_rate": 0.00010431388438893326,
"loss": 0.1776,
"step": 2790
},
{
"epoch": 2.4412407164700745,
"grad_norm": 0.9297656416893005,
"learning_rate": 0.00010403926930585042,
"loss": 0.124,
"step": 2795
},
{
"epoch": 2.4456094364351246,
"grad_norm": 0.9144073128700256,
"learning_rate": 0.00010376462370801724,
"loss": 0.1728,
"step": 2800
},
{
"epoch": 2.449978156400175,
"grad_norm": 0.8969748616218567,
"learning_rate": 0.00010348994967025012,
"loss": 0.1268,
"step": 2805
},
{
"epoch": 2.454346876365225,
"grad_norm": 1.952581763267517,
"learning_rate": 0.00010321524926758012,
"loss": 0.1516,
"step": 2810
},
{
"epoch": 2.458715596330275,
"grad_norm": 1.463630199432373,
"learning_rate": 0.00010294052457523766,
"loss": 0.1748,
"step": 2815
},
{
"epoch": 2.4630843162953253,
"grad_norm": 0.9995607137680054,
"learning_rate": 0.0001026657776686365,
"loss": 0.1815,
"step": 2820
},
{
"epoch": 2.467453036260376,
"grad_norm": 1.2662900686264038,
"learning_rate": 0.00010239101062335834,
"loss": 0.1727,
"step": 2825
},
{
"epoch": 2.471821756225426,
"grad_norm": 1.2032655477523804,
"learning_rate": 0.00010211622551513697,
"loss": 0.1398,
"step": 2830
},
{
"epoch": 2.4761904761904763,
"grad_norm": 1.363953948020935,
"learning_rate": 0.00010184142441984259,
"loss": 0.1405,
"step": 2835
},
{
"epoch": 2.4805591961555264,
"grad_norm": 1.4486302137374878,
"learning_rate": 0.00010156660941346627,
"loss": 0.1877,
"step": 2840
},
{
"epoch": 2.4849279161205766,
"grad_norm": 1.4691563844680786,
"learning_rate": 0.00010129178257210413,
"loss": 0.1261,
"step": 2845
},
{
"epoch": 2.489296636085627,
"grad_norm": 1.7420620918273926,
"learning_rate": 0.0001010169459719416,
"loss": 0.1672,
"step": 2850
},
{
"epoch": 2.493665356050677,
"grad_norm": 1.8223110437393188,
"learning_rate": 0.00010074210168923804,
"loss": 0.1476,
"step": 2855
},
{
"epoch": 2.4980340760157276,
"grad_norm": 1.197109580039978,
"learning_rate": 0.00010046725180031062,
"loss": 0.1227,
"step": 2860
},
{
"epoch": 2.5024027959807777,
"grad_norm": 1.7598116397857666,
"learning_rate": 0.00010019239838151906,
"loss": 0.1882,
"step": 2865
},
{
"epoch": 2.506771515945828,
"grad_norm": 1.498608946800232,
"learning_rate": 9.99175435092496e-05,
"loss": 0.248,
"step": 2870
},
{
"epoch": 2.511140235910878,
"grad_norm": 1.8565483093261719,
"learning_rate": 9.964268925989954e-05,
"loss": 0.1935,
"step": 2875
},
{
"epoch": 2.5155089558759283,
"grad_norm": 0.8620087504386902,
"learning_rate": 9.936783770986145e-05,
"loss": 0.126,
"step": 2880
},
{
"epoch": 2.5198776758409784,
"grad_norm": 0.7721559405326843,
"learning_rate": 9.909299093550757e-05,
"loss": 0.1066,
"step": 2885
},
{
"epoch": 2.5242463958060286,
"grad_norm": 1.5533393621444702,
"learning_rate": 9.88181510131739e-05,
"loss": 0.2407,
"step": 2890
},
{
"epoch": 2.5286151157710792,
"grad_norm": 1.1294517517089844,
"learning_rate": 9.854332001914486e-05,
"loss": 0.1601,
"step": 2895
},
{
"epoch": 2.5329838357361294,
"grad_norm": 1.7611277103424072,
"learning_rate": 9.826850002963729e-05,
"loss": 0.192,
"step": 2900
},
{
"epoch": 2.5373525557011796,
"grad_norm": 1.9960044622421265,
"learning_rate": 9.799369312078502e-05,
"loss": 0.1327,
"step": 2905
},
{
"epoch": 2.5417212756662297,
"grad_norm": 0.9330713748931885,
"learning_rate": 9.771890136862288e-05,
"loss": 0.1479,
"step": 2910
},
{
"epoch": 2.54608999563128,
"grad_norm": 1.2665588855743408,
"learning_rate": 9.744412684907138e-05,
"loss": 0.1906,
"step": 2915
},
{
"epoch": 2.5504587155963305,
"grad_norm": 1.5277937650680542,
"learning_rate": 9.716937163792075e-05,
"loss": 0.1372,
"step": 2920
},
{
"epoch": 2.5548274355613803,
"grad_norm": 1.4961965084075928,
"learning_rate": 9.689463781081542e-05,
"loss": 0.1955,
"step": 2925
},
{
"epoch": 2.559196155526431,
"grad_norm": 1.1431431770324707,
"learning_rate": 9.661992744323818e-05,
"loss": 0.1879,
"step": 2930
},
{
"epoch": 2.563564875491481,
"grad_norm": 1.1940690279006958,
"learning_rate": 9.634524261049464e-05,
"loss": 0.1565,
"step": 2935
},
{
"epoch": 2.567933595456531,
"grad_norm": 1.741918921470642,
"learning_rate": 9.607058538769756e-05,
"loss": 0.1563,
"step": 2940
},
{
"epoch": 2.5723023154215814,
"grad_norm": 2.1051597595214844,
"learning_rate": 9.579595784975103e-05,
"loss": 0.1136,
"step": 2945
},
{
"epoch": 2.5766710353866316,
"grad_norm": 1.361091136932373,
"learning_rate": 9.552136207133495e-05,
"loss": 0.1554,
"step": 2950
},
{
"epoch": 2.581039755351682,
"grad_norm": 1.1515636444091797,
"learning_rate": 9.524680012688928e-05,
"loss": 0.1832,
"step": 2955
},
{
"epoch": 2.5854084753167323,
"grad_norm": 1.6642508506774902,
"learning_rate": 9.497227409059832e-05,
"loss": 0.1716,
"step": 2960
},
{
"epoch": 2.5897771952817825,
"grad_norm": 1.1159862279891968,
"learning_rate": 9.469778603637518e-05,
"loss": 0.1392,
"step": 2965
},
{
"epoch": 2.5941459152468327,
"grad_norm": 1.3787099123001099,
"learning_rate": 9.4423338037846e-05,
"loss": 0.1429,
"step": 2970
},
{
"epoch": 2.598514635211883,
"grad_norm": 1.0721005201339722,
"learning_rate": 9.414893216833435e-05,
"loss": 0.1687,
"step": 2975
},
{
"epoch": 2.602883355176933,
"grad_norm": 1.7461411952972412,
"learning_rate": 9.387457050084552e-05,
"loss": 0.1757,
"step": 2980
},
{
"epoch": 2.607252075141983,
"grad_norm": 1.4739048480987549,
"learning_rate": 9.360025510805078e-05,
"loss": 0.1686,
"step": 2985
},
{
"epoch": 2.611620795107034,
"grad_norm": 1.7249925136566162,
"learning_rate": 9.332598806227195e-05,
"loss": 0.2044,
"step": 2990
},
{
"epoch": 2.615989515072084,
"grad_norm": 0.8930152654647827,
"learning_rate": 9.305177143546557e-05,
"loss": 0.1248,
"step": 2995
},
{
"epoch": 2.620358235037134,
"grad_norm": 0.8902339935302734,
"learning_rate": 9.277760729920728e-05,
"loss": 0.1633,
"step": 3000
},
{
"epoch": 2.6247269550021843,
"grad_norm": 2.3700525760650635,
"learning_rate": 9.250349772467618e-05,
"loss": 0.1805,
"step": 3005
},
{
"epoch": 2.6290956749672345,
"grad_norm": 1.4570140838623047,
"learning_rate": 9.222944478263915e-05,
"loss": 0.1918,
"step": 3010
},
{
"epoch": 2.6334643949322847,
"grad_norm": 1.3790675401687622,
"learning_rate": 9.195545054343529e-05,
"loss": 0.1127,
"step": 3015
},
{
"epoch": 2.637833114897335,
"grad_norm": 2.068342924118042,
"learning_rate": 9.16815170769602e-05,
"loss": 0.2045,
"step": 3020
},
{
"epoch": 2.6422018348623855,
"grad_norm": 2.2792484760284424,
"learning_rate": 9.14076464526504e-05,
"loss": 0.1942,
"step": 3025
},
{
"epoch": 2.6465705548274356,
"grad_norm": 2.240042209625244,
"learning_rate": 9.113384073946765e-05,
"loss": 0.1725,
"step": 3030
},
{
"epoch": 2.650939274792486,
"grad_norm": 1.1308009624481201,
"learning_rate": 9.086010200588328e-05,
"loss": 0.1611,
"step": 3035
},
{
"epoch": 2.655307994757536,
"grad_norm": 1.0229357481002808,
"learning_rate": 9.05864323198627e-05,
"loss": 0.1892,
"step": 3040
},
{
"epoch": 2.659676714722586,
"grad_norm": 1.2049839496612549,
"learning_rate": 9.03128337488497e-05,
"loss": 0.1585,
"step": 3045
},
{
"epoch": 2.6640454346876368,
"grad_norm": 1.1866313219070435,
"learning_rate": 9.003930835975082e-05,
"loss": 0.1224,
"step": 3050
},
{
"epoch": 2.6684141546526865,
"grad_norm": 1.1551754474639893,
"learning_rate": 8.976585821891966e-05,
"loss": 0.1275,
"step": 3055
},
{
"epoch": 2.672782874617737,
"grad_norm": 1.956389307975769,
"learning_rate": 8.949248539214145e-05,
"loss": 0.1537,
"step": 3060
},
{
"epoch": 2.6771515945827873,
"grad_norm": 0.9731565117835999,
"learning_rate": 8.921919194461735e-05,
"loss": 0.1233,
"step": 3065
},
{
"epoch": 2.6815203145478375,
"grad_norm": 0.7945232391357422,
"learning_rate": 8.894597994094879e-05,
"loss": 0.1518,
"step": 3070
},
{
"epoch": 2.6858890345128876,
"grad_norm": 1.3885526657104492,
"learning_rate": 8.867285144512202e-05,
"loss": 0.187,
"step": 3075
},
{
"epoch": 2.690257754477938,
"grad_norm": 1.2687430381774902,
"learning_rate": 8.839980852049229e-05,
"loss": 0.1599,
"step": 3080
},
{
"epoch": 2.6946264744429884,
"grad_norm": 1.0735893249511719,
"learning_rate": 8.812685322976851e-05,
"loss": 0.1826,
"step": 3085
},
{
"epoch": 2.6989951944080386,
"grad_norm": 1.6104284524917603,
"learning_rate": 8.785398763499755e-05,
"loss": 0.1791,
"step": 3090
},
{
"epoch": 2.7033639143730888,
"grad_norm": 1.6893842220306396,
"learning_rate": 8.758121379754865e-05,
"loss": 0.1588,
"step": 3095
},
{
"epoch": 2.707732634338139,
"grad_norm": 0.967934787273407,
"learning_rate": 8.730853377809784e-05,
"loss": 0.1376,
"step": 3100
},
{
"epoch": 2.712101354303189,
"grad_norm": 1.2705645561218262,
"learning_rate": 8.703594963661241e-05,
"loss": 0.1689,
"step": 3105
},
{
"epoch": 2.7164700742682393,
"grad_norm": 1.4289767742156982,
"learning_rate": 8.67634634323354e-05,
"loss": 0.1802,
"step": 3110
},
{
"epoch": 2.7208387942332894,
"grad_norm": 1.4124540090560913,
"learning_rate": 8.64910772237699e-05,
"loss": 0.1616,
"step": 3115
},
{
"epoch": 2.72520751419834,
"grad_norm": 1.1596726179122925,
"learning_rate": 8.62187930686636e-05,
"loss": 0.1676,
"step": 3120
},
{
"epoch": 2.7295762341633902,
"grad_norm": 1.243022084236145,
"learning_rate": 8.594661302399332e-05,
"loss": 0.139,
"step": 3125
},
{
"epoch": 2.7339449541284404,
"grad_norm": 1.3218811750411987,
"learning_rate": 8.56745391459492e-05,
"loss": 0.2164,
"step": 3130
},
{
"epoch": 2.7383136740934906,
"grad_norm": 2.225620985031128,
"learning_rate": 8.540257348991947e-05,
"loss": 0.1359,
"step": 3135
},
{
"epoch": 2.7426823940585408,
"grad_norm": 2.3583812713623047,
"learning_rate": 8.513071811047478e-05,
"loss": 0.2217,
"step": 3140
},
{
"epoch": 2.747051114023591,
"grad_norm": 2.345612049102783,
"learning_rate": 8.48589750613527e-05,
"loss": 0.1527,
"step": 3145
},
{
"epoch": 2.751419833988641,
"grad_norm": 0.6265102028846741,
"learning_rate": 8.458734639544207e-05,
"loss": 0.1387,
"step": 3150
},
{
"epoch": 2.7557885539536917,
"grad_norm": 1.4767670631408691,
"learning_rate": 8.431583416476779e-05,
"loss": 0.1774,
"step": 3155
},
{
"epoch": 2.760157273918742,
"grad_norm": 1.541107416152954,
"learning_rate": 8.404444042047507e-05,
"loss": 0.1522,
"step": 3160
},
{
"epoch": 2.764525993883792,
"grad_norm": 1.396525263786316,
"learning_rate": 8.377316721281402e-05,
"loss": 0.1972,
"step": 3165
},
{
"epoch": 2.7688947138488422,
"grad_norm": 1.0470671653747559,
"learning_rate": 8.35020165911242e-05,
"loss": 0.1808,
"step": 3170
},
{
"epoch": 2.7732634338138924,
"grad_norm": 1.216828465461731,
"learning_rate": 8.323099060381896e-05,
"loss": 0.1383,
"step": 3175
},
{
"epoch": 2.777632153778943,
"grad_norm": 1.0746128559112549,
"learning_rate": 8.296009129837022e-05,
"loss": 0.1591,
"step": 3180
},
{
"epoch": 2.7820008737439927,
"grad_norm": 1.3712584972381592,
"learning_rate": 8.268932072129287e-05,
"loss": 0.161,
"step": 3185
},
{
"epoch": 2.7863695937090434,
"grad_norm": 2.859886646270752,
"learning_rate": 8.241868091812924e-05,
"loss": 0.1858,
"step": 3190
},
{
"epoch": 2.7907383136740935,
"grad_norm": 1.0394188165664673,
"learning_rate": 8.214817393343383e-05,
"loss": 0.1196,
"step": 3195
},
{
"epoch": 2.7951070336391437,
"grad_norm": 1.1338419914245605,
"learning_rate": 8.187780181075766e-05,
"loss": 0.1264,
"step": 3200
},
{
"epoch": 2.799475753604194,
"grad_norm": 0.9405713677406311,
"learning_rate": 8.160756659263298e-05,
"loss": 0.1177,
"step": 3205
},
{
"epoch": 2.803844473569244,
"grad_norm": 1.4486256837844849,
"learning_rate": 8.13374703205578e-05,
"loss": 0.1745,
"step": 3210
},
{
"epoch": 2.8082131935342947,
"grad_norm": 1.058084487915039,
"learning_rate": 8.106751503498045e-05,
"loss": 0.1639,
"step": 3215
},
{
"epoch": 2.812581913499345,
"grad_norm": 1.712044358253479,
"learning_rate": 8.079770277528422e-05,
"loss": 0.1335,
"step": 3220
},
{
"epoch": 2.816950633464395,
"grad_norm": 1.0125361680984497,
"learning_rate": 8.052803557977175e-05,
"loss": 0.1197,
"step": 3225
},
{
"epoch": 2.821319353429445,
"grad_norm": 1.248329758644104,
"learning_rate": 8.025851548564999e-05,
"loss": 0.1143,
"step": 3230
},
{
"epoch": 2.8256880733944953,
"grad_norm": 0.976321280002594,
"learning_rate": 7.998914452901447e-05,
"loss": 0.1318,
"step": 3235
},
{
"epoch": 2.8300567933595455,
"grad_norm": 1.4181995391845703,
"learning_rate": 7.971992474483413e-05,
"loss": 0.1309,
"step": 3240
},
{
"epoch": 2.8344255133245957,
"grad_norm": 2.2041068077087402,
"learning_rate": 7.945085816693589e-05,
"loss": 0.1843,
"step": 3245
},
{
"epoch": 2.8387942332896463,
"grad_norm": 0.7392102479934692,
"learning_rate": 7.918194682798914e-05,
"loss": 0.1484,
"step": 3250
},
{
"epoch": 2.8431629532546965,
"grad_norm": 1.1940444707870483,
"learning_rate": 7.891319275949066e-05,
"loss": 0.1984,
"step": 3255
},
{
"epoch": 2.8475316732197467,
"grad_norm": 0.9530165791511536,
"learning_rate": 7.864459799174904e-05,
"loss": 0.1537,
"step": 3260
},
{
"epoch": 2.851900393184797,
"grad_norm": 1.3266198635101318,
"learning_rate": 7.837616455386954e-05,
"loss": 0.1721,
"step": 3265
},
{
"epoch": 2.856269113149847,
"grad_norm": 1.8742265701293945,
"learning_rate": 7.810789447373846e-05,
"loss": 0.1385,
"step": 3270
},
{
"epoch": 2.8606378331148976,
"grad_norm": 0.6404191851615906,
"learning_rate": 7.783978977800818e-05,
"loss": 0.1598,
"step": 3275
},
{
"epoch": 2.8650065530799473,
"grad_norm": 1.6875426769256592,
"learning_rate": 7.757185249208163e-05,
"loss": 0.1133,
"step": 3280
},
{
"epoch": 2.869375273044998,
"grad_norm": 2.657672166824341,
"learning_rate": 7.730408464009698e-05,
"loss": 0.1347,
"step": 3285
},
{
"epoch": 2.873743993010048,
"grad_norm": 1.3153127431869507,
"learning_rate": 7.70364882449125e-05,
"loss": 0.1183,
"step": 3290
},
{
"epoch": 2.8781127129750983,
"grad_norm": 2.145387649536133,
"learning_rate": 7.676906532809115e-05,
"loss": 0.1643,
"step": 3295
},
{
"epoch": 2.8824814329401485,
"grad_norm": 1.2466741800308228,
"learning_rate": 7.650181790988527e-05,
"loss": 0.156,
"step": 3300
},
{
"epoch": 2.8868501529051986,
"grad_norm": 1.4382154941558838,
"learning_rate": 7.62347480092215e-05,
"loss": 0.1441,
"step": 3305
},
{
"epoch": 2.8912188728702493,
"grad_norm": 1.2591091394424438,
"learning_rate": 7.596785764368539e-05,
"loss": 0.1966,
"step": 3310
},
{
"epoch": 2.895587592835299,
"grad_norm": 1.8565058708190918,
"learning_rate": 7.570114882950619e-05,
"loss": 0.1394,
"step": 3315
},
{
"epoch": 2.8999563128003496,
"grad_norm": 1.146375298500061,
"learning_rate": 7.543462358154153e-05,
"loss": 0.0894,
"step": 3320
},
{
"epoch": 2.9043250327653998,
"grad_norm": 1.0392186641693115,
"learning_rate": 7.51682839132624e-05,
"loss": 0.1452,
"step": 3325
},
{
"epoch": 2.90869375273045,
"grad_norm": 2.525780439376831,
"learning_rate": 7.49021318367378e-05,
"loss": 0.1675,
"step": 3330
},
{
"epoch": 2.9130624726955,
"grad_norm": 2.238649368286133,
"learning_rate": 7.463616936261952e-05,
"loss": 0.1889,
"step": 3335
},
{
"epoch": 2.9174311926605503,
"grad_norm": 1.5493022203445435,
"learning_rate": 7.437039850012704e-05,
"loss": 0.1021,
"step": 3340
},
{
"epoch": 2.921799912625601,
"grad_norm": 1.0183563232421875,
"learning_rate": 7.410482125703225e-05,
"loss": 0.1179,
"step": 3345
},
{
"epoch": 2.926168632590651,
"grad_norm": 0.9551679491996765,
"learning_rate": 7.383943963964439e-05,
"loss": 0.1458,
"step": 3350
},
{
"epoch": 2.9305373525557012,
"grad_norm": 3.2518422603607178,
"learning_rate": 7.357425565279483e-05,
"loss": 0.1401,
"step": 3355
},
{
"epoch": 2.9349060725207514,
"grad_norm": 2.4154436588287354,
"learning_rate": 7.330927129982191e-05,
"loss": 0.2123,
"step": 3360
},
{
"epoch": 2.9392747924858016,
"grad_norm": 1.7631040811538696,
"learning_rate": 7.304448858255588e-05,
"loss": 0.1705,
"step": 3365
},
{
"epoch": 2.9436435124508518,
"grad_norm": 1.741014003753662,
"learning_rate": 7.277990950130369e-05,
"loss": 0.1277,
"step": 3370
},
{
"epoch": 2.948012232415902,
"grad_norm": 1.7614057064056396,
"learning_rate": 7.25155360548339e-05,
"loss": 0.147,
"step": 3375
},
{
"epoch": 2.9523809523809526,
"grad_norm": 1.403290867805481,
"learning_rate": 7.225137024036164e-05,
"loss": 0.172,
"step": 3380
},
{
"epoch": 2.9567496723460027,
"grad_norm": 1.9428198337554932,
"learning_rate": 7.19874140535335e-05,
"loss": 0.2027,
"step": 3385
},
{
"epoch": 2.961118392311053,
"grad_norm": 2.5754446983337402,
"learning_rate": 7.172366948841232e-05,
"loss": 0.1601,
"step": 3390
},
{
"epoch": 2.965487112276103,
"grad_norm": 1.068940281867981,
"learning_rate": 7.146013853746237e-05,
"loss": 0.1444,
"step": 3395
},
{
"epoch": 2.9698558322411532,
"grad_norm": 1.6970683336257935,
"learning_rate": 7.119682319153409e-05,
"loss": 0.148,
"step": 3400
},
{
"epoch": 2.974224552206204,
"grad_norm": 1.3835630416870117,
"learning_rate": 7.093372543984915e-05,
"loss": 0.1352,
"step": 3405
},
{
"epoch": 2.9785932721712536,
"grad_norm": 0.9599865078926086,
"learning_rate": 7.067084726998548e-05,
"loss": 0.1035,
"step": 3410
},
{
"epoch": 2.982961992136304,
"grad_norm": 2.335697889328003,
"learning_rate": 7.040819066786195e-05,
"loss": 0.1958,
"step": 3415
},
{
"epoch": 2.9873307121013544,
"grad_norm": 1.150230050086975,
"learning_rate": 7.014575761772382e-05,
"loss": 0.1602,
"step": 3420
},
{
"epoch": 2.9916994320664045,
"grad_norm": 1.7480928897857666,
"learning_rate": 6.988355010212742e-05,
"loss": 0.1794,
"step": 3425
},
{
"epoch": 2.9960681520314547,
"grad_norm": 1.2962545156478882,
"learning_rate": 6.962157010192529e-05,
"loss": 0.1637,
"step": 3430
},
{
"epoch": 3.0,
"grad_norm": 0.600903332233429,
"learning_rate": 6.935981959625126e-05,
"loss": 0.1508,
"step": 3435
},
{
"epoch": 3.00436871996505,
"grad_norm": 0.9637101888656616,
"learning_rate": 6.909830056250527e-05,
"loss": 0.1208,
"step": 3440
},
{
"epoch": 3.0087374399301003,
"grad_norm": 1.0528173446655273,
"learning_rate": 6.883701497633876e-05,
"loss": 0.0717,
"step": 3445
},
{
"epoch": 3.0131061598951505,
"grad_norm": 1.6418970823287964,
"learning_rate": 6.857596481163957e-05,
"loss": 0.0916,
"step": 3450
},
{
"epoch": 3.017474879860201,
"grad_norm": 1.6248525381088257,
"learning_rate": 6.831515204051692e-05,
"loss": 0.0801,
"step": 3455
},
{
"epoch": 3.0218435998252513,
"grad_norm": 1.015778660774231,
"learning_rate": 6.805457863328683e-05,
"loss": 0.0989,
"step": 3460
},
{
"epoch": 3.0262123197903015,
"grad_norm": 0.8818852305412292,
"learning_rate": 6.779424655845687e-05,
"loss": 0.0915,
"step": 3465
},
{
"epoch": 3.0305810397553516,
"grad_norm": 0.8511345982551575,
"learning_rate": 6.75341577827115e-05,
"loss": 0.0819,
"step": 3470
},
{
"epoch": 3.034949759720402,
"grad_norm": 0.9664357304573059,
"learning_rate": 6.727431427089724e-05,
"loss": 0.0947,
"step": 3475
},
{
"epoch": 3.039318479685452,
"grad_norm": 1.4519518613815308,
"learning_rate": 6.701471798600766e-05,
"loss": 0.0688,
"step": 3480
},
{
"epoch": 3.0436871996505026,
"grad_norm": 1.7815632820129395,
"learning_rate": 6.675537088916882e-05,
"loss": 0.0865,
"step": 3485
},
{
"epoch": 3.0480559196155528,
"grad_norm": 1.5605847835540771,
"learning_rate": 6.6496274939624e-05,
"loss": 0.1307,
"step": 3490
},
{
"epoch": 3.052424639580603,
"grad_norm": 1.2095025777816772,
"learning_rate": 6.623743209471942e-05,
"loss": 0.1197,
"step": 3495
},
{
"epoch": 3.056793359545653,
"grad_norm": 1.2208069562911987,
"learning_rate": 6.597884430988917e-05,
"loss": 0.1115,
"step": 3500
},
{
"epoch": 3.0611620795107033,
"grad_norm": 1.4211591482162476,
"learning_rate": 6.572051353864043e-05,
"loss": 0.072,
"step": 3505
},
{
"epoch": 3.0655307994757535,
"grad_norm": 1.2047765254974365,
"learning_rate": 6.546244173253878e-05,
"loss": 0.0544,
"step": 3510
},
{
"epoch": 3.0698995194408036,
"grad_norm": 1.7253801822662354,
"learning_rate": 6.520463084119343e-05,
"loss": 0.0794,
"step": 3515
},
{
"epoch": 3.0742682394058543,
"grad_norm": 7.309013366699219,
"learning_rate": 6.494708281224255e-05,
"loss": 0.1807,
"step": 3520
},
{
"epoch": 3.0786369593709044,
"grad_norm": 1.258571743965149,
"learning_rate": 6.468979959133852e-05,
"loss": 0.0716,
"step": 3525
},
{
"epoch": 3.0830056793359546,
"grad_norm": 1.4243378639221191,
"learning_rate": 6.443278312213312e-05,
"loss": 0.1365,
"step": 3530
},
{
"epoch": 3.0873743993010048,
"grad_norm": 2.169480562210083,
"learning_rate": 6.417603534626306e-05,
"loss": 0.1059,
"step": 3535
},
{
"epoch": 3.091743119266055,
"grad_norm": 1.2186070680618286,
"learning_rate": 6.391955820333513e-05,
"loss": 0.0943,
"step": 3540
},
{
"epoch": 3.096111839231105,
"grad_norm": 2.4399561882019043,
"learning_rate": 6.366335363091165e-05,
"loss": 0.0943,
"step": 3545
},
{
"epoch": 3.1004805591961557,
"grad_norm": 1.6928596496582031,
"learning_rate": 6.340742356449579e-05,
"loss": 0.1019,
"step": 3550
},
{
"epoch": 3.104849279161206,
"grad_norm": 2.0251004695892334,
"learning_rate": 6.315176993751699e-05,
"loss": 0.0932,
"step": 3555
},
{
"epoch": 3.109217999126256,
"grad_norm": 1.7529054880142212,
"learning_rate": 6.289639468131622e-05,
"loss": 0.0946,
"step": 3560
},
{
"epoch": 3.1135867190913062,
"grad_norm": 1.67819344997406,
"learning_rate": 6.264129972513163e-05,
"loss": 0.0813,
"step": 3565
},
{
"epoch": 3.1179554390563564,
"grad_norm": 1.2460740804672241,
"learning_rate": 6.238648699608375e-05,
"loss": 0.1024,
"step": 3570
},
{
"epoch": 3.1223241590214066,
"grad_norm": 1.6184308528900146,
"learning_rate": 6.213195841916104e-05,
"loss": 0.0847,
"step": 3575
},
{
"epoch": 3.126692878986457,
"grad_norm": 1.5642549991607666,
"learning_rate": 6.187771591720536e-05,
"loss": 0.1008,
"step": 3580
},
{
"epoch": 3.1310615989515074,
"grad_norm": 1.9698232412338257,
"learning_rate": 6.16237614108973e-05,
"loss": 0.0994,
"step": 3585
},
{
"epoch": 3.1354303189165575,
"grad_norm": 1.6373701095581055,
"learning_rate": 6.137009681874192e-05,
"loss": 0.1204,
"step": 3590
},
{
"epoch": 3.1397990388816077,
"grad_norm": 0.9783329963684082,
"learning_rate": 6.111672405705402e-05,
"loss": 0.0716,
"step": 3595
},
{
"epoch": 3.144167758846658,
"grad_norm": 1.6490020751953125,
"learning_rate": 6.086364503994382e-05,
"loss": 0.078,
"step": 3600
},
{
"epoch": 3.148536478811708,
"grad_norm": 2.7672507762908936,
"learning_rate": 6.061086167930245e-05,
"loss": 0.1102,
"step": 3605
},
{
"epoch": 3.1529051987767582,
"grad_norm": 2.6075148582458496,
"learning_rate": 6.035837588478737e-05,
"loss": 0.0995,
"step": 3610
},
{
"epoch": 3.157273918741809,
"grad_norm": 1.1910531520843506,
"learning_rate": 6.010618956380821e-05,
"loss": 0.104,
"step": 3615
},
{
"epoch": 3.161642638706859,
"grad_norm": 2.8808867931365967,
"learning_rate": 5.985430462151219e-05,
"loss": 0.1065,
"step": 3620
},
{
"epoch": 3.166011358671909,
"grad_norm": 2.5939221382141113,
"learning_rate": 5.960272296076972e-05,
"loss": 0.1232,
"step": 3625
},
{
"epoch": 3.1703800786369594,
"grad_norm": 0.7496421933174133,
"learning_rate": 5.935144648216007e-05,
"loss": 0.0906,
"step": 3630
},
{
"epoch": 3.1747487986020095,
"grad_norm": 1.836545705795288,
"learning_rate": 5.910047708395703e-05,
"loss": 0.0794,
"step": 3635
},
{
"epoch": 3.1791175185670597,
"grad_norm": 1.6734864711761475,
"learning_rate": 5.884981666211452e-05,
"loss": 0.1003,
"step": 3640
},
{
"epoch": 3.18348623853211,
"grad_norm": 2.2766218185424805,
"learning_rate": 5.8599467110252305e-05,
"loss": 0.1069,
"step": 3645
},
{
"epoch": 3.1878549584971605,
"grad_norm": 1.0550603866577148,
"learning_rate": 5.8349430319641716e-05,
"loss": 0.0758,
"step": 3650
},
{
"epoch": 3.1922236784622107,
"grad_norm": 1.994390606880188,
"learning_rate": 5.809970817919121e-05,
"loss": 0.0956,
"step": 3655
},
{
"epoch": 3.196592398427261,
"grad_norm": 1.6109845638275146,
"learning_rate": 5.785030257543234e-05,
"loss": 0.1103,
"step": 3660
},
{
"epoch": 3.200961118392311,
"grad_norm": 1.7017037868499756,
"learning_rate": 5.760121539250532e-05,
"loss": 0.0709,
"step": 3665
},
{
"epoch": 3.205329838357361,
"grad_norm": 1.8740450143814087,
"learning_rate": 5.7352448512144764e-05,
"loss": 0.1109,
"step": 3670
},
{
"epoch": 3.2096985583224114,
"grad_norm": 2.410780191421509,
"learning_rate": 5.71040038136657e-05,
"loss": 0.1256,
"step": 3675
},
{
"epoch": 3.214067278287462,
"grad_norm": 3.1283838748931885,
"learning_rate": 5.6855883173949186e-05,
"loss": 0.0943,
"step": 3680
},
{
"epoch": 3.218435998252512,
"grad_norm": 1.051555871963501,
"learning_rate": 5.660808846742807e-05,
"loss": 0.048,
"step": 3685
},
{
"epoch": 3.2228047182175623,
"grad_norm": 0.789137601852417,
"learning_rate": 5.636062156607302e-05,
"loss": 0.0728,
"step": 3690
},
{
"epoch": 3.2271734381826125,
"grad_norm": 1.454079031944275,
"learning_rate": 5.6113484339378306e-05,
"loss": 0.0892,
"step": 3695
},
{
"epoch": 3.2315421581476627,
"grad_norm": 1.1673469543457031,
"learning_rate": 5.586667865434766e-05,
"loss": 0.0887,
"step": 3700
},
{
"epoch": 3.235910878112713,
"grad_norm": 0.7643448710441589,
"learning_rate": 5.5620206375480066e-05,
"loss": 0.0947,
"step": 3705
},
{
"epoch": 3.2402795980777634,
"grad_norm": 1.4417402744293213,
"learning_rate": 5.537406936475592e-05,
"loss": 0.0704,
"step": 3710
},
{
"epoch": 3.2446483180428136,
"grad_norm": 1.4992611408233643,
"learning_rate": 5.5128269481622765e-05,
"loss": 0.0945,
"step": 3715
},
{
"epoch": 3.249017038007864,
"grad_norm": 1.2667427062988281,
"learning_rate": 5.488280858298131e-05,
"loss": 0.0671,
"step": 3720
},
{
"epoch": 3.253385757972914,
"grad_norm": 1.1633776426315308,
"learning_rate": 5.463768852317146e-05,
"loss": 0.0743,
"step": 3725
},
{
"epoch": 3.257754477937964,
"grad_norm": 2.00858211517334,
"learning_rate": 5.439291115395808e-05,
"loss": 0.0889,
"step": 3730
},
{
"epoch": 3.2621231979030143,
"grad_norm": 0.9733744263648987,
"learning_rate": 5.414847832451735e-05,
"loss": 0.0445,
"step": 3735
},
{
"epoch": 3.2664919178680645,
"grad_norm": 1.1338526010513306,
"learning_rate": 5.390439188142253e-05,
"loss": 0.0911,
"step": 3740
},
{
"epoch": 3.270860637833115,
"grad_norm": 0.9580100178718567,
"learning_rate": 5.366065366863017e-05,
"loss": 0.0815,
"step": 3745
},
{
"epoch": 3.2752293577981653,
"grad_norm": 1.1516432762145996,
"learning_rate": 5.341726552746594e-05,
"loss": 0.0628,
"step": 3750
},
{
"epoch": 3.2795980777632154,
"grad_norm": 1.5042197704315186,
"learning_rate": 5.3174229296611066e-05,
"loss": 0.1095,
"step": 3755
},
{
"epoch": 3.2839667977282656,
"grad_norm": 1.399878740310669,
"learning_rate": 5.293154681208816e-05,
"loss": 0.0836,
"step": 3760
},
{
"epoch": 3.288335517693316,
"grad_norm": 2.0793116092681885,
"learning_rate": 5.268921990724751e-05,
"loss": 0.1065,
"step": 3765
},
{
"epoch": 3.292704237658366,
"grad_norm": 1.0440312623977661,
"learning_rate": 5.2447250412753157e-05,
"loss": 0.0904,
"step": 3770
},
{
"epoch": 3.297072957623416,
"grad_norm": 1.3085564374923706,
"learning_rate": 5.220564015656899e-05,
"loss": 0.0697,
"step": 3775
},
{
"epoch": 3.3014416775884667,
"grad_norm": 1.2351759672164917,
"learning_rate": 5.196439096394512e-05,
"loss": 0.0894,
"step": 3780
},
{
"epoch": 3.305810397553517,
"grad_norm": 1.8387460708618164,
"learning_rate": 5.1723504657403966e-05,
"loss": 0.1251,
"step": 3785
},
{
"epoch": 3.310179117518567,
"grad_norm": 2.4653780460357666,
"learning_rate": 5.1482983056726544e-05,
"loss": 0.061,
"step": 3790
},
{
"epoch": 3.3145478374836173,
"grad_norm": 1.5587716102600098,
"learning_rate": 5.124282797893867e-05,
"loss": 0.129,
"step": 3795
},
{
"epoch": 3.3189165574486674,
"grad_norm": 0.9583266377449036,
"learning_rate": 5.10030412382972e-05,
"loss": 0.0981,
"step": 3800
},
{
"epoch": 3.3232852774137176,
"grad_norm": 1.4593812227249146,
"learning_rate": 5.0763624646276444e-05,
"loss": 0.1017,
"step": 3805
},
{
"epoch": 3.327653997378768,
"grad_norm": 1.0001953840255737,
"learning_rate": 5.052458001155442e-05,
"loss": 0.1175,
"step": 3810
},
{
"epoch": 3.3320227173438184,
"grad_norm": 2.0613253116607666,
"learning_rate": 5.0285909139999175e-05,
"loss": 0.0927,
"step": 3815
},
{
"epoch": 3.3363914373088686,
"grad_norm": 1.731712818145752,
"learning_rate": 5.004761383465515e-05,
"loss": 0.1034,
"step": 3820
},
{
"epoch": 3.3407601572739187,
"grad_norm": 1.1507972478866577,
"learning_rate": 4.98096958957295e-05,
"loss": 0.0569,
"step": 3825
},
{
"epoch": 3.345128877238969,
"grad_norm": 0.8323259353637695,
"learning_rate": 4.9572157120578666e-05,
"loss": 0.0931,
"step": 3830
},
{
"epoch": 3.349497597204019,
"grad_norm": 4.5710344314575195,
"learning_rate": 4.933499930369466e-05,
"loss": 0.0772,
"step": 3835
},
{
"epoch": 3.3538663171690697,
"grad_norm": 1.2585368156433105,
"learning_rate": 4.909822423669145e-05,
"loss": 0.0881,
"step": 3840
},
{
"epoch": 3.35823503713412,
"grad_norm": 1.0395569801330566,
"learning_rate": 4.8861833708291584e-05,
"loss": 0.0835,
"step": 3845
},
{
"epoch": 3.36260375709917,
"grad_norm": 1.3017204999923706,
"learning_rate": 4.862582950431267e-05,
"loss": 0.0804,
"step": 3850
},
{
"epoch": 3.36697247706422,
"grad_norm": 1.038264274597168,
"learning_rate": 4.839021340765364e-05,
"loss": 0.1365,
"step": 3855
},
{
"epoch": 3.3713411970292704,
"grad_norm": 1.034326434135437,
"learning_rate": 4.815498719828163e-05,
"loss": 0.082,
"step": 3860
},
{
"epoch": 3.3757099169943205,
"grad_norm": 1.3960469961166382,
"learning_rate": 4.7920152653218274e-05,
"loss": 0.1569,
"step": 3865
},
{
"epoch": 3.3800786369593707,
"grad_norm": 0.787963330745697,
"learning_rate": 4.768571154652645e-05,
"loss": 0.0815,
"step": 3870
},
{
"epoch": 3.3844473569244213,
"grad_norm": 0.9106404185295105,
"learning_rate": 4.745166564929665e-05,
"loss": 0.0726,
"step": 3875
},
{
"epoch": 3.3888160768894715,
"grad_norm": 1.1823543310165405,
"learning_rate": 4.7218016729633894e-05,
"loss": 0.087,
"step": 3880
},
{
"epoch": 3.3931847968545217,
"grad_norm": 0.7293349504470825,
"learning_rate": 4.6984766552644164e-05,
"loss": 0.0734,
"step": 3885
},
{
"epoch": 3.397553516819572,
"grad_norm": 4.221632957458496,
"learning_rate": 4.675191688042116e-05,
"loss": 0.1181,
"step": 3890
},
{
"epoch": 3.401922236784622,
"grad_norm": 1.8997467756271362,
"learning_rate": 4.65194694720329e-05,
"loss": 0.0869,
"step": 3895
},
{
"epoch": 3.406290956749672,
"grad_norm": 1.2236734628677368,
"learning_rate": 4.6287426083508536e-05,
"loss": 0.1093,
"step": 3900
},
{
"epoch": 3.4106596767147224,
"grad_norm": 1.8938541412353516,
"learning_rate": 4.6055788467825046e-05,
"loss": 0.0902,
"step": 3905
},
{
"epoch": 3.415028396679773,
"grad_norm": 1.122361421585083,
"learning_rate": 4.5824558374893964e-05,
"loss": 0.0838,
"step": 3910
},
{
"epoch": 3.419397116644823,
"grad_norm": 3.0877389907836914,
"learning_rate": 4.5593737551548266e-05,
"loss": 0.0861,
"step": 3915
},
{
"epoch": 3.4237658366098733,
"grad_norm": 1.2574303150177002,
"learning_rate": 4.5363327741528916e-05,
"loss": 0.0985,
"step": 3920
},
{
"epoch": 3.4281345565749235,
"grad_norm": 1.3278526067733765,
"learning_rate": 4.5133330685472044e-05,
"loss": 0.0877,
"step": 3925
},
{
"epoch": 3.4325032765399737,
"grad_norm": 1.873443841934204,
"learning_rate": 4.490374812089553e-05,
"loss": 0.0861,
"step": 3930
},
{
"epoch": 3.436871996505024,
"grad_norm": 0.9117352962493896,
"learning_rate": 4.4674581782186e-05,
"loss": 0.0854,
"step": 3935
},
{
"epoch": 3.4412407164700745,
"grad_norm": 0.6332014203071594,
"learning_rate": 4.4445833400585724e-05,
"loss": 0.0629,
"step": 3940
},
{
"epoch": 3.4456094364351246,
"grad_norm": 1.233370065689087,
"learning_rate": 4.42175047041794e-05,
"loss": 0.103,
"step": 3945
},
{
"epoch": 3.449978156400175,
"grad_norm": 1.5003916025161743,
"learning_rate": 4.398959741788129e-05,
"loss": 0.116,
"step": 3950
},
{
"epoch": 3.454346876365225,
"grad_norm": 1.4277344942092896,
"learning_rate": 4.37621132634221e-05,
"loss": 0.1092,
"step": 3955
},
{
"epoch": 3.458715596330275,
"grad_norm": 1.2345412969589233,
"learning_rate": 4.353505395933596e-05,
"loss": 0.1223,
"step": 3960
},
{
"epoch": 3.4630843162953253,
"grad_norm": 1.273594617843628,
"learning_rate": 4.330842122094749e-05,
"loss": 0.1185,
"step": 3965
},
{
"epoch": 3.467453036260376,
"grad_norm": 1.2188774347305298,
"learning_rate": 4.308221676035872e-05,
"loss": 0.1116,
"step": 3970
},
{
"epoch": 3.471821756225426,
"grad_norm": 1.0663518905639648,
"learning_rate": 4.285644228643633e-05,
"loss": 0.1012,
"step": 3975
},
{
"epoch": 3.4761904761904763,
"grad_norm": 1.385391116142273,
"learning_rate": 4.263109950479863e-05,
"loss": 0.0935,
"step": 3980
},
{
"epoch": 3.4805591961555264,
"grad_norm": 1.8860664367675781,
"learning_rate": 4.240619011780273e-05,
"loss": 0.0949,
"step": 3985
},
{
"epoch": 3.4849279161205766,
"grad_norm": 1.262670636177063,
"learning_rate": 4.218171582453163e-05,
"loss": 0.061,
"step": 3990
},
{
"epoch": 3.489296636085627,
"grad_norm": 0.7418899536132812,
"learning_rate": 4.195767832078134e-05,
"loss": 0.0726,
"step": 3995
},
{
"epoch": 3.493665356050677,
"grad_norm": 1.0607564449310303,
"learning_rate": 4.1734079299048214e-05,
"loss": 0.0992,
"step": 4000
},
{
"epoch": 3.4980340760157276,
"grad_norm": 2.385943651199341,
"learning_rate": 4.151092044851612e-05,
"loss": 0.1036,
"step": 4005
},
{
"epoch": 3.5024027959807777,
"grad_norm": 1.1264841556549072,
"learning_rate": 4.1288203455043504e-05,
"loss": 0.0756,
"step": 4010
},
{
"epoch": 3.506771515945828,
"grad_norm": 1.05461585521698,
"learning_rate": 4.106593000115091e-05,
"loss": 0.1329,
"step": 4015
},
{
"epoch": 3.511140235910878,
"grad_norm": 0.9836155772209167,
"learning_rate": 4.084410176600815e-05,
"loss": 0.0716,
"step": 4020
},
{
"epoch": 3.5155089558759283,
"grad_norm": 1.2975742816925049,
"learning_rate": 4.062272042542151e-05,
"loss": 0.0982,
"step": 4025
},
{
"epoch": 3.5198776758409784,
"grad_norm": 1.0592769384384155,
"learning_rate": 4.040178765182133e-05,
"loss": 0.0805,
"step": 4030
},
{
"epoch": 3.5242463958060286,
"grad_norm": 1.3170608282089233,
"learning_rate": 4.0181305114249215e-05,
"loss": 0.0898,
"step": 4035
},
{
"epoch": 3.5286151157710792,
"grad_norm": 2.1055305004119873,
"learning_rate": 3.996127447834536e-05,
"loss": 0.1372,
"step": 4040
},
{
"epoch": 3.5329838357361294,
"grad_norm": 1.4089146852493286,
"learning_rate": 3.974169740633614e-05,
"loss": 0.1174,
"step": 4045
},
{
"epoch": 3.5373525557011796,
"grad_norm": 0.8465004563331604,
"learning_rate": 3.952257555702149e-05,
"loss": 0.0694,
"step": 4050
},
{
"epoch": 3.5417212756662297,
"grad_norm": 1.5724999904632568,
"learning_rate": 3.9303910585762296e-05,
"loss": 0.0866,
"step": 4055
},
{
"epoch": 3.54608999563128,
"grad_norm": 1.2664058208465576,
"learning_rate": 3.908570414446802e-05,
"loss": 0.08,
"step": 4060
},
{
"epoch": 3.5504587155963305,
"grad_norm": 1.2871352434158325,
"learning_rate": 3.8867957881584014e-05,
"loss": 0.1048,
"step": 4065
},
{
"epoch": 3.5548274355613803,
"grad_norm": 1.2822154760360718,
"learning_rate": 3.8650673442079355e-05,
"loss": 0.0611,
"step": 4070
},
{
"epoch": 3.559196155526431,
"grad_norm": 1.6582127809524536,
"learning_rate": 3.843385246743417e-05,
"loss": 0.0934,
"step": 4075
},
{
"epoch": 3.563564875491481,
"grad_norm": 1.706179141998291,
"learning_rate": 3.821749659562739e-05,
"loss": 0.0735,
"step": 4080
},
{
"epoch": 3.567933595456531,
"grad_norm": 1.0783063173294067,
"learning_rate": 3.8001607461124314e-05,
"loss": 0.0953,
"step": 4085
},
{
"epoch": 3.5723023154215814,
"grad_norm": 1.1074267625808716,
"learning_rate": 3.7786186694864145e-05,
"loss": 0.0873,
"step": 4090
},
{
"epoch": 3.5766710353866316,
"grad_norm": 1.518434762954712,
"learning_rate": 3.757123592424794e-05,
"loss": 0.0786,
"step": 4095
},
{
"epoch": 3.581039755351682,
"grad_norm": 2.0353386402130127,
"learning_rate": 3.735675677312609e-05,
"loss": 0.0998,
"step": 4100
},
{
"epoch": 3.5854084753167323,
"grad_norm": 2.324700117111206,
"learning_rate": 3.714275086178614e-05,
"loss": 0.0869,
"step": 4105
},
{
"epoch": 3.5897771952817825,
"grad_norm": 1.0576910972595215,
"learning_rate": 3.692921980694057e-05,
"loss": 0.1051,
"step": 4110
},
{
"epoch": 3.5941459152468327,
"grad_norm": 2.1334621906280518,
"learning_rate": 3.671616522171445e-05,
"loss": 0.0839,
"step": 4115
},
{
"epoch": 3.598514635211883,
"grad_norm": 1.1847120523452759,
"learning_rate": 3.650358871563343e-05,
"loss": 0.0862,
"step": 4120
},
{
"epoch": 3.602883355176933,
"grad_norm": 1.9714267253875732,
"learning_rate": 3.629149189461147e-05,
"loss": 0.0978,
"step": 4125
},
{
"epoch": 3.607252075141983,
"grad_norm": 1.4258538484573364,
"learning_rate": 3.6079876360938805e-05,
"loss": 0.0919,
"step": 4130
},
{
"epoch": 3.611620795107034,
"grad_norm": 3.2902631759643555,
"learning_rate": 3.5868743713269635e-05,
"loss": 0.0724,
"step": 4135
},
{
"epoch": 3.615989515072084,
"grad_norm": 1.475931167602539,
"learning_rate": 3.5658095546610306e-05,
"loss": 0.1023,
"step": 4140
},
{
"epoch": 3.620358235037134,
"grad_norm": 1.440937876701355,
"learning_rate": 3.54479334523071e-05,
"loss": 0.0965,
"step": 4145
},
{
"epoch": 3.6247269550021843,
"grad_norm": 1.6201328039169312,
"learning_rate": 3.523825901803427e-05,
"loss": 0.0625,
"step": 4150
},
{
"epoch": 3.6290956749672345,
"grad_norm": 1.9484727382659912,
"learning_rate": 3.502907382778204e-05,
"loss": 0.1019,
"step": 4155
},
{
"epoch": 3.6334643949322847,
"grad_norm": 1.3583996295928955,
"learning_rate": 3.482037946184456e-05,
"loss": 0.0797,
"step": 4160
},
{
"epoch": 3.637833114897335,
"grad_norm": 1.250138282775879,
"learning_rate": 3.461217749680807e-05,
"loss": 0.0689,
"step": 4165
},
{
"epoch": 3.6422018348623855,
"grad_norm": 1.7499765157699585,
"learning_rate": 3.440446950553904e-05,
"loss": 0.0668,
"step": 4170
},
{
"epoch": 3.6465705548274356,
"grad_norm": 1.8903212547302246,
"learning_rate": 3.41972570571721e-05,
"loss": 0.0834,
"step": 4175
},
{
"epoch": 3.650939274792486,
"grad_norm": 1.5133988857269287,
"learning_rate": 3.3990541717098315e-05,
"loss": 0.0839,
"step": 4180
},
{
"epoch": 3.655307994757536,
"grad_norm": 0.8838937878608704,
"learning_rate": 3.378432504695335e-05,
"loss": 0.0721,
"step": 4185
},
{
"epoch": 3.659676714722586,
"grad_norm": 1.4382675886154175,
"learning_rate": 3.3578608604605686e-05,
"loss": 0.0823,
"step": 4190
},
{
"epoch": 3.6640454346876368,
"grad_norm": 1.3646224737167358,
"learning_rate": 3.337339394414473e-05,
"loss": 0.1051,
"step": 4195
},
{
"epoch": 3.6684141546526865,
"grad_norm": 1.4034249782562256,
"learning_rate": 3.316868261586924e-05,
"loss": 0.0734,
"step": 4200
},
{
"epoch": 3.672782874617737,
"grad_norm": 1.422706127166748,
"learning_rate": 3.296447616627557e-05,
"loss": 0.0803,
"step": 4205
},
{
"epoch": 3.6771515945827873,
"grad_norm": 2.778165817260742,
"learning_rate": 3.2760776138045846e-05,
"loss": 0.0705,
"step": 4210
},
{
"epoch": 3.6815203145478375,
"grad_norm": 2.6988394260406494,
"learning_rate": 3.255758407003652e-05,
"loss": 0.1061,
"step": 4215
},
{
"epoch": 3.6858890345128876,
"grad_norm": 1.93489670753479,
"learning_rate": 3.2354901497266596e-05,
"loss": 0.0971,
"step": 4220
},
{
"epoch": 3.690257754477938,
"grad_norm": 1.7685363292694092,
"learning_rate": 3.215272995090612e-05,
"loss": 0.0756,
"step": 4225
},
{
"epoch": 3.6946264744429884,
"grad_norm": 1.8435022830963135,
"learning_rate": 3.195107095826461e-05,
"loss": 0.07,
"step": 4230
},
{
"epoch": 3.6989951944080386,
"grad_norm": 1.3396873474121094,
"learning_rate": 3.174992604277932e-05,
"loss": 0.0739,
"step": 4235
},
{
"epoch": 3.7033639143730888,
"grad_norm": 0.8422791957855225,
"learning_rate": 3.15492967240041e-05,
"loss": 0.0737,
"step": 4240
},
{
"epoch": 3.707732634338139,
"grad_norm": 2.0195155143737793,
"learning_rate": 3.1349184517597586e-05,
"loss": 0.0794,
"step": 4245
},
{
"epoch": 3.712101354303189,
"grad_norm": 1.469390630722046,
"learning_rate": 3.1149590935311964e-05,
"loss": 0.0589,
"step": 4250
},
{
"epoch": 3.7164700742682393,
"grad_norm": 1.3921406269073486,
"learning_rate": 3.0950517484981346e-05,
"loss": 0.0893,
"step": 4255
},
{
"epoch": 3.7208387942332894,
"grad_norm": 3.0969502925872803,
"learning_rate": 3.075196567051061e-05,
"loss": 0.0744,
"step": 4260
},
{
"epoch": 3.72520751419834,
"grad_norm": 1.414971947669983,
"learning_rate": 3.055393699186386e-05,
"loss": 0.0762,
"step": 4265
},
{
"epoch": 3.7295762341633902,
"grad_norm": 1.7889645099639893,
"learning_rate": 3.0356432945053203e-05,
"loss": 0.0496,
"step": 4270
},
{
"epoch": 3.7339449541284404,
"grad_norm": 1.676378846168518,
"learning_rate": 3.0159455022127382e-05,
"loss": 0.1013,
"step": 4275
},
{
"epoch": 3.7383136740934906,
"grad_norm": 1.6016095876693726,
"learning_rate": 2.996300471116047e-05,
"loss": 0.087,
"step": 4280
},
{
"epoch": 3.7426823940585408,
"grad_norm": 1.4002799987792969,
"learning_rate": 2.9767083496240766e-05,
"loss": 0.0772,
"step": 4285
},
{
"epoch": 3.747051114023591,
"grad_norm": 1.0628836154937744,
"learning_rate": 2.9571692857459467e-05,
"loss": 0.0795,
"step": 4290
},
{
"epoch": 3.751419833988641,
"grad_norm": 1.440360426902771,
"learning_rate": 2.9376834270899523e-05,
"loss": 0.0807,
"step": 4295
},
{
"epoch": 3.7557885539536917,
"grad_norm": 1.8291728496551514,
"learning_rate": 2.9182509208624518e-05,
"loss": 0.0868,
"step": 4300
},
{
"epoch": 3.760157273918742,
"grad_norm": 2.1751294136047363,
"learning_rate": 2.898871913866743e-05,
"loss": 0.1253,
"step": 4305
},
{
"epoch": 3.764525993883792,
"grad_norm": 1.6346979141235352,
"learning_rate": 2.879546552501968e-05,
"loss": 0.1091,
"step": 4310
},
{
"epoch": 3.7688947138488422,
"grad_norm": 1.4858824014663696,
"learning_rate": 2.8602749827620033e-05,
"loss": 0.0822,
"step": 4315
},
{
"epoch": 3.7732634338138924,
"grad_norm": 1.60005521774292,
"learning_rate": 2.841057350234354e-05,
"loss": 0.1123,
"step": 4320
},
{
"epoch": 3.777632153778943,
"grad_norm": 2.3461992740631104,
"learning_rate": 2.8218938000990557e-05,
"loss": 0.0868,
"step": 4325
},
{
"epoch": 3.7820008737439927,
"grad_norm": 1.346450686454773,
"learning_rate": 2.8027844771275723e-05,
"loss": 0.0947,
"step": 4330
},
{
"epoch": 3.7863695937090434,
"grad_norm": 5.1837897300720215,
"learning_rate": 2.7837295256817142e-05,
"loss": 0.1054,
"step": 4335
},
{
"epoch": 3.7907383136740935,
"grad_norm": 2.5261070728302,
"learning_rate": 2.7647290897125366e-05,
"loss": 0.0751,
"step": 4340
},
{
"epoch": 3.7951070336391437,
"grad_norm": 1.8613882064819336,
"learning_rate": 2.7457833127592646e-05,
"loss": 0.0675,
"step": 4345
},
{
"epoch": 3.799475753604194,
"grad_norm": 1.7511388063430786,
"learning_rate": 2.7268923379481838e-05,
"loss": 0.0767,
"step": 4350
},
{
"epoch": 3.803844473569244,
"grad_norm": 1.0222926139831543,
"learning_rate": 2.708056307991592e-05,
"loss": 0.1155,
"step": 4355
},
{
"epoch": 3.8082131935342947,
"grad_norm": 2.234628677368164,
"learning_rate": 2.6892753651866997e-05,
"loss": 0.1068,
"step": 4360
},
{
"epoch": 3.812581913499345,
"grad_norm": 1.8765060901641846,
"learning_rate": 2.6705496514145546e-05,
"loss": 0.0806,
"step": 4365
},
{
"epoch": 3.816950633464395,
"grad_norm": 1.1284332275390625,
"learning_rate": 2.6518793081389826e-05,
"loss": 0.0605,
"step": 4370
},
{
"epoch": 3.821319353429445,
"grad_norm": 1.271816372871399,
"learning_rate": 2.6332644764055116e-05,
"loss": 0.095,
"step": 4375
},
{
"epoch": 3.8256880733944953,
"grad_norm": 0.6583630442619324,
"learning_rate": 2.614705296840301e-05,
"loss": 0.0785,
"step": 4380
},
{
"epoch": 3.8300567933595455,
"grad_norm": 0.9801815152168274,
"learning_rate": 2.5962019096490875e-05,
"loss": 0.0781,
"step": 4385
},
{
"epoch": 3.8344255133245957,
"grad_norm": 0.9474557638168335,
"learning_rate": 2.5777544546161246e-05,
"loss": 0.0735,
"step": 4390
},
{
"epoch": 3.8387942332896463,
"grad_norm": 1.8281831741333008,
"learning_rate": 2.559363071103129e-05,
"loss": 0.0916,
"step": 4395
},
{
"epoch": 3.8431629532546965,
"grad_norm": 1.8808575868606567,
"learning_rate": 2.5410278980482107e-05,
"loss": 0.1045,
"step": 4400
},
{
"epoch": 3.8475316732197467,
"grad_norm": 0.7540014982223511,
"learning_rate": 2.5227490739648487e-05,
"loss": 0.0826,
"step": 4405
},
{
"epoch": 3.851900393184797,
"grad_norm": 1.4811105728149414,
"learning_rate": 2.5045267369408265e-05,
"loss": 0.0919,
"step": 4410
},
{
"epoch": 3.856269113149847,
"grad_norm": 1.621551275253296,
"learning_rate": 2.486361024637198e-05,
"loss": 0.0674,
"step": 4415
},
{
"epoch": 3.8606378331148976,
"grad_norm": 1.2833832502365112,
"learning_rate": 2.468252074287245e-05,
"loss": 0.1036,
"step": 4420
},
{
"epoch": 3.8650065530799473,
"grad_norm": 1.6416436433792114,
"learning_rate": 2.4502000226954337e-05,
"loss": 0.1043,
"step": 4425
},
{
"epoch": 3.869375273044998,
"grad_norm": 1.1891071796417236,
"learning_rate": 2.4322050062363945e-05,
"loss": 0.0864,
"step": 4430
},
{
"epoch": 3.873743993010048,
"grad_norm": 1.4844835996627808,
"learning_rate": 2.4142671608538792e-05,
"loss": 0.0876,
"step": 4435
},
{
"epoch": 3.8781127129750983,
"grad_norm": 1.3718053102493286,
"learning_rate": 2.3963866220597442e-05,
"loss": 0.0898,
"step": 4440
},
{
"epoch": 3.8824814329401485,
"grad_norm": 2.1567909717559814,
"learning_rate": 2.378563524932922e-05,
"loss": 0.0705,
"step": 4445
},
{
"epoch": 3.8868501529051986,
"grad_norm": 2.1048190593719482,
"learning_rate": 2.3607980041183918e-05,
"loss": 0.0985,
"step": 4450
},
{
"epoch": 3.8912188728702493,
"grad_norm": 1.1503431797027588,
"learning_rate": 2.343090193826183e-05,
"loss": 0.084,
"step": 4455
},
{
"epoch": 3.895587592835299,
"grad_norm": 0.7724549770355225,
"learning_rate": 2.3254402278303423e-05,
"loss": 0.0734,
"step": 4460
},
{
"epoch": 3.8999563128003496,
"grad_norm": 0.9084768891334534,
"learning_rate": 2.3078482394679325e-05,
"loss": 0.086,
"step": 4465
},
{
"epoch": 3.9043250327653998,
"grad_norm": 1.570974588394165,
"learning_rate": 2.2903143616380274e-05,
"loss": 0.096,
"step": 4470
},
{
"epoch": 3.90869375273045,
"grad_norm": 1.4236451387405396,
"learning_rate": 2.2728387268006922e-05,
"loss": 0.0946,
"step": 4475
},
{
"epoch": 3.9130624726955,
"grad_norm": 1.0874404907226562,
"learning_rate": 2.2554214669760054e-05,
"loss": 0.0875,
"step": 4480
},
{
"epoch": 3.9174311926605503,
"grad_norm": 1.8805941343307495,
"learning_rate": 2.2380627137430443e-05,
"loss": 0.0804,
"step": 4485
},
{
"epoch": 3.921799912625601,
"grad_norm": 2.1116044521331787,
"learning_rate": 2.2207625982389025e-05,
"loss": 0.1026,
"step": 4490
},
{
"epoch": 3.926168632590651,
"grad_norm": 1.3165743350982666,
"learning_rate": 2.203521251157682e-05,
"loss": 0.0777,
"step": 4495
},
{
"epoch": 3.9305373525557012,
"grad_norm": 1.7966806888580322,
"learning_rate": 2.18633880274953e-05,
"loss": 0.0906,
"step": 4500
},
{
"epoch": 3.9349060725207514,
"grad_norm": 1.0090104341506958,
"learning_rate": 2.169215382819636e-05,
"loss": 0.0713,
"step": 4505
},
{
"epoch": 3.9392747924858016,
"grad_norm": 0.9656439423561096,
"learning_rate": 2.1521511207272595e-05,
"loss": 0.0654,
"step": 4510
},
{
"epoch": 3.9436435124508518,
"grad_norm": 0.5651183724403381,
"learning_rate": 2.135146145384752e-05,
"loss": 0.0727,
"step": 4515
},
{
"epoch": 3.948012232415902,
"grad_norm": 2.1474905014038086,
"learning_rate": 2.1182005852565755e-05,
"loss": 0.0808,
"step": 4520
},
{
"epoch": 3.9523809523809526,
"grad_norm": 1.0668432712554932,
"learning_rate": 2.1013145683583446e-05,
"loss": 0.1044,
"step": 4525
},
{
"epoch": 3.9567496723460027,
"grad_norm": 0.9837005138397217,
"learning_rate": 2.084488222255855e-05,
"loss": 0.0537,
"step": 4530
},
{
"epoch": 3.961118392311053,
"grad_norm": 0.860316812992096,
"learning_rate": 2.0677216740641082e-05,
"loss": 0.0666,
"step": 4535
},
{
"epoch": 3.965487112276103,
"grad_norm": 1.4890446662902832,
"learning_rate": 2.051015050446372e-05,
"loss": 0.0707,
"step": 4540
},
{
"epoch": 3.9698558322411532,
"grad_norm": 0.803032636642456,
"learning_rate": 2.0343684776132098e-05,
"loss": 0.0836,
"step": 4545
},
{
"epoch": 3.974224552206204,
"grad_norm": 1.6997323036193848,
"learning_rate": 2.017782081321523e-05,
"loss": 0.077,
"step": 4550
},
{
"epoch": 3.9785932721712536,
"grad_norm": 0.9332925081253052,
"learning_rate": 2.0012559868736157e-05,
"loss": 0.0739,
"step": 4555
},
{
"epoch": 3.982961992136304,
"grad_norm": 0.9704071879386902,
"learning_rate": 1.984790319116239e-05,
"loss": 0.0738,
"step": 4560
},
{
"epoch": 3.9873307121013544,
"grad_norm": 4.083499431610107,
"learning_rate": 1.96838520243965e-05,
"loss": 0.0937,
"step": 4565
},
{
"epoch": 3.9916994320664045,
"grad_norm": 1.2434906959533691,
"learning_rate": 1.9520407607766644e-05,
"loss": 0.0514,
"step": 4570
},
{
"epoch": 3.9960681520314547,
"grad_norm": 1.4120557308197021,
"learning_rate": 1.9357571176017343e-05,
"loss": 0.0881,
"step": 4575
},
{
"epoch": 4.0,
"grad_norm": 2.6155974864959717,
"learning_rate": 1.9195343959300073e-05,
"loss": 0.079,
"step": 4580
},
{
"epoch": 4.004368719965051,
"grad_norm": 0.6208202838897705,
"learning_rate": 1.9033727183163964e-05,
"loss": 0.054,
"step": 4585
},
{
"epoch": 4.0087374399301,
"grad_norm": 1.010776162147522,
"learning_rate": 1.88727220685466e-05,
"loss": 0.0404,
"step": 4590
},
{
"epoch": 4.013106159895151,
"grad_norm": 0.9651484489440918,
"learning_rate": 1.8712329831764663e-05,
"loss": 0.077,
"step": 4595
},
{
"epoch": 4.017474879860201,
"grad_norm": 0.5756092071533203,
"learning_rate": 1.8552551684504904e-05,
"loss": 0.0353,
"step": 4600
}
],
"logging_steps": 5,
"max_steps": 5725,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2233814967263232.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}