Qwen2.5-0.5B-Open-R1-Code-GRPO / trainer_state.json
rasdani's picture
Model save
b6cdafc verified
raw
history blame
200 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.07184166097920183,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 577.5625,
"epoch": 0.00014368332195840368,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 595.375,
"epoch": 0.00028736664391680735,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 489.875,
"epoch": 0.00043104996587521106,
"grad_norm": 1.5707740783691406,
"kl": 0.0,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0478,
"reward": 0.0062500000931322575,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0625,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 465.875,
"epoch": 0.0005747332878336147,
"grad_norm": 0.003557927906513214,
"kl": 0.00043487548828125,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 671.5,
"epoch": 0.0007184166097920184,
"grad_norm": 0.0033763274550437927,
"kl": 0.0005903244018554688,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 507.375,
"epoch": 0.0008620999317504221,
"grad_norm": 0.003343924880027771,
"kl": 0.0004489421844482422,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 640.125,
"epoch": 0.0010057832537088258,
"grad_norm": 0.003249621717259288,
"kl": 0.0005922317504882812,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 574.875,
"epoch": 0.0011494665756672294,
"grad_norm": 0.004247524309903383,
"kl": 0.000701904296875,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 571.1875,
"epoch": 0.001293149897625633,
"grad_norm": 0.004429314751178026,
"kl": 0.0005669593811035156,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 624.5625,
"epoch": 0.0014368332195840367,
"grad_norm": 0.004792630672454834,
"kl": 0.0008535385131835938,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 634.9375,
"epoch": 0.0015805165415424404,
"grad_norm": 0.004688590299338102,
"kl": 0.00080108642578125,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 681.125,
"epoch": 0.0017241998635008442,
"grad_norm": 0.004042426124215126,
"kl": 0.0007867813110351562,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 552.8125,
"epoch": 0.0018678831854592479,
"grad_norm": 0.007403955794870853,
"kl": 0.0011396408081054688,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 774.5,
"epoch": 0.0020115665074176515,
"grad_norm": 0.007942945696413517,
"kl": 0.00086212158203125,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 648.875,
"epoch": 0.0021552498293760554,
"grad_norm": 0.011233330704271793,
"kl": 0.0015621185302734375,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 395.125,
"epoch": 0.002298933151334459,
"grad_norm": 2.4543673992156982,
"kl": 0.007465362548828125,
"learning_rate": 4.999952797253148e-06,
"loss": -0.0963,
"reward": 0.0062500000931322575,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0625,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 590.5625,
"epoch": 0.0024426164732928627,
"grad_norm": 0.03457748889923096,
"kl": 0.0042629241943359375,
"learning_rate": 4.9998111909931225e-06,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 466.75,
"epoch": 0.002586299795251266,
"grad_norm": 1.883467674255371,
"kl": 0.013484954833984375,
"learning_rate": 4.999575187161439e-06,
"loss": -0.0091,
"reward": 0.0062500000931322575,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0625,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 644.25,
"epoch": 0.00272998311720967,
"grad_norm": 1.1460031270980835,
"kl": 0.02417755126953125,
"learning_rate": 4.9992447956603455e-06,
"loss": 0.0375,
"reward": 0.0062500000931322575,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0625,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 424.125,
"epoch": 0.0028736664391680734,
"grad_norm": 38.29722213745117,
"kl": 5.15692138671875,
"learning_rate": 4.998820030352409e-06,
"loss": 0.0845,
"reward": 0.012500000186264515,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.125,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 465.375,
"epoch": 0.0030173497611264773,
"grad_norm": 554.5137329101562,
"kl": 12.4742431640625,
"learning_rate": 4.998300909059929e-06,
"loss": 0.1845,
"reward": 0.02500000037252903,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 531.0625,
"epoch": 0.0031610330830848807,
"grad_norm": 5.935389041900635,
"kl": 0.32598876953125,
"learning_rate": 4.997687453564198e-06,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 391.9375,
"epoch": 0.0033047164050432846,
"grad_norm": 2.4972121715545654,
"kl": 0.0226593017578125,
"learning_rate": 4.9969796896045775e-06,
"loss": -0.0487,
"reward": 0.0062500000931322575,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0625,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 595.8125,
"epoch": 0.0034483997270016884,
"grad_norm": 0.053129617124795914,
"kl": 0.0148468017578125,
"learning_rate": 4.996177646877426e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 570.125,
"epoch": 0.003592083048960092,
"grad_norm": 0.026312552392482758,
"kl": 0.0113067626953125,
"learning_rate": 4.995281359034851e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 474.0,
"epoch": 0.0037357663709184957,
"grad_norm": 0.03218882903456688,
"kl": 0.0166015625,
"learning_rate": 4.994290863683296e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 610.6875,
"epoch": 0.003879449692876899,
"grad_norm": 0.051387328654527664,
"kl": 0.01434326171875,
"learning_rate": 4.99320620238196e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 594.0625,
"epoch": 0.004023133014835303,
"grad_norm": 0.26794055104255676,
"kl": 0.034637451171875,
"learning_rate": 4.99202742064106e-06,
"loss": 0.0003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 523.8125,
"epoch": 0.004166816336793707,
"grad_norm": 0.02611120045185089,
"kl": 0.01556396484375,
"learning_rate": 4.990754567919917e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 606.4375,
"epoch": 0.004310499658752111,
"grad_norm": 0.028897596523165703,
"kl": 0.019256591796875,
"learning_rate": 4.989387697624881e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 569.1875,
"epoch": 0.004454182980710514,
"grad_norm": 0.024760432541370392,
"kl": 0.01702880859375,
"learning_rate": 4.987926867107095e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 694.3125,
"epoch": 0.004597866302668918,
"grad_norm": 0.027595188468694687,
"kl": 0.018402099609375,
"learning_rate": 4.986372137660078e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 536.375,
"epoch": 0.0047415496246273215,
"grad_norm": 0.02404957078397274,
"kl": 0.01806640625,
"learning_rate": 4.984723574517165e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 604.375,
"epoch": 0.004885232946585725,
"grad_norm": 0.025466497987508774,
"kl": 0.01824951171875,
"learning_rate": 4.9829812468487655e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 637.8125,
"epoch": 0.005028916268544128,
"grad_norm": 0.029370181262493134,
"kl": 0.0229644775390625,
"learning_rate": 4.981145227759457e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 597.1875,
"epoch": 0.005172599590502532,
"grad_norm": 0.03050071932375431,
"kl": 0.022796630859375,
"learning_rate": 4.979215594284924e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 650.5625,
"epoch": 0.005316282912460936,
"grad_norm": 0.022108623757958412,
"kl": 0.019073486328125,
"learning_rate": 4.977192427388722e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 711.1875,
"epoch": 0.00545996623441934,
"grad_norm": 0.034452371299266815,
"kl": 0.022613525390625,
"learning_rate": 4.9750758119588824e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 613.1875,
"epoch": 0.005603649556377744,
"grad_norm": 0.022646795958280563,
"kl": 0.019561767578125,
"learning_rate": 4.972865836804349e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 592.3125,
"epoch": 0.005747332878336147,
"grad_norm": 0.04067623242735863,
"kl": 0.02325439453125,
"learning_rate": 4.970562594651254e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 553.875,
"epoch": 0.005891016200294551,
"grad_norm": 0.0206755418330431,
"kl": 0.0185546875,
"learning_rate": 4.968166182139026e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 576.0,
"epoch": 0.0060346995222529546,
"grad_norm": 0.027665315195918083,
"kl": 0.025482177734375,
"learning_rate": 4.9656766998163306e-06,
"loss": 0.0003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 673.0625,
"epoch": 0.006178382844211358,
"grad_norm": 0.028020743280649185,
"kl": 0.0281982421875,
"learning_rate": 4.963094252136865e-06,
"loss": 0.0003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 758.5,
"epoch": 0.006322066166169761,
"grad_norm": 0.018797732889652252,
"kl": 0.0179443359375,
"learning_rate": 4.960418947454958e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 650.125,
"epoch": 0.006465749488128165,
"grad_norm": 0.02184910513460636,
"kl": 0.021484375,
"learning_rate": 4.957650898021038e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 659.1875,
"epoch": 0.006609432810086569,
"grad_norm": 0.020105620846152306,
"kl": 0.021453857421875,
"learning_rate": 4.954790219976915e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 564.5625,
"epoch": 0.006753116132044973,
"grad_norm": 0.021222786977887154,
"kl": 0.02001953125,
"learning_rate": 4.95183703335091e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 534.0625,
"epoch": 0.006896799454003377,
"grad_norm": 0.03832077980041504,
"kl": 0.0220489501953125,
"learning_rate": 4.948791462052819e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 689.8125,
"epoch": 0.00704048277596178,
"grad_norm": 7.35048770904541,
"kl": 0.247314453125,
"learning_rate": 4.945653633868716e-06,
"loss": 0.0019,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 699.75,
"epoch": 0.007184166097920184,
"grad_norm": 0.02264636568725109,
"kl": 0.01715087890625,
"learning_rate": 4.942423680455584e-06,
"loss": 0.0002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 627.75,
"epoch": 0.007327849419878588,
"grad_norm": 0.06823224574327469,
"kl": 0.2841796875,
"learning_rate": 4.939101737335802e-06,
"loss": 0.0028,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 716.875,
"epoch": 0.0074715327418369915,
"grad_norm": 0.12825427949428558,
"kl": 0.3291015625,
"learning_rate": 4.935687943891447e-06,
"loss": 0.0034,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 584.125,
"epoch": 0.007615216063795395,
"grad_norm": 0.09328959882259369,
"kl": 0.33056640625,
"learning_rate": 4.932182443358458e-06,
"loss": 0.0034,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 679.8125,
"epoch": 0.007758899385753798,
"grad_norm": 0.1614077389240265,
"kl": 0.3330078125,
"learning_rate": 4.928585382820616e-06,
"loss": 0.0033,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 524.5,
"epoch": 0.007902582707712203,
"grad_norm": 0.08818939328193665,
"kl": 0.32177734375,
"learning_rate": 4.924896913203376e-06,
"loss": 0.0031,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 613.125,
"epoch": 0.008046266029670606,
"grad_norm": 0.10239657759666443,
"kl": 0.337890625,
"learning_rate": 4.921117189267535e-06,
"loss": 0.0033,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 684.375,
"epoch": 0.008189949351629009,
"grad_norm": 0.21877345442771912,
"kl": 0.295166015625,
"learning_rate": 4.917246369602742e-06,
"loss": 0.0029,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 595.125,
"epoch": 0.008333632673587414,
"grad_norm": 0.17493651807308197,
"kl": 0.32080078125,
"learning_rate": 4.9132846166208355e-06,
"loss": 0.0031,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 654.1875,
"epoch": 0.008477315995545817,
"grad_norm": 0.08061961829662323,
"kl": 0.28564453125,
"learning_rate": 4.9092320965490365e-06,
"loss": 0.0029,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 704.5625,
"epoch": 0.008620999317504222,
"grad_norm": 0.05985090509057045,
"kl": 0.2939453125,
"learning_rate": 4.905088979422971e-06,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 663.4375,
"epoch": 0.008764682639462625,
"grad_norm": 0.06948310881853104,
"kl": 0.265625,
"learning_rate": 4.900855439079536e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 608.5625,
"epoch": 0.008908365961421028,
"grad_norm": 0.053070612251758575,
"kl": 0.240966796875,
"learning_rate": 4.8965316531496055e-06,
"loss": 0.0024,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 726.3125,
"epoch": 0.009052049283379432,
"grad_norm": 0.05458589643239975,
"kl": 0.271484375,
"learning_rate": 4.892117803050578e-06,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 760.3125,
"epoch": 0.009195732605337835,
"grad_norm": 0.06874047219753265,
"kl": 0.2734375,
"learning_rate": 4.887614073978761e-06,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 895.75,
"epoch": 0.009339415927296238,
"grad_norm": 0.055456191301345825,
"kl": 0.260009765625,
"learning_rate": 4.883020654901609e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 769.875,
"epoch": 0.009483099249254643,
"grad_norm": 0.04558374360203743,
"kl": 0.234375,
"learning_rate": 4.878337738549785e-06,
"loss": 0.0023,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 825.875,
"epoch": 0.009626782571213046,
"grad_norm": 0.050083279609680176,
"kl": 0.259765625,
"learning_rate": 4.873565521409082e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 679.75,
"epoch": 0.00977046589317145,
"grad_norm": 0.07992015033960342,
"kl": 0.26416015625,
"learning_rate": 4.868704203712173e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 807.375,
"epoch": 0.009914149215129854,
"grad_norm": 0.07405384629964828,
"kl": 0.27001953125,
"learning_rate": 4.86375398943021e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 638.5,
"epoch": 0.010057832537088257,
"grad_norm": 0.5143739581108093,
"kl": 0.352783203125,
"learning_rate": 4.858715086264274e-06,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 668.875,
"epoch": 0.010201515859046661,
"grad_norm": 0.053426701575517654,
"kl": 0.2099609375,
"learning_rate": 4.853587705636646e-06,
"loss": 0.0021,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 605.375,
"epoch": 0.010345199181005064,
"grad_norm": 0.18540872633457184,
"kl": 0.29345703125,
"learning_rate": 4.84837206268195e-06,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 663.4375,
"epoch": 0.01048888250296347,
"grad_norm": 0.11063985526561737,
"kl": 0.263671875,
"learning_rate": 4.8430683762381195e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 627.625,
"epoch": 0.010632565824921872,
"grad_norm": 0.09360551834106445,
"kl": 0.267822265625,
"learning_rate": 4.837676868837213e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 581.25,
"epoch": 0.010776249146880275,
"grad_norm": 0.4334953725337982,
"kl": 0.3037109375,
"learning_rate": 4.832197766696085e-06,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 570.875,
"epoch": 0.01091993246883868,
"grad_norm": 0.1146390363574028,
"kl": 0.36865234375,
"learning_rate": 4.826631299706887e-06,
"loss": 0.0028,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 431.25,
"epoch": 0.011063615790797083,
"grad_norm": 0.06237626075744629,
"kl": 0.34423828125,
"learning_rate": 4.820977701427424e-06,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 304.25,
"epoch": 0.011207299112755488,
"grad_norm": 0.06463415175676346,
"kl": 0.2822265625,
"learning_rate": 4.81523720907136e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 407.0,
"epoch": 0.01135098243471389,
"grad_norm": 0.12471897900104523,
"kl": 0.29931640625,
"learning_rate": 4.809410063498254e-06,
"loss": 0.0028,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 294.875,
"epoch": 0.011494665756672294,
"grad_norm": 0.14441123604774475,
"kl": 0.40478515625,
"learning_rate": 4.8034965092034656e-06,
"loss": 0.0032,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 483.25,
"epoch": 0.011638349078630698,
"grad_norm": 0.07597321271896362,
"kl": 0.30029296875,
"learning_rate": 4.797496794307889e-06,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 456.875,
"epoch": 0.011782032400589101,
"grad_norm": 0.09612507373094559,
"kl": 0.3154296875,
"learning_rate": 4.791411170547545e-06,
"loss": 0.0031,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 346.75,
"epoch": 0.011925715722547506,
"grad_norm": 0.16882579028606415,
"kl": 0.29443359375,
"learning_rate": 4.785239893263017e-06,
"loss": 0.0029,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 380.25,
"epoch": 0.012069399044505909,
"grad_norm": 0.07839754223823547,
"kl": 0.29638671875,
"learning_rate": 4.778983221388742e-06,
"loss": 0.0028,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 339.0625,
"epoch": 0.012213082366464312,
"grad_norm": 0.11466533690690994,
"kl": 0.298583984375,
"learning_rate": 4.77264141744214e-06,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 449.0,
"epoch": 0.012356765688422717,
"grad_norm": 0.09201041609048843,
"kl": 0.29052734375,
"learning_rate": 4.766214747512603e-06,
"loss": 0.0028,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 397.875,
"epoch": 0.01250044901038112,
"grad_norm": 0.5096721053123474,
"kl": 0.31689453125,
"learning_rate": 4.759703481250331e-06,
"loss": 0.0033,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 388.625,
"epoch": 0.012644132332339523,
"grad_norm": 0.07259467989206314,
"kl": 0.352294921875,
"learning_rate": 4.753107891855015e-06,
"loss": 0.0025,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 374.125,
"epoch": 0.012787815654297928,
"grad_norm": 0.07488111406564713,
"kl": 0.257080078125,
"learning_rate": 4.746428256064375e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 514.6875,
"epoch": 0.01293149897625633,
"grad_norm": 0.04593910649418831,
"kl": 0.2216796875,
"learning_rate": 4.7396648541425534e-06,
"loss": 0.0023,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 374.0,
"epoch": 0.013075182298214735,
"grad_norm": 0.28726693987846375,
"kl": 0.4638671875,
"learning_rate": 4.732817969868348e-06,
"loss": 0.0044,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 430.0625,
"epoch": 0.013218865620173138,
"grad_norm": 0.0935889482498169,
"kl": 0.26025390625,
"learning_rate": 4.7258878905233095e-06,
"loss": 0.0025,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 450.75,
"epoch": 0.013362548942131541,
"grad_norm": 0.13612554967403412,
"kl": 0.31982421875,
"learning_rate": 4.718874906879688e-06,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 419.25,
"epoch": 0.013506232264089946,
"grad_norm": 0.5188797116279602,
"kl": 0.435546875,
"learning_rate": 4.711779313188231e-06,
"loss": 0.004,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 428.75,
"epoch": 0.013649915586048349,
"grad_norm": 0.06785713881254196,
"kl": 0.2880859375,
"learning_rate": 4.70460140716584e-06,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 417.1875,
"epoch": 0.013793598908006754,
"grad_norm": 0.10226578265428543,
"kl": 0.3046875,
"learning_rate": 4.697341489983076e-06,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 529.8125,
"epoch": 0.013937282229965157,
"grad_norm": 0.09858646988868713,
"kl": 0.30908203125,
"learning_rate": 4.6899998662515215e-06,
"loss": 0.0029,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 432.4375,
"epoch": 0.01408096555192356,
"grad_norm": 0.4978332817554474,
"kl": 0.31787109375,
"learning_rate": 4.682576844011007e-06,
"loss": 0.0032,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 568.4375,
"epoch": 0.014224648873881965,
"grad_norm": 0.07428912818431854,
"kl": 0.277587890625,
"learning_rate": 4.675072734716678e-06,
"loss": 0.0025,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 410.3125,
"epoch": 0.014368332195840368,
"grad_norm": 0.09108950942754745,
"kl": 0.333251953125,
"learning_rate": 4.667487853225931e-06,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 583.8125,
"epoch": 0.014512015517798772,
"grad_norm": 0.07506731152534485,
"kl": 0.4638671875,
"learning_rate": 4.659822517785203e-06,
"loss": 0.0029,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 468.25,
"epoch": 0.014655698839757175,
"grad_norm": 0.08625821769237518,
"kl": 0.314453125,
"learning_rate": 4.6520770500166165e-06,
"loss": 0.0028,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 462.5,
"epoch": 0.014799382161715578,
"grad_norm": 0.06079576537013054,
"kl": 0.28369140625,
"learning_rate": 4.644251774904487e-06,
"loss": 0.0028,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 440.6875,
"epoch": 0.014943065483673983,
"grad_norm": 0.0945528894662857,
"kl": 0.33642578125,
"learning_rate": 4.636347020781684e-06,
"loss": 0.0036,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 607.875,
"epoch": 0.015086748805632386,
"grad_norm": 0.08069407939910889,
"kl": 0.31298828125,
"learning_rate": 4.6283631193158605e-06,
"loss": 0.0031,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 564.375,
"epoch": 0.01523043212759079,
"grad_norm": 0.055641427636146545,
"kl": 0.29150390625,
"learning_rate": 4.620300405495532e-06,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 531.4375,
"epoch": 0.015374115449549194,
"grad_norm": 0.061065420508384705,
"kl": 0.302734375,
"learning_rate": 4.612159217616022e-06,
"loss": 0.0029,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 493.1875,
"epoch": 0.015517798771507597,
"grad_norm": 0.06338890641927719,
"kl": 0.32373046875,
"learning_rate": 4.603939897265268e-06,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 508.625,
"epoch": 0.015661482093466,
"grad_norm": 0.4134579300880432,
"kl": 0.328125,
"learning_rate": 4.595642789309492e-06,
"loss": 0.0033,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 460.9375,
"epoch": 0.015805165415424406,
"grad_norm": 0.04516274109482765,
"kl": 0.276611328125,
"learning_rate": 4.587268241878724e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 561.3125,
"epoch": 0.01594884873738281,
"grad_norm": 0.0691392570734024,
"kl": 0.27783203125,
"learning_rate": 4.578816606352205e-06,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 548.25,
"epoch": 0.016092532059341212,
"grad_norm": 0.05729848891496658,
"kl": 0.342529296875,
"learning_rate": 4.570288237343632e-06,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 516.1875,
"epoch": 0.016236215381299615,
"grad_norm": 0.073029063642025,
"kl": 0.267333984375,
"learning_rate": 4.561683492686289e-06,
"loss": 0.0024,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 502.375,
"epoch": 0.016379898703258018,
"grad_norm": 1.27533757686615,
"kl": 0.22216796875,
"learning_rate": 4.5530027334180285e-06,
"loss": 0.0409,
"reward": 0.0062500000931322575,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0625,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 522.25,
"epoch": 0.016523582025216425,
"grad_norm": 0.0462871678173542,
"kl": 0.23681640625,
"learning_rate": 4.544246323766122e-06,
"loss": 0.0023,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 582.8125,
"epoch": 0.016667265347174828,
"grad_norm": 0.042445018887519836,
"kl": 0.22998046875,
"learning_rate": 4.535414631131983e-06,
"loss": 0.0022,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 514.3125,
"epoch": 0.01681094866913323,
"grad_norm": 1.90855872631073,
"kl": 0.34375,
"learning_rate": 4.526508026075746e-06,
"loss": -0.0011,
"reward": 0.012500000186264515,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.125,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 440.4375,
"epoch": 0.016954631991091634,
"grad_norm": 0.08046559244394302,
"kl": 0.358154296875,
"learning_rate": 4.517526882300721e-06,
"loss": 0.0024,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 449.5,
"epoch": 0.017098315313050037,
"grad_norm": 0.14999990165233612,
"kl": 0.302978515625,
"learning_rate": 4.508471576637713e-06,
"loss": 0.0024,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 472.5,
"epoch": 0.017241998635008443,
"grad_norm": 1.8022429943084717,
"kl": 0.251953125,
"learning_rate": 4.499342489029211e-06,
"loss": 0.0915,
"reward": 0.012500000186264515,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.125,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 475.9375,
"epoch": 0.017385681956966846,
"grad_norm": 0.07380665838718414,
"kl": 0.29052734375,
"learning_rate": 4.490140002513449e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 466.0625,
"epoch": 0.01752936527892525,
"grad_norm": 0.0822119191288948,
"kl": 0.294189453125,
"learning_rate": 4.48086450320833e-06,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 406.75,
"epoch": 0.017673048600883652,
"grad_norm": 1.9022564888000488,
"kl": 0.2939453125,
"learning_rate": 4.4715163802952266e-06,
"loss": -0.1072,
"reward": 0.0062500000931322575,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0625,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 523.875,
"epoch": 0.017816731922842055,
"grad_norm": 0.10652535408735275,
"kl": 0.25537109375,
"learning_rate": 4.462096026002655e-06,
"loss": 0.0025,
"reward": 0.0,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 519.125,
"epoch": 0.017960415244800458,
"grad_norm": 1.6003073453903198,
"kl": 0.260986328125,
"learning_rate": 4.4526038355898144e-06,
"loss": -0.074,
"reward": 0.012500000186264515,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.125,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 609.3125,
"epoch": 0.018104098566758865,
"grad_norm": 1.3760863542556763,
"kl": 0.2841796875,
"learning_rate": 4.4430402073300035e-06,
"loss": 0.0366,
"reward": 0.012500000186264515,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.125,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 467.5,
"epoch": 0.018247781888717268,
"grad_norm": 1.8627442121505737,
"kl": 0.284423828125,
"learning_rate": 4.433405542493909e-06,
"loss": 0.0936,
"reward": 0.018750000279396772,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.1875,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 618.1875,
"epoch": 0.01839146521067567,
"grad_norm": 1.4385063648223877,
"kl": 0.298828125,
"learning_rate": 4.4237002453327734e-06,
"loss": 0.0351,
"reward": 0.012500000186264515,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.125,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 506.875,
"epoch": 0.018535148532634074,
"grad_norm": 2.3687305450439453,
"kl": 0.316162109375,
"learning_rate": 4.4139247230614245e-06,
"loss": -0.0343,
"reward": 0.018750000279396772,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.1875,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 444.375,
"epoch": 0.018678831854592477,
"grad_norm": 2.7708656787872314,
"kl": 0.35009765625,
"learning_rate": 4.404079385841201e-06,
"loss": -0.0774,
"reward": 0.056250001303851604,
"reward_std": 0.05580127239227295,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5625,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 494.0625,
"epoch": 0.018822515176550883,
"grad_norm": 1.9888551235198975,
"kl": 0.37109375,
"learning_rate": 4.394164646762734e-06,
"loss": -0.1476,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 505.5,
"epoch": 0.018966198498509286,
"grad_norm": 2.9615418910980225,
"kl": 0.39111328125,
"learning_rate": 4.384180921828618e-06,
"loss": 0.1607,
"reward": 0.06250000186264515,
"reward_std": 0.05386751517653465,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.625,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 374.0625,
"epoch": 0.01910988182046769,
"grad_norm": 2.8335933685302734,
"kl": 0.720703125,
"learning_rate": 4.374128629935955e-06,
"loss": 0.0248,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 491.25,
"epoch": 0.019253565142426092,
"grad_norm": 2.8478641510009766,
"kl": 0.4345703125,
"learning_rate": 4.364008192858781e-06,
"loss": -0.0556,
"reward": 0.06250000186264515,
"reward_std": 0.05386751517653465,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.625,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 363.625,
"epoch": 0.019397248464384495,
"grad_norm": 2.654599905014038,
"kl": 0.5478515625,
"learning_rate": 4.353820035230366e-06,
"loss": -0.0377,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 558.4375,
"epoch": 0.0195409317863429,
"grad_norm": 1.9685494899749756,
"kl": 0.3134765625,
"learning_rate": 4.3435645845254e-06,
"loss": 0.0397,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 407.1875,
"epoch": 0.019684615108301304,
"grad_norm": 3.2566757202148438,
"kl": 0.39111328125,
"learning_rate": 4.333242271042054e-06,
"loss": 0.1668,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 372.9375,
"epoch": 0.019828298430259707,
"grad_norm": 2.8907630443573,
"kl": 0.41943359375,
"learning_rate": 4.32285352788393e-06,
"loss": 0.1106,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 385.625,
"epoch": 0.01997198175221811,
"grad_norm": 3.1960320472717285,
"kl": 0.44140625,
"learning_rate": 4.312398790941882e-06,
"loss": 0.0251,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 222.3125,
"epoch": 0.020115665074176513,
"grad_norm": 3.963812828063965,
"kl": 0.42822265625,
"learning_rate": 4.301878498875735e-06,
"loss": -0.0553,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 208.6875,
"epoch": 0.02025934839613492,
"grad_norm": 24269.09375,
"kl": 984.3603515625,
"learning_rate": 4.291293093095873e-06,
"loss": 13.9158,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 272.0625,
"epoch": 0.020403031718093323,
"grad_norm": 41.54574966430664,
"kl": 1.75048828125,
"learning_rate": 4.280643017744723e-06,
"loss": 0.0095,
"reward": 0.08125000121071935,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 301.5,
"epoch": 0.020546715040051726,
"grad_norm": 1.2713871002197266,
"kl": 0.52978515625,
"learning_rate": 4.269928719678117e-06,
"loss": 0.005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 239.375,
"epoch": 0.02069039836201013,
"grad_norm": 3.0878281593322754,
"kl": 0.755859375,
"learning_rate": 4.2591506484465426e-06,
"loss": -0.0796,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 193.3125,
"epoch": 0.020834081683968532,
"grad_norm": 3.4972658157348633,
"kl": 0.58154296875,
"learning_rate": 4.248309256276283e-06,
"loss": -0.0246,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 222.8125,
"epoch": 0.02097776500592694,
"grad_norm": 4.0359272956848145,
"kl": 0.568359375,
"learning_rate": 4.23740499805044e-06,
"loss": -0.1349,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 153.375,
"epoch": 0.02112144832788534,
"grad_norm": 4.588460922241211,
"kl": 0.5205078125,
"learning_rate": 4.22643833128985e-06,
"loss": 0.1225,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 144.9375,
"epoch": 0.021265131649843744,
"grad_norm": 3.1051583290100098,
"kl": 0.5810546875,
"learning_rate": 4.215409716133885e-06,
"loss": 0.0097,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 135.3125,
"epoch": 0.021408814971802147,
"grad_norm": 0.4494142532348633,
"kl": 0.62890625,
"learning_rate": 4.204319615321151e-06,
"loss": 0.0061,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 184.125,
"epoch": 0.02155249829376055,
"grad_norm": 3.1240758895874023,
"kl": 0.4951171875,
"learning_rate": 4.193168494170065e-06,
"loss": 0.0225,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 161.25,
"epoch": 0.021696181615718957,
"grad_norm": 0.13801798224449158,
"kl": 0.5107421875,
"learning_rate": 4.181956820559339e-06,
"loss": 0.005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 204.75,
"epoch": 0.02183986493767736,
"grad_norm": 2.869321823120117,
"kl": 0.52685546875,
"learning_rate": 4.170685064908342e-06,
"loss": 0.0007,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 217.625,
"epoch": 0.021983548259635763,
"grad_norm": 2.987393379211426,
"kl": 0.4775390625,
"learning_rate": 4.159353700157365e-06,
"loss": 0.0874,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 301.1875,
"epoch": 0.022127231581594166,
"grad_norm": 4.155872344970703,
"kl": 0.513671875,
"learning_rate": 4.14796320174778e-06,
"loss": 0.1495,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 203.375,
"epoch": 0.02227091490355257,
"grad_norm": 3.3726768493652344,
"kl": 0.5556640625,
"learning_rate": 4.136514047602087e-06,
"loss": -0.1008,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 239.5625,
"epoch": 0.022414598225510975,
"grad_norm": 1.6632219552993774,
"kl": 0.43505859375,
"learning_rate": 4.1250067181038635e-06,
"loss": -0.0251,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 279.3125,
"epoch": 0.02255828154746938,
"grad_norm": 1.4800124168395996,
"kl": 0.5205078125,
"learning_rate": 4.113441696077608e-06,
"loss": 0.0334,
"reward": 0.0875000013038516,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 228.5,
"epoch": 0.02270196486942778,
"grad_norm": 3.3807125091552734,
"kl": 0.5283203125,
"learning_rate": 4.101819466768484e-06,
"loss": 0.0686,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 265.375,
"epoch": 0.022845648191386184,
"grad_norm": 0.11021065711975098,
"kl": 0.4150390625,
"learning_rate": 4.0901405178219535e-06,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 236.5,
"epoch": 0.022989331513344587,
"grad_norm": 0.12716983258724213,
"kl": 0.48828125,
"learning_rate": 4.078405339263326e-06,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 394.75,
"epoch": 0.023133014835302994,
"grad_norm": 2.502532720565796,
"kl": 0.43359375,
"learning_rate": 4.06661442347719e-06,
"loss": 0.1277,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 307.75,
"epoch": 0.023276698157261397,
"grad_norm": 2.625666618347168,
"kl": 0.603515625,
"learning_rate": 4.054768265186758e-06,
"loss": -0.0039,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 320.875,
"epoch": 0.0234203814792198,
"grad_norm": 10.025872230529785,
"kl": 3.30126953125,
"learning_rate": 4.0428673614331036e-06,
"loss": -0.1205,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 340.0625,
"epoch": 0.023564064801178203,
"grad_norm": 1.7133045196533203,
"kl": 0.4716796875,
"learning_rate": 4.030912211554316e-06,
"loss": -0.0296,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 323.9375,
"epoch": 0.023707748123136606,
"grad_norm": 2.04508638381958,
"kl": 0.47265625,
"learning_rate": 4.018903317164539e-06,
"loss": 0.1483,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 407.75,
"epoch": 0.023851431445095012,
"grad_norm": 1.8505865335464478,
"kl": 0.435546875,
"learning_rate": 4.006841182132932e-06,
"loss": 0.1073,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 340.6875,
"epoch": 0.023995114767053415,
"grad_norm": 1.863888144493103,
"kl": 0.44677734375,
"learning_rate": 3.9947263125625195e-06,
"loss": -0.0231,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 359.8125,
"epoch": 0.024138798089011818,
"grad_norm": 3.2325119972229004,
"kl": 0.4931640625,
"learning_rate": 3.982559216768967e-06,
"loss": -0.0656,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 228.0,
"epoch": 0.02428248141097022,
"grad_norm": 3.302624464035034,
"kl": 0.5654296875,
"learning_rate": 3.970340405259245e-06,
"loss": 0.0314,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 353.3125,
"epoch": 0.024426164732928624,
"grad_norm": 4.623762607574463,
"kl": 0.5009765625,
"learning_rate": 3.958070390710214e-06,
"loss": 0.3016,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 378.5,
"epoch": 0.02456984805488703,
"grad_norm": 1.5588092803955078,
"kl": 0.43212890625,
"learning_rate": 3.945749687947109e-06,
"loss": 0.0641,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 283.125,
"epoch": 0.024713531376845434,
"grad_norm": 2.220501184463501,
"kl": 0.771484375,
"learning_rate": 3.933378813921942e-06,
"loss": 0.0059,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 373.9375,
"epoch": 0.024857214698803837,
"grad_norm": 1084.8433837890625,
"kl": 40.76611328125,
"learning_rate": 3.920958287691811e-06,
"loss": 0.8411,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 373.25,
"epoch": 0.02500089802076224,
"grad_norm": 117.17958068847656,
"kl": 4.12109375,
"learning_rate": 3.908488630397121e-06,
"loss": 0.0798,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 331.5625,
"epoch": 0.025144581342720643,
"grad_norm": 9.010733604431152,
"kl": 1.80224609375,
"learning_rate": 3.8959703652397175e-06,
"loss": -0.0073,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 389.875,
"epoch": 0.025288264664679046,
"grad_norm": 5.297504901885986,
"kl": 0.67919921875,
"learning_rate": 3.883404017460935e-06,
"loss": 0.1025,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 394.375,
"epoch": 0.025431947986637452,
"grad_norm": 2.6283674240112305,
"kl": 0.5390625,
"learning_rate": 3.870790114319559e-06,
"loss": 0.0492,
"reward": 0.0437500006519258,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.4375,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 321.5,
"epoch": 0.025575631308595855,
"grad_norm": 3.5162456035614014,
"kl": 0.7216796875,
"learning_rate": 3.858129185069701e-06,
"loss": 0.1675,
"reward": 0.03125000046566129,
"reward_std": 0.04136751499027014,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.3125,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 333.3125,
"epoch": 0.025719314630554258,
"grad_norm": 4.047501564025879,
"kl": 0.751953125,
"learning_rate": 3.845421760938597e-06,
"loss": 0.1119,
"reward": 0.050000001676380634,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 305.6875,
"epoch": 0.02586299795251266,
"grad_norm": 4.252400875091553,
"kl": 0.6796875,
"learning_rate": 3.832668375104312e-06,
"loss": 0.0201,
"reward": 0.050000001676380634,
"reward_std": 0.05000000074505806,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 225.9375,
"epoch": 0.026006681274471064,
"grad_norm": 7.807644844055176,
"kl": 1.560546875,
"learning_rate": 3.8198695626733725e-06,
"loss": 0.0647,
"reward": 0.04375000111758709,
"reward_std": 0.051933757960796356,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.4375,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 212.625,
"epoch": 0.02615036459642947,
"grad_norm": 5.3708038330078125,
"kl": 0.736328125,
"learning_rate": 3.8070258606583156e-06,
"loss": -0.0523,
"reward": 0.03125000046566129,
"reward_std": 0.051933757960796356,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.3125,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 263.9375,
"epoch": 0.026294047918387874,
"grad_norm": 3.724597454071045,
"kl": 0.6953125,
"learning_rate": 3.7941378079551544e-06,
"loss": -0.0584,
"reward": 0.06875000149011612,
"reward_std": 0.04136751499027014,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 231.25,
"epoch": 0.026437731240346277,
"grad_norm": 4.854169845581055,
"kl": 0.8095703125,
"learning_rate": 3.7812059453207677e-06,
"loss": -0.0794,
"reward": 0.05625000176951289,
"reward_std": 0.051933757960796356,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5625,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 283.125,
"epoch": 0.02658141456230468,
"grad_norm": 4.72191858291626,
"kl": 0.6787109375,
"learning_rate": 3.768230815350213e-06,
"loss": 0.0998,
"reward": 0.06250000186264515,
"reward_std": 0.05386751517653465,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.625,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 315.5625,
"epoch": 0.026725097884263083,
"grad_norm": 4.964809417724609,
"kl": 0.6796875,
"learning_rate": 3.7552129624539557e-06,
"loss": 0.34,
"reward": 0.0687500024214387,
"reward_std": 0.051933757960796356,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 258.375,
"epoch": 0.02686878120622149,
"grad_norm": 3.47402286529541,
"kl": 0.7421875,
"learning_rate": 3.7421529328350316e-06,
"loss": 0.0961,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 359.0,
"epoch": 0.027012464528179892,
"grad_norm": 480.0755615234375,
"kl": 37.884765625,
"learning_rate": 3.7290512744661274e-06,
"loss": 0.679,
"reward": 0.05000000121071935,
"reward_std": 0.05386751517653465,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 213.1875,
"epoch": 0.027156147850138295,
"grad_norm": 5.7086052894592285,
"kl": 1.61328125,
"learning_rate": 3.715908537066589e-06,
"loss": 0.1013,
"reward": 0.06250000186264515,
"reward_std": 0.05386751517653465,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.625,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 369.875,
"epoch": 0.027299831172096698,
"grad_norm": 3.870173215866089,
"kl": 0.7568359375,
"learning_rate": 3.7027252720793538e-06,
"loss": 0.2494,
"reward": 0.0687500019557774,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 158.25,
"epoch": 0.0274435144940551,
"grad_norm": 491.740478515625,
"kl": 18.1435546875,
"learning_rate": 3.689502032647817e-06,
"loss": 0.3011,
"reward": 0.07500000111758709,
"reward_std": 0.028867514804005623,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 280.75,
"epoch": 0.027587197816013508,
"grad_norm": 4.54683256149292,
"kl": 0.853515625,
"learning_rate": 3.6762393735926245e-06,
"loss": 0.2421,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 180.4375,
"epoch": 0.02773088113797191,
"grad_norm": 3.0036094188690186,
"kl": 0.7412109375,
"learning_rate": 3.6629378513883852e-06,
"loss": 0.0226,
"reward": 0.0875000013038516,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 159.5625,
"epoch": 0.027874564459930314,
"grad_norm": 5.060415744781494,
"kl": 0.8330078125,
"learning_rate": 3.6495980241403307e-06,
"loss": 0.1031,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 203.4375,
"epoch": 0.028018247781888717,
"grad_norm": 3.40126633644104,
"kl": 0.8203125,
"learning_rate": 3.636220451560896e-06,
"loss": 0.2255,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 227.8125,
"epoch": 0.02816193110384712,
"grad_norm": 5.539950847625732,
"kl": 0.814453125,
"learning_rate": 3.622805694946235e-06,
"loss": 0.2887,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 196.625,
"epoch": 0.028305614425805526,
"grad_norm": 1.9450119733810425,
"kl": 0.75390625,
"learning_rate": 3.609354317152667e-06,
"loss": -0.0384,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 159.125,
"epoch": 0.02844929774776393,
"grad_norm": 5.7262701988220215,
"kl": 0.79296875,
"learning_rate": 3.595866882573063e-06,
"loss": 0.2286,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 156.875,
"epoch": 0.028592981069722332,
"grad_norm": 4.302977561950684,
"kl": 0.65625,
"learning_rate": 3.5823439571131675e-06,
"loss": 0.0149,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 153.5625,
"epoch": 0.028736664391680735,
"grad_norm": 4.005610466003418,
"kl": 0.9853515625,
"learning_rate": 3.5687861081678477e-06,
"loss": 0.0623,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 148.625,
"epoch": 0.028880347713639138,
"grad_norm": 3.1937036514282227,
"kl": 0.83203125,
"learning_rate": 3.555193904597291e-06,
"loss": 0.0186,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 193.1875,
"epoch": 0.029024031035597544,
"grad_norm": 3.0985937118530273,
"kl": 0.7939453125,
"learning_rate": 3.541567916703138e-06,
"loss": 0.2288,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 210.6875,
"epoch": 0.029167714357555947,
"grad_norm": 0.2118406444787979,
"kl": 0.8671875,
"learning_rate": 3.5279087162045517e-06,
"loss": 0.0088,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 150.25,
"epoch": 0.02931139767951435,
"grad_norm": 3.7620272636413574,
"kl": 1.2724609375,
"learning_rate": 3.5142168762142265e-06,
"loss": -0.0115,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 203.6875,
"epoch": 0.029455081001472753,
"grad_norm": 3.8695547580718994,
"kl": 0.7763671875,
"learning_rate": 3.500492971214347e-06,
"loss": 0.0121,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 244.125,
"epoch": 0.029598764323431156,
"grad_norm": 5.059806823730469,
"kl": 0.87890625,
"learning_rate": 3.48673757703248e-06,
"loss": 0.3029,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 136.125,
"epoch": 0.029742447645389563,
"grad_norm": 2.909438371658325,
"kl": 0.771484375,
"learning_rate": 3.472951270817418e-06,
"loss": 0.0315,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 207.375,
"epoch": 0.029886130967347966,
"grad_norm": 3.3862502574920654,
"kl": 0.8125,
"learning_rate": 3.4591346310149578e-06,
"loss": 0.0018,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 162.3125,
"epoch": 0.03002981428930637,
"grad_norm": 5.295530796051025,
"kl": 0.7373046875,
"learning_rate": 3.445288237343632e-06,
"loss": 0.1625,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 213.75,
"epoch": 0.030173497611264772,
"grad_norm": 1.1489182710647583,
"kl": 0.791015625,
"learning_rate": 3.4314126707703895e-06,
"loss": 0.2328,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 186.375,
"epoch": 0.030317180933223175,
"grad_norm": 0.16106684505939484,
"kl": 0.685546875,
"learning_rate": 3.4175085134862128e-06,
"loss": 0.0069,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 171.6875,
"epoch": 0.03046086425518158,
"grad_norm": 5.108158111572266,
"kl": 0.7587890625,
"learning_rate": 3.4035763488816953e-06,
"loss": 0.1661,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 165.8125,
"epoch": 0.030604547577139984,
"grad_norm": 0.24283349514007568,
"kl": 0.7724609375,
"learning_rate": 3.3896167615225594e-06,
"loss": 0.0074,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 169.375,
"epoch": 0.030748230899098387,
"grad_norm": 5.494866371154785,
"kl": 1.5595703125,
"learning_rate": 3.375630337125133e-06,
"loss": -0.0222,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 198.625,
"epoch": 0.03089191422105679,
"grad_norm": 0.21820229291915894,
"kl": 0.6953125,
"learning_rate": 3.361617662531772e-06,
"loss": 0.0069,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 209.8125,
"epoch": 0.031035597543015193,
"grad_norm": 1.8697510957717896,
"kl": 0.912109375,
"learning_rate": 3.347579325686237e-06,
"loss": 0.0101,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 216.375,
"epoch": 0.0311792808649736,
"grad_norm": 2.8091392517089844,
"kl": 0.724609375,
"learning_rate": 3.333515915609027e-06,
"loss": 0.0367,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 201.6875,
"epoch": 0.031322964186932,
"grad_norm": 3.437286853790283,
"kl": 0.6689453125,
"learning_rate": 3.3194280223726616e-06,
"loss": -0.0491,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 216.1875,
"epoch": 0.0314666475088904,
"grad_norm": 416.79510498046875,
"kl": 33.26171875,
"learning_rate": 3.305316237076927e-06,
"loss": 0.5898,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 167.5,
"epoch": 0.03161033083084881,
"grad_norm": 4.085023403167725,
"kl": 0.8330078125,
"learning_rate": 3.291181151824071e-06,
"loss": 0.0318,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 247.1875,
"epoch": 0.031754014152807215,
"grad_norm": 5.383124351501465,
"kl": 0.810546875,
"learning_rate": 3.27702335969396e-06,
"loss": 0.151,
"reward": 0.0687500024214387,
"reward_std": 0.051933757960796356,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 244.9375,
"epoch": 0.03189769747476562,
"grad_norm": 121.67268371582031,
"kl": 9.408203125,
"learning_rate": 3.2628434547191985e-06,
"loss": 0.0321,
"reward": 0.0687500024214387,
"reward_std": 0.051933757960796356,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 216.0,
"epoch": 0.03204138079672402,
"grad_norm": 17.158824920654297,
"kl": 3.072265625,
"learning_rate": 3.2486420318601973e-06,
"loss": -0.0192,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 185.0,
"epoch": 0.032185064118682424,
"grad_norm": 4.799882411956787,
"kl": 0.859375,
"learning_rate": 3.2344196869802187e-06,
"loss": 0.0672,
"reward": 0.06250000093132257,
"reward_std": 0.043301272206008434,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.625,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 239.5625,
"epoch": 0.03232874744064083,
"grad_norm": 3.988457202911377,
"kl": 0.75390625,
"learning_rate": 3.2201770168203694e-06,
"loss": -0.0968,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 222.875,
"epoch": 0.03247243076259923,
"grad_norm": 1.923133134841919,
"kl": 0.615234375,
"learning_rate": 3.205914618974563e-06,
"loss": -0.0411,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 202.125,
"epoch": 0.03261611408455763,
"grad_norm": 2.7511911392211914,
"kl": 0.6474609375,
"learning_rate": 3.1916330918644496e-06,
"loss": 0.009,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 246.5,
"epoch": 0.032759797406516036,
"grad_norm": 1.7069810628890991,
"kl": 0.5673828125,
"learning_rate": 3.177333034714303e-06,
"loss": -0.014,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 233.875,
"epoch": 0.03290348072847444,
"grad_norm": 2.153999090194702,
"kl": 0.611328125,
"learning_rate": 3.1630150475258813e-06,
"loss": 0.0533,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 271.9375,
"epoch": 0.03304716405043285,
"grad_norm": 3.5415985584259033,
"kl": 0.5654296875,
"learning_rate": 3.148679731053252e-06,
"loss": -0.0361,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 269.3125,
"epoch": 0.03319084737239125,
"grad_norm": 8.814692497253418,
"kl": 0.642578125,
"learning_rate": 3.1343276867775805e-06,
"loss": -0.1332,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 254.125,
"epoch": 0.033334530694349655,
"grad_norm": 2.5323190689086914,
"kl": 0.5712890625,
"learning_rate": 3.1199595168819043e-06,
"loss": 0.096,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 295.3125,
"epoch": 0.03347821401630806,
"grad_norm": 3.8002569675445557,
"kl": 0.82421875,
"learning_rate": 3.105575824225852e-06,
"loss": -0.0877,
"reward": 0.06875000149011612,
"reward_std": 0.04136751499027014,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 288.1875,
"epoch": 0.03362189733826646,
"grad_norm": 3.2891669273376465,
"kl": 0.55810546875,
"learning_rate": 3.091177212320363e-06,
"loss": 0.2101,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 299.6875,
"epoch": 0.033765580660224864,
"grad_norm": 2.444749355316162,
"kl": 0.54150390625,
"learning_rate": 3.0767642853023538e-06,
"loss": -0.0229,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 351.6875,
"epoch": 0.03390926398218327,
"grad_norm": 0.35167747735977173,
"kl": 0.49560546875,
"learning_rate": 3.062337647909376e-06,
"loss": 0.0049,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 325.0,
"epoch": 0.03405294730414167,
"grad_norm": 2.867281436920166,
"kl": 0.45166015625,
"learning_rate": 3.04789790545424e-06,
"loss": 0.1182,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 310.25,
"epoch": 0.03419663062610007,
"grad_norm": 2.982635498046875,
"kl": 0.51708984375,
"learning_rate": 3.033445663799621e-06,
"loss": 0.0588,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 311.125,
"epoch": 0.034340313948058476,
"grad_norm": 2.8937182426452637,
"kl": 0.521484375,
"learning_rate": 3.018981529332633e-06,
"loss": 0.1857,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 262.9375,
"epoch": 0.034483997270016886,
"grad_norm": 3.6246678829193115,
"kl": 0.6162109375,
"learning_rate": 3.00450610893939e-06,
"loss": 0.1892,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 403.5625,
"epoch": 0.03462768059197529,
"grad_norm": 2.3801181316375732,
"kl": 0.52099609375,
"learning_rate": 2.9900200099795396e-06,
"loss": 0.0722,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 287.3125,
"epoch": 0.03477136391393369,
"grad_norm": 0.24018186330795288,
"kl": 0.5986328125,
"learning_rate": 2.9755238402607826e-06,
"loss": 0.0058,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 393.625,
"epoch": 0.034915047235892095,
"grad_norm": 0.6572515964508057,
"kl": 0.53466796875,
"learning_rate": 2.961018208013367e-06,
"loss": 0.0049,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 560.4375,
"epoch": 0.0350587305578505,
"grad_norm": 2.2393953800201416,
"kl": 0.4404296875,
"learning_rate": 2.9465037218645694e-06,
"loss": 0.3692,
"reward": 0.06250000186264515,
"reward_std": 0.05386751517653465,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.625,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 446.1875,
"epoch": 0.0352024138798089,
"grad_norm": 2.0634946823120117,
"kl": 0.57177734375,
"learning_rate": 2.9319809908131604e-06,
"loss": -0.0071,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 528.1875,
"epoch": 0.035346097201767304,
"grad_norm": 2.6367909908294678,
"kl": 0.5859375,
"learning_rate": 2.917450624203847e-06,
"loss": 0.0284,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 640.25,
"epoch": 0.03548978052372571,
"grad_norm": 0.06677532196044922,
"kl": 0.365234375,
"learning_rate": 2.9029132317017118e-06,
"loss": 0.0034,
"reward": 0.07500000111758709,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 625.75,
"epoch": 0.03563346384568411,
"grad_norm": 1.9811322689056396,
"kl": 0.39404296875,
"learning_rate": 2.888369423266629e-06,
"loss": 0.1424,
"reward": 0.0687500019557774,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 710.0,
"epoch": 0.03577714716764251,
"grad_norm": 1.6662659645080566,
"kl": 0.4345703125,
"learning_rate": 2.8738198091276712e-06,
"loss": 0.1986,
"reward": 0.06250000139698386,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.625,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 680.0,
"epoch": 0.035920830489600916,
"grad_norm": 1.886515736579895,
"kl": 0.39013671875,
"learning_rate": 2.859264999757509e-06,
"loss": 0.2372,
"reward": 0.04375000111758709,
"reward_std": 0.04136751499027014,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.4375,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 764.6875,
"epoch": 0.036064513811559326,
"grad_norm": 5.7232513427734375,
"kl": 0.4990234375,
"learning_rate": 2.8447056058467928e-06,
"loss": 0.1301,
"reward": 0.05000000074505806,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 524.6875,
"epoch": 0.03620819713351773,
"grad_norm": 1.8524736166000366,
"kl": 0.5546875,
"learning_rate": 2.830142238278531e-06,
"loss": 0.0687,
"reward": 0.056250001303851604,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.5625,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 476.375,
"epoch": 0.03635188045547613,
"grad_norm": 2.285640239715576,
"kl": 0.56640625,
"learning_rate": 2.81557550810246e-06,
"loss": 0.2638,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 374.3125,
"epoch": 0.036495563777434535,
"grad_norm": 3.965292453765869,
"kl": 0.626953125,
"learning_rate": 2.8010060265094026e-06,
"loss": -0.0549,
"reward": 0.06250000186264515,
"reward_std": 0.05386751517653465,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.625,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 299.0625,
"epoch": 0.03663924709939294,
"grad_norm": 2.7410449981689453,
"kl": 0.54248046875,
"learning_rate": 2.786434404805629e-06,
"loss": -0.0428,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 242.25,
"epoch": 0.03678293042135134,
"grad_norm": 7.071990489959717,
"kl": 0.76611328125,
"learning_rate": 2.771861254387199e-06,
"loss": 0.1733,
"reward": 0.06875000149011612,
"reward_std": 0.04136751499027014,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 216.6875,
"epoch": 0.036926613743309744,
"grad_norm": 126.14482116699219,
"kl": 5.0859375,
"learning_rate": 2.7572871867143204e-06,
"loss": 0.2472,
"reward": 0.06875000149011612,
"reward_std": 0.04136751499027014,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 245.5625,
"epoch": 0.03707029706526815,
"grad_norm": 4.183584213256836,
"kl": 0.6787109375,
"learning_rate": 2.742712813285681e-06,
"loss": 0.0637,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 318.4375,
"epoch": 0.03721398038722655,
"grad_norm": 3.0855674743652344,
"kl": 0.5908203125,
"learning_rate": 2.7281387456128017e-06,
"loss": 0.1647,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 215.6875,
"epoch": 0.03735766370918495,
"grad_norm": 4.139929294586182,
"kl": 0.673828125,
"learning_rate": 2.7135655951943716e-06,
"loss": -0.0256,
"reward": 0.06875000102445483,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.6875,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 219.0625,
"epoch": 0.03750134703114336,
"grad_norm": 0.20174548029899597,
"kl": 0.6328125,
"learning_rate": 2.698993973490598e-06,
"loss": 0.0059,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 158.75,
"epoch": 0.037645030353101766,
"grad_norm": 5.350340366363525,
"kl": 0.6455078125,
"learning_rate": 2.6844244918975416e-06,
"loss": 0.0784,
"reward": 0.07500000204890966,
"reward_std": 0.03943375777453184,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 135.625,
"epoch": 0.03778871367506017,
"grad_norm": 3.1076321601867676,
"kl": 0.9599609375,
"learning_rate": 2.66985776172147e-06,
"loss": 0.0643,
"reward": 0.08125000121071935,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 170.0625,
"epoch": 0.03793239699701857,
"grad_norm": 2.9959518909454346,
"kl": 0.8447265625,
"learning_rate": 2.6552943941532088e-06,
"loss": 0.0162,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 214.125,
"epoch": 0.038076080318976975,
"grad_norm": 0.19384361803531647,
"kl": 0.6650390625,
"learning_rate": 2.6407350002424927e-06,
"loss": 0.0066,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 250.5,
"epoch": 0.03821976364093538,
"grad_norm": 0.26606935262680054,
"kl": 0.71875,
"learning_rate": 2.626180190872329e-06,
"loss": 0.0068,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 240.375,
"epoch": 0.03836344696289378,
"grad_norm": 0.9030230641365051,
"kl": 0.8369140625,
"learning_rate": 2.611630576733372e-06,
"loss": 0.0062,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 166.1875,
"epoch": 0.038507130284852184,
"grad_norm": 3.1588239669799805,
"kl": 1.0390625,
"learning_rate": 2.5970867682982885e-06,
"loss": 0.0089,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 165.5,
"epoch": 0.03865081360681059,
"grad_norm": 2.534489154815674,
"kl": 0.740234375,
"learning_rate": 2.582549375796154e-06,
"loss": -0.0202,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 152.6875,
"epoch": 0.03879449692876899,
"grad_norm": 3.67244029045105,
"kl": 0.673828125,
"learning_rate": 2.568019009186841e-06,
"loss": 0.0254,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 207.4375,
"epoch": 0.0389381802507274,
"grad_norm": 2.7831411361694336,
"kl": 0.72265625,
"learning_rate": 2.5534962781354317e-06,
"loss": -0.0581,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 156.0,
"epoch": 0.0390818635726858,
"grad_norm": 5.289279937744141,
"kl": 0.6708984375,
"learning_rate": 2.538981791986634e-06,
"loss": 0.0231,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 145.6875,
"epoch": 0.039225546894644206,
"grad_norm": 1.0131648778915405,
"kl": 0.8173828125,
"learning_rate": 2.524476159739218e-06,
"loss": 0.0082,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 176.375,
"epoch": 0.03936923021660261,
"grad_norm": 2.1524786949157715,
"kl": 0.69921875,
"learning_rate": 2.5099799900204607e-06,
"loss": -0.031,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 169.625,
"epoch": 0.03951291353856101,
"grad_norm": 0.3758815824985504,
"kl": 0.60546875,
"learning_rate": 2.4954938910606108e-06,
"loss": 0.0056,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 120.0625,
"epoch": 0.039656596860519415,
"grad_norm": 2.6264796257019043,
"kl": 0.646484375,
"learning_rate": 2.481018470667368e-06,
"loss": 0.0003,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 194.75,
"epoch": 0.03980028018247782,
"grad_norm": 9.496621131896973,
"kl": 2.5390625,
"learning_rate": 2.4665543362003802e-06,
"loss": 0.0279,
"reward": 0.0875000013038516,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 129.4375,
"epoch": 0.03994396350443622,
"grad_norm": 0.19138182699680328,
"kl": 0.5478515625,
"learning_rate": 2.4521020945457615e-06,
"loss": 0.0053,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 128.125,
"epoch": 0.040087646826394624,
"grad_norm": 3.2331597805023193,
"kl": 0.599609375,
"learning_rate": 2.4376623520906255e-06,
"loss": 0.0059,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 177.5,
"epoch": 0.04023133014835303,
"grad_norm": 2.2726430892944336,
"kl": 0.5234375,
"learning_rate": 2.4232357146976478e-06,
"loss": 0.0504,
"reward": 0.0875000013038516,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 163.1875,
"epoch": 0.04037501347031144,
"grad_norm": 2.353566884994507,
"kl": 0.5107421875,
"learning_rate": 2.408822787679637e-06,
"loss": -0.0288,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 199.25,
"epoch": 0.04051869679226984,
"grad_norm": 0.12594649195671082,
"kl": 0.49951171875,
"learning_rate": 2.3944241757741475e-06,
"loss": 0.0049,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 205.9375,
"epoch": 0.04066238011422824,
"grad_norm": 1.8537880182266235,
"kl": 0.544921875,
"learning_rate": 2.380040483118097e-06,
"loss": -0.0586,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 189.3125,
"epoch": 0.040806063436186646,
"grad_norm": 3.44875168800354,
"kl": 0.900390625,
"learning_rate": 2.365672313222419e-06,
"loss": 0.1564,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 193.125,
"epoch": 0.04094974675814505,
"grad_norm": 1.9976215362548828,
"kl": 0.48974609375,
"learning_rate": 2.351320268946749e-06,
"loss": 0.0606,
"reward": 0.0875000013038516,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 219.0,
"epoch": 0.04109343008010345,
"grad_norm": 3.5146334171295166,
"kl": 0.7607421875,
"learning_rate": 2.336984952474119e-06,
"loss": -0.06,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 177.3125,
"epoch": 0.041237113402061855,
"grad_norm": 3.8956093788146973,
"kl": 0.6533203125,
"learning_rate": 2.322666965285697e-06,
"loss": 0.0237,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 165.375,
"epoch": 0.04138079672402026,
"grad_norm": 3.3750154972076416,
"kl": 0.607421875,
"learning_rate": 2.3083669081355507e-06,
"loss": 0.0068,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 267.0625,
"epoch": 0.04152448004597866,
"grad_norm": 2.528259754180908,
"kl": 0.5810546875,
"learning_rate": 2.2940853810254377e-06,
"loss": 0.1652,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 248.0,
"epoch": 0.041668163367937064,
"grad_norm": 0.24038171768188477,
"kl": 0.5234375,
"learning_rate": 2.2798229831796313e-06,
"loss": 0.0049,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 218.0,
"epoch": 0.041811846689895474,
"grad_norm": 3.456895589828491,
"kl": 0.49658203125,
"learning_rate": 2.2655803130197816e-06,
"loss": 0.2115,
"reward": 0.08125000167638063,
"reward_std": 0.026933757588267326,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 146.625,
"epoch": 0.04195553001185388,
"grad_norm": 0.15112897753715515,
"kl": 0.5068359375,
"learning_rate": 2.2513579681398034e-06,
"loss": 0.0049,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 200.6875,
"epoch": 0.04209921333381228,
"grad_norm": 0.16546125710010529,
"kl": 0.50732421875,
"learning_rate": 2.237156545280803e-06,
"loss": 0.005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 241.375,
"epoch": 0.04224289665577068,
"grad_norm": 0.17799150943756104,
"kl": 0.49072265625,
"learning_rate": 2.2229766403060403e-06,
"loss": 0.0049,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 212.4375,
"epoch": 0.042386579977729086,
"grad_norm": 3.0936059951782227,
"kl": 0.5068359375,
"learning_rate": 2.2088188481759305e-06,
"loss": 0.0534,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 197.5625,
"epoch": 0.04253026329968749,
"grad_norm": 1.9481518268585205,
"kl": 0.478515625,
"learning_rate": 2.194683762923073e-06,
"loss": -0.0562,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 226.0,
"epoch": 0.04267394662164589,
"grad_norm": 2.9069528579711914,
"kl": 0.64111328125,
"learning_rate": 2.1805719776273387e-06,
"loss": 0.0548,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 199.1875,
"epoch": 0.042817629943604295,
"grad_norm": 0.11291232705116272,
"kl": 0.49072265625,
"learning_rate": 2.166484084390974e-06,
"loss": 0.0046,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 195.875,
"epoch": 0.0429613132655627,
"grad_norm": 0.10604951530694962,
"kl": 0.4345703125,
"learning_rate": 2.1524206743137636e-06,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 205.5625,
"epoch": 0.0431049965875211,
"grad_norm": 0.18865613639354706,
"kl": 0.45849609375,
"learning_rate": 2.1383823374682287e-06,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 200.25,
"epoch": 0.043248679909479504,
"grad_norm": 0.26521536707878113,
"kl": 0.43798828125,
"learning_rate": 2.124369662874868e-06,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 200.0625,
"epoch": 0.043392363231437914,
"grad_norm": 2.301034927368164,
"kl": 0.58837890625,
"learning_rate": 2.110383238477441e-06,
"loss": 0.0223,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 154.1875,
"epoch": 0.04353604655339632,
"grad_norm": 0.2673363983631134,
"kl": 0.54736328125,
"learning_rate": 2.096423651118305e-06,
"loss": 0.0054,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 263.875,
"epoch": 0.04367972987535472,
"grad_norm": 0.11367050558328629,
"kl": 0.3955078125,
"learning_rate": 2.082491486513788e-06,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 210.9375,
"epoch": 0.04382341319731312,
"grad_norm": 0.1046769991517067,
"kl": 0.47021484375,
"learning_rate": 2.0685873292296116e-06,
"loss": 0.0047,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 215.125,
"epoch": 0.043967096519271526,
"grad_norm": 0.15870000422000885,
"kl": 0.5517578125,
"learning_rate": 2.054711762656369e-06,
"loss": 0.0052,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 162.4375,
"epoch": 0.04411077984122993,
"grad_norm": 0.12202607095241547,
"kl": 0.4443359375,
"learning_rate": 2.040865368985044e-06,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 282.8125,
"epoch": 0.04425446316318833,
"grad_norm": 0.5133360624313354,
"kl": 0.4326171875,
"learning_rate": 2.027048729182583e-06,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 225.75,
"epoch": 0.044398146485146735,
"grad_norm": 0.08721781522035599,
"kl": 0.3935546875,
"learning_rate": 2.0132624229675205e-06,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 184.3125,
"epoch": 0.04454182980710514,
"grad_norm": 0.13983947038650513,
"kl": 0.52587890625,
"learning_rate": 1.9995070287856546e-06,
"loss": 0.0055,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 213.625,
"epoch": 0.04468551312906354,
"grad_norm": 0.7404407858848572,
"kl": 0.45166015625,
"learning_rate": 1.985783123785774e-06,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 241.125,
"epoch": 0.04482919645102195,
"grad_norm": 0.11150796711444855,
"kl": 0.466796875,
"learning_rate": 1.9720912837954486e-06,
"loss": 0.0047,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 247.25,
"epoch": 0.044972879772980354,
"grad_norm": 0.09887633472681046,
"kl": 0.44921875,
"learning_rate": 1.958432083296862e-06,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 228.1875,
"epoch": 0.04511656309493876,
"grad_norm": 0.08166426420211792,
"kl": 0.41015625,
"learning_rate": 1.9448060954027093e-06,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 295.5,
"epoch": 0.04526024641689716,
"grad_norm": 0.08792513608932495,
"kl": 0.40380859375,
"learning_rate": 1.931213891832153e-06,
"loss": 0.0042,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 286.5625,
"epoch": 0.04540392973885556,
"grad_norm": 3.032651662826538,
"kl": 0.4560546875,
"learning_rate": 1.9176560428868336e-06,
"loss": -0.0239,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 296.4375,
"epoch": 0.045547613060813966,
"grad_norm": 0.11587885767221451,
"kl": 0.41650390625,
"learning_rate": 1.9041331174269373e-06,
"loss": 0.0042,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 387.3125,
"epoch": 0.04569129638277237,
"grad_norm": 0.8805405497550964,
"kl": 0.43701171875,
"learning_rate": 1.8906456828473341e-06,
"loss": 0.0609,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 283.75,
"epoch": 0.04583497970473077,
"grad_norm": 0.08596353977918625,
"kl": 0.43212890625,
"learning_rate": 1.8771943050537656e-06,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 299.375,
"epoch": 0.045978663026689175,
"grad_norm": 1.9583159685134888,
"kl": 0.4990234375,
"learning_rate": 1.8637795484391046e-06,
"loss": 0.06,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 271.3125,
"epoch": 0.04612234634864758,
"grad_norm": 0.15285852551460266,
"kl": 0.4501953125,
"learning_rate": 1.8504019758596698e-06,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 313.6875,
"epoch": 0.04626602967060599,
"grad_norm": 0.10856521874666214,
"kl": 0.44287109375,
"learning_rate": 1.8370621486116163e-06,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 291.8125,
"epoch": 0.04640971299256439,
"grad_norm": 2.9064958095550537,
"kl": 0.51025390625,
"learning_rate": 1.823760626407377e-06,
"loss": -0.0198,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 304.0,
"epoch": 0.046553396314522794,
"grad_norm": 0.0989953801035881,
"kl": 0.41796875,
"learning_rate": 1.8104979673521838e-06,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 330.75,
"epoch": 0.0466970796364812,
"grad_norm": 1.5450448989868164,
"kl": 0.5390625,
"learning_rate": 1.7972747279206482e-06,
"loss": 0.0276,
"reward": 0.0875000013038516,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 320.3125,
"epoch": 0.0468407629584396,
"grad_norm": 0.07848266512155533,
"kl": 0.359375,
"learning_rate": 1.7840914629334122e-06,
"loss": 0.0035,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 350.75,
"epoch": 0.046984446280398,
"grad_norm": 0.11079081147909164,
"kl": 0.4228515625,
"learning_rate": 1.7709487255338731e-06,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 268.5625,
"epoch": 0.047128129602356406,
"grad_norm": 0.08232247084379196,
"kl": 0.43603515625,
"learning_rate": 1.7578470671649684e-06,
"loss": 0.004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 350.3125,
"epoch": 0.04727181292431481,
"grad_norm": 0.09194315969944,
"kl": 0.40869140625,
"learning_rate": 1.744787037546045e-06,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 319.125,
"epoch": 0.04741549624627321,
"grad_norm": 0.08984750509262085,
"kl": 0.49169921875,
"learning_rate": 1.731769184649788e-06,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 254.875,
"epoch": 0.047559179568231615,
"grad_norm": 0.09713928401470184,
"kl": 0.484375,
"learning_rate": 1.7187940546792325e-06,
"loss": 0.0046,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 305.0625,
"epoch": 0.047702862890190025,
"grad_norm": 0.11803556233644485,
"kl": 0.435546875,
"learning_rate": 1.7058621920448465e-06,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 279.6875,
"epoch": 0.04784654621214843,
"grad_norm": 2.062950849533081,
"kl": 0.501953125,
"learning_rate": 1.6929741393416855e-06,
"loss": 0.0285,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 327.8125,
"epoch": 0.04799022953410683,
"grad_norm": 3.389031410217285,
"kl": 0.4765625,
"learning_rate": 1.6801304373266286e-06,
"loss": -0.0063,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 343.5,
"epoch": 0.048133912856065234,
"grad_norm": 0.17832499742507935,
"kl": 0.54443359375,
"learning_rate": 1.667331624895689e-06,
"loss": 0.0053,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 414.0,
"epoch": 0.048277596178023637,
"grad_norm": 1.3559800386428833,
"kl": 0.4990234375,
"learning_rate": 1.6545782390614037e-06,
"loss": 0.0622,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 345.375,
"epoch": 0.04842127949998204,
"grad_norm": 0.11337132751941681,
"kl": 0.45654296875,
"learning_rate": 1.6418708149302992e-06,
"loss": 0.0042,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 347.9375,
"epoch": 0.04856496282194044,
"grad_norm": 1.1885749101638794,
"kl": 0.3779296875,
"learning_rate": 1.6292098856804423e-06,
"loss": 0.1019,
"reward": 0.0875000013038516,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 421.4375,
"epoch": 0.048708646143898846,
"grad_norm": 0.9923747181892395,
"kl": 0.53564453125,
"learning_rate": 1.6165959825390661e-06,
"loss": -0.0729,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 298.3125,
"epoch": 0.04885232946585725,
"grad_norm": 0.11337187141180038,
"kl": 0.50927734375,
"learning_rate": 1.604029634760284e-06,
"loss": 0.0055,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 306.3125,
"epoch": 0.04899601278781565,
"grad_norm": 0.1039213165640831,
"kl": 0.46630859375,
"learning_rate": 1.59151136960288e-06,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 281.1875,
"epoch": 0.04913969610977406,
"grad_norm": 0.08466682583093643,
"kl": 0.4306640625,
"learning_rate": 1.5790417123081903e-06,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 386.6875,
"epoch": 0.049283379431732464,
"grad_norm": 0.10602067410945892,
"kl": 0.58203125,
"learning_rate": 1.5666211860780583e-06,
"loss": 0.006,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 296.5,
"epoch": 0.04942706275369087,
"grad_norm": 0.06954965740442276,
"kl": 0.435546875,
"learning_rate": 1.5542503120528918e-06,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 333.6875,
"epoch": 0.04957074607564927,
"grad_norm": 0.08143350481987,
"kl": 0.53125,
"learning_rate": 1.5419296092897866e-06,
"loss": 0.0053,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 280.6875,
"epoch": 0.04971442939760767,
"grad_norm": 0.11366530507802963,
"kl": 0.42529296875,
"learning_rate": 1.529659594740755e-06,
"loss": 0.004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 308.5,
"epoch": 0.049858112719566076,
"grad_norm": 0.08413314819335938,
"kl": 0.43701171875,
"learning_rate": 1.5174407832310338e-06,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 361.1875,
"epoch": 0.05000179604152448,
"grad_norm": 1.0981367826461792,
"kl": 0.40869140625,
"learning_rate": 1.5052736874374815e-06,
"loss": 0.0653,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 333.9375,
"epoch": 0.05014547936348288,
"grad_norm": 0.09738067537546158,
"kl": 0.38671875,
"learning_rate": 1.4931588178670695e-06,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 289.3125,
"epoch": 0.050289162685441285,
"grad_norm": 0.06618193536996841,
"kl": 0.36474609375,
"learning_rate": 1.4810966828354605e-06,
"loss": 0.0037,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 462.375,
"epoch": 0.05043284600739969,
"grad_norm": 2.784261703491211,
"kl": 0.47021484375,
"learning_rate": 1.469087788445684e-06,
"loss": 0.0589,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 352.25,
"epoch": 0.05057652932935809,
"grad_norm": 0.08649200946092606,
"kl": 0.46240234375,
"learning_rate": 1.4571326385668965e-06,
"loss": 0.0042,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 279.0625,
"epoch": 0.0507202126513165,
"grad_norm": 0.09761747717857361,
"kl": 0.431640625,
"learning_rate": 1.4452317348132434e-06,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 292.5625,
"epoch": 0.050863895973274904,
"grad_norm": 0.12047071009874344,
"kl": 0.47900390625,
"learning_rate": 1.4333855765228104e-06,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 385.75,
"epoch": 0.05100757929523331,
"grad_norm": 1.935134768486023,
"kl": 0.666015625,
"learning_rate": 1.421594660736675e-06,
"loss": 0.1214,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 388.5625,
"epoch": 0.05115126261719171,
"grad_norm": 0.0700010433793068,
"kl": 0.39013671875,
"learning_rate": 1.4098594821780476e-06,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 338.125,
"epoch": 0.05129494593915011,
"grad_norm": 0.07236258685588837,
"kl": 0.35693359375,
"learning_rate": 1.3981805332315174e-06,
"loss": 0.0035,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 375.8125,
"epoch": 0.051438629261108516,
"grad_norm": 0.10169550776481628,
"kl": 0.48779296875,
"learning_rate": 1.3865583039223929e-06,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 341.4375,
"epoch": 0.05158231258306692,
"grad_norm": 4.062565803527832,
"kl": 0.7236328125,
"learning_rate": 1.374993281896137e-06,
"loss": 0.0212,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 464.3125,
"epoch": 0.05172599590502532,
"grad_norm": 0.11676806211471558,
"kl": 0.5576171875,
"learning_rate": 1.3634859523979134e-06,
"loss": 0.0054,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 390.6875,
"epoch": 0.051869679226983725,
"grad_norm": 1.4354273080825806,
"kl": 0.486328125,
"learning_rate": 1.3520367982522208e-06,
"loss": 0.0869,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 388.8125,
"epoch": 0.05201336254894213,
"grad_norm": 0.11781425029039383,
"kl": 0.41259765625,
"learning_rate": 1.3406462998426358e-06,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 352.375,
"epoch": 0.05215704587090054,
"grad_norm": 0.1065189316868782,
"kl": 0.4375,
"learning_rate": 1.3293149350916595e-06,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 358.3125,
"epoch": 0.05230072919285894,
"grad_norm": 0.08605165779590607,
"kl": 0.40380859375,
"learning_rate": 1.3180431794406623e-06,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 344.75,
"epoch": 0.052444412514817344,
"grad_norm": 5.735403537750244,
"kl": 0.64013671875,
"learning_rate": 1.3068315058299358e-06,
"loss": 0.1397,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 350.25,
"epoch": 0.05258809583677575,
"grad_norm": 1.2408664226531982,
"kl": 0.40576171875,
"learning_rate": 1.2956803846788503e-06,
"loss": -0.0358,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 372.125,
"epoch": 0.05273177915873415,
"grad_norm": 0.08045981079339981,
"kl": 0.451171875,
"learning_rate": 1.284590283866116e-06,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 319.5625,
"epoch": 0.05287546248069255,
"grad_norm": 0.08284196257591248,
"kl": 0.416015625,
"learning_rate": 1.2735616687101518e-06,
"loss": 0.0042,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 361.0,
"epoch": 0.053019145802650956,
"grad_norm": 0.08355327695608139,
"kl": 0.4482421875,
"learning_rate": 1.2625950019495614e-06,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 403.875,
"epoch": 0.05316282912460936,
"grad_norm": 0.08809319883584976,
"kl": 0.44873046875,
"learning_rate": 1.251690743723718e-06,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 318.75,
"epoch": 0.05330651244656776,
"grad_norm": 1.7776198387145996,
"kl": 0.384765625,
"learning_rate": 1.2408493515534581e-06,
"loss": 0.0241,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 375.4375,
"epoch": 0.053450195768526165,
"grad_norm": 0.09417347609996796,
"kl": 0.51171875,
"learning_rate": 1.2300712803218834e-06,
"loss": 0.0051,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 534.25,
"epoch": 0.053593879090484575,
"grad_norm": 1.3220620155334473,
"kl": 0.447265625,
"learning_rate": 1.2193569822552772e-06,
"loss": 0.0913,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 392.0625,
"epoch": 0.05373756241244298,
"grad_norm": 1.8771501779556274,
"kl": 0.47216796875,
"learning_rate": 1.2087069069041268e-06,
"loss": 0.0733,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 370.0,
"epoch": 0.05388124573440138,
"grad_norm": 0.08563094586133957,
"kl": 0.3857421875,
"learning_rate": 1.1981215011242654e-06,
"loss": 0.0038,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 433.3125,
"epoch": 0.054024929056359784,
"grad_norm": 1.363516926765442,
"kl": 0.43994140625,
"learning_rate": 1.1876012090581184e-06,
"loss": 0.064,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 417.9375,
"epoch": 0.05416861237831819,
"grad_norm": 1.752987027168274,
"kl": 0.404296875,
"learning_rate": 1.177146472116071e-06,
"loss": 0.1186,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 360.0625,
"epoch": 0.05431229570027659,
"grad_norm": 0.08021112531423569,
"kl": 0.4033203125,
"learning_rate": 1.1667577289579462e-06,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 312.1875,
"epoch": 0.05445597902223499,
"grad_norm": 0.12521375715732574,
"kl": 0.4931640625,
"learning_rate": 1.1564354154746007e-06,
"loss": 0.0051,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 321.8125,
"epoch": 0.054599662344193396,
"grad_norm": 0.09579505026340485,
"kl": 0.4921875,
"learning_rate": 1.146179964769635e-06,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 474.5,
"epoch": 0.0547433456661518,
"grad_norm": 0.0810822919011116,
"kl": 0.44970703125,
"learning_rate": 1.1359918071412195e-06,
"loss": 0.0046,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 397.8125,
"epoch": 0.0548870289881102,
"grad_norm": 1.5699117183685303,
"kl": 0.4716796875,
"learning_rate": 1.1258713700640456e-06,
"loss": 0.0992,
"reward": 0.0875000013038516,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 421.875,
"epoch": 0.05503071231006861,
"grad_norm": 0.06795567274093628,
"kl": 0.47998046875,
"learning_rate": 1.115819078171383e-06,
"loss": 0.0049,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 302.0,
"epoch": 0.055174395632027015,
"grad_norm": 2.255272150039673,
"kl": 0.5107421875,
"learning_rate": 1.1058353532372667e-06,
"loss": 0.0632,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 323.25,
"epoch": 0.05531807895398542,
"grad_norm": 0.1344795525074005,
"kl": 0.4765625,
"learning_rate": 1.0959206141587998e-06,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 293.0,
"epoch": 0.05546176227594382,
"grad_norm": 2.3380484580993652,
"kl": 0.49462890625,
"learning_rate": 1.0860752769385766e-06,
"loss": -0.0554,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 289.25,
"epoch": 0.055605445597902224,
"grad_norm": 2.3010873794555664,
"kl": 0.41650390625,
"learning_rate": 1.0762997546672279e-06,
"loss": -0.0603,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 352.75,
"epoch": 0.05574912891986063,
"grad_norm": 0.09730658680200577,
"kl": 0.46533203125,
"learning_rate": 1.0665944575060914e-06,
"loss": 0.0047,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 317.75,
"epoch": 0.05589281224181903,
"grad_norm": 0.08599717915058136,
"kl": 0.40087890625,
"learning_rate": 1.056959792669997e-06,
"loss": 0.004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 301.9375,
"epoch": 0.05603649556377743,
"grad_norm": 0.08673281967639923,
"kl": 0.36865234375,
"learning_rate": 1.0473961644101856e-06,
"loss": 0.0037,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 275.75,
"epoch": 0.056180178885735836,
"grad_norm": 0.08435692638158798,
"kl": 0.4267578125,
"learning_rate": 1.037903973997345e-06,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 416.0,
"epoch": 0.05632386220769424,
"grad_norm": 0.10442197322845459,
"kl": 0.447265625,
"learning_rate": 1.0284836197047737e-06,
"loss": 0.0046,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 270.0625,
"epoch": 0.05646754552965264,
"grad_norm": 0.1144208163022995,
"kl": 0.4169921875,
"learning_rate": 1.0191354967916712e-06,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 413.375,
"epoch": 0.05661122885161105,
"grad_norm": 1.8321337699890137,
"kl": 0.46142578125,
"learning_rate": 1.0098599974865515e-06,
"loss": 0.1531,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 283.375,
"epoch": 0.056754912173569455,
"grad_norm": 0.07575566321611404,
"kl": 0.44384765625,
"learning_rate": 1.0006575109707898e-06,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 300.25,
"epoch": 0.05689859549552786,
"grad_norm": 0.06213975325226784,
"kl": 0.4384765625,
"learning_rate": 9.915284233622877e-07,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 300.25,
"epoch": 0.05704227881748626,
"grad_norm": 0.09239588677883148,
"kl": 0.42138671875,
"learning_rate": 9.824731176992796e-07,
"loss": 0.0042,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 300.6875,
"epoch": 0.057185962139444664,
"grad_norm": 0.21190443634986877,
"kl": 0.396484375,
"learning_rate": 9.734919739242543e-07,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 248.0,
"epoch": 0.05732964546140307,
"grad_norm": 0.1755290925502777,
"kl": 0.43212890625,
"learning_rate": 9.645853688680177e-07,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 331.6875,
"epoch": 0.05747332878336147,
"grad_norm": 1.920776128768921,
"kl": 0.4873046875,
"learning_rate": 9.557536762338786e-07,
"loss": 0.1366,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 341.125,
"epoch": 0.05761701210531987,
"grad_norm": 0.12242202460765839,
"kl": 0.44580078125,
"learning_rate": 9.46997266581973e-07,
"loss": 0.0046,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 294.75,
"epoch": 0.057760695427278276,
"grad_norm": 0.07106681913137436,
"kl": 0.3984375,
"learning_rate": 9.383165073137115e-07,
"loss": 0.0037,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 328.9375,
"epoch": 0.05790437874923668,
"grad_norm": 0.10068362206220627,
"kl": 0.431640625,
"learning_rate": 9.297117626563687e-07,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 304.0625,
"epoch": 0.05804806207119509,
"grad_norm": 0.20983238518238068,
"kl": 0.4404296875,
"learning_rate": 9.211833936477957e-07,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 309.0625,
"epoch": 0.05819174539315349,
"grad_norm": 0.14584408700466156,
"kl": 0.4462890625,
"learning_rate": 9.127317581212753e-07,
"loss": 0.0046,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 359.4375,
"epoch": 0.058335428715111895,
"grad_norm": 0.0764172375202179,
"kl": 0.41796875,
"learning_rate": 9.043572106905084e-07,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 380.6875,
"epoch": 0.0584791120370703,
"grad_norm": 0.0757213830947876,
"kl": 0.4384765625,
"learning_rate": 8.960601027347321e-07,
"loss": 0.0046,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 306.8125,
"epoch": 0.0586227953590287,
"grad_norm": 0.0817841961979866,
"kl": 0.42529296875,
"learning_rate": 8.878407823839788e-07,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 248.0,
"epoch": 0.058766478680987104,
"grad_norm": 2.3475921154022217,
"kl": 0.4404296875,
"learning_rate": 8.796995945044689e-07,
"loss": 0.038,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 327.25,
"epoch": 0.05891016200294551,
"grad_norm": 0.11481396108865738,
"kl": 0.4560546875,
"learning_rate": 8.716368806841405e-07,
"loss": 0.0047,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 327.0625,
"epoch": 0.05905384532490391,
"grad_norm": 0.11284112930297852,
"kl": 0.36279296875,
"learning_rate": 8.636529792183171e-07,
"loss": 0.0036,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 387.0625,
"epoch": 0.05919752864686231,
"grad_norm": 1.5390390157699585,
"kl": 0.48486328125,
"learning_rate": 8.557482250955144e-07,
"loss": 0.0038,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 320.4375,
"epoch": 0.059341211968820716,
"grad_norm": 0.12584422528743744,
"kl": 0.45556640625,
"learning_rate": 8.479229499833844e-07,
"loss": 0.0047,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 310.3125,
"epoch": 0.059484895290779126,
"grad_norm": 0.06749071925878525,
"kl": 0.4423828125,
"learning_rate": 8.401774822147976e-07,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 418.6875,
"epoch": 0.05962857861273753,
"grad_norm": 0.0747298002243042,
"kl": 0.501953125,
"learning_rate": 8.325121467740695e-07,
"loss": 0.005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 310.6875,
"epoch": 0.05977226193469593,
"grad_norm": 1.954253911972046,
"kl": 0.3984375,
"learning_rate": 8.249272652833226e-07,
"loss": -0.0782,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 442.5625,
"epoch": 0.059915945256654335,
"grad_norm": 0.09026701003313065,
"kl": 0.49267578125,
"learning_rate": 8.174231559889931e-07,
"loss": 0.0052,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 264.4375,
"epoch": 0.06005962857861274,
"grad_norm": 0.10345678776502609,
"kl": 0.45751953125,
"learning_rate": 8.100001337484787e-07,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 311.375,
"epoch": 0.06020331190057114,
"grad_norm": 0.07524294406175613,
"kl": 0.38232421875,
"learning_rate": 8.026585100169251e-07,
"loss": 0.004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 299.375,
"epoch": 0.060346995222529544,
"grad_norm": 0.08929236233234406,
"kl": 0.37255859375,
"learning_rate": 7.953985928341601e-07,
"loss": 0.0035,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 291.8125,
"epoch": 0.06049067854448795,
"grad_norm": 0.08844827115535736,
"kl": 0.38427734375,
"learning_rate": 7.882206868117693e-07,
"loss": 0.0038,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 388.0625,
"epoch": 0.06063436186644635,
"grad_norm": 0.08420619368553162,
"kl": 0.3828125,
"learning_rate": 7.81125093120313e-07,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 364.3125,
"epoch": 0.06077804518840475,
"grad_norm": 0.05994531139731407,
"kl": 0.39013671875,
"learning_rate": 7.741121094766916e-07,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 346.9375,
"epoch": 0.06092172851036316,
"grad_norm": 169.9329376220703,
"kl": 15.16455078125,
"learning_rate": 7.671820301316532e-07,
"loss": 0.0798,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 301.875,
"epoch": 0.061065411832321566,
"grad_norm": 0.08597877621650696,
"kl": 0.4296875,
"learning_rate": 7.603351458574474e-07,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 360.625,
"epoch": 0.06120909515427997,
"grad_norm": 0.08660821616649628,
"kl": 0.4404296875,
"learning_rate": 7.535717439356255e-07,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 347.375,
"epoch": 0.06135277847623837,
"grad_norm": 0.09224473685026169,
"kl": 0.45654296875,
"learning_rate": 7.46892108144986e-07,
"loss": 0.0047,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 342.3125,
"epoch": 0.061496461798196775,
"grad_norm": 0.11141007393598557,
"kl": 0.42578125,
"learning_rate": 7.402965187496697e-07,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 340.5,
"epoch": 0.06164014512015518,
"grad_norm": 2.9540467262268066,
"kl": 0.42919921875,
"learning_rate": 7.337852524873974e-07,
"loss": 0.2308,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 370.4375,
"epoch": 0.06178382844211358,
"grad_norm": 0.10491559654474258,
"kl": 0.3974609375,
"learning_rate": 7.273585825578608e-07,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 334.8125,
"epoch": 0.061927511764071984,
"grad_norm": 2.0611469745635986,
"kl": 0.48681640625,
"learning_rate": 7.21016778611259e-07,
"loss": 0.0687,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 346.1875,
"epoch": 0.06207119508603039,
"grad_norm": 1.7300693988800049,
"kl": 0.40234375,
"learning_rate": 7.147601067369835e-07,
"loss": 0.1007,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 326.25,
"epoch": 0.06221487840798879,
"grad_norm": 1.881553053855896,
"kl": 0.3720703125,
"learning_rate": 7.085888294524561e-07,
"loss": 0.0915,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 331.625,
"epoch": 0.0623585617299472,
"grad_norm": 0.08463376760482788,
"kl": 0.431640625,
"learning_rate": 7.025032056921117e-07,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 280.25,
"epoch": 0.0625022450519056,
"grad_norm": 0.08293016254901886,
"kl": 0.43505859375,
"learning_rate": 6.965034907965349e-07,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 362.3125,
"epoch": 0.062645928373864,
"grad_norm": 0.13934151828289032,
"kl": 0.45068359375,
"learning_rate": 6.905899365017462e-07,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 235.0,
"epoch": 0.06278961169582241,
"grad_norm": 2.300443649291992,
"kl": 0.53857421875,
"learning_rate": 6.847627909286409e-07,
"loss": -0.0633,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 321.6875,
"epoch": 0.0629332950177808,
"grad_norm": 2.531771183013916,
"kl": 0.43359375,
"learning_rate": 6.790222985725761e-07,
"loss": 0.1378,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 317.125,
"epoch": 0.06307697833973921,
"grad_norm": 0.17811354994773865,
"kl": 0.45751953125,
"learning_rate": 6.733687002931141e-07,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 386.5,
"epoch": 0.06322066166169762,
"grad_norm": 0.0891660824418068,
"kl": 0.38916015625,
"learning_rate": 6.678022333039158e-07,
"loss": 0.0038,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 354.5625,
"epoch": 0.06336434498365602,
"grad_norm": 1.3903534412384033,
"kl": 0.38818359375,
"learning_rate": 6.623231311627876e-07,
"loss": 0.0717,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 333.625,
"epoch": 0.06350802830561443,
"grad_norm": 0.08387104421854019,
"kl": 0.47412109375,
"learning_rate": 6.569316237618811e-07,
"loss": 0.005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 329.75,
"epoch": 0.06365171162757283,
"grad_norm": 0.10497360676527023,
"kl": 0.46826171875,
"learning_rate": 6.516279373180499e-07,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 349.125,
"epoch": 0.06379539494953124,
"grad_norm": 0.1853691041469574,
"kl": 0.48046875,
"learning_rate": 6.464122943633543e-07,
"loss": 0.0047,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 348.5,
"epoch": 0.06393907827148963,
"grad_norm": 1.2641898393630981,
"kl": 0.4384765625,
"learning_rate": 6.412849137357271e-07,
"loss": -0.041,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 376.9375,
"epoch": 0.06408276159344804,
"grad_norm": 2.004131317138672,
"kl": 0.48583984375,
"learning_rate": 6.3624601056979e-07,
"loss": -0.0629,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 392.8125,
"epoch": 0.06422644491540644,
"grad_norm": 3.0901613235473633,
"kl": 0.44677734375,
"learning_rate": 6.312957962878278e-07,
"loss": 0.1556,
"reward": 0.08125000260770321,
"reward_std": 0.037500000558793545,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.8125,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 356.1875,
"epoch": 0.06437012823736485,
"grad_norm": 0.08546911180019379,
"kl": 0.40185546875,
"learning_rate": 6.264344785909181e-07,
"loss": 0.004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 262.875,
"epoch": 0.06451381155932324,
"grad_norm": 0.12542271614074707,
"kl": 0.40625,
"learning_rate": 6.216622614502149e-07,
"loss": 0.004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 446.1875,
"epoch": 0.06465749488128165,
"grad_norm": 0.882746696472168,
"kl": 0.4072265625,
"learning_rate": 6.169793450983916e-07,
"loss": 0.0326,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 243.6875,
"epoch": 0.06480117820324006,
"grad_norm": 0.08595617115497589,
"kl": 0.38916015625,
"learning_rate": 6.123859260212393e-07,
"loss": 0.0038,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 320.625,
"epoch": 0.06494486152519846,
"grad_norm": 0.0691758245229721,
"kl": 0.40380859375,
"learning_rate": 6.07882196949423e-07,
"loss": 0.004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 460.0625,
"epoch": 0.06508854484715687,
"grad_norm": 0.06517741084098816,
"kl": 0.37109375,
"learning_rate": 6.034683468503948e-07,
"loss": 0.0037,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 338.625,
"epoch": 0.06523222816911527,
"grad_norm": 0.1255577653646469,
"kl": 0.52587890625,
"learning_rate": 5.991445609204641e-07,
"loss": 0.0053,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 294.1875,
"epoch": 0.06537591149107368,
"grad_norm": 0.2892981767654419,
"kl": 0.421875,
"learning_rate": 5.949110205770292e-07,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 290.625,
"epoch": 0.06551959481303207,
"grad_norm": 0.08248059451580048,
"kl": 0.4111328125,
"learning_rate": 5.90767903450964e-07,
"loss": 0.0043,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 316.8125,
"epoch": 0.06566327813499048,
"grad_norm": 0.06285588443279266,
"kl": 0.40771484375,
"learning_rate": 5.867153833791652e-07,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 407.375,
"epoch": 0.06580696145694888,
"grad_norm": 1.4286978244781494,
"kl": 0.41552734375,
"learning_rate": 5.827536303972587e-07,
"loss": 0.099,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 293.9375,
"epoch": 0.06595064477890729,
"grad_norm": 0.07036440074443817,
"kl": 0.39599609375,
"learning_rate": 5.78882810732465e-07,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 346.875,
"epoch": 0.0660943281008657,
"grad_norm": 0.15339231491088867,
"kl": 0.416015625,
"learning_rate": 5.75103086796625e-07,
"loss": 0.0042,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 375.5625,
"epoch": 0.0662380114228241,
"grad_norm": 0.07655762881040573,
"kl": 0.38818359375,
"learning_rate": 5.714146171793846e-07,
"loss": 0.0037,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 326.375,
"epoch": 0.0663816947447825,
"grad_norm": 0.12055181711912155,
"kl": 0.53515625,
"learning_rate": 5.678175566415422e-07,
"loss": 0.0056,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 314.1875,
"epoch": 0.0665253780667409,
"grad_norm": 0.0709441527724266,
"kl": 0.4140625,
"learning_rate": 5.643120561085528e-07,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 273.25,
"epoch": 0.06666906138869931,
"grad_norm": 0.10049540549516678,
"kl": 0.48583984375,
"learning_rate": 5.608982626641991e-07,
"loss": 0.0049,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 336.9375,
"epoch": 0.0668127447106577,
"grad_norm": 0.18985526263713837,
"kl": 0.4140625,
"learning_rate": 5.575763195444166e-07,
"loss": 0.0042,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 312.375,
"epoch": 0.06695642803261612,
"grad_norm": 1.2384103536605835,
"kl": 0.4248046875,
"learning_rate": 5.543463661312847e-07,
"loss": -0.0351,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 367.9375,
"epoch": 0.06710011135457451,
"grad_norm": 0.08845322579145432,
"kl": 0.4287109375,
"learning_rate": 5.512085379471808e-07,
"loss": 0.0046,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 403.25,
"epoch": 0.06724379467653292,
"grad_norm": 1.3062446117401123,
"kl": 0.412109375,
"learning_rate": 5.481629666490903e-07,
"loss": 0.0713,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 365.1875,
"epoch": 0.06738747799849132,
"grad_norm": 0.07498609274625778,
"kl": 0.3583984375,
"learning_rate": 5.452097800230853e-07,
"loss": 0.0035,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 347.8125,
"epoch": 0.06753116132044973,
"grad_norm": 1.3313813209533691,
"kl": 0.462890625,
"learning_rate": 5.423491019789623e-07,
"loss": -0.0492,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 315.9375,
"epoch": 0.06767484464240814,
"grad_norm": 0.08418026566505432,
"kl": 0.44775390625,
"learning_rate": 5.395810525450425e-07,
"loss": 0.0045,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 328.375,
"epoch": 0.06781852796436653,
"grad_norm": 0.065086729824543,
"kl": 0.34716796875,
"learning_rate": 5.369057478631359e-07,
"loss": 0.0034,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 375.5625,
"epoch": 0.06796221128632494,
"grad_norm": 1.40998113155365,
"kl": 0.39208984375,
"learning_rate": 5.343233001836694e-07,
"loss": 0.1839,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 283.4375,
"epoch": 0.06810589460828334,
"grad_norm": 0.18871350586414337,
"kl": 0.42236328125,
"learning_rate": 5.318338178609754e-07,
"loss": 0.004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 331.0625,
"epoch": 0.06824957793024175,
"grad_norm": 0.07475198060274124,
"kl": 0.44384765625,
"learning_rate": 5.294374053487459e-07,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 350.0625,
"epoch": 0.06839326125220015,
"grad_norm": 0.08038158714771271,
"kl": 0.361328125,
"learning_rate": 5.271341631956511e-07,
"loss": 0.0035,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 332.75,
"epoch": 0.06853694457415856,
"grad_norm": 1.5739789009094238,
"kl": 0.46044921875,
"learning_rate": 5.249241880411181e-07,
"loss": -0.0099,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 401.1875,
"epoch": 0.06868062789611695,
"grad_norm": 0.0891866534948349,
"kl": 0.38525390625,
"learning_rate": 5.228075726112785e-07,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 443.9375,
"epoch": 0.06882431121807536,
"grad_norm": 0.11574111878871918,
"kl": 0.37548828125,
"learning_rate": 5.207844057150768e-07,
"loss": 0.0037,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 369.9375,
"epoch": 0.06896799454003377,
"grad_norm": 0.07223603874444962,
"kl": 0.34716796875,
"learning_rate": 5.188547722405437e-07,
"loss": 0.0034,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 363.375,
"epoch": 0.06911167786199217,
"grad_norm": 0.07493572682142258,
"kl": 0.4140625,
"learning_rate": 5.170187531512351e-07,
"loss": 0.0042,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 386.1875,
"epoch": 0.06925536118395058,
"grad_norm": 0.20151501893997192,
"kl": 0.453125,
"learning_rate": 5.152764254828348e-07,
"loss": 0.005,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 262.3125,
"epoch": 0.06939904450590897,
"grad_norm": 0.200210303068161,
"kl": 0.46435546875,
"learning_rate": 5.136278623399225e-07,
"loss": 0.0048,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 320.75,
"epoch": 0.06954272782786738,
"grad_norm": 0.08599031716585159,
"kl": 0.369140625,
"learning_rate": 5.120731328929058e-07,
"loss": 0.0036,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 469.25,
"epoch": 0.06968641114982578,
"grad_norm": 1.3228610754013062,
"kl": 0.390625,
"learning_rate": 5.106123023751187e-07,
"loss": 0.0797,
"reward": 0.0875000013038516,
"reward_std": 0.014433757402002811,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 443.5,
"epoch": 0.06983009447178419,
"grad_norm": 0.06405351310968399,
"kl": 0.4345703125,
"learning_rate": 5.092454320800833e-07,
"loss": 0.0044,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 313.625,
"epoch": 0.06997377779374259,
"grad_norm": 2.284759998321533,
"kl": 0.3701171875,
"learning_rate": 5.079725793589405e-07,
"loss": 0.0138,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 406.375,
"epoch": 0.070117461115701,
"grad_norm": 0.06436394900083542,
"kl": 0.39208984375,
"learning_rate": 5.067937976180407e-07,
"loss": 0.004,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 413.125,
"epoch": 0.07026114443765939,
"grad_norm": 2.4305238723754883,
"kl": 0.4248046875,
"learning_rate": 5.057091363167046e-07,
"loss": 0.0754,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 342.25,
"epoch": 0.0704048277596178,
"grad_norm": 0.06973189115524292,
"kl": 0.3818359375,
"learning_rate": 5.047186409651489e-07,
"loss": 0.0036,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 367.8125,
"epoch": 0.07054851108157621,
"grad_norm": 1.2751497030258179,
"kl": 0.38330078125,
"learning_rate": 5.038223531225742e-07,
"loss": 0.0632,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 378.5625,
"epoch": 0.07069219440353461,
"grad_norm": 2.2283360958099365,
"kl": 0.40185546875,
"learning_rate": 5.030203103954232e-07,
"loss": 0.157,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 359.3125,
"epoch": 0.07083587772549302,
"grad_norm": 0.06642284989356995,
"kl": 0.36181640625,
"learning_rate": 5.023125464358026e-07,
"loss": 0.0036,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 359.125,
"epoch": 0.07097956104745141,
"grad_norm": 1.7237136363983154,
"kl": 0.419921875,
"learning_rate": 5.016990909400709e-07,
"loss": 0.1536,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 412.5625,
"epoch": 0.07112324436940982,
"grad_norm": 0.07103113830089569,
"kl": 0.37353515625,
"learning_rate": 5.011799696475915e-07,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 427.0,
"epoch": 0.07126692769136822,
"grad_norm": 0.07387176156044006,
"kl": 0.40576171875,
"learning_rate": 5.007552043396547e-07,
"loss": 0.0041,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 442.5,
"epoch": 0.07141061101332663,
"grad_norm": 2.5220470428466797,
"kl": 0.3955078125,
"learning_rate": 5.004248128385618e-07,
"loss": 0.1158,
"reward": 0.08750000223517418,
"reward_std": 0.02500000037252903,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.875,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 316.625,
"epoch": 0.07155429433528503,
"grad_norm": 0.09226960688829422,
"kl": 0.3818359375,
"learning_rate": 5.001888090068784e-07,
"loss": 0.0037,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 321.375,
"epoch": 0.07169797765724344,
"grad_norm": 1.362955093383789,
"kl": 0.40478515625,
"learning_rate": 5.000472027468528e-07,
"loss": -0.0529,
"reward": 0.09375000186264515,
"reward_std": 0.012500000186264515,
"rewards/code_reward": 0.0,
"rewards/format_reward": 0.9375,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 339.375,
"epoch": 0.07184166097920183,
"grad_norm": 0.08727846294641495,
"kl": 0.38037109375,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0039,
"reward": 0.10000000149011612,
"reward_std": 0.0,
"rewards/code_reward": 0.0,
"rewards/format_reward": 1.0,
"step": 500
},
{
"epoch": 0.07184166097920183,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.05661330611514859,
"train_runtime": 3774.5693,
"train_samples_per_second": 2.119,
"train_steps_per_second": 0.132
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}