diff --git "a/checkpoint-1473/trainer_state.json" "b/checkpoint-1473/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1473/trainer_state.json" @@ -0,0 +1,44286 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1473, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 633.125, + "completions/mean_terminated_length": 633.125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.40213863365352154, + "epoch": 0.0006788866259334691, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 8665.0, + "reward": 1.4499999284744263, + "reward_std": 0.978336751461029, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07500000298023224, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10350984334945679, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 462.0, + "completions/mean_terminated_length": 462.0, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.48721940256655216, + "epoch": 0.0013577732518669382, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "learning_rate": 1.3513513513513515e-07, + "loss": 0.0, + "num_tokens": 15553.0, + "reward": 1.0833333730697632, + "reward_std": 0.8355209231376648, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23754701018333435, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 383.125, + "completions/mean_terminated_length": 383.125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.2951512522995472, + "epoch": 0.002036659877800407, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "learning_rate": 2.702702702702703e-07, + "loss": 0.0, + "num_tokens": 21634.0, + "reward": 1.0, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 588.25, + "completions/mean_terminated_length": 588.25, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.45288635790348053, + "epoch": 0.0027155465037338763, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "learning_rate": 4.0540540540540546e-07, + "loss": 0.0, + "num_tokens": 29468.0, + "reward": 0.550000011920929, + "reward_std": 0.6023762226104736, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1414213627576828, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 467.25, + "completions/mean_terminated_length": 467.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.3951845243573189, + "epoch": 0.0033944331296673455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "learning_rate": 5.405405405405406e-07, + "loss": 0.0, + "num_tokens": 36478.0, + "reward": 1.0833333730697632, + "reward_std": 0.7292092442512512, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0833333358168602, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022911310196, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 571.125, + "completions/mean_terminated_length": 571.125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6701111160218716, + "epoch": 0.004073319755600814, + "frac_reward_zero_std": 0.0, + "grad_norm": 66.5, + "learning_rate": 6.756756756756758e-07, + "loss": -0.0, + "num_tokens": 44455.0, + "reward": 0.5357142686843872, + "reward_std": 0.7889543771743774, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0357142873108387, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10101525485515594, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 357.0, + "completions/mean_terminated_length": 357.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5524849742650986, + "epoch": 0.0047522063815342835, + "frac_reward_zero_std": 0.0, + "grad_norm": 123.5, + "learning_rate": 8.108108108108109e-07, + "loss": 0.0, + "num_tokens": 50351.0, + "reward": 1.0729167461395264, + "reward_std": 0.8699543476104736, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1368400603532791, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 781.0, + "completions/mean_terminated_length": 781.0, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "entropy": 0.4235018901526928, + "epoch": 0.005431093007467753, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "learning_rate": 9.459459459459461e-07, + "loss": -0.0, + "num_tokens": 60079.0, + "reward": 0.1875, + "reward_std": 0.5303300619125366, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 606.125, + "completions/mean_terminated_length": 606.125, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 0.3649282669648528, + "epoch": 0.006109979633401222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.0, + "num_tokens": 68432.0, + "reward": 1.2000000476837158, + "reward_std": 1.025740146636963, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20000000298023224, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3082207143306732, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 221.625, + "completions/mean_terminated_length": 221.625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.353199296630919, + "epoch": 0.006788866259334691, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "learning_rate": 1.2162162162162164e-06, + "loss": 0.0, + "num_tokens": 73469.0, + "reward": 1.6979167461395264, + "reward_std": 0.7765633463859558, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1979166716337204, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.30190369486808777, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2005.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 846.875, + "completions/mean_terminated_length": 846.875, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "entropy": 0.1927033788524568, + "epoch": 0.00746775288526816, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "learning_rate": 1.3513513513513515e-06, + "loss": -0.0, + "num_tokens": 86140.0, + "reward": 0.6607142686843872, + "reward_std": 0.3142625093460083, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6607142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.3142625391483307, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 588.25, + "completions/mean_terminated_length": 588.25, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.45569038949906826, + "epoch": 0.008146639511201629, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "learning_rate": 1.4864864864864868e-06, + "loss": -0.0, + "num_tokens": 94262.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 345.125, + "completions/mean_terminated_length": 345.125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.425748098641634, + "epoch": 0.008825526137135099, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.0, + "num_tokens": 100047.0, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 365.0, + "completions/mean_terminated_length": 365.0, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.3499553017318249, + "epoch": 0.009504412763068567, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.75, + "learning_rate": 1.756756756756757e-06, + "loss": -0.0, + "num_tokens": 106583.0, + "reward": 0.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1118.0, + "completions/max_terminated_length": 1118.0, + "completions/mean_length": 841.625, + "completions/mean_terminated_length": 841.625, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.34041350334882736, + "epoch": 0.010183299389002037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.0, + "num_tokens": 116532.0, + "reward": 0.559374988079071, + "reward_std": 0.8740236759185791, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05937499925494194, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.13224539160728455, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 422.25, + "completions/mean_terminated_length": 422.25, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.34066037461161613, + "epoch": 0.010862186014935505, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "learning_rate": 2.0270270270270273e-06, + "loss": 0.0, + "num_tokens": 123006.0, + "reward": 0.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 423.0, + "completions/mean_terminated_length": 423.0, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.41253336891531944, + "epoch": 0.011541072640868975, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.0, + "num_tokens": 129830.0, + "reward": 1.03125, + "reward_std": 0.60411536693573, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 527.625, + "completions/mean_terminated_length": 527.625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5557551775127649, + "epoch": 0.012219959266802444, + "frac_reward_zero_std": 0.0, + "grad_norm": 31.5, + "learning_rate": 2.297297297297298e-06, + "loss": -0.0, + "num_tokens": 137259.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1220.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 787.625, + "completions/mean_terminated_length": 787.625, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "entropy": 0.4465682953596115, + "epoch": 0.012898845892735914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "learning_rate": 2.432432432432433e-06, + "loss": -0.0, + "num_tokens": 149472.0, + "reward": 0.6120129823684692, + "reward_std": 0.5126845836639404, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3214285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.21257825195789337, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.040584415197372437, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0758657306432724, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 504.125, + "completions/mean_terminated_length": 504.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.4042566269636154, + "epoch": 0.013577732518669382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "learning_rate": 2.5675675675675675e-06, + "loss": -0.0, + "num_tokens": 156881.0, + "reward": 1.379166603088379, + "reward_std": 0.6281586289405823, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04583333432674408, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08533315360546112, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 420.125, + "completions/mean_terminated_length": 420.125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.19113011565059423, + "epoch": 0.014256619144602852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "learning_rate": 2.702702702702703e-06, + "loss": -0.0, + "num_tokens": 164658.0, + "reward": 1.4261903762817383, + "reward_std": 0.8017059564590454, + "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.36154431104660034, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1011904776096344, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11418647319078445, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 551.625, + "completions/mean_terminated_length": 551.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.4456657748669386, + "epoch": 0.01493550577053632, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "learning_rate": 2.837837837837838e-06, + "loss": -0.0, + "num_tokens": 172447.0, + "reward": 0.5625, + "reward_std": 0.6232117414474487, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 709.0, + "completions/mean_terminated_length": 709.0, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "entropy": 0.3877583211287856, + "epoch": 0.015614392396469789, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "learning_rate": 2.9729729729729736e-06, + "loss": 0.0, + "num_tokens": 183479.0, + "reward": 0.28125, + "reward_std": 0.5250425338745117, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1534.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 909.5, + "completions/mean_terminated_length": 909.5, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.43758431635797024, + "epoch": 0.016293279022403257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.1081081081081082e-06, + "loss": 0.0, + "num_tokens": 194547.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 494.25, + "completions/mean_terminated_length": 494.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.4515352062880993, + "epoch": 0.01697216564833673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "learning_rate": 3.2432432432432437e-06, + "loss": -0.0, + "num_tokens": 201741.0, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 449.625, + "completions/mean_terminated_length": 449.625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.2880000276491046, + "epoch": 0.017651052274270197, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "learning_rate": 3.3783783783783788e-06, + "loss": 0.0, + "num_tokens": 209866.0, + "reward": 1.4809027910232544, + "reward_std": 0.9505121111869812, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3399054706096649, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1684027761220932, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.257219135761261, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 420.0, + "completions/mean_terminated_length": 420.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6037653535604477, + "epoch": 0.018329938900203666, + "frac_reward_zero_std": 0.0, + "grad_norm": 139.0, + "learning_rate": 3.513513513513514e-06, + "loss": 0.0, + "num_tokens": 216490.0, + "reward": 1.0833333730697632, + "reward_std": 0.7292092442512512, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0833333358168602, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2357022911310196, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 775.875, + "completions/mean_terminated_length": 775.875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.4370947852730751, + "epoch": 0.019008825526137134, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.53125, + "learning_rate": 3.648648648648649e-06, + "loss": 0.0, + "num_tokens": 226937.0, + "reward": 0.535714328289032, + "reward_std": 0.7071068286895752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 530.75, + "completions/mean_terminated_length": 530.75, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.444356651045382, + "epoch": 0.019687712152070606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.0, + "num_tokens": 234879.0, + "reward": 1.2026515007019043, + "reward_std": 0.9371897578239441, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07765151560306549, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.16307373344898224, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 385.375, + "completions/mean_terminated_length": 385.375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.48405807465314865, + "epoch": 0.020366598778004074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "learning_rate": 3.918918918918919e-06, + "loss": -0.0, + "num_tokens": 241018.0, + "reward": 0.78125, + "reward_std": 0.4898523688316345, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 373.125, + "completions/mean_terminated_length": 373.125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.46748646907508373, + "epoch": 0.021045485403937542, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.375, + "learning_rate": 4.0540540540540545e-06, + "loss": -0.0, + "num_tokens": 247091.0, + "reward": 1.1875, + "reward_std": 0.752970278263092, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 1182.0, + "completions/mean_length": 962.625, + "completions/mean_terminated_length": 962.625, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "entropy": 0.41946573927998543, + "epoch": 0.02172437202987101, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "learning_rate": 4.189189189189189e-06, + "loss": 0.0, + "num_tokens": 261432.0, + "reward": 1.0071429014205933, + "reward_std": 0.6755626201629639, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.33284708857536316, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04285714402794838, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08081220835447311, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 592.75, + "completions/mean_terminated_length": 592.75, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.6089797131717205, + "epoch": 0.02240325865580448, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "learning_rate": 4.324324324324325e-06, + "loss": -0.0, + "num_tokens": 269414.0, + "reward": 0.78125, + "reward_std": 0.4898523688316345, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 682.875, + "completions/mean_terminated_length": 682.875, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.5007057599723339, + "epoch": 0.02308214528173795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "learning_rate": 4.45945945945946e-06, + "loss": 0.0, + "num_tokens": 278229.0, + "reward": 0.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 357.5, + "completions/mean_terminated_length": 357.5, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.3946137595921755, + "epoch": 0.02376103190767142, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "learning_rate": 4.594594594594596e-06, + "loss": -0.0, + "num_tokens": 284113.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1033.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 671.125, + "completions/mean_terminated_length": 671.125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "entropy": 0.41237878799438477, + "epoch": 0.024439918533604887, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "learning_rate": 4.72972972972973e-06, + "loss": -0.0, + "num_tokens": 293930.0, + "reward": 1.0499999523162842, + "reward_std": 0.4750939905643463, + "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.36936238408088684, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 564.375, + "completions/mean_terminated_length": 564.375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "entropy": 0.42290690913796425, + "epoch": 0.025118805159538356, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "learning_rate": 4.864864864864866e-06, + "loss": 0.0, + "num_tokens": 301701.0, + "reward": 1.1916667222976685, + "reward_std": 0.7487027049064636, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06666667014360428, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12848322093486786, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 319.125, + "completions/mean_terminated_length": 319.125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.41047218441963196, + "epoch": 0.025797691785471828, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "learning_rate": 5e-06, + "loss": -0.0, + "num_tokens": 307246.0, + "reward": 0.875, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 417.25, + "completions/mean_terminated_length": 417.25, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.322323614731431, + "epoch": 0.026476578411405296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "learning_rate": 5.135135135135135e-06, + "loss": -0.0, + "num_tokens": 313680.0, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 599.5, + "completions/mean_terminated_length": 599.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.5022741565480828, + "epoch": 0.027155465037338764, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "learning_rate": 5.2702702702702705e-06, + "loss": 0.0, + "num_tokens": 321756.0, + "reward": 0.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 332.75, + "completions/mean_terminated_length": 332.75, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.31443312019109726, + "epoch": 0.027834351663272233, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "learning_rate": 5.405405405405406e-06, + "loss": 0.0, + "num_tokens": 327602.0, + "reward": 0.78125, + "reward_std": 0.7727212905883789, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 698.5, + "completions/mean_terminated_length": 698.5, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.4371532369405031, + "epoch": 0.028513238289205704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "learning_rate": 5.540540540540541e-06, + "loss": 0.0, + "num_tokens": 336894.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 384.375, + "completions/mean_terminated_length": 384.375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.36207926645874977, + "epoch": 0.029192124915139173, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "learning_rate": 5.675675675675676e-06, + "loss": -0.0, + "num_tokens": 344273.0, + "reward": 1.0809524059295654, + "reward_std": 0.6699240803718567, + "rewards/fixed_code_pass_all_test_reward/mean": 0.32499998807907104, + "rewards/fixed_code_pass_all_test_reward/std": 0.4652188718318939, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.130952388048172, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1813279390335083, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 552.25, + "completions/mean_terminated_length": 552.25, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 0.3683111499994993, + "epoch": 0.02987101154107264, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "learning_rate": 5.810810810810811e-06, + "loss": -0.0, + "num_tokens": 352795.0, + "reward": 1.359375, + "reward_std": 0.4974825084209442, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.015625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 298.375, + "completions/mean_terminated_length": 298.375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.42081066966056824, + "epoch": 0.03054989816700611, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.703125, + "learning_rate": 5.945945945945947e-06, + "loss": -0.0, + "num_tokens": 358422.0, + "reward": 0.675000011920929, + "reward_std": 0.8137215971946716, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 590.375, + "completions/mean_terminated_length": 590.375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.3734747637063265, + "epoch": 0.031228784792939578, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 6.081081081081082e-06, + "loss": -0.0, + "num_tokens": 366513.0, + "reward": 1.056249976158142, + "reward_std": 0.606769859790802, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05624999850988388, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10500850528478622, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 758.625, + "completions/mean_terminated_length": 758.625, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.2996700187213719, + "epoch": 0.031907671418873046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "learning_rate": 6.2162162162162164e-06, + "loss": 0.0, + "num_tokens": 377510.0, + "reward": 0.8854166865348816, + "reward_std": 0.7694333791732788, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.47715675830841064, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0729166716337204, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.10803020745515823, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 481.625, + "completions/mean_terminated_length": 481.625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.43572198040783405, + "epoch": 0.032586558044806514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "learning_rate": 6.351351351351351e-06, + "loss": 0.0, + "num_tokens": 384859.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 508.5, + "completions/mean_terminated_length": 508.5, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.18101190589368343, + "epoch": 0.03326544467073999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "learning_rate": 6.486486486486487e-06, + "loss": -0.0, + "num_tokens": 394199.0, + "reward": 1.1744047403335571, + "reward_std": 0.5458797216415405, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.29940474033355713, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29570335149765015, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 191.75, + "completions/mean_terminated_length": 191.75, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.24147845432162285, + "epoch": 0.03394433129667346, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "learning_rate": 6.621621621621622e-06, + "loss": 0.0, + "num_tokens": 398901.0, + "reward": 1.3416666984558105, + "reward_std": 1.0042370557785034, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.21666666865348816, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20158106088638306, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 390.25, + "completions/mean_terminated_length": 390.25, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.332314838655293, + "epoch": 0.034623217922606926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "learning_rate": 6.7567567567567575e-06, + "loss": -0.0, + "num_tokens": 406047.0, + "reward": 1.066666603088379, + "reward_std": 0.49022185802459717, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 358.25, + "completions/mean_terminated_length": 358.25, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.24127027858048677, + "epoch": 0.035302104548540394, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "learning_rate": 6.891891891891892e-06, + "loss": -0.0, + "num_tokens": 412489.0, + "reward": 1.4166666269302368, + "reward_std": 0.7292091250419617, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02083333395421505, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 363.75, + "completions/mean_terminated_length": 363.75, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.18655581679195166, + "epoch": 0.03598099117447386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "learning_rate": 7.027027027027028e-06, + "loss": -0.0, + "num_tokens": 420511.0, + "reward": 1.4736607074737549, + "reward_std": 0.9764032959938049, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.43490222096443176, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.13437500596046448, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.20219223201274872, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 466.125, + "completions/mean_terminated_length": 466.125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.2667345628142357, + "epoch": 0.03665987780040733, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "learning_rate": 7.162162162162163e-06, + "loss": -0.0, + "num_tokens": 427328.0, + "reward": 1.910416603088379, + "reward_std": 0.5594737529754639, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.16041666269302368, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.175693079829216, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 381.25, + "completions/mean_terminated_length": 381.25, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.27860070299357176, + "epoch": 0.0373387644263408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "learning_rate": 7.297297297297298e-06, + "loss": -0.0, + "num_tokens": 433586.0, + "reward": 1.4723213911056519, + "reward_std": 1.0072075128555298, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22232142090797424, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17692866921424866, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 519.0, + "completions/mean_terminated_length": 519.0, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.4064774829894304, + "epoch": 0.03801765105227427, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "learning_rate": 7.4324324324324324e-06, + "loss": 0.0, + "num_tokens": 440834.0, + "reward": 0.47142860293388367, + "reward_std": 0.655632734298706, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09642857313156128, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15569837391376495, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 474.75, + "completions/mean_terminated_length": 474.75, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.2492697569541633, + "epoch": 0.038696537678207736, + "frac_reward_zero_std": 0.0, + "grad_norm": 158.0, + "learning_rate": 7.567567567567569e-06, + "loss": -0.0, + "num_tokens": 450936.0, + "reward": 1.7020833492279053, + "reward_std": 0.8942691683769226, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7250000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.22708332538604736, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2007797509431839, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 282.75, + "completions/mean_terminated_length": 282.75, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.2521714773029089, + "epoch": 0.03937542430414121, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "learning_rate": 7.702702702702704e-06, + "loss": 0.0, + "num_tokens": 456534.0, + "reward": 2.258333444595337, + "reward_std": 1.1282448768615723, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6333333253860474, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4201284646987915, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 105.625, + "completions/mean_terminated_length": 105.625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.08102542627602816, + "epoch": 0.04005431093007468, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.625, + "learning_rate": 7.837837837837838e-06, + "loss": -0.0, + "num_tokens": 460739.0, + "reward": 2.4375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.13168837316334248, + "epoch": 0.04073319755600815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "learning_rate": 7.972972972972974e-06, + "loss": -0.0, + "num_tokens": 466835.0, + "reward": 0.8357142806053162, + "reward_std": 0.6430158615112305, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.21380901336669922, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.06071428954601288, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11473128199577332, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 357.5, + "completions/mean_terminated_length": 357.5, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.17302474100142717, + "epoch": 0.041412084181941616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "learning_rate": 8.108108108108109e-06, + "loss": -0.0, + "num_tokens": 474039.0, + "reward": 0.8999999761581421, + "reward_std": 0.37032803893089294, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 283.25, + "completions/mean_terminated_length": 283.25, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.09238436678424478, + "epoch": 0.042090970807875085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "learning_rate": 8.243243243243245e-06, + "loss": -0.0, + "num_tokens": 480905.0, + "reward": 1.488541603088379, + "reward_std": 0.7430623173713684, + "rewards/fixed_code_pass_all_test_reward/mean": 0.484375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2259652018547058, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2541666626930237, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19755448400974274, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 372.875, + "completions/mean_terminated_length": 372.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.06192499818280339, + "epoch": 0.04276985743380855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "learning_rate": 8.378378378378378e-06, + "loss": 0.0, + "num_tokens": 489120.0, + "reward": 1.954545497894287, + "reward_std": 0.04859290271997452, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9545454978942871, + "rewards/fixed_code_pass_all_test_reward/std": 0.0485929399728775, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 199.625, + "completions/mean_terminated_length": 199.625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.15581836737692356, + "epoch": 0.04344874405974202, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "learning_rate": 8.513513513513514e-06, + "loss": 0.0, + "num_tokens": 493837.0, + "reward": 2.214583396911621, + "reward_std": 0.654498279094696, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4645833671092987, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2778085470199585, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 496.5, + "completions/mean_terminated_length": 496.5, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.23688332177698612, + "epoch": 0.04412763068567549, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "learning_rate": 8.64864864864865e-06, + "loss": 0.0, + "num_tokens": 502857.0, + "reward": 1.0428571701049805, + "reward_std": 0.0808122381567955, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.04285714402794838, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08081220835447311, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 323.25, + "completions/mean_terminated_length": 323.25, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.15722941514104605, + "epoch": 0.04480651731160896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "learning_rate": 8.783783783783785e-06, + "loss": 0.0, + "num_tokens": 509059.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 148.125, + "completions/mean_terminated_length": 148.125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.15436407551169395, + "epoch": 0.04548540393754243, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.625, + "learning_rate": 8.91891891891892e-06, + "loss": 0.0, + "num_tokens": 513500.0, + "reward": 2.2875001430511475, + "reward_std": 0.5453810095787048, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4125000238418579, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.23363077640533447, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 125.5, + "completions/mean_terminated_length": 125.5, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.11615594383329153, + "epoch": 0.0461642905634759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.054054054054054e-06, + "loss": 0.0, + "num_tokens": 517608.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 469.25, + "completions/mean_terminated_length": 469.25, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.13107811007648706, + "epoch": 0.04684317718940937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "learning_rate": 9.189189189189191e-06, + "loss": 0.0, + "num_tokens": 526130.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 142.0, + "completions/mean_terminated_length": 142.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.15522114373743534, + "epoch": 0.04752206381534284, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5625, + "learning_rate": 9.324324324324325e-06, + "loss": 0.0, + "num_tokens": 530474.0, + "reward": 1.5, + "reward_std": 0.4960158169269562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19416078925132751, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 134.5, + "completions/mean_terminated_length": 134.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.16009862814098597, + "epoch": 0.048200950441276307, + "frac_reward_zero_std": 0.0, + "grad_norm": 37.0, + "learning_rate": 9.45945945945946e-06, + "loss": -0.0, + "num_tokens": 534606.0, + "reward": 1.837499976158142, + "reward_std": 0.763333797454834, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.13566663209348917, + "epoch": 0.048879837067209775, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "learning_rate": 9.594594594594594e-06, + "loss": -0.0, + "num_tokens": 540146.0, + "reward": 1.4553570747375488, + "reward_std": 0.629321813583374, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0803571417927742, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1157275140285492, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 161.125, + "completions/mean_terminated_length": 161.125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.21239402517676353, + "epoch": 0.04955872369314324, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.84375, + "learning_rate": 9.729729729729732e-06, + "loss": -0.0, + "num_tokens": 544683.0, + "reward": 1.4562499523162842, + "reward_std": 0.6668415069580078, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.20625001192092896, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.25006943941116333, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 183.375, + "completions/mean_terminated_length": 183.375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.08463525306433439, + "epoch": 0.05023761031907671, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "learning_rate": 9.864864864864865e-06, + "loss": 0.0, + "num_tokens": 549878.0, + "reward": 2.0833334922790527, + "reward_std": 0.46929535269737244, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.2083333432674408, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18898224830627441, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 165.125, + "completions/mean_terminated_length": 165.125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.1347042741253972, + "epoch": 0.05091649694501019, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "learning_rate": 1e-05, + "loss": 0.0, + "num_tokens": 554399.0, + "reward": 2.0625, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 222.5, + "completions/mean_terminated_length": 222.5, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.10731179267168045, + "epoch": 0.051595383570943655, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.671875, + "learning_rate": 1.0135135135135136e-05, + "loss": -0.0, + "num_tokens": 559603.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 333.125, + "completions/mean_terminated_length": 333.125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.13763546478003263, + "epoch": 0.05227427019687712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 1.027027027027027e-05, + "loss": -0.0, + "num_tokens": 566868.0, + "reward": 2.0958333015441895, + "reward_std": 0.36250340938568115, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.4225771427154541, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5958333611488342, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12141691148281097, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 153.125, + "completions/mean_terminated_length": 153.125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.12349202670156956, + "epoch": 0.05295315682281059, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.890625, + "learning_rate": 1.0405405405405407e-05, + "loss": -0.0, + "num_tokens": 571157.0, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 161.0, + "completions/mean_terminated_length": 161.0, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.13015724625438452, + "epoch": 0.05363204344874406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0540540540540541e-05, + "loss": 0.0, + "num_tokens": 575677.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 196.375, + "completions/mean_terminated_length": 196.375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.11832852475345135, + "epoch": 0.05431093007467753, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "learning_rate": 1.0675675675675677e-05, + "loss": 0.0, + "num_tokens": 580384.0, + "reward": 1.212499976158142, + "reward_std": 0.760521411895752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.4625000059604645, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3159452974796295, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 244.625, + "completions/mean_terminated_length": 244.625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.10070886136963964, + "epoch": 0.054989816700611, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "learning_rate": 1.0810810810810812e-05, + "loss": 0.0, + "num_tokens": 585981.0, + "reward": 0.9375, + "reward_std": 0.6662945747375488, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12400397658348083, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 322.75, + "completions/mean_terminated_length": 322.75, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "entropy": 0.12286796048283577, + "epoch": 0.055668703326544465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0945945945945946e-05, + "loss": 0.0, + "num_tokens": 592643.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 177.75, + "completions/mean_terminated_length": 177.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.1339035602286458, + "epoch": 0.05634758995247793, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.90625, + "learning_rate": 1.1081081081081081e-05, + "loss": -0.0, + "num_tokens": 597249.0, + "reward": 1.693750023841858, + "reward_std": 1.0695621967315674, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3187499940395355, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.36747071146965027, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 365.0, + "completions/mean_terminated_length": 365.0, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.13325223326683044, + "epoch": 0.05702647657841141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1216216216216219e-05, + "loss": 0.0, + "num_tokens": 604713.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 428.125, + "completions/mean_terminated_length": 428.125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.07309685787186027, + "epoch": 0.05770536320434488, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "learning_rate": 1.1351351351351352e-05, + "loss": -0.0, + "num_tokens": 614834.0, + "reward": 1.8297618627548218, + "reward_std": 0.7953521013259888, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.22587695717811584, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25833332538604736, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17861904203891754, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 219.125, + "completions/mean_terminated_length": 219.125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.053572315722703934, + "epoch": 0.058384249830278345, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "learning_rate": 1.1486486486486488e-05, + "loss": 0.0, + "num_tokens": 621619.0, + "reward": 2.909090995788574, + "reward_std": 0.2571297883987427, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9090908765792847, + "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 151.25, + "completions/mean_terminated_length": 151.25, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.23708575032651424, + "epoch": 0.059063136456211814, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.984375, + "learning_rate": 1.1621621621621622e-05, + "loss": 0.0, + "num_tokens": 625925.0, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 341.125, + "completions/mean_terminated_length": 341.125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.146900518797338, + "epoch": 0.05974202308214528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "learning_rate": 1.1756756756756757e-05, + "loss": 0.0, + "num_tokens": 632598.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 197.625, + "completions/mean_terminated_length": 197.625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.27483534440398216, + "epoch": 0.06042090970807875, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "learning_rate": 1.1891891891891894e-05, + "loss": -0.0, + "num_tokens": 637435.0, + "reward": 1.7999999523162842, + "reward_std": 0.6553807258605957, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17500001192092896, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24348658323287964, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.05631826026365161, + "epoch": 0.06109979633401222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "learning_rate": 1.2027027027027028e-05, + "loss": -0.0, + "num_tokens": 644637.0, + "reward": 2.5568182468414307, + "reward_std": 0.21115268766880035, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9318181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.19284729659557343, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1178511381149292, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 167.75, + "completions/mean_terminated_length": 167.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.11315709352493286, + "epoch": 0.06177868295994569, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.796875, + "learning_rate": 1.2162162162162164e-05, + "loss": -0.0, + "num_tokens": 649251.0, + "reward": 0.8999999761581421, + "reward_std": 0.37032803893089294, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 245.5, + "completions/mean_terminated_length": 245.5, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.09401944745332003, + "epoch": 0.062457569585879155, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "learning_rate": 1.2297297297297299e-05, + "loss": -0.0, + "num_tokens": 656479.0, + "reward": 1.3214285373687744, + "reward_std": 0.44361361861228943, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3214285671710968, + "rewards/fixed_code_pass_all_test_reward/std": 0.44361358880996704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 303.25, + "completions/mean_terminated_length": 303.25, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.0514632654376328, + "epoch": 0.06313645621181263, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "learning_rate": 1.2432432432432433e-05, + "loss": 0.0, + "num_tokens": 663305.0, + "reward": 2.261805534362793, + "reward_std": 0.15215781331062317, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.26180553436279297, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15215782821178436, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 183.625, + "completions/mean_terminated_length": 183.625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.14608134049922228, + "epoch": 0.06381534283774609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2567567567567568e-05, + "loss": 0.0, + "num_tokens": 668590.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 218.875, + "completions/mean_terminated_length": 218.875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.097068106289953, + "epoch": 0.06449422946367957, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "learning_rate": 1.2702702702702702e-05, + "loss": 0.0, + "num_tokens": 675053.0, + "reward": 2.0875000953674316, + "reward_std": 0.31038394570350647, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.24306972324848175, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3375000059604645, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12368355691432953, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 143.125, + "completions/mean_terminated_length": 143.125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.07860143668949604, + "epoch": 0.06517311608961303, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "learning_rate": 1.283783783783784e-05, + "loss": -0.0, + "num_tokens": 679438.0, + "reward": 2.0333333015441895, + "reward_std": 0.5294501185417175, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.28333336114883423, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3075762987136841, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.062403345480561256, + "epoch": 0.0658520027155465, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "learning_rate": 1.2972972972972975e-05, + "loss": 0.0, + "num_tokens": 685547.0, + "reward": 2.9000000953674316, + "reward_std": 0.15118584036827087, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.8999999761581421, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1511857807636261, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 317.375, + "completions/mean_terminated_length": 317.375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.11283600656315684, + "epoch": 0.06653088934147998, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "learning_rate": 1.3108108108108109e-05, + "loss": -0.0, + "num_tokens": 693230.0, + "reward": 1.5511904954910278, + "reward_std": 0.6588498950004578, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.1428571492433548, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3369047939777374, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.24302050471305847, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 298.375, + "completions/mean_terminated_length": 298.375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.14692620281130075, + "epoch": 0.06720977596741344, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "learning_rate": 1.3243243243243244e-05, + "loss": 0.0, + "num_tokens": 698977.0, + "reward": 1.765625, + "reward_std": 0.4745180904865265, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.015625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.04419417306780815, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 207.5, + "completions/mean_terminated_length": 207.5, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.1271934425458312, + "epoch": 0.06788866259334692, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "learning_rate": 1.3378378378378381e-05, + "loss": -0.0, + "num_tokens": 703989.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.09632621146738529, + "epoch": 0.06856754921928038, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "learning_rate": 1.3513513513513515e-05, + "loss": -0.0, + "num_tokens": 708986.0, + "reward": 1.5499999523162842, + "reward_std": 0.4869731366634369, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.05000000074505806, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09258200973272324, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 178.375, + "completions/mean_terminated_length": 178.375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.11827277857810259, + "epoch": 0.06924643584521385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "learning_rate": 1.364864864864865e-05, + "loss": 0.0, + "num_tokens": 713597.0, + "reward": 1.6041667461395264, + "reward_std": 0.6542748212814331, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1041666716337204, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15268756449222565, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 191.5, + "completions/mean_terminated_length": 191.5, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.1445675678551197, + "epoch": 0.06992532247114731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3783783783783784e-05, + "loss": 0.0, + "num_tokens": 718809.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 268.5, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.15129757719114423, + "epoch": 0.07060420909708079, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "learning_rate": 1.391891891891892e-05, + "loss": 0.0, + "num_tokens": 725501.0, + "reward": 1.5, + "reward_std": 0.3598114550113678, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 130.125, + "completions/mean_terminated_length": 130.125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.0953492745757103, + "epoch": 0.07128309572301425, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "learning_rate": 1.4054054054054055e-05, + "loss": 0.0, + "num_tokens": 729806.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 207.75, + "completions/mean_terminated_length": 207.75, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.09533433010801673, + "epoch": 0.07196198234894773, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "learning_rate": 1.4189189189189189e-05, + "loss": -0.0, + "num_tokens": 735012.0, + "reward": 2.625, + "reward_std": 0.4940117299556732, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2182178944349289, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 186.0, + "completions/mean_terminated_length": 186.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.092557767406106, + "epoch": 0.0726408689748812, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "learning_rate": 1.4324324324324326e-05, + "loss": 0.0, + "num_tokens": 740492.0, + "reward": 2.107142925262451, + "reward_std": 0.5918954014778137, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428060531616, + "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17251639068126678, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 196.25, + "completions/mean_terminated_length": 196.25, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2892663460224867, + "epoch": 0.07331975560081466, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "learning_rate": 1.4459459459459462e-05, + "loss": 0.0, + "num_tokens": 745470.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 211.125, + "completions/mean_terminated_length": 211.125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.12156647210940719, + "epoch": 0.07399864222674814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "learning_rate": 1.4594594594594596e-05, + "loss": 0.0, + "num_tokens": 751463.0, + "reward": 1.6375000476837158, + "reward_std": 0.19955304265022278, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5750000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.070710688829422, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 226.25, + "completions/mean_terminated_length": 226.25, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.11055296938866377, + "epoch": 0.0746775288526816, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "learning_rate": 1.4729729729729731e-05, + "loss": -0.0, + "num_tokens": 756897.0, + "reward": 1.524999976158142, + "reward_std": 0.7778174877166748, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 277.25, + "completions/mean_terminated_length": 277.25, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.038223384879529476, + "epoch": 0.07535641547861507, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.578125, + "learning_rate": 1.4864864864864865e-05, + "loss": 0.0, + "num_tokens": 763395.0, + "reward": 2.5416667461395264, + "reward_std": 0.19416078925132751, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5416666865348816, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19416078925132751, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 209.0, + "completions/mean_terminated_length": 209.0, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.1549054579809308, + "epoch": 0.07603530210454854, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "learning_rate": 1.5000000000000002e-05, + "loss": -0.0, + "num_tokens": 769027.0, + "reward": 1.4583332538604736, + "reward_std": 0.8486683964729309, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8541666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.3500283360481262, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1041666716337204, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19795581698417664, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 170.125, + "completions/mean_terminated_length": 170.125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.1332483682781458, + "epoch": 0.07671418873048201, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.59375, + "learning_rate": 1.5135135135135138e-05, + "loss": 0.0, + "num_tokens": 774076.0, + "reward": 1.4375, + "reward_std": 1.1160356998443604, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3720119297504425, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 137.25, + "completions/mean_terminated_length": 137.25, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.1638177689164877, + "epoch": 0.07739307535641547, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.875, + "learning_rate": 1.527027027027027e-05, + "loss": -0.0, + "num_tokens": 778198.0, + "reward": 2.049999952316284, + "reward_std": 0.9365590214729309, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.550000011920929, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.49856939911842346, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 121.875, + "completions/mean_terminated_length": 121.875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.219361437484622, + "epoch": 0.07807196198234895, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.40625, + "learning_rate": 1.540540540540541e-05, + "loss": 0.0, + "num_tokens": 782253.0, + "reward": 1.3854167461395264, + "reward_std": 0.42007532715797424, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.1354166716337204, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.19889327883720398, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 317.0, + "completions/mean_terminated_length": 317.0, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.08035129262134433, + "epoch": 0.07875084860828242, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "learning_rate": 1.554054054054054e-05, + "loss": -0.0, + "num_tokens": 788909.0, + "reward": 2.2041666507720947, + "reward_std": 0.5428284406661987, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6499999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.38172540068626404, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5541666746139526, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2598457932472229, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 261.625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.0929252477362752, + "epoch": 0.07942973523421588, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "learning_rate": 1.5675675675675676e-05, + "loss": 0.0, + "num_tokens": 794186.0, + "reward": 1.0916666984558105, + "reward_std": 0.17066630721092224, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.09166666865348816, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.17066630721092224, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.1819993769749999, + "epoch": 0.08010862186014936, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "learning_rate": 1.581081081081081e-05, + "loss": -0.0, + "num_tokens": 799878.0, + "reward": 1.087499976158142, + "reward_std": 0.18077214062213898, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.08749999850988388, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.18077215552330017, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 304.5, + "completions/mean_terminated_length": 304.5, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.058641504030674696, + "epoch": 0.08078750848608282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "learning_rate": 1.5945945945945947e-05, + "loss": -0.0, + "num_tokens": 806706.0, + "reward": 2.5645833015441895, + "reward_std": 0.34161585569381714, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5645833015441895, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.34161585569381714, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 276.625, + "completions/mean_terminated_length": 276.625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.11894779466092587, + "epoch": 0.0814663951120163, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "learning_rate": 1.6081081081081083e-05, + "loss": 0.0, + "num_tokens": 812551.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 198.75, + "completions/mean_terminated_length": 198.75, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.07181581668555737, + "epoch": 0.08214528173794976, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "learning_rate": 1.6216216216216218e-05, + "loss": -0.0, + "num_tokens": 817517.0, + "reward": 2.3249998092651367, + "reward_std": 0.30472469329833984, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.32500001788139343, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.30472469329833984, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 237.625, + "completions/mean_terminated_length": 237.625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.13512822892516851, + "epoch": 0.08282416836388323, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "learning_rate": 1.6351351351351354e-05, + "loss": -0.0, + "num_tokens": 823210.0, + "reward": 1.0708333253860474, + "reward_std": 0.098299041390419, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.07083333283662796, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.09829902648925781, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 228.375, + "completions/mean_terminated_length": 228.375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.08480434771627188, + "epoch": 0.0835030549898167, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "learning_rate": 1.648648648648649e-05, + "loss": 0.0, + "num_tokens": 828285.0, + "reward": 2.1750001907348633, + "reward_std": 0.594084620475769, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.800000011920929, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2916836738586426, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 341.0, + "completions/mean_terminated_length": 341.0, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "entropy": 0.12959323124960065, + "epoch": 0.08418194161575017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.662162162162162e-05, + "loss": 0.0, + "num_tokens": 835541.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 252.625, + "completions/mean_terminated_length": 252.625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.06366240326315165, + "epoch": 0.08486082824168364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "learning_rate": 1.6756756756756757e-05, + "loss": -0.0, + "num_tokens": 841922.0, + "reward": 2.2395832538604736, + "reward_std": 0.1632840782403946, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3645833134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.1886538565158844, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2314550280570984, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 440.125, + "completions/mean_terminated_length": 440.125, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "entropy": 0.10514781018719077, + "epoch": 0.0855397148676171, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "learning_rate": 1.6891891891891896e-05, + "loss": 0.0, + "num_tokens": 850291.0, + "reward": 1.53125, + "reward_std": 0.3750595152378082, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.53125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3750595450401306, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 161.25, + "completions/mean_terminated_length": 161.25, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.1252982271835208, + "epoch": 0.08621860149355058, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "learning_rate": 1.7027027027027028e-05, + "loss": 0.0, + "num_tokens": 854765.0, + "reward": 1.15625, + "reward_std": 0.6935609579086304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.06566974520683289, + "epoch": 0.08689748811948404, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "learning_rate": 1.7162162162162163e-05, + "loss": -0.0, + "num_tokens": 861299.0, + "reward": 1.5520832538604736, + "reward_std": 0.24372075498104095, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4895833134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.17501415312290192, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.12400397658348083, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 543.375, + "completions/mean_terminated_length": 543.375, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 0.12522254325449467, + "epoch": 0.08757637474541752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "learning_rate": 1.72972972972973e-05, + "loss": -0.0, + "num_tokens": 871966.0, + "reward": 2.1458332538604736, + "reward_std": 0.3431587517261505, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.2070196568965912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3958333432674408, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3666396141052246, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 286.625, + "completions/mean_terminated_length": 286.625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.163420214317739, + "epoch": 0.08825526137135098, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "learning_rate": 1.7432432432432434e-05, + "loss": -0.0, + "num_tokens": 877939.0, + "reward": 1.9854166507720947, + "reward_std": 0.4245854616165161, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3604166507720947, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.29866600036621094, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 267.0, + "completions/mean_terminated_length": 267.0, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.024705966003239155, + "epoch": 0.08893414799728445, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "learning_rate": 1.756756756756757e-05, + "loss": -0.0, + "num_tokens": 884483.0, + "reward": 2.8291666507720947, + "reward_std": 0.3675044775009155, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9125000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.2474873811006546, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9166666865348816, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.15430334210395813, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 420.25, + "completions/mean_terminated_length": 420.25, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.1089058571960777, + "epoch": 0.08961303462321792, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "learning_rate": 1.7702702702702702e-05, + "loss": 0.0, + "num_tokens": 893029.0, + "reward": 2.1129465103149414, + "reward_std": 0.19129861891269684, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.48794645071029663, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.11027258634567261, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 498.5, + "completions/mean_terminated_length": 498.5, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.28384515829384327, + "epoch": 0.09029192124915139, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "learning_rate": 1.783783783783784e-05, + "loss": -0.0, + "num_tokens": 900705.0, + "reward": 0.8999999761581421, + "reward_std": 0.37032803893089294, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02500000037252903, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0707106813788414, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.04758477327413857, + "epoch": 0.09097080787508487, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "learning_rate": 1.7972972972972976e-05, + "loss": 0.0, + "num_tokens": 907077.0, + "reward": 2.3958334922790527, + "reward_std": 1.0155048370361328, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6458333730697632, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4027435779571533, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 259.875, + "completions/mean_terminated_length": 259.875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.04968078434467316, + "epoch": 0.09164969450101833, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "learning_rate": 1.8108108108108108e-05, + "loss": -0.0, + "num_tokens": 914364.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 137.625, + "completions/mean_terminated_length": 137.625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.15637117624282837, + "epoch": 0.0923285811269518, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.25, + "learning_rate": 1.8243243243243244e-05, + "loss": 0.0, + "num_tokens": 918713.0, + "reward": 1.4208333492279053, + "reward_std": 0.6153944730758667, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.17083333432674408, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2675209641456604, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 134.75, + "completions/mean_terminated_length": 134.75, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.12832927331328392, + "epoch": 0.09300746775288526, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.671875, + "learning_rate": 1.8378378378378383e-05, + "loss": -0.0, + "num_tokens": 923279.0, + "reward": 2.1472220420837402, + "reward_std": 0.627296507358551, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.6472222208976746, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.38607826828956604, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 134.375, + "completions/mean_terminated_length": 134.375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.06418356252834201, + "epoch": 0.09368635437881874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8513513513513515e-05, + "loss": 0.0, + "num_tokens": 928250.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 105.375, + "completions/mean_terminated_length": 105.375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.15013590827584267, + "epoch": 0.0943652410047522, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.28125, + "learning_rate": 1.864864864864865e-05, + "loss": -0.0, + "num_tokens": 932285.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 166.75, + "completions/mean_terminated_length": 166.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.08057058416306973, + "epoch": 0.09504412763068568, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "learning_rate": 1.8783783783783786e-05, + "loss": 0.0, + "num_tokens": 937035.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 349.25, + "completions/mean_terminated_length": 106.5714340209961, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.07423369376920164, + "epoch": 0.09572301425661914, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.84375, + "learning_rate": 1.891891891891892e-05, + "loss": 0.0, + "num_tokens": 943061.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 138.375, + "completions/mean_terminated_length": 138.375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.062284584157168865, + "epoch": 0.09640190088255261, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.140625, + "learning_rate": 1.9054054054054057e-05, + "loss": -0.0, + "num_tokens": 948064.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 100.75, + "completions/mean_terminated_length": 100.75, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.07618905929848552, + "epoch": 0.09708078750848609, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6875, + "learning_rate": 1.918918918918919e-05, + "loss": -0.0, + "num_tokens": 952110.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 220.875, + "completions/mean_terminated_length": 220.875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.07020694715902209, + "epoch": 0.09775967413441955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9324324324324328e-05, + "loss": 0.0, + "num_tokens": 958381.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 88.25, + "completions/mean_terminated_length": 88.25, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.07306768978014588, + "epoch": 0.09843856076035302, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.375, + "learning_rate": 1.9459459459459463e-05, + "loss": 0.0, + "num_tokens": 962151.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 132.625, + "completions/mean_terminated_length": 132.625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.06474465597420931, + "epoch": 0.09911744738628649, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "learning_rate": 1.9594594594594595e-05, + "loss": -0.0, + "num_tokens": 966740.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 67.5, + "completions/mean_terminated_length": 67.5, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.06335726520046592, + "epoch": 0.09979633401221996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.972972972972973e-05, + "loss": 0.0, + "num_tokens": 970304.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 85.5, + "completions/mean_terminated_length": 85.5, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.14347989484667778, + "epoch": 0.10047522063815342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9864864864864866e-05, + "loss": 0.0, + "num_tokens": 974108.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 173.75, + "completions/mean_terminated_length": 173.75, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.062342855613678694, + "epoch": 0.1011541072640869, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "learning_rate": 2e-05, + "loss": 0.0, + "num_tokens": 979442.0, + "reward": 2.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 330.25, + "completions/mean_terminated_length": 330.25, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.03864629892632365, + "epoch": 0.10183299389002037, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "learning_rate": 1.999997189149227e-05, + "loss": 0.0, + "num_tokens": 987228.0, + "reward": 1.03125, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.03125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0883883461356163, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 321.75, + "completions/mean_terminated_length": 321.75, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.03964217263273895, + "epoch": 0.10251188051595383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "learning_rate": 1.999988756612709e-05, + "loss": 0.0, + "num_tokens": 995402.0, + "reward": 2.15625, + "reward_std": 0.35197147727012634, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, + "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 166.75, + "completions/mean_terminated_length": 166.75, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.0510701450984925, + "epoch": 0.10319076714188731, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "learning_rate": 1.9999747024378516e-05, + "loss": -0.0, + "num_tokens": 1000416.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 165.375, + "completions/mean_terminated_length": 165.375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.0711862719617784, + "epoch": 0.10386965376782077, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "learning_rate": 1.9999550267036634e-05, + "loss": 0.0, + "num_tokens": 1005595.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 114.625, + "completions/mean_terminated_length": 114.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.06987747130915523, + "epoch": 0.10454854039375425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.999929729520755e-05, + "loss": 0.0, + "num_tokens": 1009872.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 89.625, + "completions/mean_terminated_length": 89.625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.03780437004752457, + "epoch": 0.10522742701968771, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "learning_rate": 1.99989881103134e-05, + "loss": 0.0, + "num_tokens": 1013853.0, + "reward": 2.9583334922790527, + "reward_std": 0.11785107105970383, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9583333730697632, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.117851123213768, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 255.0, + "completions/mean_terminated_length": 255.0, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.006107321009039879, + "epoch": 0.10590631364562118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9998622714092328e-05, + "loss": 0.0, + "num_tokens": 1020293.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 69.625, + "completions/mean_terminated_length": 69.625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.07531640492379665, + "epoch": 0.10658520027155464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9998201108598477e-05, + "loss": 0.0, + "num_tokens": 1023946.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 206.0, + "completions/mean_terminated_length": 206.0, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.03529926063492894, + "epoch": 0.10726408689748812, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "learning_rate": 1.9997723296201997e-05, + "loss": -0.0, + "num_tokens": 1029738.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 294.75, + "completions/mean_terminated_length": 294.75, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "entropy": 0.027123197447508574, + "epoch": 0.1079429735234216, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "learning_rate": 1.9997189279589003e-05, + "loss": 0.0, + "num_tokens": 1037408.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 119.0, + "completions/mean_terminated_length": 119.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.042470349464565516, + "epoch": 0.10862186014935506, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.78125, + "learning_rate": 1.9996599061761575e-05, + "loss": 0.0, + "num_tokens": 1042280.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 102.125, + "completions/mean_terminated_length": 102.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.04984563961625099, + "epoch": 0.10930074677528853, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "learning_rate": 1.9995952646037743e-05, + "loss": 0.0, + "num_tokens": 1046441.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 103.25, + "completions/mean_terminated_length": 103.25, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.06849029986187816, + "epoch": 0.109979633401222, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5625, + "learning_rate": 1.9995250036051462e-05, + "loss": 0.0, + "num_tokens": 1050675.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 191.375, + "completions/mean_terminated_length": 191.375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.024978973204270005, + "epoch": 0.11065852002715547, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.078125, + "learning_rate": 1.9994491235752595e-05, + "loss": -0.0, + "num_tokens": 1056510.0, + "reward": 2.8214285373687744, + "reward_std": 0.3306500315666199, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.33065006136894226, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 109.875, + "completions/mean_terminated_length": 109.875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.03540743584744632, + "epoch": 0.11133740665308893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9993676249406895e-05, + "loss": 0.0, + "num_tokens": 1060901.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 101.0, + "completions/mean_terminated_length": 101.0, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.07029630057513714, + "epoch": 0.1120162932790224, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.875, + "learning_rate": 1.999280508159597e-05, + "loss": 0.0, + "num_tokens": 1065045.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 114.25, + "completions/mean_terminated_length": 114.25, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.04848286882042885, + "epoch": 0.11269517990495587, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.375, + "learning_rate": 1.999187773721726e-05, + "loss": 0.0, + "num_tokens": 1069423.0, + "reward": 2.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 73.5, + "completions/mean_terminated_length": 73.5, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.0937038529664278, + "epoch": 0.11337406653088934, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.09375, + "learning_rate": 1.9990894221484027e-05, + "loss": -0.0, + "num_tokens": 1073107.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 145.375, + "completions/mean_terminated_length": 145.375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.04907804913818836, + "epoch": 0.11405295315682282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "learning_rate": 1.9989854539925296e-05, + "loss": 0.0, + "num_tokens": 1077942.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 238.625, + "completions/mean_terminated_length": 238.625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.01483443018514663, + "epoch": 0.11473183978275628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9988758698385854e-05, + "loss": 0.0, + "num_tokens": 1084267.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 109.25, + "completions/mean_terminated_length": 109.25, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.06854783603921533, + "epoch": 0.11541072640868975, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015625, + "learning_rate": 1.9987606703026187e-05, + "loss": 0.0, + "num_tokens": 1088565.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 92.625, + "completions/mean_terminated_length": 92.625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.03099416079930961, + "epoch": 0.11608961303462322, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.78125, + "learning_rate": 1.9986398560322476e-05, + "loss": -0.0, + "num_tokens": 1092578.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 103.0, + "completions/mean_length": 101.125, + "completions/mean_terminated_length": 101.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.02786101191304624, + "epoch": 0.11676849966055669, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9985134277066533e-05, + "loss": 0.0, + "num_tokens": 1096779.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 189.125, + "completions/mean_terminated_length": 189.125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.02895908593200147, + "epoch": 0.11744738628649015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.998381386036578e-05, + "loss": 0.0, + "num_tokens": 1102372.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 98.0, + "completions/mean_terminated_length": 98.0, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.05383358057588339, + "epoch": 0.11812627291242363, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "learning_rate": 1.9982437317643218e-05, + "loss": -0.0, + "num_tokens": 1106292.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 241.375, + "completions/mean_terminated_length": 241.375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.02336695638950914, + "epoch": 0.11880515953835709, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "learning_rate": 1.9981004656637344e-05, + "loss": -0.0, + "num_tokens": 1113063.0, + "reward": 1.5138888359069824, + "reward_std": 0.2444263994693756, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5138888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.2444264143705368, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 160.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.02944644633680582, + "epoch": 0.11948404616429056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9979515885402156e-05, + "loss": 0.0, + "num_tokens": 1118135.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 113.25, + "completions/mean_terminated_length": 113.25, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.0337059882003814, + "epoch": 0.12016293279022404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9977971012307085e-05, + "loss": 0.0, + "num_tokens": 1122561.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 188.0, + "completions/mean_terminated_length": 188.0, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.01251135824713856, + "epoch": 0.1208418194161575, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "learning_rate": 1.9976370046036947e-05, + "loss": -0.0, + "num_tokens": 1128121.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 137.75, + "completions/mean_terminated_length": 137.75, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.060274966061115265, + "epoch": 0.12152070604209098, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "learning_rate": 1.9974712995591887e-05, + "loss": -0.0, + "num_tokens": 1132759.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 302.625, + "completions/mean_terminated_length": 302.625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.026927365455776453, + "epoch": 0.12219959266802444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "learning_rate": 1.9972999870287357e-05, + "loss": 0.0, + "num_tokens": 1140436.0, + "reward": 2.7395834922790527, + "reward_std": 0.5408648252487183, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.25173014402389526, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 155.125, + "completions/mean_terminated_length": 155.125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.011356945033185184, + "epoch": 0.12287847929395791, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "learning_rate": 1.997123067975404e-05, + "loss": -0.0, + "num_tokens": 1145509.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 116.0, + "completions/mean_terminated_length": 116.0, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.03719848836772144, + "epoch": 0.12355736591989137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.996940543393778e-05, + "loss": 0.0, + "num_tokens": 1149813.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 72.375, + "completions/mean_terminated_length": 72.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.028450862504541874, + "epoch": 0.12423625254582485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9967524143099583e-05, + "loss": 0.0, + "num_tokens": 1153400.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 136.25, + "completions/mean_terminated_length": 136.25, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.04874406987801194, + "epoch": 0.12491513917175831, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9965586817815494e-05, + "loss": 0.0, + "num_tokens": 1158354.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.06884664436802268, + "epoch": 0.12559402579769177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9963593468976583e-05, + "loss": 0.0, + "num_tokens": 1161829.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 111.75, + "completions/mean_terminated_length": 111.75, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.06350117921829224, + "epoch": 0.12627291242362526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "learning_rate": 1.9961544107788855e-05, + "loss": 0.0, + "num_tokens": 1165915.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 85.375, + "completions/mean_terminated_length": 85.375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.11480604950338602, + "epoch": 0.12695179904955872, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.59375, + "learning_rate": 1.9959438745773216e-05, + "loss": -0.0, + "num_tokens": 1170070.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 115.375, + "completions/mean_terminated_length": 115.375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.04991080705076456, + "epoch": 0.12763068567549218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9957277394765377e-05, + "loss": 0.0, + "num_tokens": 1174785.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 112.25, + "completions/mean_terminated_length": 112.25, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.023421406280249357, + "epoch": 0.12830957230142567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.995506006691581e-05, + "loss": 0.0, + "num_tokens": 1179555.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 78.625, + "completions/mean_terminated_length": 78.625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.0779186524450779, + "epoch": 0.12898845892735913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9952786774689667e-05, + "loss": 0.0, + "num_tokens": 1183624.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 113.75, + "completions/mean_terminated_length": 113.75, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.024235277203842998, + "epoch": 0.1296673455532926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "learning_rate": 1.9950457530866726e-05, + "loss": -0.0, + "num_tokens": 1188030.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 116.25, + "completions/mean_terminated_length": 116.25, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.027389248367398977, + "epoch": 0.13034623217922606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "learning_rate": 1.9948072348541294e-05, + "loss": -0.0, + "num_tokens": 1192208.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 86.375, + "completions/mean_terminated_length": 86.375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.031602269038558006, + "epoch": 0.13102511880515955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9945631241122158e-05, + "loss": 0.0, + "num_tokens": 1196147.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 76.125, + "completions/mean_terminated_length": 76.125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.08630556054413319, + "epoch": 0.131704005431093, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.46875, + "learning_rate": 1.9943134222332493e-05, + "loss": -0.0, + "num_tokens": 1199948.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 217.75, + "completions/mean_terminated_length": 217.75, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.07903834339231253, + "epoch": 0.13238289205702647, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "learning_rate": 1.994058130620979e-05, + "loss": 0.0, + "num_tokens": 1205962.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 82.375, + "completions/mean_terminated_length": 82.375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.06530685583129525, + "epoch": 0.13306177868295996, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.03125, + "learning_rate": 1.9937972507105793e-05, + "loss": 0.0, + "num_tokens": 1209837.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 103.125, + "completions/mean_terminated_length": 103.125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.046963199973106384, + "epoch": 0.13374066530889342, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6875, + "learning_rate": 1.993530783968638e-05, + "loss": 0.0, + "num_tokens": 1213982.0, + "reward": 2.375, + "reward_std": 1.0606601238250732, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 93.5, + "completions/mean_terminated_length": 93.5, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.09216509386897087, + "epoch": 0.13441955193482688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.993258731893152e-05, + "loss": 0.0, + "num_tokens": 1217970.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 155.5, + "completions/mean_terminated_length": 155.5, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.010635877726599574, + "epoch": 0.13509843856076034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.992981096013517e-05, + "loss": 0.0, + "num_tokens": 1223086.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 91.75, + "completions/mean_terminated_length": 91.75, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.08575577288866043, + "epoch": 0.13577732518669383, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.125, + "learning_rate": 1.9926978778905193e-05, + "loss": 0.0, + "num_tokens": 1226980.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 86.75, + "completions/mean_terminated_length": 86.75, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.07871610764414072, + "epoch": 0.1364562118126273, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.34375, + "learning_rate": 1.992409079116326e-05, + "loss": 0.0, + "num_tokens": 1230898.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 69.5, + "completions/mean_terminated_length": 69.5, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.06334522133693099, + "epoch": 0.13713509843856075, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.59375, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.0, + "num_tokens": 1234566.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 189.0, + "completions/mean_terminated_length": 189.0, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.034989220555871725, + "epoch": 0.13781398506449424, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5, + "learning_rate": 1.9918147461398796e-05, + "loss": -0.0, + "num_tokens": 1240086.0, + "reward": 1.9166666269302368, + "reward_std": 0.5657789707183838, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 236.25, + "completions/mean_terminated_length": 236.25, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.015031340532004833, + "epoch": 0.1384928716904277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9915092152787888e-05, + "loss": 0.0, + "num_tokens": 1246264.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 73.125, + "completions/mean_terminated_length": 73.125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.05455721355974674, + "epoch": 0.13917175831636117, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.6875, + "learning_rate": 1.991198110448809e-05, + "loss": -0.0, + "num_tokens": 1249881.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 106.5, + "completions/mean_terminated_length": 106.5, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.038839657325297594, + "epoch": 0.13985064494229463, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "learning_rate": 1.9908814333988794e-05, + "loss": 0.0, + "num_tokens": 1253877.0, + "reward": 2.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 91.25, + "completions/mean_terminated_length": 91.25, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.03963353531435132, + "epoch": 0.14052953156822812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.990559185909263e-05, + "loss": 0.0, + "num_tokens": 1257879.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 68.25, + "completions/mean_terminated_length": 68.25, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.026947764679789543, + "epoch": 0.14120841819416158, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9902313697915395e-05, + "loss": 0.0, + "num_tokens": 1261641.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 120.875, + "completions/mean_terminated_length": 120.875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.10003570653498173, + "epoch": 0.14188730482009504, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "learning_rate": 1.9898979868885933e-05, + "loss": -0.0, + "num_tokens": 1265912.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 178.75, + "completions/mean_terminated_length": 178.75, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.031031151534989476, + "epoch": 0.1425661914460285, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.578125, + "learning_rate": 1.989559039074603e-05, + "loss": 0.0, + "num_tokens": 1271198.0, + "reward": 2.921875, + "reward_std": 0.22097086906433105, + "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, + "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 300.875, + "completions/mean_terminated_length": 300.875, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "entropy": 0.013214790145866573, + "epoch": 0.143245078071962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.989214528255033e-05, + "loss": 0.0, + "num_tokens": 1278461.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 65.75, + "completions/mean_terminated_length": 65.75, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.03855268331244588, + "epoch": 0.14392396469789545, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.6875, + "learning_rate": 1.9888644563666194e-05, + "loss": 0.0, + "num_tokens": 1282075.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 201.5, + "completions/mean_terminated_length": 201.5, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.042058383114635944, + "epoch": 0.1446028513238289, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.578125, + "learning_rate": 1.9885088253773623e-05, + "loss": 0.0, + "num_tokens": 1287535.0, + "reward": 2.0500001907348633, + "reward_std": 0.5928140878677368, + "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 181.625, + "completions/mean_terminated_length": 181.625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.03320292220450938, + "epoch": 0.1452817379497624, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "learning_rate": 1.988147637286513e-05, + "loss": 0.0, + "num_tokens": 1293028.0, + "reward": 1.5416667461395264, + "reward_std": 0.3959116041660309, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 317.625, + "completions/mean_terminated_length": 317.625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.0128429364413023, + "epoch": 0.14596062457569586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.987780894124563e-05, + "loss": 0.0, + "num_tokens": 1300641.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 416.625, + "completions/mean_terminated_length": 416.625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.03136526886373758, + "epoch": 0.14663951120162932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "learning_rate": 1.987408597953233e-05, + "loss": 0.0, + "num_tokens": 1310110.0, + "reward": 2.8375000953674316, + "reward_std": 0.2199837565422058, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8374999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.219983771443367, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 57.125, + "completions/mean_terminated_length": 57.125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.02553011034615338, + "epoch": 0.14731839782756279, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.987030750865461e-05, + "loss": 0.0, + "num_tokens": 1313607.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 212.25, + "completions/mean_terminated_length": 212.25, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.04503213532734662, + "epoch": 0.14799728445349628, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "learning_rate": 1.9866473549853904e-05, + "loss": -0.0, + "num_tokens": 1321249.0, + "reward": 1.6964285373687744, + "reward_std": 0.24669833481311798, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.2466983050107956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 105.75, + "completions/mean_terminated_length": 105.75, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.022715769009664655, + "epoch": 0.14867617107942974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9862584124683587e-05, + "loss": 0.0, + "num_tokens": 1325439.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 311.625, + "completions/mean_terminated_length": 311.625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.02056844183243811, + "epoch": 0.1493550577053632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9296875, + "learning_rate": 1.9858639255008844e-05, + "loss": -0.0, + "num_tokens": 1332892.0, + "reward": 2.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 294.5, + "completions/mean_terminated_length": 294.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.03644117899239063, + "epoch": 0.1500339443312967, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "learning_rate": 1.9854638963006552e-05, + "loss": 0.0, + "num_tokens": 1340240.0, + "reward": 1.09375, + "reward_std": 1.060133934020996, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 225.5, + "completions/mean_terminated_length": 225.5, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.02476617321372032, + "epoch": 0.15071283095723015, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "learning_rate": 1.9850583271165166e-05, + "loss": -0.0, + "num_tokens": 1346332.0, + "reward": 2.642857074737549, + "reward_std": 0.38180169463157654, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6428571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.3818017840385437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 58.625, + "completions/mean_terminated_length": 58.625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.08807905670255423, + "epoch": 0.1513917175831636, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.46875, + "learning_rate": 1.9846472202284574e-05, + "loss": -0.0, + "num_tokens": 1350065.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 103.875, + "completions/mean_terminated_length": 103.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.028961456147953868, + "epoch": 0.15207060420909707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.984230577947597e-05, + "loss": 0.0, + "num_tokens": 1354232.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 113.75, + "completions/mean_terminated_length": 113.75, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.03760439460165799, + "epoch": 0.15274949083503056, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.90625, + "learning_rate": 1.9838084026161746e-05, + "loss": 0.0, + "num_tokens": 1358590.0, + "reward": 2.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 147.375, + "completions/mean_terminated_length": 147.375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.053790890611708164, + "epoch": 0.15342837746096402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9833806966075343e-05, + "loss": 0.0, + "num_tokens": 1363497.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 94.5, + "completions/mean_terminated_length": 94.5, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.0835356218740344, + "epoch": 0.15410726408689748, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.84375, + "learning_rate": 1.9829474623261106e-05, + "loss": 0.0, + "num_tokens": 1367525.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 81.5, + "completions/mean_terminated_length": 81.5, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.0343251540325582, + "epoch": 0.15478615071283094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9825087022074182e-05, + "loss": 0.0, + "num_tokens": 1371329.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 129.5, + "completions/mean_terminated_length": 129.5, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.026921316282823682, + "epoch": 0.15546503733876443, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "learning_rate": 1.9820644187180354e-05, + "loss": 0.0, + "num_tokens": 1375861.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 89.875, + "completions/mean_terminated_length": 89.875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.02502172254025936, + "epoch": 0.1561439239646979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.981614614355591e-05, + "loss": 0.0, + "num_tokens": 1379844.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 81.0, + "completions/mean_terminated_length": 81.0, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.033004665514454246, + "epoch": 0.15682281059063136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.981159291648751e-05, + "loss": 0.0, + "num_tokens": 1383700.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 137.5, + "completions/mean_terminated_length": 137.5, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.029004631796851754, + "epoch": 0.15750169721656485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9806984531572038e-05, + "loss": 0.0, + "num_tokens": 1388536.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 89.75, + "completions/mean_terminated_length": 89.75, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.04491503653116524, + "epoch": 0.1581805838424983, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.234375, + "learning_rate": 1.9802321014716465e-05, + "loss": -0.0, + "num_tokens": 1392558.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 90.125, + "completions/mean_terminated_length": 90.125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.050404710695147514, + "epoch": 0.15885947046843177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9797602392137678e-05, + "loss": 0.0, + "num_tokens": 1396623.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 346.625, + "completions/mean_terminated_length": 346.625, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "entropy": 0.0162679695058614, + "epoch": 0.15953835709436523, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "learning_rate": 1.9792828690362377e-05, + "loss": 0.0, + "num_tokens": 1404604.0, + "reward": 1.9659091234207153, + "reward_std": 0.09642363339662552, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, + "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 146.5, + "completions/mean_terminated_length": 146.5, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.021264452021569014, + "epoch": 0.16021724372029872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9787999936226877e-05, + "loss": 0.0, + "num_tokens": 1409600.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 339.75, + "completions/mean_terminated_length": 339.75, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.012292444240301847, + "epoch": 0.16089613034623218, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "learning_rate": 1.9783116156877008e-05, + "loss": 0.0, + "num_tokens": 1417518.0, + "reward": 1.9772727489471436, + "reward_std": 0.042082689702510834, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9772727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.04208271950483322, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 83.0, + "completions/mean_terminated_length": 83.0, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.031746637308970094, + "epoch": 0.16157501697216564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9778177379767903e-05, + "loss": 0.0, + "num_tokens": 1421246.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 157.0, + "completions/mean_terminated_length": 157.0, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.05210646474733949, + "epoch": 0.16225390359809913, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "learning_rate": 1.9773183632663907e-05, + "loss": -0.0, + "num_tokens": 1426286.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 103.25, + "completions/mean_terminated_length": 103.25, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.020215536933392286, + "epoch": 0.1629327902240326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9768134943638367e-05, + "loss": 0.0, + "num_tokens": 1430328.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 118.0, + "completions/mean_terminated_length": 118.0, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.0342337426263839, + "epoch": 0.16361167684996605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9763031341073512e-05, + "loss": 0.0, + "num_tokens": 1434560.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 225.625, + "completions/mean_terminated_length": 225.625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.06160185765475035, + "epoch": 0.16429056347589951, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.125, + "learning_rate": 1.9757872853660265e-05, + "loss": -0.0, + "num_tokens": 1440885.0, + "reward": 2.21875, + "reward_std": 0.6999680995941162, + "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, + "rewards/fixed_code_pass_all_test_reward/std": 0.3764851689338684, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 91.25, + "completions/mean_terminated_length": 91.25, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.04878190439194441, + "epoch": 0.164969450101833, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.609375, + "learning_rate": 1.975265951039811e-05, + "loss": -0.0, + "num_tokens": 1444679.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 253.0, + "completions/mean_terminated_length": 253.0, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.03578205150552094, + "epoch": 0.16564833672776647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.97473913405949e-05, + "loss": 0.0, + "num_tokens": 1451223.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 89.5, + "completions/mean_terminated_length": 89.5, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.05918336706236005, + "epoch": 0.16632722335369993, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "learning_rate": 1.974206837386672e-05, + "loss": -0.0, + "num_tokens": 1455203.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 75.125, + "completions/mean_terminated_length": 75.125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.02153742383234203, + "epoch": 0.1670061099796334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9736690640137696e-05, + "loss": 0.0, + "num_tokens": 1459156.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 115.5, + "completions/mean_terminated_length": 115.5, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.06047332426533103, + "epoch": 0.16768499660556688, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.640625, + "learning_rate": 1.9731258169639846e-05, + "loss": -0.0, + "num_tokens": 1463312.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 218.625, + "completions/mean_terminated_length": 218.625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.013830311829224229, + "epoch": 0.16836388323150034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "learning_rate": 1.9725770992912893e-05, + "loss": 0.0, + "num_tokens": 1469461.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 121.375, + "completions/mean_terminated_length": 121.375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.030970022082328796, + "epoch": 0.1690427698574338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.972022914080411e-05, + "loss": 0.0, + "num_tokens": 1473672.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 219.625, + "completions/mean_terminated_length": 219.625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.024551305454224348, + "epoch": 0.1697216564833673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "learning_rate": 1.9714632644468135e-05, + "loss": 0.0, + "num_tokens": 1479845.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 98.875, + "completions/mean_terminated_length": 98.875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.045385925099253654, + "epoch": 0.17040054310930075, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "learning_rate": 1.9708981535366797e-05, + "loss": -0.0, + "num_tokens": 1483668.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.047814551275223494, + "epoch": 0.1710794297352342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.970327584526895e-05, + "loss": 0.0, + "num_tokens": 1490566.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 136.75, + "completions/mean_terminated_length": 136.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.058026759419590235, + "epoch": 0.17175831636116767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9697515606250276e-05, + "loss": 0.0, + "num_tokens": 1495028.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 122.0, + "completions/mean_terminated_length": 122.0, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.06939646881073713, + "epoch": 0.17243720298710116, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.15625, + "learning_rate": 1.9691700850693126e-05, + "loss": -0.0, + "num_tokens": 1499180.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 83.75, + "completions/mean_terminated_length": 83.75, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.037486160174012184, + "epoch": 0.17311608961303462, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.0, + "num_tokens": 1502938.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 85.5, + "completions/mean_terminated_length": 85.5, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.08376667788252234, + "epoch": 0.17379497623896809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.967990792102495e-05, + "loss": 0.0, + "num_tokens": 1506678.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 186.125, + "completions/mean_terminated_length": 186.125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.03614409361034632, + "epoch": 0.17447386286490157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9673929813210265e-05, + "loss": 0.0, + "num_tokens": 1512223.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 333.0, + "completions/mean_terminated_length": 333.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.038370306603610516, + "epoch": 0.17515274949083504, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 1.9667897321449387e-05, + "loss": -0.0, + "num_tokens": 1520255.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 79.75, + "completions/mean_terminated_length": 79.75, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.05382959032431245, + "epoch": 0.1758316361167685, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.5625, + "learning_rate": 1.9661810479655184e-05, + "loss": 0.0, + "num_tokens": 1524165.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 128.375, + "completions/mean_terminated_length": 128.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.035490254405885935, + "epoch": 0.17651052274270196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9655669322046068e-05, + "loss": 0.0, + "num_tokens": 1528880.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 79.25, + "completions/mean_terminated_length": 79.25, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.08525615930557251, + "epoch": 0.17718940936863545, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.28125, + "learning_rate": 1.9649473883145792e-05, + "loss": -0.0, + "num_tokens": 1532602.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 90.5, + "completions/mean_terminated_length": 90.5, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.0366119546815753, + "epoch": 0.1778682959945689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9643224197783265e-05, + "loss": 0.0, + "num_tokens": 1536630.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 100.0, + "completions/mean_terminated_length": 100.0, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.01925523462705314, + "epoch": 0.17854718262050237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9636920301092352e-05, + "loss": 0.0, + "num_tokens": 1540558.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 535.375, + "completions/mean_terminated_length": 535.375, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "entropy": 0.04267650982365012, + "epoch": 0.17922606924643583, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "learning_rate": 1.9630562228511682e-05, + "loss": -0.0, + "num_tokens": 1551401.0, + "reward": 2.1964285373687744, + "reward_std": 0.5175492763519287, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.2901442348957062, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 438.375, + "completions/mean_terminated_length": 438.375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.0744494921527803, + "epoch": 0.17990495587236932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.962415001578444e-05, + "loss": 0.0, + "num_tokens": 1561252.0, + "reward": 2.5999999046325684, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 112.0, + "completions/mean_terminated_length": 112.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.072756327688694, + "epoch": 0.18058384249830278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9617683698958168e-05, + "loss": 0.0, + "num_tokens": 1565348.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 117.25, + "completions/mean_terminated_length": 117.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.018114945152774453, + "epoch": 0.18126272912423624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9611163314384574e-05, + "loss": 0.0, + "num_tokens": 1569550.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 108.125, + "completions/mean_terminated_length": 108.125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.045144837349653244, + "epoch": 0.18194161575016973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9604588898719314e-05, + "loss": 0.0, + "num_tokens": 1573519.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 78.125, + "completions/mean_terminated_length": 78.125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.09496352914720774, + "epoch": 0.1826205023761032, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9597960488921785e-05, + "loss": 0.0, + "num_tokens": 1577544.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 100.75, + "completions/mean_terminated_length": 100.75, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.09399104584008455, + "epoch": 0.18329938900203666, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.875, + "learning_rate": 1.9591278122254938e-05, + "loss": 0.0, + "num_tokens": 1581494.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 91.25, + "completions/mean_terminated_length": 91.25, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.05996898515149951, + "epoch": 0.18397827562797012, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9584541836285035e-05, + "loss": 0.0, + "num_tokens": 1585352.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 315.5, + "completions/mean_terminated_length": 315.5, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "entropy": 0.03379065846092999, + "epoch": 0.1846571622539036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.957775166888147e-05, + "loss": 0.0, + "num_tokens": 1592692.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 82.25, + "completions/mean_terminated_length": 82.25, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.07197279017418623, + "epoch": 0.18533604887983707, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.25, + "learning_rate": 1.957090765821654e-05, + "loss": -0.0, + "num_tokens": 1596366.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 209.875, + "completions/mean_terminated_length": 209.875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.044796328293159604, + "epoch": 0.18601493550577053, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "learning_rate": 1.9564009842765225e-05, + "loss": 0.0, + "num_tokens": 1601597.0, + "reward": 1.0, + "reward_std": 1.0690449476242065, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 343.0, + "completions/mean_terminated_length": 343.0, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "entropy": 0.036600218852981925, + "epoch": 0.18669382213170402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "learning_rate": 1.955705826130499e-05, + "loss": -0.0, + "num_tokens": 1609925.0, + "reward": 1.740384578704834, + "reward_std": 0.31565922498703003, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7403846383094788, + "rewards/fixed_code_pass_all_test_reward/std": 0.3156592547893524, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 136.375, + "completions/mean_terminated_length": 136.375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.02586209448054433, + "epoch": 0.18737270875763748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "learning_rate": 1.9550052952915545e-05, + "loss": 0.0, + "num_tokens": 1614544.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 167.0, + "completions/mean_terminated_length": 167.0, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.03599099209532142, + "epoch": 0.18805159538357094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9542993956978647e-05, + "loss": 0.0, + "num_tokens": 1619904.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 92.0, + "completions/mean_terminated_length": 92.0, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.055113062262535095, + "epoch": 0.1887304820095044, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.71875, + "learning_rate": 1.9535881313177864e-05, + "loss": -0.0, + "num_tokens": 1623712.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 89.625, + "completions/mean_terminated_length": 89.625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.03858232032507658, + "epoch": 0.1894093686354379, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.28125, + "learning_rate": 1.9528715061498355e-05, + "loss": 0.0, + "num_tokens": 1627517.0, + "reward": 1.625, + "reward_std": 0.9161254167556763, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 454.0, + "completions/mean_terminated_length": 454.0, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.028234433149918914, + "epoch": 0.19008825526137135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8046875, + "learning_rate": 1.9521495242226648e-05, + "loss": -0.0, + "num_tokens": 1637149.0, + "reward": 1.9375, + "reward_std": 0.45650067925453186, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 101.875, + "completions/mean_terminated_length": 101.875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.03720711078494787, + "epoch": 0.19076714188730481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "learning_rate": 1.9514221895950416e-05, + "loss": 0.0, + "num_tokens": 1641228.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 123.0, + "completions/mean_terminated_length": 123.0, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.05074124364182353, + "epoch": 0.19144602851323828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.950689506355824e-05, + "loss": 0.0, + "num_tokens": 1645468.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 116.5, + "completions/mean_terminated_length": 116.5, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.01687573315575719, + "epoch": 0.19212491513917176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.949951478623938e-05, + "loss": 0.0, + "num_tokens": 1649664.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 163.875, + "completions/mean_terminated_length": 163.875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.05306277936324477, + "epoch": 0.19280380176510523, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "learning_rate": 1.949208110548356e-05, + "loss": 0.0, + "num_tokens": 1654351.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 138.625, + "completions/mean_terminated_length": 138.625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.08211056794971228, + "epoch": 0.1934826883910387, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.375, + "learning_rate": 1.948459406308071e-05, + "loss": -0.0, + "num_tokens": 1658692.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 187.625, + "completions/mean_terminated_length": 187.625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.12029063701629639, + "epoch": 0.19416157501697218, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.875, + "learning_rate": 1.9477053701120746e-05, + "loss": 0.0, + "num_tokens": 1663641.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 200.75, + "completions/mean_terminated_length": 200.75, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.057263683527708054, + "epoch": 0.19484046164290564, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "learning_rate": 1.9469460061993336e-05, + "loss": 0.0, + "num_tokens": 1669239.0, + "reward": 2.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 139.0, + "completions/mean_terminated_length": 139.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.07944484893232584, + "epoch": 0.1955193482688391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9461813188387652e-05, + "loss": 0.0, + "num_tokens": 1673623.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 269.125, + "completions/mean_terminated_length": 269.125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.043128138640895486, + "epoch": 0.19619823489477256, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "learning_rate": 1.9454113123292133e-05, + "loss": 0.0, + "num_tokens": 1679800.0, + "reward": 2.3541667461395264, + "reward_std": 0.2077372819185257, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3541666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2077372521162033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 153.75, + "completions/mean_terminated_length": 153.75, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.08693261258304119, + "epoch": 0.19687712152070605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9446359909994253e-05, + "loss": 0.0, + "num_tokens": 1684462.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 312.125, + "completions/mean_terminated_length": 312.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.03681220579892397, + "epoch": 0.1975560081466395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "learning_rate": 1.9438553592080257e-05, + "loss": 0.0, + "num_tokens": 1692183.0, + "reward": 1.9659091234207153, + "reward_std": 0.09642363339662552, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, + "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 204.25, + "completions/mean_terminated_length": 204.25, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.08966750046238303, + "epoch": 0.19823489477257297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9430694213434936e-05, + "loss": 0.0, + "num_tokens": 1697593.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 129.5, + "completions/mean_terminated_length": 129.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.04149002628400922, + "epoch": 0.19891378139850646, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.921875, + "learning_rate": 1.942278181824137e-05, + "loss": -0.0, + "num_tokens": 1702565.0, + "reward": 2.1875, + "reward_std": 0.4955156147480011, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 210.25, + "completions/mean_terminated_length": 210.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.06707863416522741, + "epoch": 0.19959266802443992, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "learning_rate": 1.9414816450980686e-05, + "loss": 0.0, + "num_tokens": 1708775.0, + "reward": 2.6875, + "reward_std": 0.3204349875450134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 106.375, + "completions/mean_terminated_length": 106.375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.04134741902817041, + "epoch": 0.20027155465037338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.94067981564318e-05, + "loss": 0.0, + "num_tokens": 1713010.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 240.375, + "completions/mean_terminated_length": 240.375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.05806305631995201, + "epoch": 0.20095044127630685, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "learning_rate": 1.9398726979671174e-05, + "loss": 0.0, + "num_tokens": 1719349.0, + "reward": 1.671875, + "reward_std": 0.2106272429227829, + "rewards/fixed_code_pass_all_test_reward/mean": 0.671875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2106272578239441, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 108.875, + "completions/mean_terminated_length": 108.875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.01956355758011341, + "epoch": 0.20162932790224034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9390602966072548e-05, + "loss": 0.0, + "num_tokens": 1723460.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 126.0, + "completions/mean_terminated_length": 126.0, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.04810475930571556, + "epoch": 0.2023082145281738, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "learning_rate": 1.9382426161306712e-05, + "loss": 0.0, + "num_tokens": 1727700.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 138.875, + "completions/mean_terminated_length": 138.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.06472568772733212, + "epoch": 0.20298710115410726, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "learning_rate": 1.9374196611341212e-05, + "loss": -0.0, + "num_tokens": 1732251.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 126.5, + "completions/mean_terminated_length": 126.5, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.018159526865929365, + "epoch": 0.20366598778004075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9365914362440125e-05, + "loss": 0.0, + "num_tokens": 1736583.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 192.5, + "completions/mean_terminated_length": 192.5, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.04344137362204492, + "epoch": 0.2043448744059742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "learning_rate": 1.9357579461163783e-05, + "loss": 0.0, + "num_tokens": 1741899.0, + "reward": 2.3214285373687744, + "reward_std": 1.0209182500839233, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.464481920003891, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 187.875, + "completions/mean_terminated_length": 187.875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.06356910709291697, + "epoch": 0.20502376103190767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "learning_rate": 1.9349191954368515e-05, + "loss": 0.0, + "num_tokens": 1747426.0, + "reward": 2.2291667461395264, + "reward_std": 0.4448782503604889, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8541666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.20773723721504211, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 244.75, + "completions/mean_terminated_length": 244.75, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.08265692042186856, + "epoch": 0.20570264765784113, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "learning_rate": 1.9340751889206378e-05, + "loss": 0.0, + "num_tokens": 1753352.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 313.5, + "completions/mean_terminated_length": 313.5, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.03501248057000339, + "epoch": 0.20638153428377462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "learning_rate": 1.93322593131249e-05, + "loss": 0.0, + "num_tokens": 1760788.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 128.25, + "completions/mean_terminated_length": 128.25, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.06978957680985332, + "epoch": 0.20706042090970808, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.59375, + "learning_rate": 1.932371427386681e-05, + "loss": 0.0, + "num_tokens": 1765342.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 129.0, + "completions/mean_terminated_length": 129.0, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.07782715698704123, + "epoch": 0.20773930753564154, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "learning_rate": 1.931511681946977e-05, + "loss": -0.0, + "num_tokens": 1769598.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 159.25, + "completions/mean_terminated_length": 159.25, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.07246410427615047, + "epoch": 0.208418194161575, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "learning_rate": 1.9306466998266102e-05, + "loss": 0.0, + "num_tokens": 1774440.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 188.125, + "completions/mean_terminated_length": 188.125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.05745515413582325, + "epoch": 0.2090970807875085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.0, + "num_tokens": 1781257.0, + "reward": 1.5357142686843872, + "reward_std": 0.10101523250341415, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 260.75, + "completions/mean_terminated_length": 260.75, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.017283402499742806, + "epoch": 0.20977596741344195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9289010450239843e-05, + "loss": 0.0, + "num_tokens": 1787807.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 216.75, + "completions/mean_terminated_length": 216.75, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.11775861494243145, + "epoch": 0.21045485403937542, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "learning_rate": 1.928020382155276e-05, + "loss": -0.0, + "num_tokens": 1793205.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 292.5, + "completions/mean_terminated_length": 292.5, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.12922955304384232, + "epoch": 0.2111337406653089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9271345022329502e-05, + "loss": 0.0, + "num_tokens": 1799521.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 307.5, + "completions/mean_terminated_length": 307.5, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.06560094049200416, + "epoch": 0.21181262729124237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9262434102371596e-05, + "loss": 0.0, + "num_tokens": 1806829.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 223.125, + "completions/mean_terminated_length": 223.125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.06759773194789886, + "epoch": 0.21249151391717583, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "learning_rate": 1.9253471111773572e-05, + "loss": 0.0, + "num_tokens": 1813294.0, + "reward": 2.847222328186035, + "reward_std": 0.25845497846603394, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8472222089767456, + "rewards/fixed_code_pass_all_test_reward/std": 0.25845491886138916, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 124.5, + "completions/mean_terminated_length": 124.5, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.04212288232520223, + "epoch": 0.2131704005431093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.924445610092269e-05, + "loss": 0.0, + "num_tokens": 1817354.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 235.625, + "completions/mean_terminated_length": 235.625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.10384846292436123, + "epoch": 0.21384928716904278, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "learning_rate": 1.9235389120498645e-05, + "loss": -0.0, + "num_tokens": 1822727.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 112.5, + "completions/mean_terminated_length": 112.5, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.03845132002606988, + "epoch": 0.21452817379497624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9226270221473302e-05, + "loss": 0.0, + "num_tokens": 1826731.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 215.125, + "completions/mean_terminated_length": 215.125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.03485893807373941, + "epoch": 0.2152070604209097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.921709945511039e-05, + "loss": 0.0, + "num_tokens": 1832460.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 234.5, + "completions/mean_terminated_length": 234.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.1028092484921217, + "epoch": 0.2158859470468432, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "learning_rate": 1.9207876872965217e-05, + "loss": -0.0, + "num_tokens": 1837816.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 220.875, + "completions/mean_terminated_length": 220.875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.15017211344093084, + "epoch": 0.21656483367277665, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "learning_rate": 1.9198602526884388e-05, + "loss": -0.0, + "num_tokens": 1843215.0, + "reward": 2.125, + "reward_std": 0.9910312294960022, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 390.25, + "completions/mean_terminated_length": 390.25, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.05205815797671676, + "epoch": 0.2172437202987101, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "learning_rate": 1.9189276469005508e-05, + "loss": 0.0, + "num_tokens": 1852281.0, + "reward": 2.012500047683716, + "reward_std": 0.41209399700164795, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13750000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.07440238445997238, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 213.625, + "completions/mean_terminated_length": 213.625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.0689094322733581, + "epoch": 0.21792260692464357, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.917989875175689e-05, + "loss": 0.0, + "num_tokens": 1857238.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 197.625, + "completions/mean_terminated_length": 197.625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.04457589378580451, + "epoch": 0.21860149355057706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9170469427857264e-05, + "loss": 0.0, + "num_tokens": 1862115.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 228.625, + "completions/mean_terminated_length": 228.625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.041729350574314594, + "epoch": 0.21928038017651053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "learning_rate": 1.9160988550315475e-05, + "loss": -0.0, + "num_tokens": 1868216.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 177.0, + "completions/mean_terminated_length": 177.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.023223794298246503, + "epoch": 0.219959266802444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9151456172430186e-05, + "loss": 0.0, + "num_tokens": 1873192.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 173.625, + "completions/mean_terminated_length": 173.625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.05698779132217169, + "epoch": 0.22063815342837745, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.703125, + "learning_rate": 1.914187234778958e-05, + "loss": 0.0, + "num_tokens": 1878709.0, + "reward": 1.875, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 275.5, + "completions/mean_terminated_length": 275.5, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.12405252177268267, + "epoch": 0.22131704005431094, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "learning_rate": 1.913223713027106e-05, + "loss": -0.0, + "num_tokens": 1884617.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 182.25, + "completions/mean_terminated_length": 182.25, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.027563789626583457, + "epoch": 0.2219959266802444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9122550574040942e-05, + "loss": 0.0, + "num_tokens": 1889411.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 176.0, + "completions/mean_terminated_length": 176.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.05370080238208175, + "epoch": 0.22267481330617786, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "learning_rate": 1.9112812733554155e-05, + "loss": -0.0, + "num_tokens": 1894163.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.09333140589296818, + "epoch": 0.22335369993211135, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.703125, + "learning_rate": 1.910302366355393e-05, + "loss": -0.0, + "num_tokens": 1898952.0, + "reward": 2.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 230.5, + "completions/mean_terminated_length": 230.5, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.05855082580819726, + "epoch": 0.2240325865580448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.90931834190715e-05, + "loss": 0.0, + "num_tokens": 1904716.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 177.75, + "completions/mean_terminated_length": 177.75, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.08473899774253368, + "epoch": 0.22471147318397827, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "learning_rate": 1.9083292055425783e-05, + "loss": -0.0, + "num_tokens": 1909610.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 316.625, + "completions/mean_terminated_length": 316.625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.08378634741529822, + "epoch": 0.22539035980991173, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "learning_rate": 1.907334962822307e-05, + "loss": 0.0, + "num_tokens": 1918079.0, + "reward": 2.232142925262451, + "reward_std": 0.1308750957250595, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2321428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.13087505102157593, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 189.75, + "completions/mean_terminated_length": 189.75, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.06544020678848028, + "epoch": 0.22606924643584522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.906335619335672e-05, + "loss": 0.0, + "num_tokens": 1922805.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 197.5, + "completions/mean_terminated_length": 197.5, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.08480233186855912, + "epoch": 0.22674813306177868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9053311807006845e-05, + "loss": 0.0, + "num_tokens": 1927617.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 185.0, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.0539171420969069, + "epoch": 0.22742701968771215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.904321652563998e-05, + "loss": 0.0, + "num_tokens": 1932305.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 221.375, + "completions/mean_terminated_length": 221.375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.08433426916599274, + "epoch": 0.22810590631364563, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "learning_rate": 1.903307040600879e-05, + "loss": 0.0, + "num_tokens": 1937396.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 191.25, + "completions/mean_terminated_length": 191.25, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.08307742513716221, + "epoch": 0.2287847929395791, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "learning_rate": 1.902287350515173e-05, + "loss": -0.0, + "num_tokens": 1942374.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 152.5, + "completions/mean_terminated_length": 152.5, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.0349551672115922, + "epoch": 0.22946367956551256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9012625880392733e-05, + "loss": 0.0, + "num_tokens": 1946682.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 196.25, + "completions/mean_terminated_length": 196.25, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.09286962915211916, + "epoch": 0.23014256619144602, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "learning_rate": 1.900232758934089e-05, + "loss": 0.0, + "num_tokens": 1951772.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 177.75, + "completions/mean_terminated_length": 177.75, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.0401443219743669, + "epoch": 0.2308214528173795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.899197868989011e-05, + "loss": 0.0, + "num_tokens": 1956906.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 225.5, + "completions/mean_terminated_length": 225.5, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.10647087544202805, + "epoch": 0.23150033944331297, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "learning_rate": 1.898157924021883e-05, + "loss": -0.0, + "num_tokens": 1961822.0, + "reward": 2.25, + "reward_std": 0.8864052295684814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 137.625, + "completions/mean_terminated_length": 137.625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.06849431153386831, + "epoch": 0.23217922606924643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8971129298789644e-05, + "loss": 0.0, + "num_tokens": 1965987.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 465.0, + "completions/mean_terminated_length": 465.0, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.1154327979311347, + "epoch": 0.2328581126951799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "learning_rate": 1.8960628924349006e-05, + "loss": -0.0, + "num_tokens": 1975067.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 216.0, + "completions/mean_terminated_length": 216.0, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.04719087341800332, + "epoch": 0.23353699932111338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8950078175926886e-05, + "loss": 0.0, + "num_tokens": 1980787.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 187.375, + "completions/mean_terminated_length": 187.375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.0680823945440352, + "epoch": 0.23421588594704684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8939477112836445e-05, + "loss": 0.0, + "num_tokens": 1985710.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 208.875, + "completions/mean_terminated_length": 208.875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.10352581646293402, + "epoch": 0.2348947725729803, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "learning_rate": 1.892882579467369e-05, + "loss": 0.0, + "num_tokens": 1990741.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 166.875, + "completions/mean_terminated_length": 166.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.08224901277571917, + "epoch": 0.2355736591989138, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "learning_rate": 1.8918124281317162e-05, + "loss": -0.0, + "num_tokens": 1995244.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 405.375, + "completions/mean_terminated_length": 405.375, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 0.1158347800374031, + "epoch": 0.23625254582484725, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "learning_rate": 1.8907372632927573e-05, + "loss": -0.0, + "num_tokens": 2003159.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 170.5, + "completions/mean_terminated_length": 170.5, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.0793031700886786, + "epoch": 0.23693143245078072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8896570909947477e-05, + "loss": 0.0, + "num_tokens": 2007883.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 168.0, + "completions/mean_terminated_length": 168.0, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.127309899777174, + "epoch": 0.23761031907671418, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "learning_rate": 1.8885719173100937e-05, + "loss": -0.0, + "num_tokens": 2012203.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 275.25, + "completions/mean_terminated_length": 275.25, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.03309164522215724, + "epoch": 0.23828920570264767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.887481748339318e-05, + "loss": 0.0, + "num_tokens": 2018637.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 243.0, + "completions/mean_terminated_length": 243.0, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.05568704567849636, + "epoch": 0.23896809232858113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8863865902110253e-05, + "loss": 0.0, + "num_tokens": 2024805.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 171.5, + "completions/mean_terminated_length": 171.5, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.09123982861638069, + "epoch": 0.2396469789545146, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "learning_rate": 1.8852864490818678e-05, + "loss": -0.0, + "num_tokens": 2029361.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 190.125, + "completions/mean_terminated_length": 190.125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.08292779233306646, + "epoch": 0.24032586558044808, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "learning_rate": 1.8841813311365105e-05, + "loss": -0.0, + "num_tokens": 2034330.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 243.375, + "completions/mean_terminated_length": 243.375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.12956679053604603, + "epoch": 0.24100475220638154, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015625, + "learning_rate": 1.8830712425875964e-05, + "loss": -0.0, + "num_tokens": 2041389.0, + "reward": 2.0625, + "reward_std": 0.4172614812850952, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 175.5, + "completions/mean_terminated_length": 175.5, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.0687949163839221, + "epoch": 0.241683638832315, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.15625, + "learning_rate": 1.8819561896757124e-05, + "loss": 0.0, + "num_tokens": 2046137.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 223.5, + "completions/mean_terminated_length": 223.5, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.082338429056108, + "epoch": 0.24236252545824846, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "learning_rate": 1.8808361786693533e-05, + "loss": -0.0, + "num_tokens": 2051485.0, + "reward": 2.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.08644285053014755, + "epoch": 0.24304141208418195, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "learning_rate": 1.879711215864886e-05, + "loss": 0.0, + "num_tokens": 2057730.0, + "reward": 2.2083334922790527, + "reward_std": 0.3053751587867737, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.3053751289844513, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 236.0, + "completions/mean_terminated_length": 236.0, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.12417812552303076, + "epoch": 0.2437202987101154, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "learning_rate": 1.8785813075865164e-05, + "loss": 0.0, + "num_tokens": 2063082.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 238.75, + "completions/mean_terminated_length": 238.75, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.1339465482160449, + "epoch": 0.24439918533604887, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "learning_rate": 1.877446460186251e-05, + "loss": -0.0, + "num_tokens": 2068248.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 207.125, + "completions/mean_terminated_length": 207.125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.04863814264535904, + "epoch": 0.24507807196198234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.0, + "num_tokens": 2073585.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 235.75, + "completions/mean_terminated_length": 235.75, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.04723559692502022, + "epoch": 0.24575695858791582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.875161973566858e-05, + "loss": 0.0, + "num_tokens": 2079495.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 377.75, + "completions/mean_terminated_length": 377.75, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.07612746069207788, + "epoch": 0.24643584521384929, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "learning_rate": 1.874012347190432e-05, + "loss": -0.0, + "num_tokens": 2088437.0, + "reward": 2.1624999046325684, + "reward_std": 0.07440241426229477, + "rewards/fixed_code_pass_all_test_reward/mean": 0.16249999403953552, + "rewards/fixed_code_pass_all_test_reward/std": 0.07440238445997238, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.08486952725797892, + "epoch": 0.24711473183978275, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "learning_rate": 1.8728578073774427e-05, + "loss": 0.0, + "num_tokens": 2093419.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 224.5, + "completions/mean_terminated_length": 224.5, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.06508130906149745, + "epoch": 0.24779361846571624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8716983606183673e-05, + "loss": 0.0, + "num_tokens": 2098511.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 367.25, + "completions/mean_terminated_length": 367.25, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "entropy": 0.06202725553885102, + "epoch": 0.2484725050916497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.87053401343127e-05, + "loss": 0.0, + "num_tokens": 2105721.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 189.5, + "completions/mean_terminated_length": 189.5, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.07700490904971957, + "epoch": 0.24915139171758316, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8693647723617637e-05, + "loss": 0.0, + "num_tokens": 2110469.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 181.5, + "completions/mean_terminated_length": 181.5, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.06152817793190479, + "epoch": 0.24983027834351662, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "learning_rate": 1.8681906439829716e-05, + "loss": -0.0, + "num_tokens": 2115137.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 346.375, + "completions/mean_terminated_length": 346.375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.041742518078535795, + "epoch": 0.2505091649694501, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "learning_rate": 1.8670116348954945e-05, + "loss": 0.0, + "num_tokens": 2123500.0, + "reward": 1.817307710647583, + "reward_std": 0.12292228639125824, + "rewards/fixed_code_pass_all_test_reward/mean": 0.817307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.12292228639125824, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.13209902122616768, + "epoch": 0.25118805159538354, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "learning_rate": 1.865827751727368e-05, + "loss": -0.0, + "num_tokens": 2129381.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 179.75, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.046899959444999695, + "epoch": 0.25186693822131706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.864639001134031e-05, + "loss": 0.0, + "num_tokens": 2134507.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 167.5, + "completions/mean_terminated_length": 167.5, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.0577500881627202, + "epoch": 0.2525458248472505, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "learning_rate": 1.863445389798284e-05, + "loss": 0.0, + "num_tokens": 2139127.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 231.125, + "completions/mean_terminated_length": 231.125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.06450037378817797, + "epoch": 0.253224711473184, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "learning_rate": 1.8622469244302542e-05, + "loss": -0.0, + "num_tokens": 2144848.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 197.625, + "completions/mean_terminated_length": 197.625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.05902037117630243, + "epoch": 0.25390359809911744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8610436117673557e-05, + "loss": 0.0, + "num_tokens": 2149741.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 136.0, + "completions/mean_terminated_length": 136.0, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.04024791670963168, + "epoch": 0.2545824847250509, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.28125, + "learning_rate": 1.8598354585742537e-05, + "loss": 0.0, + "num_tokens": 2154013.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 231.375, + "completions/mean_terminated_length": 231.375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.043174607679247856, + "epoch": 0.25526137135098437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.858622471642824e-05, + "loss": 0.0, + "num_tokens": 2159872.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 218.875, + "completions/mean_terminated_length": 218.875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.04491899721324444, + "epoch": 0.25594025797691783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8574046577921182e-05, + "loss": 0.0, + "num_tokens": 2164863.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 253.25, + "completions/mean_terminated_length": 253.25, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.14451793301850557, + "epoch": 0.25661914460285135, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "learning_rate": 1.8561820238683216e-05, + "loss": 0.0, + "num_tokens": 2170985.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 157.75, + "completions/mean_terminated_length": 157.75, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.07329763192683458, + "epoch": 0.2572980312287848, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "learning_rate": 1.8549545767447174e-05, + "loss": -0.0, + "num_tokens": 2175351.0, + "reward": 2.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 119.5, + "completions/mean_terminated_length": 119.5, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.04977906960994005, + "epoch": 0.25797691785471827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.853722323321647e-05, + "loss": 0.0, + "num_tokens": 2179419.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 117.25, + "completions/mean_terminated_length": 117.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.04489452810958028, + "epoch": 0.25865580448065173, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8524852705264716e-05, + "loss": 0.0, + "num_tokens": 2183437.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 285.875, + "completions/mean_terminated_length": 285.875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.0470429384149611, + "epoch": 0.2593346911065852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "learning_rate": 1.8512434253135324e-05, + "loss": 0.0, + "num_tokens": 2189892.0, + "reward": 2.6041667461395264, + "reward_std": 0.17677675187587738, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6041666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 139.75, + "completions/mean_terminated_length": 139.75, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.07873268146067858, + "epoch": 0.26001357773251865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8499967946641127e-05, + "loss": 0.0, + "num_tokens": 2194258.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 245.375, + "completions/mean_terminated_length": 245.375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.04758663475513458, + "epoch": 0.2606924643584521, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "learning_rate": 1.848745385586398e-05, + "loss": -0.0, + "num_tokens": 2201957.0, + "reward": 2.578125, + "reward_std": 0.4952339828014374, + "rewards/fixed_code_pass_all_test_reward/mean": 0.578125, + "rewards/fixed_code_pass_all_test_reward/std": 0.4952339828014374, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.09630326088517904, + "epoch": 0.26137135098438563, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "learning_rate": 1.8474892051154366e-05, + "loss": 0.0, + "num_tokens": 2208077.0, + "reward": 2.734375, + "reward_std": 0.19408094882965088, + "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, + "rewards/fixed_code_pass_all_test_reward/std": 0.19408094882965088, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 144.5, + "completions/mean_terminated_length": 144.5, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.08041145419701934, + "epoch": 0.2620502376103191, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "learning_rate": 1.8462282603131005e-05, + "loss": 0.0, + "num_tokens": 2212361.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 199.0, + "completions/mean_terminated_length": 199.0, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.07909099664539099, + "epoch": 0.26272912423625255, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "learning_rate": 1.8449625582680445e-05, + "loss": 0.0, + "num_tokens": 2217505.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 141.75, + "completions/mean_terminated_length": 141.75, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.06990398140624166, + "epoch": 0.263408010862186, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "learning_rate": 1.843692106095668e-05, + "loss": -0.0, + "num_tokens": 2221783.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 211.25, + "completions/mean_terminated_length": 211.25, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.08789980597794056, + "epoch": 0.2640868974881195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "learning_rate": 1.842416910938074e-05, + "loss": -0.0, + "num_tokens": 2226817.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 224.75, + "completions/mean_terminated_length": 224.75, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.06863742880523205, + "epoch": 0.26476578411405294, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.25, + "learning_rate": 1.841136979964029e-05, + "loss": 0.0, + "num_tokens": 2232543.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 124.125, + "completions/mean_terminated_length": 124.125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.09061278309673071, + "epoch": 0.2654446707399864, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.40625, + "learning_rate": 1.8398523203689235e-05, + "loss": -0.0, + "num_tokens": 2236616.0, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 260.375, + "completions/mean_terminated_length": 260.375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.11232329346239567, + "epoch": 0.2661235573659199, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8385629393747292e-05, + "loss": 0.0, + "num_tokens": 2242499.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 174.0, + "completions/mean_terminated_length": 174.0, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.09821562003344297, + "epoch": 0.2668024439918534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.837268844229962e-05, + "loss": 0.0, + "num_tokens": 2247259.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 188.875, + "completions/mean_terminated_length": 188.875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.16587267909199, + "epoch": 0.26748133061778684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8359700422096385e-05, + "loss": 0.0, + "num_tokens": 2251994.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 165.125, + "completions/mean_terminated_length": 165.125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.10131550021469593, + "epoch": 0.2681602172437203, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "learning_rate": 1.8346665406152362e-05, + "loss": 0.0, + "num_tokens": 2256595.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 133.75, + "completions/mean_terminated_length": 133.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.05954999150708318, + "epoch": 0.26883910386965376, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.8125, + "learning_rate": 1.8333583467746515e-05, + "loss": -0.0, + "num_tokens": 2260897.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 243.875, + "completions/mean_terminated_length": 243.875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.08692301250994205, + "epoch": 0.2695179904955872, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "learning_rate": 1.83204546804216e-05, + "loss": 0.0, + "num_tokens": 2266864.0, + "reward": 2.7083334922790527, + "reward_std": 0.11785111576318741, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 274.5, + "completions/mean_terminated_length": 274.5, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.08521929755806923, + "epoch": 0.2701968771215207, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "learning_rate": 1.8307279117983744e-05, + "loss": 0.0, + "num_tokens": 2273084.0, + "reward": 2.8333334922790527, + "reward_std": 0.17817412316799164, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 176.75, + "completions/mean_terminated_length": 176.75, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.06568845454603434, + "epoch": 0.2708757637474542, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "learning_rate": 1.829405685450202e-05, + "loss": -0.0, + "num_tokens": 2277674.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 280.5, + "completions/mean_terminated_length": 280.5, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.04120262851938605, + "epoch": 0.27155465037338766, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "learning_rate": 1.828078796430805e-05, + "loss": -0.0, + "num_tokens": 2284126.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 236.875, + "completions/mean_terminated_length": 236.875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.12331179715692997, + "epoch": 0.2722335369993211, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "learning_rate": 1.826747252199558e-05, + "loss": 0.0, + "num_tokens": 2290021.0, + "reward": 2.075000047683716, + "reward_std": 0.8811518549919128, + "rewards/fixed_code_pass_all_test_reward/mean": 0.574999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.345377653837204, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 138.5, + "completions/mean_terminated_length": 138.5, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.06388229411095381, + "epoch": 0.2729124236252546, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "learning_rate": 1.8254110602420047e-05, + "loss": 0.0, + "num_tokens": 2294217.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 213.875, + "completions/mean_terminated_length": 213.875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.07360357465222478, + "epoch": 0.27359131025118805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8240702280698176e-05, + "loss": 0.0, + "num_tokens": 2299792.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 158.0, + "completions/mean_terminated_length": 158.0, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.08301210403442383, + "epoch": 0.2742701968771215, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "learning_rate": 1.822724763220755e-05, + "loss": -0.0, + "num_tokens": 2304288.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 178.25, + "completions/mean_terminated_length": 178.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.07697890792042017, + "epoch": 0.27494908350305497, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "learning_rate": 1.8213746732586186e-05, + "loss": 0.0, + "num_tokens": 2309394.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 304.5, + "completions/mean_terminated_length": 304.5, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "entropy": 0.08701994083821774, + "epoch": 0.2756279701289885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8200199657732115e-05, + "loss": 0.0, + "num_tokens": 2316574.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 111.5, + "completions/mean_terminated_length": 111.5, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.07085479702800512, + "epoch": 0.27630685675492195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8186606483802945e-05, + "loss": 0.0, + "num_tokens": 2320498.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 135.875, + "completions/mean_terminated_length": 135.875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.05542711168527603, + "epoch": 0.2769857433808554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.817296728721545e-05, + "loss": 0.0, + "num_tokens": 2324689.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 396.625, + "completions/mean_terminated_length": 396.625, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "entropy": 0.1278435904532671, + "epoch": 0.27766463000678887, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "learning_rate": 1.815928214464511e-05, + "loss": -0.0, + "num_tokens": 2333054.0, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 454.125, + "completions/mean_terminated_length": 454.125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 0.09295008983463049, + "epoch": 0.27834351663272233, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "learning_rate": 1.814555113302573e-05, + "loss": -0.0, + "num_tokens": 2342383.0, + "reward": 1.908653736114502, + "reward_std": 0.04079460725188255, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9086538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.04079463332891464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 166.75, + "completions/mean_terminated_length": 166.75, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.07046977197751403, + "epoch": 0.2790224032586558, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "learning_rate": 1.813177432954894e-05, + "loss": -0.0, + "num_tokens": 2346829.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 116.0, + "completions/mean_terminated_length": 116.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.08017634134739637, + "epoch": 0.27970128988458925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.811795181166383e-05, + "loss": 0.0, + "num_tokens": 2350949.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 144.875, + "completions/mean_terminated_length": 144.875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.08231574995443225, + "epoch": 0.2803801765105227, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "learning_rate": 1.8104083657076466e-05, + "loss": 0.0, + "num_tokens": 2355612.0, + "reward": 2.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 216.625, + "completions/mean_terminated_length": 216.625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.07155280746519566, + "epoch": 0.28105906313645623, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "learning_rate": 1.8090169943749477e-05, + "loss": -0.0, + "num_tokens": 2361553.0, + "reward": 1.7864582538604736, + "reward_std": 0.374296635389328, + "rewards/fixed_code_pass_all_test_reward/mean": 0.890625, + "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.02083333395421505, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0589255727827549, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 158.375, + "completions/mean_terminated_length": 158.375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.07857662159949541, + "epoch": 0.2817379497623897, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "learning_rate": 1.80762107499016e-05, + "loss": 0.0, + "num_tokens": 2366180.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 164.75, + "completions/mean_terminated_length": 164.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.11160101648420095, + "epoch": 0.28241683638832316, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.484375, + "learning_rate": 1.8062206154007267e-05, + "loss": 0.0, + "num_tokens": 2370810.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 185.75, + "completions/mean_terminated_length": 185.75, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.06576718902215362, + "epoch": 0.2830957230142566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8048156234796124e-05, + "loss": 0.0, + "num_tokens": 2376312.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 144.125, + "completions/mean_terminated_length": 144.125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.03741574566811323, + "epoch": 0.2837746096401901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8034061071252632e-05, + "loss": 0.0, + "num_tokens": 2380593.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 110.375, + "completions/mean_terminated_length": 110.375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.060843697283416986, + "epoch": 0.28445349626612354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8019920742615596e-05, + "loss": 0.0, + "num_tokens": 2384572.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 163.625, + "completions/mean_terminated_length": 163.625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.0874741431325674, + "epoch": 0.285132382892057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8005735328377718e-05, + "loss": 0.0, + "num_tokens": 2389153.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 136.5, + "completions/mean_terminated_length": 136.5, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.04866040498018265, + "epoch": 0.2858112695179905, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "learning_rate": 1.7991504908285162e-05, + "loss": 0.0, + "num_tokens": 2393725.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 162.25, + "completions/mean_terminated_length": 162.25, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.06404763692989945, + "epoch": 0.286490156143924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7977229562337104e-05, + "loss": 0.0, + "num_tokens": 2398159.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 130.875, + "completions/mean_terminated_length": 130.875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.08717668987810612, + "epoch": 0.28716904276985744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7962909370785283e-05, + "loss": 0.0, + "num_tokens": 2402366.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 160.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.09095563367009163, + "epoch": 0.2878479293957909, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "learning_rate": 1.7948544414133534e-05, + "loss": 0.0, + "num_tokens": 2406966.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 115.625, + "completions/mean_terminated_length": 115.625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.05710427463054657, + "epoch": 0.28852681602172436, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "learning_rate": 1.7934134773137364e-05, + "loss": -0.0, + "num_tokens": 2411339.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 101.75, + "completions/mean_terminated_length": 101.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.01786201330833137, + "epoch": 0.2892057026476578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7919680528803468e-05, + "loss": 0.0, + "num_tokens": 2415289.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 264.875, + "completions/mean_terminated_length": 264.875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.030130391474813223, + "epoch": 0.2898845892735913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7905181762389298e-05, + "loss": 0.0, + "num_tokens": 2421688.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 208.125, + "completions/mean_terminated_length": 208.125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.054072245955467224, + "epoch": 0.2905634758995248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "learning_rate": 1.7890638555402585e-05, + "loss": 0.0, + "num_tokens": 2426969.0, + "reward": 2.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 165.125, + "completions/mean_terminated_length": 165.125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.08146559819579124, + "epoch": 0.29124236252545826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7876050989600908e-05, + "loss": 0.0, + "num_tokens": 2431562.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 202.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.07595767872408032, + "epoch": 0.2919212491513917, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "learning_rate": 1.7861419146991204e-05, + "loss": 0.0, + "num_tokens": 2436515.0, + "reward": 2.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 178.375, + "completions/mean_terminated_length": 178.375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.08013419527560472, + "epoch": 0.2926001357773252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7846743109829318e-05, + "loss": 0.0, + "num_tokens": 2441118.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 141.375, + "completions/mean_terminated_length": 141.375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.08432691264897585, + "epoch": 0.29327902240325865, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "learning_rate": 1.7832022960619562e-05, + "loss": -0.0, + "num_tokens": 2445433.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 146.625, + "completions/mean_terminated_length": 146.625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.06273604743182659, + "epoch": 0.2939579090291921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7817258782114216e-05, + "loss": 0.0, + "num_tokens": 2449958.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 184.375, + "completions/mean_terminated_length": 184.375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.14235163014382124, + "epoch": 0.29463679565512557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7802450657313086e-05, + "loss": 0.0, + "num_tokens": 2454913.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 290.0, + "completions/mean_terminated_length": 290.0, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.11650931928306818, + "epoch": 0.2953156822810591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7787598669463027e-05, + "loss": 0.0, + "num_tokens": 2461209.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 149.125, + "completions/mean_terminated_length": 149.125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.02227119216695428, + "epoch": 0.29599456890699255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.777270290205749e-05, + "loss": 0.0, + "num_tokens": 2465538.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 185.375, + "completions/mean_terminated_length": 185.375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.08627740852534771, + "epoch": 0.296673455532926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "learning_rate": 1.7757763438836027e-05, + "loss": -0.0, + "num_tokens": 2471117.0, + "reward": 2.7124998569488525, + "reward_std": 0.16420802474021912, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7125000357627869, + "rewards/fixed_code_pass_all_test_reward/std": 0.1642080694437027, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 197.375, + "completions/mean_terminated_length": 197.375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.08697156794369221, + "epoch": 0.2973523421588595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7742780363783843e-05, + "loss": 0.0, + "num_tokens": 2476168.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 155.75, + "completions/mean_terminated_length": 155.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.09812935255467892, + "epoch": 0.29803122878479293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7727753761131312e-05, + "loss": 0.0, + "num_tokens": 2480534.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 412.875, + "completions/mean_terminated_length": 412.875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 0.04836875991895795, + "epoch": 0.2987101154107264, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "learning_rate": 1.7712683715353514e-05, + "loss": -0.0, + "num_tokens": 2490005.0, + "reward": 2.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 193.0, + "completions/mean_terminated_length": 193.0, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.06183534534648061, + "epoch": 0.29938900203665986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "learning_rate": 1.7697570311169746e-05, + "loss": 0.0, + "num_tokens": 2495197.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 301.25, + "completions/mean_terminated_length": 301.25, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.05814493494108319, + "epoch": 0.3000678886625934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7682413633543057e-05, + "loss": 0.0, + "num_tokens": 2502815.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 143.375, + "completions/mean_terminated_length": 143.375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.07629174273461103, + "epoch": 0.30074677528852684, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.296875, + "learning_rate": 1.766721376767976e-05, + "loss": 0.0, + "num_tokens": 2507226.0, + "reward": 2.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 191.25, + "completions/mean_terminated_length": 191.25, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.09764100052416325, + "epoch": 0.3014256619144603, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "learning_rate": 1.7651970799028976e-05, + "loss": 0.0, + "num_tokens": 2512764.0, + "reward": 2.1785714626312256, + "reward_std": 0.4691658616065979, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.07636035233736038, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 199.125, + "completions/mean_terminated_length": 199.125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.060885429847985506, + "epoch": 0.30210454854039376, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8125, + "learning_rate": 1.7636684813282113e-05, + "loss": 0.0, + "num_tokens": 2518005.0, + "reward": 2.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.12670390959829092, + "epoch": 0.3027834351663272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7621355896372424e-05, + "loss": 0.0, + "num_tokens": 2524087.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 138.75, + "completions/mean_terminated_length": 138.75, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.05900596734136343, + "epoch": 0.3034623217922607, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "learning_rate": 1.760598413447451e-05, + "loss": 0.0, + "num_tokens": 2528757.0, + "reward": 2.200000047683716, + "reward_std": 0.46598589420318604, + "rewards/fixed_code_pass_all_test_reward/mean": 0.699999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.32071349024772644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 158.0, + "completions/mean_terminated_length": 158.0, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.03903109743259847, + "epoch": 0.30414120841819414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7590569614003825e-05, + "loss": 0.0, + "num_tokens": 2534125.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 155.5, + "completions/mean_terminated_length": 155.5, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.05271232035011053, + "epoch": 0.3048200950441276, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "learning_rate": 1.7575112421616203e-05, + "loss": 0.0, + "num_tokens": 2538617.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 129.625, + "completions/mean_terminated_length": 129.625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.0384956831112504, + "epoch": 0.3054989816700611, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7559612644207364e-05, + "loss": 0.0, + "num_tokens": 2542782.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 221.125, + "completions/mean_terminated_length": 221.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.03622567281126976, + "epoch": 0.3061778682959946, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "learning_rate": 1.7544070368912435e-05, + "loss": -0.0, + "num_tokens": 2548879.0, + "reward": 1.921875, + "reward_std": 0.0646936446428299, + "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, + "rewards/fixed_code_pass_all_test_reward/std": 0.06469365209341049, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 388.25, + "completions/mean_terminated_length": 388.25, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.054282717406749725, + "epoch": 0.30685675492192804, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "learning_rate": 1.7528485683105444e-05, + "loss": 0.0, + "num_tokens": 2557273.0, + "reward": 2.78125, + "reward_std": 0.41052013635635376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 172.875, + "completions/mean_terminated_length": 172.875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.0631052409298718, + "epoch": 0.3075356415478615, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.90625, + "learning_rate": 1.751285867439885e-05, + "loss": -0.0, + "num_tokens": 2561888.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.12077120132744312, + "epoch": 0.30821452817379497, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "learning_rate": 1.7497189430643025e-05, + "loss": 0.0, + "num_tokens": 2566961.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 411.0, + "completions/mean_terminated_length": 411.0, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.068185078445822, + "epoch": 0.3088934147997284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "learning_rate": 1.7481478039925784e-05, + "loss": 0.0, + "num_tokens": 2576025.0, + "reward": 2.0714287757873535, + "reward_std": 0.3818017840385437, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.07393559068441391, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 171.25, + "completions/mean_terminated_length": 171.25, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.06827958533540368, + "epoch": 0.3095723014256619, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.203125, + "learning_rate": 1.746572459057188e-05, + "loss": 0.0, + "num_tokens": 2580603.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 253.625, + "completions/mean_terminated_length": 253.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.09161295369267464, + "epoch": 0.3102511880515954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "learning_rate": 1.7449929171142495e-05, + "loss": -0.0, + "num_tokens": 2588528.0, + "reward": 2.5357141494750977, + "reward_std": 0.21257825195789337, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.21257825195789337, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 322.0, + "completions/mean_terminated_length": 322.0, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.08320469036698341, + "epoch": 0.31093007467752887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7434091870434772e-05, + "loss": 0.0, + "num_tokens": 2597376.0, + "reward": 2.5999999046325684, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 220.375, + "completions/mean_terminated_length": 220.375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.041240944527089596, + "epoch": 0.31160896130346233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.741821277748128e-05, + "loss": 0.0, + "num_tokens": 2602611.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 235.75, + "completions/mean_terminated_length": 235.75, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.04695710772648454, + "epoch": 0.3122878479293958, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "learning_rate": 1.740229198154955e-05, + "loss": 0.0, + "num_tokens": 2608537.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 92.375, + "completions/mean_terminated_length": 92.375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.04123246343806386, + "epoch": 0.31296673455532925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.738632957214154e-05, + "loss": 0.0, + "num_tokens": 2612412.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 136.75, + "completions/mean_terminated_length": 136.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.06342339888215065, + "epoch": 0.3136456211812627, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "learning_rate": 1.737032563899315e-05, + "loss": -0.0, + "num_tokens": 2616562.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 183.625, + "completions/mean_terminated_length": 183.625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.04811230581253767, + "epoch": 0.3143245078071962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875, + "learning_rate": 1.7354280272073718e-05, + "loss": -0.0, + "num_tokens": 2621911.0, + "reward": 1.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 302.75, + "completions/mean_terminated_length": 302.75, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.06339940801262856, + "epoch": 0.3150033944331297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "learning_rate": 1.7338193561585507e-05, + "loss": -0.0, + "num_tokens": 2628821.0, + "reward": 2.75, + "reward_std": 0.4178554117679596, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2920915186405182, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 178.5, + "completions/mean_terminated_length": 178.5, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.05791258532553911, + "epoch": 0.31568228105906315, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "learning_rate": 1.7322065597963206e-05, + "loss": 0.0, + "num_tokens": 2633729.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 120.75, + "completions/mean_terminated_length": 120.75, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.07732136873528361, + "epoch": 0.3163611676849966, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.09375, + "learning_rate": 1.730589647187341e-05, + "loss": 0.0, + "num_tokens": 2637703.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 124.25, + "completions/mean_terminated_length": 124.25, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.048830126877874136, + "epoch": 0.3170400543109301, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.53125, + "learning_rate": 1.7289686274214116e-05, + "loss": 0.0, + "num_tokens": 2641769.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 203.0, + "completions/mean_terminated_length": 203.0, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.04089064942672849, + "epoch": 0.31771894093686354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7273435096114223e-05, + "loss": 0.0, + "num_tokens": 2647033.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 156.5, + "completions/mean_terminated_length": 156.5, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.06483722059056163, + "epoch": 0.318397827562797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7257143028933004e-05, + "loss": 0.0, + "num_tokens": 2651533.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 243.5, + "completions/mean_terminated_length": 243.5, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.023649835726246238, + "epoch": 0.31907671418873046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "learning_rate": 1.7240810164259597e-05, + "loss": 0.0, + "num_tokens": 2657881.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 225.0, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.09029651340097189, + "epoch": 0.319755600814664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.722443659391249e-05, + "loss": 0.0, + "num_tokens": 2663233.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 149.625, + "completions/mean_terminated_length": 149.625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.03758882568217814, + "epoch": 0.32043448744059744, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "learning_rate": 1.7208022409939012e-05, + "loss": -0.0, + "num_tokens": 2667702.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 354.25, + "completions/mean_terminated_length": 354.25, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.06241106940433383, + "epoch": 0.3211133740665309, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "learning_rate": 1.7191567704614806e-05, + "loss": 0.0, + "num_tokens": 2675736.0, + "reward": 1.8303571939468384, + "reward_std": 0.2587745785713196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8303571939468384, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 133.375, + "completions/mean_terminated_length": 133.375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.056640565395355225, + "epoch": 0.32179226069246436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.717507257044331e-05, + "loss": 0.0, + "num_tokens": 2679899.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 143.375, + "completions/mean_terminated_length": 143.375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.09125719266012311, + "epoch": 0.3224711473183978, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3125, + "learning_rate": 1.7158537100155256e-05, + "loss": 0.0, + "num_tokens": 2684318.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 161.625, + "completions/mean_terminated_length": 161.625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.07897145766764879, + "epoch": 0.3231500339443313, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "learning_rate": 1.714196138670811e-05, + "loss": -0.0, + "num_tokens": 2688835.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 300.25, + "completions/mean_terminated_length": 300.25, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.046984167071059346, + "epoch": 0.32382892057026474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7125345523285598e-05, + "loss": 0.0, + "num_tokens": 2696221.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 143.5, + "completions/mean_terminated_length": 143.5, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.07721525803208351, + "epoch": 0.32450780719619826, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "learning_rate": 1.7108689603297134e-05, + "loss": -0.0, + "num_tokens": 2700705.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 141.625, + "completions/mean_terminated_length": 141.625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.02691166941076517, + "epoch": 0.3251866938221317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7091993720377336e-05, + "loss": 0.0, + "num_tokens": 2705078.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.052031297236680984, + "epoch": 0.3258655804480652, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.140625, + "learning_rate": 1.7075257968385472e-05, + "loss": -0.0, + "num_tokens": 2709472.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 111.125, + "completions/mean_terminated_length": 111.125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.04639151692390442, + "epoch": 0.32654446707399865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7058482441404946e-05, + "loss": 0.0, + "num_tokens": 2713473.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 158.75, + "completions/mean_terminated_length": 158.75, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.09368814900517464, + "epoch": 0.3272233536999321, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "learning_rate": 1.7041667233742763e-05, + "loss": 0.0, + "num_tokens": 2717935.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 94.0, + "completions/mean_terminated_length": 94.0, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 0.03158421581611037, + "epoch": 0.32790224032586557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7024812439929004e-05, + "loss": 0.0, + "num_tokens": 2721847.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 254.5, + "completions/mean_terminated_length": 254.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.054124286863952875, + "epoch": 0.32858112695179903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7007918154716286e-05, + "loss": 0.0, + "num_tokens": 2728443.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 202.625, + "completions/mean_terminated_length": 202.625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.04601641371846199, + "epoch": 0.3292600135777325, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "learning_rate": 1.6990984473079245e-05, + "loss": -0.0, + "num_tokens": 2733616.0, + "reward": 2.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 250.0, + "completions/mean_terminated_length": 250.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.02938453876413405, + "epoch": 0.329938900203666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6974011490213976e-05, + "loss": 0.0, + "num_tokens": 2740064.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 149.125, + "completions/mean_terminated_length": 149.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.0769786462187767, + "epoch": 0.33061778682959947, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 1.6956999301537533e-05, + "loss": 0.0, + "num_tokens": 2744553.0, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 72.375, + "completions/mean_terminated_length": 72.375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.04711468797177076, + "epoch": 0.33129667345553293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6939948002687352e-05, + "loss": 0.0, + "num_tokens": 2748212.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 153.75, + "completions/mean_terminated_length": 153.75, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.03385374881327152, + "epoch": 0.3319755600814664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.692285768952076e-05, + "loss": 0.0, + "num_tokens": 2752554.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.04587622079998255, + "epoch": 0.33265444670739985, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "learning_rate": 1.6905728458114384e-05, + "loss": -0.0, + "num_tokens": 2759175.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 185.625, + "completions/mean_terminated_length": 185.625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.10464992513880134, + "epoch": 0.3333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "learning_rate": 1.6888560404763656e-05, + "loss": -0.0, + "num_tokens": 2763972.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 123.75, + "completions/mean_terminated_length": 123.75, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.07443650905042887, + "epoch": 0.3340122199592668, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "learning_rate": 1.687135362598225e-05, + "loss": -0.0, + "num_tokens": 2768210.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 148.5, + "completions/mean_terminated_length": 148.5, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.10057664709165692, + "epoch": 0.3346911065852003, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.25, + "learning_rate": 1.6854108218501534e-05, + "loss": -0.0, + "num_tokens": 2775278.0, + "reward": 2.8214285373687744, + "reward_std": 0.16642355918884277, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.16642354428768158, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 211.5, + "completions/mean_terminated_length": 211.5, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.1446738326922059, + "epoch": 0.33536999321113375, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "learning_rate": 1.6836824279270053e-05, + "loss": -0.0, + "num_tokens": 2780538.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 98.875, + "completions/mean_terminated_length": 98.875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.032744155963882804, + "epoch": 0.3360488798370672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6819501905452945e-05, + "loss": 0.0, + "num_tokens": 2784465.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 150.0, + "completions/mean_terminated_length": 150.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.06475615315139294, + "epoch": 0.3367277664630007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.680214119443143e-05, + "loss": 0.0, + "num_tokens": 2788889.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 210.125, + "completions/mean_terminated_length": 210.125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.06889946572482586, + "epoch": 0.33740665308893414, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.859375, + "learning_rate": 1.6784742243802242e-05, + "loss": -0.0, + "num_tokens": 2794378.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 233.625, + "completions/mean_terminated_length": 233.625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.053773445542901754, + "epoch": 0.3380855397148676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "learning_rate": 1.676730515137709e-05, + "loss": 0.0, + "num_tokens": 2800239.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 143.0, + "completions/mean_terminated_length": 143.0, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.05142463184893131, + "epoch": 0.33876442634080106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6749830015182106e-05, + "loss": 0.0, + "num_tokens": 2804583.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 231.25, + "completions/mean_terminated_length": 231.25, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.04364967648871243, + "epoch": 0.3394433129667346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "learning_rate": 1.673231693345729e-05, + "loss": -0.0, + "num_tokens": 2810617.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 500 + }, + { + "epoch": 0.3394433129667346, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 224.87262872628727, + "eval_completions/max_terminated_length": 224.87262872628727, + "eval_completions/mean_length": 194.9481707317073, + "eval_completions/mean_terminated_length": 194.9481707317073, + "eval_completions/min_length": 164.2520325203252, + "eval_completions/min_terminated_length": 164.2520325203252, + "eval_entropy": 0.05976972838505335, + "eval_frac_reward_zero_std": 0.4742547425474255, + "eval_num_tokens": 2810617.0, + "eval_reward": 1.9986720918639889, + "eval_reward_std": 0.20666620964159163, + "eval_rewards/fixed_code_pass_all_test_reward/mean": 0.6794534814632359, + "eval_rewards/fixed_code_pass_all_test_reward/std": 0.12145954475493288, + "eval_rewards/format_reward/mean": 0.9915311653116531, + "eval_rewards/format_reward/std": 0.01811231022604759, + "eval_rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3276874435545629, + "eval_rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08238629815055103, + "eval_train_loss": -0.003072693943977356, + "eval_train_runtime": 1018.8467, + "eval_train_samples_per_second": 0.362, + "eval_train_steps_per_second": 0.046, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 147.5, + "completions/mean_terminated_length": 147.5, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.04556558933109045, + "epoch": 0.34012219959266804, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.78125, + "learning_rate": 1.6714766004655952e-05, + "loss": 0.0, + "num_tokens": 2815309.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 214.875, + "completions/mean_terminated_length": 214.875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.017006220878101885, + "epoch": 0.3408010862186015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "learning_rate": 1.6697177327444185e-05, + "loss": 0.0, + "num_tokens": 2821332.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 221.875, + "completions/mean_terminated_length": 221.875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.029887779848650098, + "epoch": 0.34147997284453496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "learning_rate": 1.6679551000700277e-05, + "loss": 0.0, + "num_tokens": 2827763.0, + "reward": 2.34375, + "reward_std": 0.0578637570142746, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, + "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 214.125, + "completions/mean_terminated_length": 214.125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.07232411252334714, + "epoch": 0.3421588594704684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6661887123514183e-05, + "loss": 0.0, + "num_tokens": 2832828.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 120.0, + "completions/mean_terminated_length": 120.0, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.07752927113324404, + "epoch": 0.3428377460964019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6644185795186946e-05, + "loss": 0.0, + "num_tokens": 2836932.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 149.375, + "completions/mean_terminated_length": 149.375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.053660436533391476, + "epoch": 0.34351663272233535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.662644711523014e-05, + "loss": 0.0, + "num_tokens": 2841335.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 315.625, + "completions/mean_terminated_length": 315.625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "entropy": 0.044008327182382345, + "epoch": 0.34419551934826886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.660867118336535e-05, + "loss": 0.0, + "num_tokens": 2848388.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 232.75, + "completions/mean_terminated_length": 232.75, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.08285042317584157, + "epoch": 0.3448744059742023, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "learning_rate": 1.6590858099523545e-05, + "loss": 0.0, + "num_tokens": 2854106.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 110.0, + "completions/mean_terminated_length": 110.0, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.051351450849324465, + "epoch": 0.3455532926001358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.657300796384457e-05, + "loss": 0.0, + "num_tokens": 2858162.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 139.875, + "completions/mean_terminated_length": 139.875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.0495811328291893, + "epoch": 0.34623217922606925, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "learning_rate": 1.6555120876676557e-05, + "loss": -0.0, + "num_tokens": 2862753.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 137.0, + "completions/mean_terminated_length": 137.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.04050322901457548, + "epoch": 0.3469110658520027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6537196938575376e-05, + "loss": 0.0, + "num_tokens": 2867369.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 131.5, + "completions/mean_terminated_length": 131.5, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.06908958125859499, + "epoch": 0.34758995247793617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6519236250304058e-05, + "loss": 0.0, + "num_tokens": 2871837.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 194.75, + "completions/mean_terminated_length": 194.75, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.06147625343874097, + "epoch": 0.34826883910386963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "learning_rate": 1.6501238912832226e-05, + "loss": 0.0, + "num_tokens": 2877115.0, + "reward": 2.7916667461395264, + "reward_std": 0.17251639068126678, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.17251639068126678, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 193.25, + "completions/mean_terminated_length": 193.25, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.06666107149794698, + "epoch": 0.34894772572980315, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.828125, + "learning_rate": 1.648320502733555e-05, + "loss": 0.0, + "num_tokens": 2882445.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.05938845733180642, + "epoch": 0.3496266123557366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.646513469519514e-05, + "loss": 0.0, + "num_tokens": 2886767.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 177.5, + "completions/mean_terminated_length": 177.5, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.037987842690199614, + "epoch": 0.35030549898167007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.644702801799702e-05, + "loss": 0.0, + "num_tokens": 2892043.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 254.75, + "completions/mean_terminated_length": 254.75, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.021649083821102977, + "epoch": 0.35098438560760353, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "learning_rate": 1.6428885097531524e-05, + "loss": 0.0, + "num_tokens": 2898497.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 162.875, + "completions/mean_terminated_length": 162.875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.08299192041158676, + "epoch": 0.351663272233537, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.641070603579273e-05, + "loss": 0.0, + "num_tokens": 2903064.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.06629437580704689, + "epoch": 0.35234215885947046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "learning_rate": 1.63924909349779e-05, + "loss": 0.0, + "num_tokens": 2907785.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 171.0, + "completions/mean_terminated_length": 171.0, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.05698850331827998, + "epoch": 0.3530210454854039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.63742398974869e-05, + "loss": 0.0, + "num_tokens": 2912497.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 201.0, + "completions/mean_terminated_length": 201.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.040736530791036785, + "epoch": 0.35369993211133743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6355953025921606e-05, + "loss": 0.0, + "num_tokens": 2917745.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 160.25, + "completions/mean_terminated_length": 160.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.09599796775728464, + "epoch": 0.3543788187372709, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.59375, + "learning_rate": 1.633763042308536e-05, + "loss": 0.0, + "num_tokens": 2922379.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 128.0, + "completions/mean_terminated_length": 128.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.08285710355266929, + "epoch": 0.35505770536320436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6319272191982364e-05, + "loss": 0.0, + "num_tokens": 2926643.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 130.25, + "completions/mean_terminated_length": 130.25, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.05148724466562271, + "epoch": 0.3557365919891378, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.25, + "learning_rate": 1.6300878435817115e-05, + "loss": 0.0, + "num_tokens": 2931189.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 216.25, + "completions/mean_terminated_length": 216.25, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.08176072407513857, + "epoch": 0.3564154786150713, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.703125, + "learning_rate": 1.6282449257993814e-05, + "loss": -0.0, + "num_tokens": 2936751.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 229.875, + "completions/mean_terminated_length": 229.875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.04647241858765483, + "epoch": 0.35709436524100474, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.46875, + "learning_rate": 1.626398476211581e-05, + "loss": -0.0, + "num_tokens": 2943334.0, + "reward": 2.2638888359069824, + "reward_std": 0.5276733040809631, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 130.5, + "completions/mean_terminated_length": 130.5, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.05590016767382622, + "epoch": 0.3577732518669382, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "learning_rate": 1.624548505198498e-05, + "loss": 0.0, + "num_tokens": 2947610.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 340.75, + "completions/mean_terminated_length": 340.75, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.033854235894978046, + "epoch": 0.35845213849287166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "learning_rate": 1.622695023160117e-05, + "loss": 0.0, + "num_tokens": 2955552.0, + "reward": 2.21875, + "reward_std": 0.8705242872238159, + "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, + "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.03326621069572866, + "epoch": 0.3591310251188052, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "learning_rate": 1.6208380405161623e-05, + "loss": 0.0, + "num_tokens": 2962468.0, + "reward": 2.9250001907348633, + "reward_std": 0.14880476891994476, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9249999523162842, + "rewards/fixed_code_pass_all_test_reward/std": 0.14880475401878357, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 118.5, + "completions/mean_terminated_length": 118.5, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.0327159590087831, + "epoch": 0.35980991174473864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6189775677060347e-05, + "loss": 0.0, + "num_tokens": 2967104.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 160.125, + "completions/mean_terminated_length": 160.125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.057204945711418986, + "epoch": 0.3604887983706721, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "learning_rate": 1.6171136151887577e-05, + "loss": 0.0, + "num_tokens": 2971609.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 115.5, + "completions/mean_terminated_length": 115.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.029368214774876833, + "epoch": 0.36116768499660556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6152461934429154e-05, + "loss": 0.0, + "num_tokens": 2975621.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 259.25, + "completions/mean_terminated_length": 259.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.030925868079066277, + "epoch": 0.361846571622539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6133753129665968e-05, + "loss": 0.0, + "num_tokens": 2982615.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 238.625, + "completions/mean_terminated_length": 238.625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.02856113645248115, + "epoch": 0.3625254582484725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6115009842773322e-05, + "loss": 0.0, + "num_tokens": 2988836.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 182.5, + "completions/mean_terminated_length": 182.5, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.057596494909375906, + "epoch": 0.36320434487440595, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "learning_rate": 1.6096232179120388e-05, + "loss": 0.0, + "num_tokens": 2994040.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 94.625, + "completions/mean_terminated_length": 94.625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.05044847633689642, + "epoch": 0.36388323150033947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6077420244269585e-05, + "loss": 0.0, + "num_tokens": 2998293.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 159.625, + "completions/mean_terminated_length": 159.625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.08904850669205189, + "epoch": 0.3645621181262729, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "learning_rate": 1.6058574143975995e-05, + "loss": -0.0, + "num_tokens": 3003394.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 129.875, + "completions/mean_terminated_length": 129.875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.05029344651848078, + "epoch": 0.3652410047522064, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.25, + "learning_rate": 1.603969398418677e-05, + "loss": -0.0, + "num_tokens": 3007625.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 80.75, + "completions/mean_terminated_length": 80.75, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.045220422092825174, + "epoch": 0.36591989137813985, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1875, + "learning_rate": 1.6020779871040538e-05, + "loss": 0.0, + "num_tokens": 3011431.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 343.25, + "completions/mean_terminated_length": 343.25, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.09786451607942581, + "epoch": 0.3665987780040733, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "learning_rate": 1.6001831910866795e-05, + "loss": -0.0, + "num_tokens": 3019889.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 102.0, + "completions/mean_terminated_length": 102.0, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.058186124078929424, + "epoch": 0.3672776646300068, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "learning_rate": 1.5982850210185313e-05, + "loss": -0.0, + "num_tokens": 3023961.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 112.75, + "completions/mean_terminated_length": 112.75, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.050074026454240084, + "epoch": 0.36795655125594023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5963834875705556e-05, + "loss": 0.0, + "num_tokens": 3028015.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 138.125, + "completions/mean_terminated_length": 138.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.04945247480645776, + "epoch": 0.36863543788187375, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.09375, + "learning_rate": 1.5944786014326053e-05, + "loss": 0.0, + "num_tokens": 3033024.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 183.0, + "completions/mean_terminated_length": 183.0, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.0497220391407609, + "epoch": 0.3693143245078072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5925703733133823e-05, + "loss": 0.0, + "num_tokens": 3038064.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 98.625, + "completions/mean_terminated_length": 98.625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.060638073831796646, + "epoch": 0.3699932111337407, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "learning_rate": 1.5906588139403752e-05, + "loss": 0.0, + "num_tokens": 3042069.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 89.125, + "completions/mean_terminated_length": 89.125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.03359708562493324, + "epoch": 0.37067209775967414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5887439340598002e-05, + "loss": 0.0, + "num_tokens": 3045998.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 202.625, + "completions/mean_terminated_length": 202.625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.06563668977469206, + "epoch": 0.3713509843856076, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "learning_rate": 1.5868257444365408e-05, + "loss": -0.0, + "num_tokens": 3051579.0, + "reward": 2.9166665077209473, + "reward_std": 0.12598812580108643, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.1259881556034088, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 90.875, + "completions/mean_terminated_length": 90.875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.10051706479862332, + "epoch": 0.37202987101154106, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.875, + "learning_rate": 1.5849042558540863e-05, + "loss": 0.0, + "num_tokens": 3055402.0, + "reward": 1.75, + "reward_std": 1.0350983142852783, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.08418468851596117, + "epoch": 0.3727087576374745, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "learning_rate": 1.5829794791144723e-05, + "loss": 0.0, + "num_tokens": 3060038.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 107.625, + "completions/mean_terminated_length": 107.625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.08815432898700237, + "epoch": 0.37338764426340804, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.9375, + "learning_rate": 1.581051425038219e-05, + "loss": 0.0, + "num_tokens": 3064123.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 97.875, + "completions/mean_terminated_length": 97.875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.030117509653791785, + "epoch": 0.3740665308893415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5791201044642707e-05, + "loss": 0.0, + "num_tokens": 3067986.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.05088119860738516, + "epoch": 0.37474541751527496, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.21875, + "learning_rate": 1.577185528249936e-05, + "loss": -0.0, + "num_tokens": 3073108.0, + "reward": 2.875, + "reward_std": 0.1157275140285492, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 198.5, + "completions/mean_terminated_length": 198.5, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.020021514268592, + "epoch": 0.3754243041412084, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "learning_rate": 1.5752477072708247e-05, + "loss": 0.0, + "num_tokens": 3078896.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 163.5, + "completions/mean_terminated_length": 163.5, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.11634811153635383, + "epoch": 0.3761031907671419, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "learning_rate": 1.5733066524207875e-05, + "loss": 0.0, + "num_tokens": 3083404.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 128.375, + "completions/mean_terminated_length": 128.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.08624893147498369, + "epoch": 0.37678207739307534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5713623746118558e-05, + "loss": 0.0, + "num_tokens": 3087663.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 79.25, + "completions/mean_terminated_length": 79.25, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.03172642434947193, + "epoch": 0.3774609640190088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5694148847741793e-05, + "loss": 0.0, + "num_tokens": 3091553.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 115.5, + "completions/mean_terminated_length": 115.5, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.06022269558161497, + "epoch": 0.3781398506449423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5674641938559644e-05, + "loss": 0.0, + "num_tokens": 3095981.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 165.25, + "completions/mean_terminated_length": 165.25, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.04809736763127148, + "epoch": 0.3788187372708758, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "learning_rate": 1.5655103128234134e-05, + "loss": 0.0, + "num_tokens": 3100423.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 118.25, + "completions/mean_terminated_length": 118.25, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.019930118694901466, + "epoch": 0.37949762389680924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5635532526606625e-05, + "loss": 0.0, + "num_tokens": 3104593.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 354.125, + "completions/mean_terminated_length": 354.125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "entropy": 0.015033581294119358, + "epoch": 0.3801765105227427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5615930243697196e-05, + "loss": 0.0, + "num_tokens": 3112714.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 106.375, + "completions/mean_terminated_length": 106.375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.04087465349584818, + "epoch": 0.38085539714867617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.559629638970403e-05, + "loss": 0.0, + "num_tokens": 3116949.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 139.0, + "completions/mean_terminated_length": 139.0, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.04973500198684633, + "epoch": 0.38153428377460963, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.53125, + "learning_rate": 1.5576631075002796e-05, + "loss": -0.0, + "num_tokens": 3121301.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 196.5, + "completions/mean_terminated_length": 196.5, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.03898005420342088, + "epoch": 0.3822131704005431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5556934410146024e-05, + "loss": 0.0, + "num_tokens": 3126841.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 105.25, + "completions/mean_terminated_length": 105.25, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.040712617337703705, + "epoch": 0.38289205702647655, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.125, + "learning_rate": 1.5537206505862486e-05, + "loss": 0.0, + "num_tokens": 3130899.0, + "reward": 2.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 211.75, + "completions/mean_terminated_length": 211.75, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.02091029309667647, + "epoch": 0.38357094365241007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "learning_rate": 1.5517447473056568e-05, + "loss": -0.0, + "num_tokens": 3136817.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 160.625, + "completions/mean_terminated_length": 160.625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.09499801602214575, + "epoch": 0.38424983027834353, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.890625, + "learning_rate": 1.549765742280766e-05, + "loss": 0.0, + "num_tokens": 3141334.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 135.625, + "completions/mean_terminated_length": 135.625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.03466602601110935, + "epoch": 0.384928716904277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5477836466369522e-05, + "loss": 0.0, + "num_tokens": 3145859.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 120.125, + "completions/mean_terminated_length": 120.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.07435816619545221, + "epoch": 0.38560760353021045, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.875, + "learning_rate": 1.5457984715169643e-05, + "loss": 0.0, + "num_tokens": 3150124.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 314.5, + "completions/mean_terminated_length": 314.5, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "entropy": 0.032860161969438195, + "epoch": 0.3862864901561439, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "learning_rate": 1.5438102280808653e-05, + "loss": -0.0, + "num_tokens": 3157192.0, + "reward": 1.7321428060531616, + "reward_std": 0.36967799067497253, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428060531616, + "rewards/fixed_code_pass_all_test_reward/std": 0.36967799067497253, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 127.75, + "completions/mean_terminated_length": 127.75, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.07669706363230944, + "epoch": 0.3869653767820774, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.359375, + "learning_rate": 1.541818927505966e-05, + "loss": 0.0, + "num_tokens": 3161358.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 137.125, + "completions/mean_terminated_length": 137.125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.08569513633847237, + "epoch": 0.38764426340801084, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.90625, + "learning_rate": 1.5398245809867643e-05, + "loss": -0.0, + "num_tokens": 3165751.0, + "reward": 2.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 109.125, + "completions/mean_terminated_length": 109.125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.06928400369361043, + "epoch": 0.38832315003394435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.537827199734881e-05, + "loss": 0.0, + "num_tokens": 3169896.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 119.375, + "completions/mean_terminated_length": 119.375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.07419133419170976, + "epoch": 0.3890020366598778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.0, + "num_tokens": 3174003.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 177.625, + "completions/mean_terminated_length": 177.625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.04335285141132772, + "epoch": 0.3896809232858113, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "learning_rate": 1.533823377964791e-05, + "loss": 0.0, + "num_tokens": 3179096.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 164.25, + "completions/mean_terminated_length": 164.25, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.0432195570319891, + "epoch": 0.39035980991174474, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.09375, + "learning_rate": 1.5318169599548755e-05, + "loss": 0.0, + "num_tokens": 3183874.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 151.875, + "completions/mean_terminated_length": 151.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.04805750073865056, + "epoch": 0.3910386965376782, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "learning_rate": 1.529807552228734e-05, + "loss": 0.0, + "num_tokens": 3188401.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 98.25, + "completions/mean_terminated_length": 98.25, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.019100099802017212, + "epoch": 0.39171758316361166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5277951660826568e-05, + "loss": 0.0, + "num_tokens": 3192443.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 241.375, + "completions/mean_terminated_length": 241.375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.02586966985836625, + "epoch": 0.3923964697895451, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "learning_rate": 1.5257798128296783e-05, + "loss": -0.0, + "num_tokens": 3198686.0, + "reward": 1.75, + "reward_std": 0.20701964199543, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.2070196568965912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 390.25, + "completions/mean_terminated_length": 390.25, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.020898035378195345, + "epoch": 0.39307535641547864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "learning_rate": 1.5237615037995129e-05, + "loss": -0.0, + "num_tokens": 3207496.0, + "reward": 1.918269157409668, + "reward_std": 0.013598186895251274, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9182692766189575, + "rewards/fixed_code_pass_all_test_reward/std": 0.013598217628896236, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 125.75, + "completions/mean_terminated_length": 125.75, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.030841318890452385, + "epoch": 0.3937542430414121, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "learning_rate": 1.5217402503384914e-05, + "loss": 0.0, + "num_tokens": 3212150.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 120.375, + "completions/mean_terminated_length": 120.375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.04560302849858999, + "epoch": 0.39443312966734556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5197160638094981e-05, + "loss": 0.0, + "num_tokens": 3216569.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 106.375, + "completions/mean_terminated_length": 106.375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.07175793964415789, + "epoch": 0.395112016293279, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.517688955591905e-05, + "loss": 0.0, + "num_tokens": 3220652.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 292.375, + "completions/mean_terminated_length": 292.375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.032216466031968594, + "epoch": 0.3957909029192125, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "learning_rate": 1.5156589370815096e-05, + "loss": -0.0, + "num_tokens": 3227783.0, + "reward": 2.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 342.375, + "completions/mean_terminated_length": 342.375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.03005579300224781, + "epoch": 0.39646978954514595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5136260196904704e-05, + "loss": 0.0, + "num_tokens": 3235618.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 115.625, + "completions/mean_terminated_length": 115.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.019725291756913066, + "epoch": 0.3971486761710794, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "learning_rate": 1.5115902148472418e-05, + "loss": 0.0, + "num_tokens": 3239919.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 299.125, + "completions/mean_terminated_length": 299.125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "entropy": 0.025002469774335623, + "epoch": 0.3978275627970129, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "learning_rate": 1.5095515339965117e-05, + "loss": -0.0, + "num_tokens": 3247600.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 96.25, + "completions/mean_terminated_length": 96.25, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.12215751688927412, + "epoch": 0.3985064494229464, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.5, + "learning_rate": 1.5075099885991345e-05, + "loss": 0.0, + "num_tokens": 3251562.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 110.125, + "completions/mean_terminated_length": 110.125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.03420641226693988, + "epoch": 0.39918533604887985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5054655901320697e-05, + "loss": 0.0, + "num_tokens": 3255587.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 213.5, + "completions/mean_terminated_length": 213.5, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.06157191004604101, + "epoch": 0.3998642226748133, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.21875, + "learning_rate": 1.5034183500883153e-05, + "loss": 0.0, + "num_tokens": 3260975.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 84.125, + "completions/mean_terminated_length": 84.125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.017459457740187645, + "epoch": 0.40054310930074677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5013682799768435e-05, + "loss": 0.0, + "num_tokens": 3264864.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 120.0, + "completions/mean_terminated_length": 120.0, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.02103763446211815, + "epoch": 0.40122199592668023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4993153913225374e-05, + "loss": 0.0, + "num_tokens": 3269136.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 150.25, + "completions/mean_terminated_length": 150.25, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.0377178059425205, + "epoch": 0.4019008825526137, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "learning_rate": 1.4972596956661229e-05, + "loss": 0.0, + "num_tokens": 3273642.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 96.0, + "completions/mean_terminated_length": 96.0, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.04918831679970026, + "epoch": 0.4025797691785472, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6875, + "learning_rate": 1.495201204564109e-05, + "loss": -0.0, + "num_tokens": 3277570.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 111.25, + "completions/mean_terminated_length": 111.25, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.013917091069743037, + "epoch": 0.40325865580448067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4931399295887172e-05, + "loss": 0.0, + "num_tokens": 3281900.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 314.0, + "completions/mean_terminated_length": 314.0, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.016182406223379076, + "epoch": 0.40393754243041413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4910758823278208e-05, + "loss": 0.0, + "num_tokens": 3289132.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 250.625, + "completions/mean_terminated_length": 250.625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.012484532431699336, + "epoch": 0.4046164290563476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4890090743848774e-05, + "loss": 0.0, + "num_tokens": 3295609.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 120.875, + "completions/mean_terminated_length": 120.875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.04672857653349638, + "epoch": 0.40529531568228105, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.671875, + "learning_rate": 1.4869395173788642e-05, + "loss": 0.0, + "num_tokens": 3299688.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 86.25, + "completions/mean_terminated_length": 86.25, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.03325698827393353, + "epoch": 0.4059742023082145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4848672229442132e-05, + "loss": 0.0, + "num_tokens": 3303602.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 326.5, + "completions/mean_terminated_length": 326.5, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.022395170526579022, + "epoch": 0.406653088934148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "learning_rate": 1.482792202730745e-05, + "loss": -0.0, + "num_tokens": 3311398.0, + "reward": 2.580357074737549, + "reward_std": 0.3975829482078552, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7053571939468384, + "rewards/fixed_code_pass_all_test_reward/std": 0.12334916740655899, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 236.125, + "completions/mean_terminated_length": 236.125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.026530831586569548, + "epoch": 0.4073319755600815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "learning_rate": 1.4807144684036044e-05, + "loss": 0.0, + "num_tokens": 3317511.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 355.25, + "completions/mean_terminated_length": 355.25, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 0.012836731621064246, + "epoch": 0.40801086218601496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4786340316431931e-05, + "loss": 0.0, + "num_tokens": 3325617.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 255.125, + "completions/mean_terminated_length": 255.125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.012336520594544709, + "epoch": 0.4086897488119484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.476550904145106e-05, + "loss": 0.0, + "num_tokens": 3332034.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 130.5, + "completions/mean_terminated_length": 130.5, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.02567755780182779, + "epoch": 0.4093686354378819, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.890625, + "learning_rate": 1.4744650976200643e-05, + "loss": -0.0, + "num_tokens": 3336374.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 205.625, + "completions/mean_terminated_length": 205.625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.016870889579877257, + "epoch": 0.41004752206381534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4723766237938495e-05, + "loss": 0.0, + "num_tokens": 3342339.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 79.125, + "completions/mean_terminated_length": 79.125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.06487983511760831, + "epoch": 0.4107264086897488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4702854944072383e-05, + "loss": 0.0, + "num_tokens": 3346076.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 89.625, + "completions/mean_terminated_length": 89.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.04705160157755017, + "epoch": 0.41140529531568226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4681917212159358e-05, + "loss": 0.0, + "num_tokens": 3349953.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 204.25, + "completions/mean_terminated_length": 204.25, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.06749026523903012, + "epoch": 0.4120841819416157, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "learning_rate": 1.46609531599051e-05, + "loss": -0.0, + "num_tokens": 3355131.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 62.875, + "completions/mean_terminated_length": 62.875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.032150683691725135, + "epoch": 0.41276306856754924, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.78125, + "learning_rate": 1.4639962905163258e-05, + "loss": -0.0, + "num_tokens": 3358690.0, + "reward": 2.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 213.5, + "completions/mean_terminated_length": 213.5, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.07843731855973601, + "epoch": 0.4134419551934827, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.375, + "learning_rate": 1.4618946565934775e-05, + "loss": 0.0, + "num_tokens": 3363998.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 285.375, + "completions/mean_terminated_length": 285.375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.024500891799107194, + "epoch": 0.41412084181941616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "learning_rate": 1.4597904260367239e-05, + "loss": -0.0, + "num_tokens": 3370777.0, + "reward": 1.5, + "reward_std": 1.6035674810409546, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 130.75, + "completions/mean_terminated_length": 130.75, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.03512542974203825, + "epoch": 0.4147997284453496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4576836106754213e-05, + "loss": 0.0, + "num_tokens": 3375055.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 93.5, + "completions/mean_terminated_length": 93.5, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.039333144668489695, + "epoch": 0.4154786150712831, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.455574222353457e-05, + "loss": 0.0, + "num_tokens": 3378939.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 120.75, + "completions/mean_terminated_length": 120.75, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.05375869292765856, + "epoch": 0.41615750169721655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.453462272929182e-05, + "loss": 0.0, + "num_tokens": 3383209.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 112.125, + "completions/mean_terminated_length": 112.125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.03742718417197466, + "epoch": 0.41683638832315, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.65625, + "learning_rate": 1.4513477742753465e-05, + "loss": 0.0, + "num_tokens": 3387466.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 240.25, + "completions/mean_terminated_length": 240.25, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.023294531973078847, + "epoch": 0.4175152749490835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.44923073827903e-05, + "loss": 0.0, + "num_tokens": 3393548.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 139.25, + "completions/mean_terminated_length": 139.25, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.05156797170639038, + "epoch": 0.418194161575017, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.734375, + "learning_rate": 1.4471111768415777e-05, + "loss": 0.0, + "num_tokens": 3397854.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 90.0, + "completions/mean_terminated_length": 90.0, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.008384114131331444, + "epoch": 0.41887304820095045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.444989101878531e-05, + "loss": 0.0, + "num_tokens": 3401630.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 79.25, + "completions/mean_terminated_length": 79.25, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.08051333297044039, + "epoch": 0.4195519348268839, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4428645253195621e-05, + "loss": 0.0, + "num_tokens": 3405480.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 90.5, + "completions/mean_terminated_length": 90.5, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.04123400757089257, + "epoch": 0.42023082145281737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4407374591084064e-05, + "loss": 0.0, + "num_tokens": 3409412.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 411.375, + "completions/mean_terminated_length": 411.375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.031261581694707274, + "epoch": 0.42090970807875083, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "learning_rate": 1.4386079152027952e-05, + "loss": -0.0, + "num_tokens": 3419055.0, + "reward": 2.4749999046325684, + "reward_std": 0.3535533547401428, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 121.5, + "completions/mean_terminated_length": 121.5, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.0838325722143054, + "epoch": 0.4215885947046843, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.640625, + "learning_rate": 1.4364759055743888e-05, + "loss": 0.0, + "num_tokens": 3423139.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 123.0, + "completions/mean_terminated_length": 123.0, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.03678737487643957, + "epoch": 0.4222674813306178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4343414422087093e-05, + "loss": 0.0, + "num_tokens": 3427379.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 159.75, + "completions/mean_terminated_length": 159.75, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.0567446150816977, + "epoch": 0.4229463679565513, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "learning_rate": 1.4322045371050722e-05, + "loss": 0.0, + "num_tokens": 3432369.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 105.75, + "completions/mean_terminated_length": 105.75, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.03274666238576174, + "epoch": 0.42362525458248473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4300652022765207e-05, + "loss": 0.0, + "num_tokens": 3436327.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 188.875, + "completions/mean_terminated_length": 188.875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.021755606052465737, + "epoch": 0.4243041412084182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4279234497497563e-05, + "loss": 0.0, + "num_tokens": 3442166.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 266.25, + "completions/mean_terminated_length": 266.25, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.011844113236293197, + "epoch": 0.42498302783435166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "learning_rate": 1.4257792915650728e-05, + "loss": 0.0, + "num_tokens": 3450672.0, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 131.5, + "completions/mean_terminated_length": 131.5, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.025760697433725, + "epoch": 0.4256619144602851, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "learning_rate": 1.4236327397762874e-05, + "loss": 0.0, + "num_tokens": 3454956.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 68.0, + "completions/mean_terminated_length": 68.0, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.028030681889504194, + "epoch": 0.4263408010862186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4214838064506738e-05, + "loss": 0.0, + "num_tokens": 3458564.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 190.375, + "completions/mean_terminated_length": 190.375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.060217094607651234, + "epoch": 0.4270196877121521, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.419332503668894e-05, + "loss": 0.0, + "num_tokens": 3463919.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 146.375, + "completions/mean_terminated_length": 146.375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.0345245657954365, + "epoch": 0.42769857433808556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "learning_rate": 1.417178843524929e-05, + "loss": 0.0, + "num_tokens": 3468546.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 311.875, + "completions/mean_terminated_length": 311.875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "entropy": 0.04785546218045056, + "epoch": 0.428377460964019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.415022838126015e-05, + "loss": 0.0, + "num_tokens": 3476161.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 116.0, + "completions/mean_terminated_length": 116.0, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.023954134434461594, + "epoch": 0.4290563475899525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4128644995925696e-05, + "loss": 0.0, + "num_tokens": 3480321.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 228.875, + "completions/mean_terminated_length": 228.875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.03332836041226983, + "epoch": 0.42973523421588594, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "learning_rate": 1.4107038400581288e-05, + "loss": 0.0, + "num_tokens": 3486368.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 308.125, + "completions/mean_terminated_length": 308.125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "entropy": 0.009603998390957713, + "epoch": 0.4304141208418194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.408540871669275e-05, + "loss": 0.0, + "num_tokens": 3493497.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 149.25, + "completions/mean_terminated_length": 149.25, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.028482120716944337, + "epoch": 0.43109300746775286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4063756065855714e-05, + "loss": 0.0, + "num_tokens": 3498419.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 184.875, + "completions/mean_terminated_length": 184.875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.041112816194072366, + "epoch": 0.4317718940936864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "learning_rate": 1.4042080569794916e-05, + "loss": 0.0, + "num_tokens": 3503906.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 115.375, + "completions/mean_terminated_length": 115.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.02200512622948736, + "epoch": 0.43245078071961984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4020382350363528e-05, + "loss": 0.0, + "num_tokens": 3508085.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 226.0, + "completions/mean_terminated_length": 226.0, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.006985355954384431, + "epoch": 0.4331296673455533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3998661529542463e-05, + "loss": 0.0, + "num_tokens": 3514125.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 85.75, + "completions/mean_terminated_length": 85.75, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.0492265815846622, + "epoch": 0.43380855397148677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3976918229439698e-05, + "loss": 0.0, + "num_tokens": 3517915.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 118.875, + "completions/mean_terminated_length": 118.875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.02465058397501707, + "epoch": 0.4344874405974202, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.40625, + "learning_rate": 1.3955152572289568e-05, + "loss": 0.0, + "num_tokens": 3522034.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 198.25, + "completions/mean_terminated_length": 198.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.042808918049559, + "epoch": 0.4351663272233537, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "learning_rate": 1.3933364680452106e-05, + "loss": -0.0, + "num_tokens": 3527596.0, + "reward": 2.9791665077209473, + "reward_std": 0.058925628662109375, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 121.375, + "completions/mean_terminated_length": 121.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.05737858219072223, + "epoch": 0.43584521384928715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.391155467641234e-05, + "loss": 0.0, + "num_tokens": 3531767.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 109.25, + "completions/mean_terminated_length": 109.25, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.08171014487743378, + "epoch": 0.4365241004752206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3889722682779598e-05, + "loss": 0.0, + "num_tokens": 3536041.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 117.75, + "completions/mean_terminated_length": 117.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.10145062673836946, + "epoch": 0.43720298710115413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3867868822286838e-05, + "loss": 0.0, + "num_tokens": 3540071.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 151.75, + "completions/mean_terminated_length": 151.75, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.05416981549933553, + "epoch": 0.4378818737270876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3845993217789937e-05, + "loss": 0.0, + "num_tokens": 3544709.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 254.25, + "completions/mean_terminated_length": 254.25, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.013641998521052301, + "epoch": 0.43856076035302105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3824095992267017e-05, + "loss": 0.0, + "num_tokens": 3551223.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 309.875, + "completions/mean_terminated_length": 309.875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 0.02465397259220481, + "epoch": 0.4392396469789545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3802177268817742e-05, + "loss": 0.0, + "num_tokens": 3558678.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 119.125, + "completions/mean_terminated_length": 119.125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.016694259480573237, + "epoch": 0.439918533604888, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3780237170662638e-05, + "loss": 0.0, + "num_tokens": 3562863.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 116.0, + "completions/mean_terminated_length": 116.0, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.021779146045446396, + "epoch": 0.44059742023082143, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "learning_rate": 1.3758275821142382e-05, + "loss": -0.0, + "num_tokens": 3566991.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 110.0, + "completions/mean_terminated_length": 110.0, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.011041645426303148, + "epoch": 0.4412763068567549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3736293343717134e-05, + "loss": 0.0, + "num_tokens": 3571287.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 346.5, + "completions/mean_terminated_length": 346.5, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "entropy": 0.039802387123927474, + "epoch": 0.4419551934826884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "learning_rate": 1.3714289861965816e-05, + "loss": -0.0, + "num_tokens": 3579683.0, + "reward": 2.125, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 108.125, + "completions/mean_terminated_length": 108.125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.0409787162207067, + "epoch": 0.4426340801086219, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.21875, + "learning_rate": 1.3692265499585438e-05, + "loss": -0.0, + "num_tokens": 3583684.0, + "reward": 2.125, + "reward_std": 0.9910312294960022, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 226.125, + "completions/mean_terminated_length": 226.125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.08081695158034563, + "epoch": 0.44331296673455534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3670220380390389e-05, + "loss": 0.0, + "num_tokens": 3588989.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 268.75, + "completions/mean_terminated_length": 268.75, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.048724310006946325, + "epoch": 0.4439918533604888, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.859375, + "learning_rate": 1.3648154628311754e-05, + "loss": 0.0, + "num_tokens": 3595347.0, + "reward": 2.25, + "reward_std": 1.0350983142852783, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 104.125, + "completions/mean_terminated_length": 104.125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.023676771903410554, + "epoch": 0.44467073998642226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3626068367396603e-05, + "loss": 0.0, + "num_tokens": 3599460.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 127.75, + "completions/mean_terminated_length": 127.75, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.0415635141544044, + "epoch": 0.4453496266123557, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "learning_rate": 1.3603961721807304e-05, + "loss": -0.0, + "num_tokens": 3603698.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 74.125, + "completions/mean_terminated_length": 74.125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.047901921439915895, + "epoch": 0.4460285132382892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3581834815820817e-05, + "loss": 0.0, + "num_tokens": 3607539.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 221.875, + "completions/mean_terminated_length": 221.875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.023064984008669853, + "epoch": 0.4467073998642227, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3559687773828012e-05, + "loss": 0.0, + "num_tokens": 3613722.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 123.375, + "completions/mean_terminated_length": 123.375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.022970449412241578, + "epoch": 0.44738628649015616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3537520720332943e-05, + "loss": 0.0, + "num_tokens": 3618021.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 227.875, + "completions/mean_terminated_length": 227.875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.08673943532630801, + "epoch": 0.4480651731160896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "learning_rate": 1.3515333779952169e-05, + "loss": -0.0, + "num_tokens": 3624404.0, + "reward": 1.765625, + "reward_std": 0.1043153703212738, + "rewards/fixed_code_pass_all_test_reward/mean": 0.765625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1043153703212738, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 117.375, + "completions/mean_terminated_length": 117.375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.07286445517092943, + "epoch": 0.4487440597420231, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.40625, + "learning_rate": 1.3493127077414046e-05, + "loss": 0.0, + "num_tokens": 3628479.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 233.125, + "completions/mean_terminated_length": 233.125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.05186600983142853, + "epoch": 0.44942294636795654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3470900737558032e-05, + "loss": 0.0, + "num_tokens": 3635872.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 225.0, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.0088185541681014, + "epoch": 0.45010183299389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3448654885333974e-05, + "loss": 0.0, + "num_tokens": 3641848.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 93.0, + "completions/mean_terminated_length": 93.0, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.08682233095169067, + "epoch": 0.45078071961982347, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.53125, + "learning_rate": 1.3426389645801415e-05, + "loss": -0.0, + "num_tokens": 3646104.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 165.0, + "completions/mean_terminated_length": 165.0, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.057350332383066416, + "epoch": 0.451459606245757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3404105144128885e-05, + "loss": 0.0, + "num_tokens": 3651056.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 135.5, + "completions/mean_terminated_length": 135.5, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.04245053534395993, + "epoch": 0.45213849287169044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3381801505593201e-05, + "loss": 0.0, + "num_tokens": 3655588.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 217.5, + "completions/mean_terminated_length": 217.5, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.0375438358169049, + "epoch": 0.4528173794976239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3359478855578764e-05, + "loss": 0.0, + "num_tokens": 3661744.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 195.625, + "completions/mean_terminated_length": 195.625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.09043617080897093, + "epoch": 0.45349626612355737, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "learning_rate": 1.333713731957685e-05, + "loss": -0.0, + "num_tokens": 3666997.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 132.25, + "completions/mean_terminated_length": 132.25, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.053803281392902136, + "epoch": 0.45417515274949083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3314777023184907e-05, + "loss": 0.0, + "num_tokens": 3671535.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 136.625, + "completions/mean_terminated_length": 136.625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.018220534548163414, + "epoch": 0.4548540393754243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3292398092105842e-05, + "loss": 0.0, + "num_tokens": 3676020.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 76.375, + "completions/mean_terminated_length": 76.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.025097382487729192, + "epoch": 0.45553292600135775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3270000652147339e-05, + "loss": 0.0, + "num_tokens": 3679751.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 78.0, + "completions/mean_terminated_length": 78.0, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.01580260810442269, + "epoch": 0.45621181262729127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3247584829221104e-05, + "loss": 0.0, + "num_tokens": 3683519.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 78.625, + "completions/mean_terminated_length": 78.625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.02852762583643198, + "epoch": 0.45689069925322473, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.21875, + "learning_rate": 1.3225150749342222e-05, + "loss": -0.0, + "num_tokens": 3687188.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 137.125, + "completions/mean_terminated_length": 137.125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.0470118778757751, + "epoch": 0.4575695858791582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3202698538628376e-05, + "loss": 0.0, + "num_tokens": 3691525.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 108.5, + "completions/mean_terminated_length": 108.5, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.0804237388074398, + "epoch": 0.45824847250509165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.318022832329921e-05, + "loss": 0.0, + "num_tokens": 3695505.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 97.75, + "completions/mean_terminated_length": 97.75, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.035810349974781275, + "epoch": 0.4589273591310251, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3157740229675557e-05, + "loss": 0.0, + "num_tokens": 3699639.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 92.875, + "completions/mean_terminated_length": 92.875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.04038177663460374, + "epoch": 0.4596062457569586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3135234384178772e-05, + "loss": 0.0, + "num_tokens": 3703646.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 70.875, + "completions/mean_terminated_length": 70.875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.041391176637262106, + "epoch": 0.46028513238289204, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.0, + "learning_rate": 1.311271091333e-05, + "loss": 0.0, + "num_tokens": 3707301.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 344.625, + "completions/mean_terminated_length": 344.625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "entropy": 0.015988030936568975, + "epoch": 0.4609640190088255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.0, + "num_tokens": 3715154.0, + "reward": 2.3636364936828613, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 202.375, + "completions/mean_terminated_length": 202.375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.04510409012436867, + "epoch": 0.461642905634759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3067611602155799e-05, + "loss": 0.0, + "num_tokens": 3720749.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 206.75, + "completions/mean_terminated_length": 206.75, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.01797198981512338, + "epoch": 0.4623217922606925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "learning_rate": 1.3045036015365233e-05, + "loss": 0.0, + "num_tokens": 3726931.0, + "reward": 2.25, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 130.875, + "completions/mean_terminated_length": 130.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.05983824376016855, + "epoch": 0.46300067888662594, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.890625, + "learning_rate": 1.3022443310290993e-05, + "loss": -0.0, + "num_tokens": 3731218.0, + "reward": 2.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 147.625, + "completions/mean_terminated_length": 147.625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.03132125805132091, + "epoch": 0.4636795655125594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.299983361394252e-05, + "loss": 0.0, + "num_tokens": 3735855.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 122.625, + "completions/mean_terminated_length": 122.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.04651427315548062, + "epoch": 0.46435845213849286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2977207053424781e-05, + "loss": 0.0, + "num_tokens": 3739940.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 102.375, + "completions/mean_terminated_length": 102.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.055594713892787695, + "epoch": 0.4650373387644263, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0, + "learning_rate": 1.2954563755937546e-05, + "loss": 0.0, + "num_tokens": 3744183.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 405.625, + "completions/mean_terminated_length": 405.625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.02527920249849558, + "epoch": 0.4657162253903598, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2931903848774676e-05, + "loss": 0.0, + "num_tokens": 3753524.0, + "reward": 2.200000047683716, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 135.75, + "completions/mean_terminated_length": 135.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.08573331777006388, + "epoch": 0.4663951120162933, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "learning_rate": 1.2909227459323403e-05, + "loss": -0.0, + "num_tokens": 3760474.0, + "reward": 2.8214285373687744, + "reward_std": 0.22587695717811584, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.22587695717811584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 213.875, + "completions/mean_terminated_length": 213.875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.02615616819821298, + "epoch": 0.46707399864222676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "learning_rate": 1.2886534715063626e-05, + "loss": 0.0, + "num_tokens": 3766273.0, + "reward": 2.515625, + "reward_std": 0.32346823811531067, + "rewards/fixed_code_pass_all_test_reward/mean": 0.515625, + "rewards/fixed_code_pass_all_test_reward/std": 0.32346823811531067, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 346.5, + "completions/mean_terminated_length": 346.5, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.13608831726014614, + "epoch": 0.4677528852681602, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "learning_rate": 1.2863825743567174e-05, + "loss": 0.0, + "num_tokens": 3773317.0, + "reward": 2.0, + "reward_std": 1.0690449476242065, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 164.25, + "completions/mean_terminated_length": 164.25, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.08343358803540468, + "epoch": 0.4684317718940937, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "learning_rate": 1.2841100672497116e-05, + "loss": -0.0, + "num_tokens": 3778023.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 93.125, + "completions/mean_terminated_length": 93.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.05809004930779338, + "epoch": 0.46911065852002715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2818359629607008e-05, + "loss": 0.0, + "num_tokens": 3782024.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 257.25, + "completions/mean_terminated_length": 257.25, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.01703940413426608, + "epoch": 0.4697895451459606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2795602742740217e-05, + "loss": 0.0, + "num_tokens": 3788274.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 101.875, + "completions/mean_terminated_length": 101.875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.04470214154571295, + "epoch": 0.47046843177189407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.277283013982916e-05, + "loss": 0.0, + "num_tokens": 3792169.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 125.875, + "completions/mean_terminated_length": 125.875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.0648745964281261, + "epoch": 0.4711473183978276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2750041948894621e-05, + "loss": 0.0, + "num_tokens": 3796408.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 189.25, + "completions/mean_terminated_length": 189.25, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.09087196364998817, + "epoch": 0.47182620502376105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2727238298045002e-05, + "loss": 0.0, + "num_tokens": 3801690.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 124.75, + "completions/mean_terminated_length": 124.75, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.0700354422442615, + "epoch": 0.4725050916496945, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.53125, + "learning_rate": 1.2704419315475629e-05, + "loss": -0.0, + "num_tokens": 3805960.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 150.625, + "completions/mean_terminated_length": 150.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.06748683424666524, + "epoch": 0.47318397827562797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2681585129468003e-05, + "loss": 0.0, + "num_tokens": 3810213.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 120.5, + "completions/mean_terminated_length": 120.5, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.0440788296982646, + "epoch": 0.47386286490156143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2658735868389113e-05, + "loss": 0.0, + "num_tokens": 3814433.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 64.875, + "completions/mean_terminated_length": 64.875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.025556961772963405, + "epoch": 0.4745417515274949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2635871660690677e-05, + "loss": 0.0, + "num_tokens": 3818176.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 191.625, + "completions/mean_terminated_length": 191.625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.08459043968468904, + "epoch": 0.47522063815342835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2612992634908454e-05, + "loss": 0.0, + "num_tokens": 3823413.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 162.375, + "completions/mean_terminated_length": 162.375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.04112151707522571, + "epoch": 0.47589952477936187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "learning_rate": 1.259009891966149e-05, + "loss": -0.0, + "num_tokens": 3828152.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 147.75, + "completions/mean_terminated_length": 147.75, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.11485203076153994, + "epoch": 0.47657841140529533, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "learning_rate": 1.2567190643651426e-05, + "loss": -0.0, + "num_tokens": 3832646.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 322.0, + "completions/mean_terminated_length": 322.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.024960620794445276, + "epoch": 0.4772572980312288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "learning_rate": 1.2544267935661751e-05, + "loss": -0.0, + "num_tokens": 3840174.0, + "reward": 2.3249998092651367, + "reward_std": 0.41661909222602844, + "rewards/fixed_code_pass_all_test_reward/mean": 0.32500001788139343, + "rewards/fixed_code_pass_all_test_reward/std": 0.41661903262138367, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 158.25, + "completions/mean_terminated_length": 158.25, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.062057614559307694, + "epoch": 0.47793618465716226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2521330924557087e-05, + "loss": 0.0, + "num_tokens": 3844552.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 117.25, + "completions/mean_terminated_length": 117.25, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.04860835149884224, + "epoch": 0.4786150712830957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2498379739282465e-05, + "loss": 0.0, + "num_tokens": 3848698.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 213.125, + "completions/mean_terminated_length": 213.125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.033762684324756265, + "epoch": 0.4792939579090292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2475414508862598e-05, + "loss": 0.0, + "num_tokens": 3854675.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 161.0, + "completions/mean_terminated_length": 161.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.04638125002384186, + "epoch": 0.47997284453496264, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "learning_rate": 1.2452435362401161e-05, + "loss": 0.0, + "num_tokens": 3859595.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 70.25, + "completions/mean_terminated_length": 70.25, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.03012160398066044, + "epoch": 0.48065173116089616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2429442429080054e-05, + "loss": 0.0, + "num_tokens": 3863397.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 232.5, + "completions/mean_terminated_length": 232.5, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.02008997299708426, + "epoch": 0.4813306177868296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2406435838158686e-05, + "loss": 0.0, + "num_tokens": 3874049.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 94.5, + "completions/mean_terminated_length": 94.5, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.03085462236776948, + "epoch": 0.4820095044127631, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2383415718973245e-05, + "loss": 0.0, + "num_tokens": 3878021.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 242.75, + "completions/mean_terminated_length": 242.75, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.03501976956613362, + "epoch": 0.48268839103869654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2360382200935971e-05, + "loss": 0.0, + "num_tokens": 3884267.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 170.0, + "completions/mean_terminated_length": 170.0, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.1122683584690094, + "epoch": 0.48336727766463, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "learning_rate": 1.2337335413534428e-05, + "loss": 0.0, + "num_tokens": 3888931.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 75.0, + "completions/mean_terminated_length": 75.0, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.02884731814265251, + "epoch": 0.48404616429056346, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.1875, + "learning_rate": 1.2314275486330778e-05, + "loss": -0.0, + "num_tokens": 3892627.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 197.75, + "completions/mean_terminated_length": 197.75, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.0856550857424736, + "epoch": 0.4847250509164969, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.328125, + "learning_rate": 1.2291202548961042e-05, + "loss": 0.0, + "num_tokens": 3897905.0, + "reward": 1.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 126.875, + "completions/mean_terminated_length": 126.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.0559721770696342, + "epoch": 0.48540393754243044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.22681167311344e-05, + "loss": 0.0, + "num_tokens": 3902264.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 144.75, + "completions/mean_terminated_length": 144.75, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.09684339864179492, + "epoch": 0.4860828241683639, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2245018162632421e-05, + "loss": 0.0, + "num_tokens": 3907118.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 137.5, + "completions/mean_terminated_length": 137.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.04584557842463255, + "epoch": 0.48676171079429736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2221906973308365e-05, + "loss": 0.0, + "num_tokens": 3911522.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 389.625, + "completions/mean_terminated_length": 389.625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.12988235335797071, + "epoch": 0.4874405974202308, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "learning_rate": 1.2198783293086442e-05, + "loss": -0.0, + "num_tokens": 3920423.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 97.0, + "completions/mean_terminated_length": 97.0, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.03626663563773036, + "epoch": 0.4881194840461643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.217564725196108e-05, + "loss": 0.0, + "num_tokens": 3924511.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 101.25, + "completions/mean_terminated_length": 101.25, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.050255367532372475, + "epoch": 0.48879837067209775, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5625, + "learning_rate": 1.2152498979996195e-05, + "loss": 0.0, + "num_tokens": 3928489.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 106.0, + "completions/mean_terminated_length": 106.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.10353991389274597, + "epoch": 0.4894772572980312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2129338607324468e-05, + "loss": 0.0, + "num_tokens": 3932633.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 118.125, + "completions/mean_terminated_length": 118.125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.06880290480330586, + "epoch": 0.49015614392396467, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.875, + "learning_rate": 1.2106166264146598e-05, + "loss": -0.0, + "num_tokens": 3936754.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 193.875, + "completions/mean_terminated_length": 193.875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.0662290845066309, + "epoch": 0.4908350305498982, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "learning_rate": 1.2082982080730583e-05, + "loss": 0.0, + "num_tokens": 3942129.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 187.375, + "completions/mean_terminated_length": 187.375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.06920376792550087, + "epoch": 0.49151391717583165, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "learning_rate": 1.2059786187410984e-05, + "loss": -0.0, + "num_tokens": 3946980.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 125.75, + "completions/mean_terminated_length": 125.75, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.038067944813519716, + "epoch": 0.4921928038017651, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2036578714588191e-05, + "loss": 0.0, + "num_tokens": 3951482.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 164.125, + "completions/mean_terminated_length": 164.125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.06174566503614187, + "epoch": 0.49287169042769857, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "learning_rate": 1.2013359792727688e-05, + "loss": -0.0, + "num_tokens": 3956019.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 322.75, + "completions/mean_terminated_length": 322.75, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.03196876007132232, + "epoch": 0.49355057705363203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1990129552359326e-05, + "loss": 0.0, + "num_tokens": 3964737.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 227.375, + "completions/mean_terminated_length": 227.375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.04362545418553054, + "epoch": 0.4942294636795655, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "learning_rate": 1.1966888124076584e-05, + "loss": -0.0, + "num_tokens": 3971356.0, + "reward": 1.875, + "reward_std": 0.25495100021362305, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.25495097041130066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 136.0, + "completions/mean_terminated_length": 136.0, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.06535260938107967, + "epoch": 0.49490835030549896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1943635638535827e-05, + "loss": 0.0, + "num_tokens": 3975748.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.01673401170410216, + "epoch": 0.4955872369314325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.19203722264556e-05, + "loss": 0.0, + "num_tokens": 3981242.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 107.875, + "completions/mean_terminated_length": 107.875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.050440984312444925, + "epoch": 0.49626612355736593, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.28125, + "learning_rate": 1.1897098018615854e-05, + "loss": 0.0, + "num_tokens": 3985297.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 198.375, + "completions/mean_terminated_length": 198.375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.05595476645976305, + "epoch": 0.4969450101832994, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "learning_rate": 1.187381314585725e-05, + "loss": -0.0, + "num_tokens": 3991420.0, + "reward": 1.6875, + "reward_std": 0.19594092667102814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.19594095647335052, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 259.375, + "completions/mean_terminated_length": 259.375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.043723273323848844, + "epoch": 0.49762389680923286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "learning_rate": 1.1850517739080381e-05, + "loss": -0.0, + "num_tokens": 3998047.0, + "reward": 2.84375, + "reward_std": 0.2651650309562683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 164.75, + "completions/mean_terminated_length": 164.75, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.101027418859303, + "epoch": 0.4983027834351663, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.34375, + "learning_rate": 1.1827211929245075e-05, + "loss": 0.0, + "num_tokens": 4002669.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 153.625, + "completions/mean_terminated_length": 153.625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.16557116620242596, + "epoch": 0.4989816700610998, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "learning_rate": 1.1803895847369645e-05, + "loss": -0.0, + "num_tokens": 4007402.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 124.0, + "completions/mean_terminated_length": 124.0, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.05493178544566035, + "epoch": 0.49966055668703324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1780569624530135e-05, + "loss": 0.0, + "num_tokens": 4011658.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 106.125, + "completions/mean_terminated_length": 106.125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.058956142980605364, + "epoch": 0.5003394433129668, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.21875, + "learning_rate": 1.1757233391859617e-05, + "loss": 0.0, + "num_tokens": 4015547.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 145.625, + "completions/mean_terminated_length": 145.625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.04606771096587181, + "epoch": 0.5010183299389002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1733887280547425e-05, + "loss": 0.0, + "num_tokens": 4020016.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 121.125, + "completions/mean_terminated_length": 121.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.1013864791020751, + "epoch": 0.5016972165648337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1710531421838422e-05, + "loss": 0.0, + "num_tokens": 4024513.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 107.375, + "completions/mean_terminated_length": 107.375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.06689020246267319, + "epoch": 0.5023761031907671, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "learning_rate": 1.1687165947032285e-05, + "loss": -0.0, + "num_tokens": 4028660.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 130.25, + "completions/mean_terminated_length": 130.25, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.06759583251550794, + "epoch": 0.5030549898167006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1663790987482729e-05, + "loss": 0.0, + "num_tokens": 4032998.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 112.5, + "completions/mean_terminated_length": 112.5, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.07074190396815538, + "epoch": 0.5037338764426341, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "learning_rate": 1.1640406674596807e-05, + "loss": 0.0, + "num_tokens": 4037058.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 152.875, + "completions/mean_terminated_length": 152.875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.033243893878534436, + "epoch": 0.5044127630685675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1617013139834148e-05, + "loss": 0.0, + "num_tokens": 4041473.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 277.0, + "completions/mean_terminated_length": 277.0, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.033465131651610136, + "epoch": 0.505091649694501, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "learning_rate": 1.1593610514706217e-05, + "loss": -0.0, + "num_tokens": 4048889.0, + "reward": 1.9464285373687744, + "reward_std": 0.09919504076242447, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.09919501841068268, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 247.375, + "completions/mean_terminated_length": 247.375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.01844643836375326, + "epoch": 0.5057705363204344, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "learning_rate": 1.1570198930775594e-05, + "loss": 0.0, + "num_tokens": 4055012.0, + "reward": 2.3500001430511475, + "reward_std": 0.09258199483156204, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3500000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 199.625, + "completions/mean_terminated_length": 199.625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.05799560155719519, + "epoch": 0.506449422946368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "learning_rate": 1.1546778519655209e-05, + "loss": -0.0, + "num_tokens": 4060441.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 218.25, + "completions/mean_terminated_length": 218.25, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.04173798905685544, + "epoch": 0.5071283095723014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1523349413007633e-05, + "loss": 0.0, + "num_tokens": 4066203.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 106.125, + "completions/mean_terminated_length": 106.125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.026790649397298694, + "epoch": 0.5078071961982349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1499911742544304e-05, + "loss": 0.0, + "num_tokens": 4070876.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 161.0, + "completions/mean_terminated_length": 161.0, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.05811635544523597, + "epoch": 0.5084860828241684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1476465640024814e-05, + "loss": 0.0, + "num_tokens": 4075348.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 274.375, + "completions/mean_terminated_length": 274.375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.05909173237159848, + "epoch": 0.5091649694501018, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "learning_rate": 1.1453011237256152e-05, + "loss": -0.0, + "num_tokens": 4082407.0, + "reward": 2.625, + "reward_std": 0.4825679659843445, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.48256784677505493, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 134.375, + "completions/mean_terminated_length": 134.375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.03886709990911186, + "epoch": 0.5098438560760353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1429548666091969e-05, + "loss": 0.0, + "num_tokens": 4086770.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 111.5, + "completions/mean_terminated_length": 111.5, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.058395545929670334, + "epoch": 0.5105227427019687, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "learning_rate": 1.140607805843184e-05, + "loss": 0.0, + "num_tokens": 4090790.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 139.125, + "completions/mean_terminated_length": 139.125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.032309852074831724, + "epoch": 0.5112016293279023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1382599546220516e-05, + "loss": 0.0, + "num_tokens": 4095447.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 92.5, + "completions/mean_terminated_length": 92.5, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.0683764317072928, + "epoch": 0.5118805159538357, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1359113261447183e-05, + "loss": 0.0, + "num_tokens": 4099291.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 213.125, + "completions/mean_terminated_length": 213.125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.03968081600032747, + "epoch": 0.5125594025797692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.133561933614473e-05, + "loss": 0.0, + "num_tokens": 4105764.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 134.125, + "completions/mean_terminated_length": 134.125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.05837314995005727, + "epoch": 0.5132382892057027, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.421875, + "learning_rate": 1.1312117902388986e-05, + "loss": -0.0, + "num_tokens": 4110005.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 117.0, + "completions/mean_terminated_length": 117.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.11437533749267459, + "epoch": 0.5139171758316361, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "learning_rate": 1.1288609092298004e-05, + "loss": 0.0, + "num_tokens": 4114357.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 177.625, + "completions/mean_terminated_length": 177.625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.06471283501014113, + "epoch": 0.5145960624575696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1265093038031294e-05, + "loss": 0.0, + "num_tokens": 4119658.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 92.75, + "completions/mean_terminated_length": 92.75, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.06100855814293027, + "epoch": 0.515274949083503, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.71875, + "learning_rate": 1.1241569871789096e-05, + "loss": 0.0, + "num_tokens": 4123480.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 122.125, + "completions/mean_terminated_length": 122.125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.0679061939008534, + "epoch": 0.5159538357094365, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "learning_rate": 1.1218039725811626e-05, + "loss": -0.0, + "num_tokens": 4127697.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 183.75, + "completions/mean_terminated_length": 183.75, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.020937865017913282, + "epoch": 0.5166327223353699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1194502732378349e-05, + "loss": 0.0, + "num_tokens": 4133671.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 207.375, + "completions/mean_terminated_length": 207.375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.015055299969390035, + "epoch": 0.5173116089613035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1170959023807216e-05, + "loss": 0.0, + "num_tokens": 4139978.0, + "reward": 2.375, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 309.125, + "completions/mean_terminated_length": 309.125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.02946487721055746, + "epoch": 0.517990495587237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1147408732453926e-05, + "loss": 0.0, + "num_tokens": 4147099.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 184.375, + "completions/mean_terminated_length": 184.375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.032556057907640934, + "epoch": 0.5186693822131704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.112385199071119e-05, + "loss": 0.0, + "num_tokens": 4152310.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 186.25, + "completions/mean_terminated_length": 186.25, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.036190603859722614, + "epoch": 0.5193482688391039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1100288931007982e-05, + "loss": 0.0, + "num_tokens": 4157656.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 187.25, + "completions/mean_terminated_length": 187.25, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.04345138184726238, + "epoch": 0.5200271554650373, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "learning_rate": 1.1076719685808786e-05, + "loss": 0.0, + "num_tokens": 4163178.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 189.125, + "completions/mean_terminated_length": 189.125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.04096515872515738, + "epoch": 0.5207060420909708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.105314438761287e-05, + "loss": 0.0, + "num_tokens": 4168715.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 112.25, + "completions/mean_terminated_length": 112.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.028937431052327156, + "epoch": 0.5213849287169042, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.102956316895352e-05, + "loss": 0.0, + "num_tokens": 4173229.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 147.0, + "completions/mean_terminated_length": 147.0, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.03585202293470502, + "epoch": 0.5220638153428377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1005976162397309e-05, + "loss": 0.0, + "num_tokens": 4177549.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 184.625, + "completions/mean_terminated_length": 184.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.030628055799752474, + "epoch": 0.5227427019687713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0982383500543355e-05, + "loss": 0.0, + "num_tokens": 4183226.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 136.625, + "completions/mean_terminated_length": 136.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.05688394093886018, + "epoch": 0.5234215885947047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0958785316022551e-05, + "loss": 0.0, + "num_tokens": 4187559.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 199.875, + "completions/mean_terminated_length": 199.875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.030420107068493962, + "epoch": 0.5241004752206382, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "learning_rate": 1.0935181741496858e-05, + "loss": 0.0, + "num_tokens": 4192886.0, + "reward": 1.875, + "reward_std": 0.10350986570119858, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.1035098284482956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 243.625, + "completions/mean_terminated_length": 243.625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.06591624300926924, + "epoch": 0.5247793618465716, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "learning_rate": 1.0911572909658524e-05, + "loss": -0.0, + "num_tokens": 4199955.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 130.875, + "completions/mean_terminated_length": 130.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.056005898863077164, + "epoch": 0.5254582484725051, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0625, + "learning_rate": 1.0887958953229349e-05, + "loss": -0.0, + "num_tokens": 4204130.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 152.75, + "completions/mean_terminated_length": 152.75, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.0647927368991077, + "epoch": 0.5261371350984385, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "learning_rate": 1.0864340004959957e-05, + "loss": -0.0, + "num_tokens": 4209320.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 145.375, + "completions/mean_terminated_length": 145.375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.05679185036569834, + "epoch": 0.526816021724372, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.625, + "learning_rate": 1.084071619762902e-05, + "loss": -0.0, + "num_tokens": 4214203.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 94.625, + "completions/mean_terminated_length": 94.625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.04140359256416559, + "epoch": 0.5274949083503055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0817087664042536e-05, + "loss": 0.0, + "num_tokens": 4218152.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 126.25, + "completions/mean_terminated_length": 126.25, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.09763273131102324, + "epoch": 0.528173794976239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0793454537033068e-05, + "loss": 0.0, + "num_tokens": 4222370.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 228.75, + "completions/mean_terminated_length": 228.75, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.04697759263217449, + "epoch": 0.5288526816021725, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "learning_rate": 1.0769816949459002e-05, + "loss": 0.0, + "num_tokens": 4228688.0, + "reward": 2.6500000953674316, + "reward_std": 0.26726123690605164, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6499999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 138.125, + "completions/mean_terminated_length": 138.125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.04557292116805911, + "epoch": 0.5295315682281059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0746175034203799e-05, + "loss": 0.0, + "num_tokens": 4233057.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 101.0, + "completions/mean_terminated_length": 101.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.05933442106470466, + "epoch": 0.5302104548540394, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "learning_rate": 1.0722528924175254e-05, + "loss": -0.0, + "num_tokens": 4237161.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 184.875, + "completions/mean_terminated_length": 184.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.052812352776527405, + "epoch": 0.5308893414799728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0698878752304738e-05, + "loss": 0.0, + "num_tokens": 4242456.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 150.625, + "completions/mean_terminated_length": 150.625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.07757335249334574, + "epoch": 0.5315682281059063, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "learning_rate": 1.0675224651546459e-05, + "loss": -0.0, + "num_tokens": 4247149.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 154.0, + "completions/mean_terminated_length": 154.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.04613668704405427, + "epoch": 0.5322471147318398, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "learning_rate": 1.0651566754876715e-05, + "loss": -0.0, + "num_tokens": 4251917.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 180.0, + "completions/mean_terminated_length": 180.0, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.05881976243108511, + "epoch": 0.5329260013577732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0627905195293135e-05, + "loss": 0.0, + "num_tokens": 4256885.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 149.375, + "completions/mean_terminated_length": 149.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.04891295824199915, + "epoch": 0.5336048879837068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0604240105813948e-05, + "loss": 0.0, + "num_tokens": 4261520.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 287.875, + "completions/mean_terminated_length": 287.875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.05009777005761862, + "epoch": 0.5342837746096402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "learning_rate": 1.0580571619477225e-05, + "loss": -0.0, + "num_tokens": 4268583.0, + "reward": 2.375, + "reward_std": 0.5496752262115479, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.36645016074180603, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 258.75, + "completions/mean_terminated_length": 258.75, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.03557685250416398, + "epoch": 0.5349626612355737, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "learning_rate": 1.0556899869340127e-05, + "loss": -0.0, + "num_tokens": 4275069.0, + "reward": 2.9791665077209473, + "reward_std": 0.058925628662109375, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 123.125, + "completions/mean_terminated_length": 123.125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.08843094576150179, + "epoch": 0.5356415478615071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0533224988478176e-05, + "loss": 0.0, + "num_tokens": 4279182.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 205.875, + "completions/mean_terminated_length": 205.875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.08234369801357388, + "epoch": 0.5363204344874406, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "learning_rate": 1.0509547109984484e-05, + "loss": 0.0, + "num_tokens": 4284333.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 193.75, + "completions/mean_terminated_length": 193.75, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.030832044780254364, + "epoch": 0.5369993211133741, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "learning_rate": 1.0485866366969012e-05, + "loss": 0.0, + "num_tokens": 4289875.0, + "reward": 2.953125, + "reward_std": 0.13258251547813416, + "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, + "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 147.75, + "completions/mean_terminated_length": 147.75, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.06975599378347397, + "epoch": 0.5376782077393075, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "learning_rate": 1.0462182892557834e-05, + "loss": -0.0, + "num_tokens": 4294137.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 100.125, + "completions/mean_terminated_length": 100.125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.02164299343712628, + "epoch": 0.538357094365241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0438496819892376e-05, + "loss": 0.0, + "num_tokens": 4298146.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 169.25, + "completions/mean_terminated_length": 169.25, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.07684734044596553, + "epoch": 0.5390359809911744, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "learning_rate": 1.0414808282128668e-05, + "loss": -0.0, + "num_tokens": 4302668.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 142.5, + "completions/mean_terminated_length": 142.5, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.06801590882241726, + "epoch": 0.539714867617108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.03911174124366e-05, + "loss": 0.0, + "num_tokens": 4307712.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 167.75, + "completions/mean_terminated_length": 167.75, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.057615553960204124, + "epoch": 0.5403937542430414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0367424343999164e-05, + "loss": 0.0, + "num_tokens": 4312526.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 195.25, + "completions/mean_terminated_length": 195.25, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.040979793295264244, + "epoch": 0.5410726408689749, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "learning_rate": 1.0343729210011731e-05, + "loss": 0.0, + "num_tokens": 4317536.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 122.0, + "completions/mean_terminated_length": 122.0, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.06204876606352627, + "epoch": 0.5417515274949084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0320032143681262e-05, + "loss": 0.0, + "num_tokens": 4321784.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 240.875, + "completions/mean_terminated_length": 240.875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.055161413038149476, + "epoch": 0.5424304141208418, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "learning_rate": 1.0296333278225599e-05, + "loss": -0.0, + "num_tokens": 4327735.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 147.375, + "completions/mean_terminated_length": 147.375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.0742807905189693, + "epoch": 0.5431093007467753, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.03125, + "learning_rate": 1.0272632746872687e-05, + "loss": 0.0, + "num_tokens": 4332042.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 241.375, + "completions/mean_terminated_length": 241.375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.049888757057487965, + "epoch": 0.5437881873727087, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.46875, + "learning_rate": 1.0248930682859839e-05, + "loss": -0.0, + "num_tokens": 4337965.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 151.5, + "completions/mean_terminated_length": 151.5, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.050024800933897495, + "epoch": 0.5444670739986422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "learning_rate": 1.0225227219432988e-05, + "loss": -0.0, + "num_tokens": 4343345.0, + "reward": 1.84375, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 135.625, + "completions/mean_terminated_length": 135.625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.07001071749255061, + "epoch": 0.5451459606245757, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "learning_rate": 1.0201522489845927e-05, + "loss": 0.0, + "num_tokens": 4347694.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 344.625, + "completions/mean_terminated_length": 344.625, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.032825993141159415, + "epoch": 0.5458248472505092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8515625, + "learning_rate": 1.0177816627359575e-05, + "loss": -0.0, + "num_tokens": 4355355.0, + "reward": 2.6624999046325684, + "reward_std": 0.46579432487487793, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6625000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.46579426527023315, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 125.375, + "completions/mean_terminated_length": 125.375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.056563244201242924, + "epoch": 0.5465037338764427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0154109765241214e-05, + "loss": 0.0, + "num_tokens": 4359550.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 151.25, + "completions/mean_terminated_length": 151.25, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.08645900897681713, + "epoch": 0.5471826205023761, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "learning_rate": 1.0130402036763747e-05, + "loss": -0.0, + "num_tokens": 4364032.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 221.625, + "completions/mean_terminated_length": 221.625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.051528535317629576, + "epoch": 0.5478615071283096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0106693575204947e-05, + "loss": 0.0, + "num_tokens": 4369149.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 187.5, + "completions/mean_terminated_length": 187.5, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.06195195019245148, + "epoch": 0.548540393754243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0082984513846713e-05, + "loss": 0.0, + "num_tokens": 4374433.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 139.75, + "completions/mean_terminated_length": 139.75, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.07225760025903583, + "epoch": 0.5492192803801765, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.609375, + "learning_rate": 1.0059274985974305e-05, + "loss": 0.0, + "num_tokens": 4378959.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 73.875, + "completions/mean_terminated_length": 73.875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.04130575666204095, + "epoch": 0.5498981670061099, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.375, + "learning_rate": 1.0035565124875623e-05, + "loss": -0.0, + "num_tokens": 4382598.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 158.25, + "completions/mean_terminated_length": 158.25, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.08188450988382101, + "epoch": 0.5505770536320435, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "learning_rate": 1.0011855063840416e-05, + "loss": -0.0, + "num_tokens": 4387088.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 126.125, + "completions/mean_terminated_length": 126.125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.06766916019842029, + "epoch": 0.551255940257977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.988144936159582e-06, + "loss": 0.0, + "num_tokens": 4392089.0, + "reward": 2.875, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 142.625, + "completions/mean_terminated_length": 142.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.07640710193663836, + "epoch": 0.5519348268839104, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015625, + "learning_rate": 9.96443487512438e-06, + "loss": 0.0, + "num_tokens": 4396510.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 177.125, + "completions/mean_terminated_length": 177.125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.07171546202152967, + "epoch": 0.5526137135098439, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.625, + "learning_rate": 9.940725014025696e-06, + "loss": 0.0, + "num_tokens": 4401407.0, + "reward": 2.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 221.125, + "completions/mean_terminated_length": 221.125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.05010488163679838, + "epoch": 0.5532926001357773, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 9.91701548615329e-06, + "loss": 0.0, + "num_tokens": 4407464.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 180.875, + "completions/mean_terminated_length": 180.875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.07103995932266116, + "epoch": 0.5539714867617108, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "learning_rate": 9.893306424795055e-06, + "loss": -0.0, + "num_tokens": 4412431.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 339.125, + "completions/mean_terminated_length": 339.125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.0642473828047514, + "epoch": 0.5546503733876442, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "learning_rate": 9.869597963236253e-06, + "loss": -0.0, + "num_tokens": 4420120.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 117.75, + "completions/mean_terminated_length": 117.75, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.06086629815399647, + "epoch": 0.5553292600135777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.845890234758789e-06, + "loss": 0.0, + "num_tokens": 4424294.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 232.0, + "completions/mean_terminated_length": 232.0, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.0363885962869972, + "epoch": 0.5560081466395111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.822183372640426e-06, + "loss": 0.0, + "num_tokens": 4430718.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 117.0, + "completions/mean_terminated_length": 117.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.048767969477921724, + "epoch": 0.5566870332654447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.798477510154075e-06, + "loss": 0.0, + "num_tokens": 4435102.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 313.375, + "completions/mean_terminated_length": 313.375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.04720056499354541, + "epoch": 0.5573659198913782, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "learning_rate": 9.774772780567017e-06, + "loss": 0.0, + "num_tokens": 4442409.0, + "reward": 2.5, + "reward_std": 1.0690449476242065, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 173.375, + "completions/mean_terminated_length": 173.375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.054787850473076105, + "epoch": 0.5580448065173116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.751069317140163e-06, + "loss": 0.0, + "num_tokens": 4447876.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 101.0, + "completions/mean_terminated_length": 101.0, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.05901277018710971, + "epoch": 0.5587236931432451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.727367253127315e-06, + "loss": 0.0, + "num_tokens": 4452116.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 162.625, + "completions/mean_terminated_length": 162.625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.052709260024130344, + "epoch": 0.5594025797691785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.703666721774403e-06, + "loss": 0.0, + "num_tokens": 4456617.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 177.875, + "completions/mean_terminated_length": 177.875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.0534027679823339, + "epoch": 0.560081466395112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.679967856318741e-06, + "loss": 0.0, + "num_tokens": 4462128.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 158.125, + "completions/mean_terminated_length": 158.125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.09539056662470102, + "epoch": 0.5607603530210454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.656270789988274e-06, + "loss": 0.0, + "num_tokens": 4466729.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 204.0, + "completions/mean_terminated_length": 204.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.0635853330604732, + "epoch": 0.561439239646979, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "learning_rate": 9.632575656000837e-06, + "loss": -0.0, + "num_tokens": 4472473.0, + "reward": 2.7249999046325684, + "reward_std": 0.30118808150291443, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7250000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.3011881411075592, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 135.375, + "completions/mean_terminated_length": 135.375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.05248894030228257, + "epoch": 0.5621181262729125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.608882587563404e-06, + "loss": 0.0, + "num_tokens": 4476940.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 231.875, + "completions/mean_terminated_length": 231.875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.024271421716548502, + "epoch": 0.5627970128988459, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.585191717871336e-06, + "loss": 0.0, + "num_tokens": 4482931.0, + "reward": 2.4000000953674316, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.06745913159102201, + "epoch": 0.5634758995247794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.561503180107626e-06, + "loss": 0.0, + "num_tokens": 4487588.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 138.75, + "completions/mean_terminated_length": 138.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.046886586118489504, + "epoch": 0.5641547861507128, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.53781710744217e-06, + "loss": 0.0, + "num_tokens": 4491962.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 191.375, + "completions/mean_terminated_length": 191.375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.09951631911098957, + "epoch": 0.5648336727766463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.514133633030987e-06, + "loss": 0.0, + "num_tokens": 4496925.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 424.625, + "completions/mean_terminated_length": 424.625, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "entropy": 0.019229991012252867, + "epoch": 0.5655125594025797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.49045289001552e-06, + "loss": 0.0, + "num_tokens": 4505978.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 94.25, + "completions/mean_terminated_length": 94.25, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.05636278260499239, + "epoch": 0.5661914460285132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.466775011521825e-06, + "loss": 0.0, + "num_tokens": 4509828.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 197.375, + "completions/mean_terminated_length": 197.375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.03414254495874047, + "epoch": 0.5668703326544468, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.443100130659876e-06, + "loss": 0.0, + "num_tokens": 4515431.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 139.875, + "completions/mean_terminated_length": 139.875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.02369208401069045, + "epoch": 0.5675492192803802, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "learning_rate": 9.41942838052278e-06, + "loss": -0.0, + "num_tokens": 4519934.0, + "reward": 2.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 96.125, + "completions/mean_terminated_length": 96.125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.050153948366642, + "epoch": 0.5682281059063137, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.875, + "learning_rate": 9.395759894186054e-06, + "loss": 0.0, + "num_tokens": 4523679.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 255.875, + "completions/mean_terminated_length": 255.875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.036000629188492894, + "epoch": 0.5689069925322471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.372094804706867e-06, + "loss": 0.0, + "num_tokens": 4530342.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 162.25, + "completions/mean_terminated_length": 162.25, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.06715370435267687, + "epoch": 0.5695858791581806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.34843324512329e-06, + "loss": 0.0, + "num_tokens": 4535176.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 126.125, + "completions/mean_terminated_length": 126.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.05983061483129859, + "epoch": 0.570264765784114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.324775348453543e-06, + "loss": 0.0, + "num_tokens": 4540121.0, + "reward": 1.875, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 160.125, + "completions/mean_terminated_length": 160.125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.09151466190814972, + "epoch": 0.5709436524100475, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.28125, + "learning_rate": 9.301121247695265e-06, + "loss": 0.0, + "num_tokens": 4544754.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 134.375, + "completions/mean_terminated_length": 134.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.06438526068814099, + "epoch": 0.571622539035981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.277471075824747e-06, + "loss": 0.0, + "num_tokens": 4549133.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 334.375, + "completions/mean_terminated_length": 334.375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "entropy": 0.032213454600423574, + "epoch": 0.5723014256619144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.253824965796203e-06, + "loss": 0.0, + "num_tokens": 4556784.0, + "reward": 1.875, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 194.75, + "completions/mean_terminated_length": 194.75, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.09624125668779016, + "epoch": 0.572980312287848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "learning_rate": 9.230183050541001e-06, + "loss": 0.0, + "num_tokens": 4562246.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 186.5, + "completions/mean_terminated_length": 186.5, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.10141824604943395, + "epoch": 0.5736591989137814, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "learning_rate": 9.206545462966935e-06, + "loss": 0.0, + "num_tokens": 4567050.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 140.375, + "completions/mean_terminated_length": 140.375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.04522315924987197, + "epoch": 0.5743380855397149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.18291233595747e-06, + "loss": 0.0, + "num_tokens": 4571261.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 180.0, + "completions/mean_terminated_length": 180.0, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.0619788053445518, + "epoch": 0.5750169721656483, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "learning_rate": 9.159283802370981e-06, + "loss": 0.0, + "num_tokens": 4575941.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 67.25, + "completions/mean_terminated_length": 67.25, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.03905652277171612, + "epoch": 0.5756958587915818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.135659995040046e-06, + "loss": 0.0, + "num_tokens": 4579559.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 199.25, + "completions/mean_terminated_length": 199.25, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.03857954638078809, + "epoch": 0.5763747454175153, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.90625, + "learning_rate": 9.112041046770653e-06, + "loss": 0.0, + "num_tokens": 4585185.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 208.0, + "completions/mean_terminated_length": 208.0, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.026656872825697064, + "epoch": 0.5770536320434487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.088427090341483e-06, + "loss": 0.0, + "num_tokens": 4591161.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 115.625, + "completions/mean_terminated_length": 115.625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.041646760888397694, + "epoch": 0.5777325186693822, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "learning_rate": 9.064818258503145e-06, + "loss": 0.0, + "num_tokens": 4595622.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 184.125, + "completions/mean_terminated_length": 184.125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.03903352376073599, + "epoch": 0.5784114052953157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.041214683977449e-06, + "loss": 0.0, + "num_tokens": 4600871.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 230.625, + "completions/mean_terminated_length": 230.625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.029688776936382055, + "epoch": 0.5790902919212492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.017616499456647e-06, + "loss": 0.0, + "num_tokens": 4606900.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 161.0, + "completions/mean_terminated_length": 161.0, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.07806071871891618, + "epoch": 0.5797691785471826, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875, + "learning_rate": 8.994023837602694e-06, + "loss": 0.0, + "num_tokens": 4611372.0, + "reward": 2.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 145.375, + "completions/mean_terminated_length": 145.375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.08304271660745144, + "epoch": 0.5804480651731161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.970436831046484e-06, + "loss": 0.0, + "num_tokens": 4615703.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 125.875, + "completions/mean_terminated_length": 125.875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.03566870419308543, + "epoch": 0.5811269517990496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.946855612387134e-06, + "loss": 0.0, + "num_tokens": 4620350.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 182.25, + "completions/mean_terminated_length": 182.25, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.068370102904737, + "epoch": 0.581805838424983, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1875, + "learning_rate": 8.923280314191215e-06, + "loss": 0.0, + "num_tokens": 4625504.0, + "reward": 2.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 203.5, + "completions/mean_terminated_length": 203.5, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.08393561094999313, + "epoch": 0.5824847250509165, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "learning_rate": 8.899711068992023e-06, + "loss": 0.0, + "num_tokens": 4630516.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 159.0, + "completions/mean_terminated_length": 159.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.07910141348838806, + "epoch": 0.5831636116768499, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.876148009288813e-06, + "loss": 0.0, + "num_tokens": 4635068.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 93.5, + "completions/mean_terminated_length": 93.5, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.04618065990507603, + "epoch": 0.5838424983027835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.852591267546077e-06, + "loss": 0.0, + "num_tokens": 4639160.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 129.25, + "completions/mean_terminated_length": 129.25, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.038697126088663936, + "epoch": 0.5845213849287169, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.829040976192789e-06, + "loss": 0.0, + "num_tokens": 4643394.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 162.0, + "completions/mean_terminated_length": 162.0, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.08845762442797422, + "epoch": 0.5852002715546504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.805497267621653e-06, + "loss": 0.0, + "num_tokens": 4648146.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 93.5, + "completions/mean_terminated_length": 93.5, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.05717147933319211, + "epoch": 0.5858791581805839, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.781960274188376e-06, + "loss": 0.0, + "num_tokens": 4651990.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 175.75, + "completions/mean_terminated_length": 175.75, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.02556638070382178, + "epoch": 0.5865580448065173, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "learning_rate": 8.758430128210908e-06, + "loss": 0.0, + "num_tokens": 4657476.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 98.0, + "completions/mean_terminated_length": 98.0, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.0459773913025856, + "epoch": 0.5872369314324508, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "learning_rate": 8.734906961968713e-06, + "loss": 0.0, + "num_tokens": 4661708.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 195.625, + "completions/mean_terminated_length": 195.625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.05903433682397008, + "epoch": 0.5879158180583842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.711390907702001e-06, + "loss": 0.0, + "num_tokens": 4667233.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 163.375, + "completions/mean_terminated_length": 163.375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.055525312665849924, + "epoch": 0.5885947046843177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.687882097611016e-06, + "loss": 0.0, + "num_tokens": 4672020.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 149.0, + "completions/mean_terminated_length": 149.0, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.06552033172920346, + "epoch": 0.5892735913102511, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "learning_rate": 8.664380663855272e-06, + "loss": 0.0, + "num_tokens": 4676268.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 148.625, + "completions/mean_terminated_length": 148.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.05239802412688732, + "epoch": 0.5899524779361847, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "learning_rate": 8.64088673855282e-06, + "loss": -0.0, + "num_tokens": 4680905.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 145.125, + "completions/mean_terminated_length": 145.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.05006753979250789, + "epoch": 0.5906313645621182, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "learning_rate": 8.617400453779487e-06, + "loss": 0.0, + "num_tokens": 4687674.0, + "reward": 2.038461685180664, + "reward_std": 0.358450710773468, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9134615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.08606166671961546, + "epoch": 0.5913102511880516, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.34375, + "learning_rate": 8.593921941568165e-06, + "loss": 0.0, + "num_tokens": 4693055.0, + "reward": 2.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 331.625, + "completions/mean_terminated_length": 331.625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.05833883211016655, + "epoch": 0.5919891378139851, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "learning_rate": 8.570451333908033e-06, + "loss": -0.0, + "num_tokens": 4701636.0, + "reward": 2.205357074737549, + "reward_std": 0.17677675187587738, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.9375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.1767766922712326, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 141.125, + "completions/mean_terminated_length": 141.125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.07068101689219475, + "epoch": 0.5926680244399185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.546988762743852e-06, + "loss": 0.0, + "num_tokens": 4706093.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 354.625, + "completions/mean_terminated_length": 354.625, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "entropy": 0.05495765432715416, + "epoch": 0.593346911065852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.52353435997519e-06, + "loss": 0.0, + "num_tokens": 4714314.0, + "reward": 2.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 126.5, + "completions/mean_terminated_length": 126.5, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.0695712030865252, + "epoch": 0.5940257976917854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.5000882574557e-06, + "loss": 0.0, + "num_tokens": 4718654.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.05437454581260681, + "epoch": 0.594704684317719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.476650586992372e-06, + "loss": 0.0, + "num_tokens": 4725589.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.053944073617458344, + "epoch": 0.5953835709436525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.453221480344791e-06, + "loss": 0.0, + "num_tokens": 4730121.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 153.875, + "completions/mean_terminated_length": 153.875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.12104702088981867, + "epoch": 0.5960624575695859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.429801069224411e-06, + "loss": 0.0, + "num_tokens": 4734560.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 151.875, + "completions/mean_terminated_length": 151.875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.06245470978319645, + "epoch": 0.5967413441955194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.406389485293786e-06, + "loss": 0.0, + "num_tokens": 4739151.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 105.5, + "completions/mean_terminated_length": 105.5, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.031613563653081656, + "epoch": 0.5974202308214528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.382986860165859e-06, + "loss": 0.0, + "num_tokens": 4743355.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 190.125, + "completions/mean_terminated_length": 190.125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.11220237705856562, + "epoch": 0.5980991174473863, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "learning_rate": 8.359593325403195e-06, + "loss": 0.0, + "num_tokens": 4748268.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 182.125, + "completions/mean_terminated_length": 182.125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.06048412760719657, + "epoch": 0.5987780040733197, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "learning_rate": 8.336209012517273e-06, + "loss": -0.0, + "num_tokens": 4753429.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 198.375, + "completions/mean_terminated_length": 198.375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.028818948660045862, + "epoch": 0.5994568906992532, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.31283405296772e-06, + "loss": 0.0, + "num_tokens": 4759128.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 107.375, + "completions/mean_terminated_length": 107.375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.05501343263313174, + "epoch": 0.6001357773251867, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0, + "learning_rate": 8.289468578161581e-06, + "loss": -0.0, + "num_tokens": 4763403.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 324.625, + "completions/mean_terminated_length": 324.625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.03536238381639123, + "epoch": 0.6008146639511202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "learning_rate": 8.266112719452579e-06, + "loss": 0.0, + "num_tokens": 4771328.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 108.5, + "completions/mean_terminated_length": 108.5, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.0552833154797554, + "epoch": 0.6014935505770537, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.734375, + "learning_rate": 8.242766608140383e-06, + "loss": -0.0, + "num_tokens": 4775188.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 265.625, + "completions/mean_terminated_length": 265.625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.03690002113580704, + "epoch": 0.6021724372029871, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "learning_rate": 8.219430375469863e-06, + "loss": 0.0, + "num_tokens": 4781825.0, + "reward": 2.8958334922790527, + "reward_std": 0.29462775588035583, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 91.25, + "completions/mean_terminated_length": 91.25, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.03606024198234081, + "epoch": 0.6028513238289206, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "learning_rate": 8.19610415263036e-06, + "loss": -0.0, + "num_tokens": 4785563.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 150.0, + "completions/mean_terminated_length": 150.0, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.057377586141228676, + "epoch": 0.603530210454854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.172788070754927e-06, + "loss": 0.0, + "num_tokens": 4790019.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 75.375, + "completions/mean_terminated_length": 75.375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.04337291792035103, + "epoch": 0.6042090970807875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.149482260919625e-06, + "loss": 0.0, + "num_tokens": 4793686.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 285.875, + "completions/mean_terminated_length": 285.875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.04639269458130002, + "epoch": 0.604887983706721, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "learning_rate": 8.126186854142752e-06, + "loss": -0.0, + "num_tokens": 4801005.0, + "reward": 2.8181817531585693, + "reward_std": 0.3366619050502777, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, + "rewards/fixed_code_pass_all_test_reward/std": 0.33666184544563293, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 242.375, + "completions/mean_terminated_length": 242.375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.01723166659940034, + "epoch": 0.6055668703326544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.102901981384146e-06, + "loss": 0.0, + "num_tokens": 4807072.0, + "reward": 2.4000000953674316, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 193.125, + "completions/mean_terminated_length": 193.125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.08192705176770687, + "epoch": 0.606245756958588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.079627773544403e-06, + "loss": 0.0, + "num_tokens": 4812081.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 229.0, + "completions/mean_terminated_length": 229.0, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.040311204735189676, + "epoch": 0.6069246435845214, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "learning_rate": 8.056364361464176e-06, + "loss": -0.0, + "num_tokens": 4818121.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 101.5, + "completions/mean_terminated_length": 101.5, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.044932120479643345, + "epoch": 0.6076035302104549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.033111875923421e-06, + "loss": 0.0, + "num_tokens": 4822285.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 242.75, + "completions/mean_terminated_length": 242.75, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.0368472533300519, + "epoch": 0.6082824168363883, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.625, + "learning_rate": 8.009870447640676e-06, + "loss": -0.0, + "num_tokens": 4828611.0, + "reward": 2.875, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 110.625, + "completions/mean_terminated_length": 110.625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.02511978044640273, + "epoch": 0.6089613034623218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.986640207272312e-06, + "loss": 0.0, + "num_tokens": 4832912.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 304.25, + "completions/mean_terminated_length": 304.25, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "entropy": 0.026552781695500016, + "epoch": 0.6096401900882552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.963421285411812e-06, + "loss": 0.0, + "num_tokens": 4839994.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 266.625, + "completions/mean_terminated_length": 266.625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.06595556903630495, + "epoch": 0.6103190767141887, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "learning_rate": 7.940213812589018e-06, + "loss": 0.0, + "num_tokens": 4847255.0, + "reward": 1.3035714626312256, + "reward_std": 0.5423063635826111, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714328289032, + "rewards/fixed_code_pass_all_test_reward/std": 0.2159797102212906, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 205.625, + "completions/mean_terminated_length": 205.625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.05004977295175195, + "epoch": 0.6109979633401222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "learning_rate": 7.91701791926942e-06, + "loss": 0.0, + "num_tokens": 4854500.0, + "reward": 2.413461685180664, + "reward_std": 0.5248475074768066, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9134615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 215.75, + "completions/mean_terminated_length": 215.75, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.02198021998628974, + "epoch": 0.6116768499660556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.893833735853404e-06, + "loss": 0.0, + "num_tokens": 4860498.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 92.75, + "completions/mean_terminated_length": 92.75, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.04383000638335943, + "epoch": 0.6123557365919892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.870661392675535e-06, + "loss": 0.0, + "num_tokens": 4864760.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 318.875, + "completions/mean_terminated_length": 318.875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.046564546413719654, + "epoch": 0.6130346232179226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.847501020003806e-06, + "loss": 0.0, + "num_tokens": 4872167.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 170.25, + "completions/mean_terminated_length": 170.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.03181818872690201, + "epoch": 0.6137135098438561, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.824352748038924e-06, + "loss": 0.0, + "num_tokens": 4877633.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.048190067522227764, + "epoch": 0.6143923964697895, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "learning_rate": 7.801216706913563e-06, + "loss": -0.0, + "num_tokens": 4882591.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 124.625, + "completions/mean_terminated_length": 124.625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.05867015942931175, + "epoch": 0.615071283095723, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.96875, + "learning_rate": 7.778093026691636e-06, + "loss": 0.0, + "num_tokens": 4886820.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 152.5, + "completions/mean_terminated_length": 152.5, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.058279364835470915, + "epoch": 0.6157501697216565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.75498183736758e-06, + "loss": 0.0, + "num_tokens": 4891496.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 234.75, + "completions/mean_terminated_length": 234.75, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.03309832373633981, + "epoch": 0.6164290563475899, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.731883268865601e-06, + "loss": 0.0, + "num_tokens": 4897670.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 116.625, + "completions/mean_terminated_length": 116.625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.04822292411699891, + "epoch": 0.6171079429735234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.70879745103896e-06, + "loss": 0.0, + "num_tokens": 4902683.0, + "reward": 1.875, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 74.125, + "completions/mean_terminated_length": 74.125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.03923192294314504, + "epoch": 0.6177868295994569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.685724513669227e-06, + "loss": 0.0, + "num_tokens": 4906308.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 138.125, + "completions/mean_terminated_length": 138.125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.029026684118434787, + "epoch": 0.6184657162253904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.662664586465574e-06, + "loss": 0.0, + "num_tokens": 4911101.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 76.25, + "completions/mean_terminated_length": 76.25, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.015714952372945845, + "epoch": 0.6191446028513238, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.375, + "learning_rate": 7.63961779906403e-06, + "loss": -0.0, + "num_tokens": 4914807.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 135.125, + "completions/mean_terminated_length": 135.125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.018774282885715365, + "epoch": 0.6198234894772573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.616584281026759e-06, + "loss": 0.0, + "num_tokens": 4919152.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 166.625, + "completions/mean_terminated_length": 166.625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.04295201087370515, + "epoch": 0.6205023761031908, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.593564161841318e-06, + "loss": 0.0, + "num_tokens": 4923701.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 145.0, + "completions/mean_terminated_length": 145.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.07025658199563622, + "epoch": 0.6211812627291242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.57055757091995e-06, + "loss": 0.0, + "num_tokens": 4928141.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 138.0, + "completions/mean_terminated_length": 138.0, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.08920403476804495, + "epoch": 0.6218601493550577, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "learning_rate": 7.5475646375988395e-06, + "loss": -0.0, + "num_tokens": 4932389.0, + "reward": 1.0714285373687744, + "reward_std": 0.2020304799079895, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0714285746216774, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.2020305097103119, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 193.25, + "completions/mean_terminated_length": 193.25, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.028667444130405784, + "epoch": 0.6225390359809911, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "learning_rate": 7.524585491137404e-06, + "loss": 0.0, + "num_tokens": 4937239.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 122.625, + "completions/mean_terminated_length": 122.625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.054681753274053335, + "epoch": 0.6232179226069247, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.65625, + "learning_rate": 7.501620260717538e-06, + "loss": 0.0, + "num_tokens": 4941628.0, + "reward": 2.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 135.375, + "completions/mean_terminated_length": 135.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.09519746527075768, + "epoch": 0.6238968092328581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.478669075442917e-06, + "loss": 0.0, + "num_tokens": 4946055.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 163.5, + "completions/mean_terminated_length": 163.5, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.021858258987776935, + "epoch": 0.6245756958587916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.455732064338255e-06, + "loss": 0.0, + "num_tokens": 4951027.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 207.875, + "completions/mean_terminated_length": 207.875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.04795250901952386, + "epoch": 0.6252545824847251, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.432809356348576e-06, + "loss": 0.0, + "num_tokens": 4957914.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 157.0, + "completions/mean_terminated_length": 157.0, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.04438210651278496, + "epoch": 0.6259334691106585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.409901080338512e-06, + "loss": 0.0, + "num_tokens": 4962826.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 99.875, + "completions/mean_terminated_length": 99.875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.038695571245625615, + "epoch": 0.626612355736592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.38700736509155e-06, + "loss": 0.0, + "num_tokens": 4966681.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 82.125, + "completions/mean_terminated_length": 82.125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.024893229361623526, + "epoch": 0.6272912423625254, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.078125, + "learning_rate": 7.364128339309326e-06, + "loss": 0.0, + "num_tokens": 4970330.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 193.125, + "completions/mean_terminated_length": 193.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.08436478301882744, + "epoch": 0.6279701289884589, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.46875, + "learning_rate": 7.34126413161089e-06, + "loss": 0.0, + "num_tokens": 4976435.0, + "reward": 1.640625, + "reward_std": 0.23563648760318756, + "rewards/fixed_code_pass_all_test_reward/mean": 0.640625, + "rewards/fixed_code_pass_all_test_reward/std": 0.23563650250434875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 143.875, + "completions/mean_terminated_length": 143.875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.05935622798278928, + "epoch": 0.6286490156143923, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.40625, + "learning_rate": 7.318414870531996e-06, + "loss": -0.0, + "num_tokens": 4980722.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 227.875, + "completions/mean_terminated_length": 227.875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.061224288772791624, + "epoch": 0.6293279022403259, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "learning_rate": 7.2955806845243734e-06, + "loss": -0.0, + "num_tokens": 4986977.0, + "reward": 2.9124999046325684, + "reward_std": 0.2474873661994934, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9125000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.2474873811006546, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 132.5, + "completions/mean_terminated_length": 132.5, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.05595715483650565, + "epoch": 0.6300067888662594, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.65625, + "learning_rate": 7.272761701955e-06, + "loss": -0.0, + "num_tokens": 4991317.0, + "reward": 2.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 151.75, + "completions/mean_terminated_length": 151.75, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.0885505573824048, + "epoch": 0.6306856754921928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.249958051105383e-06, + "loss": 0.0, + "num_tokens": 4995827.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 131.375, + "completions/mean_terminated_length": 131.375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.06484300037845969, + "epoch": 0.6313645621181263, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "learning_rate": 7.227169860170845e-06, + "loss": 0.0, + "num_tokens": 5000222.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 309.0, + "completions/mean_terminated_length": 309.0, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "entropy": 0.031262818491086364, + "epoch": 0.6320434487440597, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "learning_rate": 7.2043972572597855e-06, + "loss": 0.0, + "num_tokens": 5007214.0, + "reward": 1.8035714626312256, + "reward_std": 0.3234066367149353, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714030265808, + "rewards/fixed_code_pass_all_test_reward/std": 0.3234066069126129, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 173.5, + "completions/mean_terminated_length": 173.5, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.03661538939923048, + "epoch": 0.6327223353699932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "learning_rate": 7.181640370392994e-06, + "loss": 0.0, + "num_tokens": 5012050.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 116.5, + "completions/mean_terminated_length": 116.5, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.07277259835973382, + "epoch": 0.6334012219959266, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.890625, + "learning_rate": 7.1588993275028885e-06, + "loss": 0.0, + "num_tokens": 5016254.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 259.125, + "completions/mean_terminated_length": 259.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.060410027392208576, + "epoch": 0.6340801086218602, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "learning_rate": 7.136174256432828e-06, + "loss": 0.0, + "num_tokens": 5023439.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 207.75, + "completions/mean_terminated_length": 207.75, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.1308068847283721, + "epoch": 0.6347589952477937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "learning_rate": 7.113465284936378e-06, + "loss": 0.0, + "num_tokens": 5028269.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 230.625, + "completions/mean_terminated_length": 230.625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.0925988806411624, + "epoch": 0.6354378818737271, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.090772540676598e-06, + "loss": 0.0, + "num_tokens": 5034882.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 203.875, + "completions/mean_terminated_length": 203.875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.07168037118390203, + "epoch": 0.6361167684996606, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875, + "learning_rate": 7.0680961512253254e-06, + "loss": -0.0, + "num_tokens": 5040193.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 134.25, + "completions/mean_terminated_length": 134.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.056642959360033274, + "epoch": 0.636795655125594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.045436244062458e-06, + "loss": 0.0, + "num_tokens": 5044619.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 163.625, + "completions/mean_terminated_length": 163.625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.08202037401497364, + "epoch": 0.6374745417515275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.022792946575222e-06, + "loss": 0.0, + "num_tokens": 5049160.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 163.125, + "completions/mean_terminated_length": 163.125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.060413535218685865, + "epoch": 0.6381534283774609, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.109375, + "learning_rate": 7.000166386057483e-06, + "loss": 0.0, + "num_tokens": 5053809.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 102.375, + "completions/mean_terminated_length": 102.375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.04014542978256941, + "epoch": 0.6388323150033944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.977556689709009e-06, + "loss": 0.0, + "num_tokens": 5057820.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 116.75, + "completions/mean_terminated_length": 116.75, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.04275320563465357, + "epoch": 0.639511201629328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.954963984634768e-06, + "loss": 0.0, + "num_tokens": 5062522.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 113.5, + "completions/mean_terminated_length": 113.5, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.07519625313580036, + "epoch": 0.6401900882552614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.932388397844204e-06, + "loss": 0.0, + "num_tokens": 5066638.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 171.125, + "completions/mean_terminated_length": 171.125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.10464443638920784, + "epoch": 0.6408689748811949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.909830056250527e-06, + "loss": 0.0, + "num_tokens": 5071311.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 195.375, + "completions/mean_terminated_length": 195.375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.08981845155358315, + "epoch": 0.6415478615071283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.887289086670004e-06, + "loss": 0.0, + "num_tokens": 5076578.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 204.5, + "completions/mean_terminated_length": 204.5, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.0879927696660161, + "epoch": 0.6422267481330618, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.234375, + "learning_rate": 6.864765615821231e-06, + "loss": 0.0, + "num_tokens": 5081630.0, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 311.875, + "completions/mean_terminated_length": 311.875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "entropy": 0.04189127474091947, + "epoch": 0.6429056347589952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.842259770324447e-06, + "loss": 0.0, + "num_tokens": 5088749.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 258.25, + "completions/mean_terminated_length": 258.25, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.020852818968705833, + "epoch": 0.6435845213849287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.819771676700794e-06, + "loss": 0.0, + "num_tokens": 5095167.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 135.375, + "completions/mean_terminated_length": 135.375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.07304209750145674, + "epoch": 0.6442634080108622, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.703125, + "learning_rate": 6.797301461371626e-06, + "loss": 0.0, + "num_tokens": 5102162.0, + "reward": 2.857142925262451, + "reward_std": 0.2020304650068283, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 384.125, + "completions/mean_terminated_length": 384.125, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.02881737658753991, + "epoch": 0.6449422946367956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.774849250657784e-06, + "loss": 0.0, + "num_tokens": 5110963.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 344.375, + "completions/mean_terminated_length": 344.375, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 0.0376912287902087, + "epoch": 0.6456211812627292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.752415170778894e-06, + "loss": 0.0, + "num_tokens": 5118806.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 97.375, + "completions/mean_terminated_length": 97.375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.08059882931411266, + "epoch": 0.6463000678886626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.729999347852665e-06, + "loss": 0.0, + "num_tokens": 5122641.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 151.625, + "completions/mean_terminated_length": 151.625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.07680756133049726, + "epoch": 0.6469789545145961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.707601907894159e-06, + "loss": 0.0, + "num_tokens": 5127054.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 154.25, + "completions/mean_terminated_length": 154.25, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.06024423544295132, + "epoch": 0.6476578411405295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.6852229768150976e-06, + "loss": 0.0, + "num_tokens": 5131408.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 350.875, + "completions/mean_terminated_length": 350.875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.056971298065036535, + "epoch": 0.648336727766463, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "learning_rate": 6.662862680423153e-06, + "loss": -0.0, + "num_tokens": 5139223.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 78.0, + "completions/mean_terminated_length": 78.0, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.011453297920525074, + "epoch": 0.6490156143923965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.640521144421237e-06, + "loss": 0.0, + "num_tokens": 5142887.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 122.375, + "completions/mean_terminated_length": 122.375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.04991039214655757, + "epoch": 0.6496945010183299, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.618198494406802e-06, + "loss": 0.0, + "num_tokens": 5146922.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 245.375, + "completions/mean_terminated_length": 245.375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.046746176201850176, + "epoch": 0.6503733876442634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "learning_rate": 6.595894855871119e-06, + "loss": 0.0, + "num_tokens": 5153085.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.031448905589058995, + "epoch": 0.6510522742701969, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "learning_rate": 6.573610354198587e-06, + "loss": -0.0, + "num_tokens": 5159418.0, + "reward": 2.6875, + "reward_std": 0.25877460837364197, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 330.75, + "completions/mean_terminated_length": 330.75, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.05657206545583904, + "epoch": 0.6517311608961304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.55134511466603e-06, + "loss": 0.0, + "num_tokens": 5167200.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 183.5, + "completions/mean_terminated_length": 183.5, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.07719372538849711, + "epoch": 0.6524100475220638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "learning_rate": 6.52909926244197e-06, + "loss": 0.0, + "num_tokens": 5172452.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 118.625, + "completions/mean_terminated_length": 118.625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.055858482141047716, + "epoch": 0.6530889341479973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.506872922585956e-06, + "loss": 0.0, + "num_tokens": 5177393.0, + "reward": 2.875, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 102.5, + "completions/mean_terminated_length": 102.5, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.06594048859551549, + "epoch": 0.6537678207739308, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.5, + "learning_rate": 6.484666220047835e-06, + "loss": -0.0, + "num_tokens": 5181301.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 141.625, + "completions/mean_terminated_length": 141.625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.03669480653479695, + "epoch": 0.6544467073998642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.4624792796670624e-06, + "loss": 0.0, + "num_tokens": 5185666.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 139.875, + "completions/mean_terminated_length": 139.875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.040769357699900866, + "epoch": 0.6551255940257977, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5625, + "learning_rate": 6.440312226171992e-06, + "loss": 0.0, + "num_tokens": 5190185.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 244.75, + "completions/mean_terminated_length": 244.75, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.038649448659271, + "epoch": 0.6558044806517311, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "learning_rate": 6.418165184179183e-06, + "loss": -0.0, + "num_tokens": 5196639.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 165.875, + "completions/mean_terminated_length": 165.875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.06659081997349858, + "epoch": 0.6564833672776647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.396038278192698e-06, + "loss": 0.0, + "num_tokens": 5201246.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 111.75, + "completions/mean_terminated_length": 111.75, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.025740576442331076, + "epoch": 0.6571622539035981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.3739316326034005e-06, + "loss": 0.0, + "num_tokens": 5205540.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 140.125, + "completions/mean_terminated_length": 140.125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.04978251946158707, + "epoch": 0.6578411405295316, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "learning_rate": 6.35184537168825e-06, + "loss": 0.0, + "num_tokens": 5209765.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 265.25, + "completions/mean_terminated_length": 265.25, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.10360525920987129, + "epoch": 0.658520027155465, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "learning_rate": 6.329779619609615e-06, + "loss": 0.0, + "num_tokens": 5215559.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 211.25, + "completions/mean_terminated_length": 211.25, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.07118803868070245, + "epoch": 0.6591989137813985, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "learning_rate": 6.307734500414564e-06, + "loss": -0.0, + "num_tokens": 5221121.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 133.625, + "completions/mean_terminated_length": 133.625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.06685220077633858, + "epoch": 0.659877800407332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.285710138034187e-06, + "loss": 0.0, + "num_tokens": 5225542.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 365.0, + "completions/mean_terminated_length": 365.0, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.07614701706916094, + "epoch": 0.6605566870332654, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "learning_rate": 6.263706656282869e-06, + "loss": -0.0, + "num_tokens": 5234774.0, + "reward": 2.6999998092651367, + "reward_std": 0.18516404926776886, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.18516401946544647, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 195.25, + "completions/mean_terminated_length": 195.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.0702132573351264, + "epoch": 0.6612355736591989, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.859375, + "learning_rate": 6.241724178857621e-06, + "loss": 0.0, + "num_tokens": 5240232.0, + "reward": 1.7000000476837158, + "reward_std": 0.10690444707870483, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.10690449178218842, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 123.25, + "completions/mean_terminated_length": 123.25, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.034311452182009816, + "epoch": 0.6619144602851323, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.90625, + "learning_rate": 6.219762829337367e-06, + "loss": 0.0, + "num_tokens": 5244434.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 139.875, + "completions/mean_terminated_length": 139.875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.07672926783561707, + "epoch": 0.6625933469110659, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0625, + "learning_rate": 6.197822731182259e-06, + "loss": -0.0, + "num_tokens": 5248817.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 189.375, + "completions/mean_terminated_length": 189.375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.04931910941377282, + "epoch": 0.6632722335369993, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "learning_rate": 6.1759040077329845e-06, + "loss": 0.0, + "num_tokens": 5253596.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 213.125, + "completions/mean_terminated_length": 213.125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.017579364706762135, + "epoch": 0.6639511201629328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "learning_rate": 6.154006782210066e-06, + "loss": 0.0, + "num_tokens": 5259917.0, + "reward": 2.359375, + "reward_std": 0.04419417306780815, + "rewards/fixed_code_pass_all_test_reward/mean": 0.359375, + "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 81.5, + "completions/mean_terminated_length": 81.5, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.05019477568566799, + "epoch": 0.6646300067888663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.132131177713165e-06, + "loss": 0.0, + "num_tokens": 5263665.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 98.625, + "completions/mean_terminated_length": 98.625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.07368154264986515, + "epoch": 0.6653088934147997, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.21875, + "learning_rate": 6.1102773172204034e-06, + "loss": 0.0, + "num_tokens": 5267638.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 334.75, + "completions/mean_terminated_length": 334.75, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.0644279196858406, + "epoch": 0.6659877800407332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.0884453235876615e-06, + "loss": 0.0, + "num_tokens": 5275468.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 163.625, + "completions/mean_terminated_length": 163.625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.025917271384969354, + "epoch": 0.6666666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.066635319547895e-06, + "loss": 0.0, + "num_tokens": 5280529.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 201.125, + "completions/mean_terminated_length": 201.125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.018838520161807537, + "epoch": 0.6673455532926001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.0448474277104365e-06, + "loss": 0.0, + "num_tokens": 5286114.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 421.5, + "completions/mean_terminated_length": 421.5, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.0307229021564126, + "epoch": 0.6680244399185336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.023081770560307e-06, + "loss": 0.0, + "num_tokens": 5295438.0, + "reward": 2.200000047683716, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 121.125, + "completions/mean_terminated_length": 121.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.05108774080872536, + "epoch": 0.6687033265444671, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.0013384704575406e-06, + "loss": 0.0, + "num_tokens": 5299791.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 428.5, + "completions/mean_terminated_length": 428.5, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.029764720937237144, + "epoch": 0.6693822131704006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "learning_rate": 5.9796176496364735e-06, + "loss": 0.0, + "num_tokens": 5309187.0, + "reward": 2.1750001907348633, + "reward_std": 0.046291034668684006, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.04629100486636162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 315.75, + "completions/mean_terminated_length": 315.75, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.03381944540888071, + "epoch": 0.670061099796334, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "learning_rate": 5.957919430205088e-06, + "loss": 0.0, + "num_tokens": 5316697.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 233.75, + "completions/mean_terminated_length": 233.75, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.033371551195159554, + "epoch": 0.6707399864222675, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "learning_rate": 5.93624393414429e-06, + "loss": -0.0, + "num_tokens": 5323071.0, + "reward": 1.875, + "reward_std": 0.26726123690605164, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 192.125, + "completions/mean_terminated_length": 192.125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.03332230914384127, + "epoch": 0.6714188730482009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.9145912833072535e-06, + "loss": 0.0, + "num_tokens": 5328632.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 201.625, + "completions/mean_terminated_length": 201.625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.022967584896832705, + "epoch": 0.6720977596741344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.892961599418716e-06, + "loss": 0.0, + "num_tokens": 5334477.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 121.875, + "completions/mean_terminated_length": 121.875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.056876023299992085, + "epoch": 0.6727766463000678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.871355004074304e-06, + "loss": 0.0, + "num_tokens": 5338668.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 80.125, + "completions/mean_terminated_length": 80.125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.033333154395222664, + "epoch": 0.6734555329260014, + "frac_reward_zero_std": 0.0, + "grad_norm": 14.5, + "learning_rate": 5.849771618739852e-06, + "loss": 0.0, + "num_tokens": 5342381.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 219.375, + "completions/mean_terminated_length": 219.375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.05435479525476694, + "epoch": 0.6741344195519349, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "learning_rate": 5.82821156475071e-06, + "loss": -0.0, + "num_tokens": 5348688.0, + "reward": 2.8125, + "reward_std": 0.1157275140285492, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 344.5, + "completions/mean_terminated_length": 344.5, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.0387456719763577, + "epoch": 0.6748133061778683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.8066749633110675e-06, + "loss": 0.0, + "num_tokens": 5356588.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 115.625, + "completions/mean_terminated_length": 115.625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.040065045235678554, + "epoch": 0.6754921928038018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.785161935493266e-06, + "loss": 0.0, + "num_tokens": 5360905.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 237.125, + "completions/mean_terminated_length": 237.125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.04246202833019197, + "epoch": 0.6761710794297352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.763672602237129e-06, + "loss": 0.0, + "num_tokens": 5367378.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 188.625, + "completions/mean_terminated_length": 188.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.019586203852668405, + "epoch": 0.6768499660556687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.742207084349274e-06, + "loss": 0.0, + "num_tokens": 5372831.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 223.875, + "completions/mean_terminated_length": 223.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.06590558681637049, + "epoch": 0.6775288526816021, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "learning_rate": 5.72076550250244e-06, + "loss": -0.0, + "num_tokens": 5378574.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 237.625, + "completions/mean_terminated_length": 237.625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.021161453099921346, + "epoch": 0.6782077393075356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.699347977234799e-06, + "loss": 0.0, + "num_tokens": 5386203.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 145.625, + "completions/mean_terminated_length": 145.625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.055856657680124044, + "epoch": 0.6788866259334692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.677954628949281e-06, + "loss": 0.0, + "num_tokens": 5390592.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1000 + }, + { + "epoch": 0.6788866259334692, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 227.21138211382114, + "eval_completions/max_terminated_length": 227.21138211382114, + "eval_completions/mean_length": 191.86957994579944, + "eval_completions/mean_terminated_length": 191.86957994579944, + "eval_completions/min_length": 155.69647696476966, + "eval_completions/min_terminated_length": 155.69647696476966, + "eval_entropy": 0.056565259366238185, + "eval_frac_reward_zero_std": 0.5040650406504065, + "eval_num_tokens": 5390592.0, + "eval_reward": 2.018510837218949, + "eval_reward_std": 0.2031349181159725, + "eval_rewards/fixed_code_pass_all_test_reward/mean": 0.6943238458336207, + "eval_rewards/fixed_code_pass_all_test_reward/std": 0.11917512425723761, + "eval_rewards/format_reward/mean": 0.9888211382113821, + "eval_rewards/format_reward/std": 0.016047745781539256, + "eval_rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3353658536585366, + "eval_rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.08650764031461906, + "eval_train_loss": 0.003139395033940673, + "eval_train_runtime": 1028.3216, + "eval_train_samples_per_second": 0.359, + "eval_train_steps_per_second": 0.046, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 155.75, + "completions/mean_terminated_length": 155.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.09460367681458592, + "epoch": 0.6795655125594026, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "learning_rate": 5.656585577912908e-06, + "loss": -0.0, + "num_tokens": 5395086.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 227.75, + "completions/mean_terminated_length": 227.75, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.07179732667282224, + "epoch": 0.6802443991853361, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.578125, + "learning_rate": 5.635240944256113e-06, + "loss": 0.0, + "num_tokens": 5400852.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 106.0, + "completions/mean_terminated_length": 106.0, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.029686111956834793, + "epoch": 0.6809232858112695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.613920847972052e-06, + "loss": 0.0, + "num_tokens": 5404852.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 187.875, + "completions/mean_terminated_length": 187.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.07532089110463858, + "epoch": 0.681602172437203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.592625408915939e-06, + "loss": 0.0, + "num_tokens": 5409955.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 171.875, + "completions/mean_terminated_length": 171.875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.05466371215879917, + "epoch": 0.6822810590631364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.571354746804383e-06, + "loss": 0.0, + "num_tokens": 5414634.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 122.125, + "completions/mean_terminated_length": 122.125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.05604705912992358, + "epoch": 0.6829599456890699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.550108981214692e-06, + "loss": 0.0, + "num_tokens": 5418667.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 180.875, + "completions/mean_terminated_length": 180.875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.06336376816034317, + "epoch": 0.6836388323150034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "learning_rate": 5.5288882315842265e-06, + "loss": 0.0, + "num_tokens": 5423378.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 154.875, + "completions/mean_terminated_length": 154.875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.0484875850379467, + "epoch": 0.6843177189409368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.507692617209701e-06, + "loss": 0.0, + "num_tokens": 5427729.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 176.625, + "completions/mean_terminated_length": 176.625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.06522973626852036, + "epoch": 0.6849966055668704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.486522257246538e-06, + "loss": 0.0, + "num_tokens": 5432478.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 127.125, + "completions/mean_terminated_length": 127.125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.05258332472294569, + "epoch": 0.6856754921928038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.465377270708183e-06, + "loss": 0.0, + "num_tokens": 5437183.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 279.875, + "completions/mean_terminated_length": 279.875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.03418240649625659, + "epoch": 0.6863543788187373, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "learning_rate": 5.4442577764654334e-06, + "loss": 0.0, + "num_tokens": 5443582.0, + "reward": 2.8541667461395264, + "reward_std": 0.20773717761039734, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8541666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.20773723721504211, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 167.625, + "completions/mean_terminated_length": 167.625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.05349999386817217, + "epoch": 0.6870332654446707, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "learning_rate": 5.423163893245786e-06, + "loss": -0.0, + "num_tokens": 5448403.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 236.625, + "completions/mean_terminated_length": 236.625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.02258082269690931, + "epoch": 0.6877121520706042, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "learning_rate": 5.402095739632763e-06, + "loss": -0.0, + "num_tokens": 5454536.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 147.0, + "completions/mean_terminated_length": 147.0, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.04335205093957484, + "epoch": 0.6883910386965377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.381053434065229e-06, + "loss": 0.0, + "num_tokens": 5458816.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 179.375, + "completions/mean_terminated_length": 179.375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.052615988068282604, + "epoch": 0.6890699253224711, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.890625, + "learning_rate": 5.360037094836745e-06, + "loss": -0.0, + "num_tokens": 5463347.0, + "reward": 2.25, + "reward_std": 0.8864052295684814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 265.375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.031195369083434343, + "epoch": 0.6897488119484046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.339046840094899e-06, + "loss": 0.0, + "num_tokens": 5469902.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 144.375, + "completions/mean_terminated_length": 144.375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.0740518462844193, + "epoch": 0.6904276985743381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.318082787840646e-06, + "loss": 0.0, + "num_tokens": 5474257.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 143.0, + "completions/mean_terminated_length": 143.0, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.06399343302473426, + "epoch": 0.6911065852002716, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "learning_rate": 5.297145055927622e-06, + "loss": 0.0, + "num_tokens": 5478505.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 256.125, + "completions/mean_terminated_length": 256.125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.018098314991220832, + "epoch": 0.691785471826205, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "learning_rate": 5.276233762061507e-06, + "loss": 0.0, + "num_tokens": 5484986.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 188.375, + "completions/mean_terminated_length": 188.375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.06366366753354669, + "epoch": 0.6924643584521385, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "learning_rate": 5.255349023799357e-06, + "loss": -0.0, + "num_tokens": 5490229.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 103.25, + "completions/mean_terminated_length": 103.25, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.06839914806187153, + "epoch": 0.693143245078072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.234490958548941e-06, + "loss": 0.0, + "num_tokens": 5494047.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 150.625, + "completions/mean_terminated_length": 150.625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.06942394003272057, + "epoch": 0.6938221317040054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.213659683568073e-06, + "loss": 0.0, + "num_tokens": 5499116.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 160.25, + "completions/mean_terminated_length": 160.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.04457291937433183, + "epoch": 0.6945010183299389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.192855315963959e-06, + "loss": 0.0, + "num_tokens": 5504526.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 149.5, + "completions/mean_terminated_length": 149.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.07382111297920346, + "epoch": 0.6951799049558723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.172077972692553e-06, + "loss": 0.0, + "num_tokens": 5508978.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 187.0, + "completions/mean_terminated_length": 187.0, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.063003228046, + "epoch": 0.6958587915818059, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.859375, + "learning_rate": 5.15132777055787e-06, + "loss": 0.0, + "num_tokens": 5513962.0, + "reward": 2.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 137.875, + "completions/mean_terminated_length": 137.875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.10102774389088154, + "epoch": 0.6965376782077393, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.375, + "learning_rate": 5.130604826211361e-06, + "loss": 0.0, + "num_tokens": 5518305.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 324.875, + "completions/mean_terminated_length": 324.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.03406269242987037, + "epoch": 0.6972165648336728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "learning_rate": 5.109909256151227e-06, + "loss": 0.0, + "num_tokens": 5525776.0, + "reward": 2.950000047683716, + "reward_std": 0.1414213627576828, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 202.25, + "completions/mean_terminated_length": 202.25, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.1569840693846345, + "epoch": 0.6978954514596063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.089241176721794e-06, + "loss": 0.0, + "num_tokens": 5530754.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 136.375, + "completions/mean_terminated_length": 136.375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.04225786775350571, + "epoch": 0.6985743380855397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.068600704112832e-06, + "loss": 0.0, + "num_tokens": 5534885.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 173.375, + "completions/mean_terminated_length": 173.375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.05938868597149849, + "epoch": 0.6992532247114732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.047987954358912e-06, + "loss": 0.0, + "num_tokens": 5539392.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 139.375, + "completions/mean_terminated_length": 139.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.03899049502797425, + "epoch": 0.6999321113374066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.02740304333877e-06, + "loss": 0.0, + "num_tokens": 5543587.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 98.125, + "completions/mean_terminated_length": 98.125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.04210246494039893, + "epoch": 0.7006109979633401, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.006846086774631e-06, + "loss": 0.0, + "num_tokens": 5547580.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 133.5, + "completions/mean_terminated_length": 133.5, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.02648665476590395, + "epoch": 0.7012898845892735, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.625, + "learning_rate": 4.9863172002315675e-06, + "loss": 0.0, + "num_tokens": 5552200.0, + "reward": 2.9000000953674316, + "reward_std": 0.2828427255153656, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 292.5, + "completions/mean_terminated_length": 292.5, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.033049843506887555, + "epoch": 0.7019687712152071, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84765625, + "learning_rate": 4.965816499116849e-06, + "loss": 0.0, + "num_tokens": 5559316.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 211.625, + "completions/mean_terminated_length": 211.625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.054559324868023396, + "epoch": 0.7026476578411406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.945344098679302e-06, + "loss": 0.0, + "num_tokens": 5564801.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 205.75, + "completions/mean_terminated_length": 205.75, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.04275689972564578, + "epoch": 0.703326544467074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.924900114008656e-06, + "loss": 0.0, + "num_tokens": 5570343.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 98.375, + "completions/mean_terminated_length": 98.375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.049867642344906926, + "epoch": 0.7040054310930075, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.859375, + "learning_rate": 4.904484660034887e-06, + "loss": 0.0, + "num_tokens": 5574522.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 262.875, + "completions/mean_terminated_length": 262.875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.05536046577617526, + "epoch": 0.7046843177189409, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "learning_rate": 4.8840978515275816e-06, + "loss": 0.0, + "num_tokens": 5581161.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 161.75, + "completions/mean_terminated_length": 161.75, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.06532919779419899, + "epoch": 0.7053632043448744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.863739803095299e-06, + "loss": 0.0, + "num_tokens": 5585679.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 207.25, + "completions/mean_terminated_length": 207.25, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.025774283800274134, + "epoch": 0.7060420909708078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.8434106291849035e-06, + "loss": 0.0, + "num_tokens": 5591577.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 145.375, + "completions/mean_terminated_length": 145.375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.07337108254432678, + "epoch": 0.7067209775967414, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.546875, + "learning_rate": 4.8231104440809524e-06, + "loss": 0.0, + "num_tokens": 5596052.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 246.75, + "completions/mean_terminated_length": 246.75, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.030016270466148853, + "epoch": 0.7073998642226749, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "learning_rate": 4.802839361905021e-06, + "loss": 0.0, + "num_tokens": 5602530.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 160.875, + "completions/mean_terminated_length": 160.875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.07069768756628036, + "epoch": 0.7080787508486083, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "learning_rate": 4.782597496615088e-06, + "loss": -0.0, + "num_tokens": 5607417.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 167.625, + "completions/mean_terminated_length": 167.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.06086717499420047, + "epoch": 0.7087576374745418, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "learning_rate": 4.762384962004877e-06, + "loss": 0.0, + "num_tokens": 5612206.0, + "reward": 2.3125, + "reward_std": 0.45806270837783813, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 150.875, + "completions/mean_terminated_length": 150.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.04599870974197984, + "epoch": 0.7094365241004752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "learning_rate": 4.74220187170322e-06, + "loss": 0.0, + "num_tokens": 5616765.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 143.5, + "completions/mean_terminated_length": 143.5, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.05836607748642564, + "epoch": 0.7101154107264087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.7220483391734325e-06, + "loss": 0.0, + "num_tokens": 5621081.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 186.875, + "completions/mean_terminated_length": 186.875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.0899033397436142, + "epoch": 0.7107942973523421, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.90625, + "learning_rate": 4.701924477712663e-06, + "loss": -0.0, + "num_tokens": 5626024.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 323.25, + "completions/mean_terminated_length": 323.25, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.033922820119187236, + "epoch": 0.7114731839782756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.681830400451249e-06, + "loss": 0.0, + "num_tokens": 5634474.0, + "reward": 2.2857143878936768, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 120.625, + "completions/mean_terminated_length": 120.625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.06572171626612544, + "epoch": 0.712152070604209, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.390625, + "learning_rate": 4.661766220352098e-06, + "loss": 0.0, + "num_tokens": 5638783.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 147.875, + "completions/mean_terminated_length": 147.875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.049859441351145506, + "epoch": 0.7128309572301426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.641732050210032e-06, + "loss": 0.0, + "num_tokens": 5643278.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 324.125, + "completions/mean_terminated_length": 324.125, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.030875551281496882, + "epoch": 0.7135098438560761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.621728002651194e-06, + "loss": 0.0, + "num_tokens": 5650959.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 205.25, + "completions/mean_terminated_length": 205.25, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.04553757677786052, + "epoch": 0.7141887304820095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.6017541901323605e-06, + "loss": 0.0, + "num_tokens": 5656569.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 158.0, + "completions/mean_terminated_length": 158.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.05938792508095503, + "epoch": 0.714867617107943, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "learning_rate": 4.581810724940343e-06, + "loss": 0.0, + "num_tokens": 5661121.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 94.375, + "completions/mean_terminated_length": 94.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.041229897644370794, + "epoch": 0.7155465037338764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.561897719191349e-06, + "loss": 0.0, + "num_tokens": 5664964.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 309.375, + "completions/mean_terminated_length": 309.375, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "entropy": 0.03235103120096028, + "epoch": 0.7162253903598099, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "learning_rate": 4.542015284830358e-06, + "loss": -0.0, + "num_tokens": 5672423.0, + "reward": 2.6363635063171387, + "reward_std": 0.3887436091899872, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, + "rewards/fixed_code_pass_all_test_reward/std": 0.3887436091899872, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 160.5, + "completions/mean_terminated_length": 160.5, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.053834905847907066, + "epoch": 0.7169042769857433, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.4375, + "learning_rate": 4.5221635336304825e-06, + "loss": 0.0, + "num_tokens": 5677051.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 192.75, + "completions/mean_terminated_length": 192.75, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.040678431978449225, + "epoch": 0.7175831636116768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.502342577192342e-06, + "loss": 0.0, + "num_tokens": 5681977.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 129.875, + "completions/mean_terminated_length": 129.875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.0555617306381464, + "epoch": 0.7182620502376104, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.625, + "learning_rate": 4.482552526943432e-06, + "loss": 0.0, + "num_tokens": 5686280.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 339.0, + "completions/mean_terminated_length": 339.0, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 0.048259368166327477, + "epoch": 0.7189409368635438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "learning_rate": 4.4627934941375185e-06, + "loss": 0.0, + "num_tokens": 5694112.0, + "reward": 2.107142925262451, + "reward_std": 0.2503642439842224, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1071428582072258, + "rewards/fixed_code_pass_all_test_reward/std": 0.25036418437957764, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 225.25, + "completions/mean_terminated_length": 225.25, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.04348608711734414, + "epoch": 0.7196198234894773, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "learning_rate": 4.443065589853977e-06, + "loss": 0.0, + "num_tokens": 5699426.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 201.875, + "completions/mean_terminated_length": 201.875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.026979003101587296, + "epoch": 0.7202987101154107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.423368924997208e-06, + "loss": 0.0, + "num_tokens": 5705233.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 208.875, + "completions/mean_terminated_length": 208.875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.01662625279277563, + "epoch": 0.7209775967413442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.403703610295972e-06, + "loss": 0.0, + "num_tokens": 5711544.0, + "reward": 2.375, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 125.375, + "completions/mean_terminated_length": 125.375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.049123300705105066, + "epoch": 0.7216564833672776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.3840697563028076e-06, + "loss": 0.0, + "num_tokens": 5715771.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 120.5, + "completions/mean_terminated_length": 120.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.06565538328140974, + "epoch": 0.7223353699932111, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "learning_rate": 4.36446747339338e-06, + "loss": 0.0, + "num_tokens": 5719855.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 114.125, + "completions/mean_terminated_length": 114.125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.055873916018754244, + "epoch": 0.7230142566191446, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.890625, + "learning_rate": 4.344896871765868e-06, + "loss": 0.0, + "num_tokens": 5724264.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 158.0, + "completions/mean_terminated_length": 158.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.05432352190837264, + "epoch": 0.723693143245078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.325358061440356e-06, + "loss": 0.0, + "num_tokens": 5728976.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.06329419417306781, + "epoch": 0.7243720298710116, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "learning_rate": 4.30585115225821e-06, + "loss": 0.0, + "num_tokens": 5735285.0, + "reward": 2.71875, + "reward_std": 0.31160587072372437, + "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, + "rewards/fixed_code_pass_all_test_reward/std": 0.31160587072372437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 120.75, + "completions/mean_terminated_length": 120.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.05682699754834175, + "epoch": 0.725050916496945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.2863762538814465e-06, + "loss": 0.0, + "num_tokens": 5739539.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 325.875, + "completions/mean_terminated_length": 325.875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "entropy": 0.03501487663015723, + "epoch": 0.7257298031228785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.2669334757921284e-06, + "loss": 0.0, + "num_tokens": 5747170.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 256.75, + "completions/mean_terminated_length": 256.75, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.02034762524999678, + "epoch": 0.7264086897488119, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "learning_rate": 4.2475229272917565e-06, + "loss": 0.0, + "num_tokens": 5753648.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 267.625, + "completions/mean_terminated_length": 267.625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.06554784090258181, + "epoch": 0.7270875763747454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.228144717500642e-06, + "loss": 0.0, + "num_tokens": 5760725.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 113.25, + "completions/mean_terminated_length": 113.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.02224722900427878, + "epoch": 0.7277664630006789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.208798955357295e-06, + "loss": 0.0, + "num_tokens": 5765263.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 174.25, + "completions/mean_terminated_length": 174.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.1215749466791749, + "epoch": 0.7284453496266123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.189485749617813e-06, + "loss": 0.0, + "num_tokens": 5770041.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 167.125, + "completions/mean_terminated_length": 167.125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.0479441003408283, + "epoch": 0.7291242362525459, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "learning_rate": 4.170205208855281e-06, + "loss": 0.0, + "num_tokens": 5774730.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 158.125, + "completions/mean_terminated_length": 158.125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.031790854409337044, + "epoch": 0.7298031228784793, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.150957441459139e-06, + "loss": 0.0, + "num_tokens": 5779219.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 225.0, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.024800655664876103, + "epoch": 0.7304820095044128, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "learning_rate": 4.131742555634597e-06, + "loss": 0.0, + "num_tokens": 5785099.0, + "reward": 1.8928571939468384, + "reward_std": 0.06612997502088547, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571939468384, + "rewards/fixed_code_pass_all_test_reward/std": 0.06613000482320786, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 146.875, + "completions/mean_terminated_length": 146.875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.07442566379904747, + "epoch": 0.7311608961303462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "learning_rate": 4.112560659401999e-06, + "loss": -0.0, + "num_tokens": 5789410.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 158.625, + "completions/mean_terminated_length": 158.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.05083320615813136, + "epoch": 0.7318397827562797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.093411860596253e-06, + "loss": 0.0, + "num_tokens": 5794159.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 241.0, + "completions/mean_terminated_length": 241.0, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.02780906716361642, + "epoch": 0.7325186693822132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.0742962668661826e-06, + "loss": 0.0, + "num_tokens": 5800383.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 281.125, + "completions/mean_terminated_length": 281.125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.06524079642258584, + "epoch": 0.7331975560081466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "learning_rate": 4.055213985673949e-06, + "loss": -0.0, + "num_tokens": 5807816.0, + "reward": 2.5357141494750977, + "reward_std": 0.05399487912654877, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.05399493873119354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 124.125, + "completions/mean_terminated_length": 124.125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.04657298885285854, + "epoch": 0.7338764426340801, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.036165124294445e-06, + "loss": 0.0, + "num_tokens": 5811993.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 174.5, + "completions/mean_terminated_length": 174.5, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.13908050954341888, + "epoch": 0.7345553292600135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.017149789814689e-06, + "loss": 0.0, + "num_tokens": 5816717.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 148.75, + "completions/mean_terminated_length": 148.75, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.07652442436665297, + "epoch": 0.7352342158859471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.998168089133211e-06, + "loss": 0.0, + "num_tokens": 5821107.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 418.625, + "completions/mean_terminated_length": 418.625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.033000505762174726, + "epoch": 0.7359131025118805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.979220128959463e-06, + "loss": 0.0, + "num_tokens": 5830432.0, + "reward": 2.200000047683716, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 168.375, + "completions/mean_terminated_length": 168.375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.07138945162296295, + "epoch": 0.736591989137814, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "learning_rate": 3.960306015813228e-06, + "loss": 0.0, + "num_tokens": 5835131.0, + "reward": 2.3125, + "reward_std": 0.45806270837783813, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.3125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.45806270837783813, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 181.75, + "completions/mean_terminated_length": 181.75, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.03901159903034568, + "epoch": 0.7372708757637475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "learning_rate": 3.941425856024007e-06, + "loss": 0.0, + "num_tokens": 5839817.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.03441104665398598, + "epoch": 0.7379497623896809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.92257975573042e-06, + "loss": 0.0, + "num_tokens": 5846835.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 185.0, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.0771292713470757, + "epoch": 0.7386286490156144, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "learning_rate": 3.9037678208796144e-06, + "loss": 0.0, + "num_tokens": 5851747.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.03328163595870137, + "epoch": 0.7393075356415478, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "learning_rate": 3.884990157226683e-06, + "loss": 0.0, + "num_tokens": 5858155.0, + "reward": 2.7083334922790527, + "reward_std": 0.2920914888381958, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.2920915186405182, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 197.125, + "completions/mean_terminated_length": 197.125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.06203055148944259, + "epoch": 0.7399864222674813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.866246870334036e-06, + "loss": 0.0, + "num_tokens": 5863068.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 112.125, + "completions/mean_terminated_length": 112.125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.03161480696871877, + "epoch": 0.7406653088934148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.847538065570847e-06, + "loss": 0.0, + "num_tokens": 5867781.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 104.625, + "completions/mean_terminated_length": 104.625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.04372584540396929, + "epoch": 0.7413441955193483, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.96875, + "learning_rate": 3.828863848112425e-06, + "loss": -0.0, + "num_tokens": 5871706.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 200.375, + "completions/mean_terminated_length": 200.375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.03522487124428153, + "epoch": 0.7420230821452818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.810224322939655e-06, + "loss": 0.0, + "num_tokens": 5877237.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 146.5, + "completions/mean_terminated_length": 146.5, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.041397218592464924, + "epoch": 0.7427019687712152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.7916195948383817e-06, + "loss": 0.0, + "num_tokens": 5881489.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 130.0, + "completions/mean_terminated_length": 130.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.06950572319328785, + "epoch": 0.7433808553971487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.7730497683988287e-06, + "loss": 0.0, + "num_tokens": 5885801.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 128.5, + "completions/mean_terminated_length": 128.5, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.04397960612550378, + "epoch": 0.7440597420230821, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.7545149480150224e-06, + "loss": 0.0, + "num_tokens": 5889909.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 138.5, + "completions/mean_terminated_length": 138.5, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.07751855719834566, + "epoch": 0.7447386286490156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.736015237884193e-06, + "loss": 0.0, + "num_tokens": 5894249.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 249.875, + "completions/mean_terminated_length": 249.875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.11444954946637154, + "epoch": 0.745417515274949, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "learning_rate": 3.7175507420061885e-06, + "loss": -0.0, + "num_tokens": 5899944.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 367.375, + "completions/mean_terminated_length": 367.375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "entropy": 0.05010857083834708, + "epoch": 0.7460964019008826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.6991215641828903e-06, + "loss": 0.0, + "num_tokens": 5908027.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 194.0, + "completions/mean_terminated_length": 194.0, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.0829274570569396, + "epoch": 0.7467752885268161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.680727808017638e-06, + "loss": 0.0, + "num_tokens": 5913347.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 130.75, + "completions/mean_terminated_length": 130.75, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.04712094506248832, + "epoch": 0.7474541751527495, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0, + "learning_rate": 3.662369576914642e-06, + "loss": 0.0, + "num_tokens": 5917617.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 196.5, + "completions/mean_terminated_length": 196.5, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.025217097951099277, + "epoch": 0.748133061778683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.644046974078397e-06, + "loss": 0.0, + "num_tokens": 5923813.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 142.875, + "completions/mean_terminated_length": 142.875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.08944710437208414, + "epoch": 0.7488119484046164, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "learning_rate": 3.625760102513103e-06, + "loss": -0.0, + "num_tokens": 5927948.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 138.75, + "completions/mean_terminated_length": 138.75, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.055567214265465736, + "epoch": 0.7494908350305499, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "learning_rate": 3.607509065022101e-06, + "loss": 0.0, + "num_tokens": 5932226.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 192.875, + "completions/mean_terminated_length": 192.875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.06994170090183616, + "epoch": 0.7501697216564833, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.359375, + "learning_rate": 3.589293964207271e-06, + "loss": -0.0, + "num_tokens": 5937593.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 152.5, + "completions/mean_terminated_length": 152.5, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.05150497052818537, + "epoch": 0.7508486082824168, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.03125, + "learning_rate": 3.57111490246848e-06, + "loss": 0.0, + "num_tokens": 5942053.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 302.125, + "completions/mean_terminated_length": 302.125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.035789793357253075, + "epoch": 0.7515274949083504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.5529719820029785e-06, + "loss": 0.0, + "num_tokens": 5949006.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 197.5, + "completions/mean_terminated_length": 197.5, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.055672927759587765, + "epoch": 0.7522063815342838, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "learning_rate": 3.5348653048048598e-06, + "loss": 0.0, + "num_tokens": 5954418.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 191.375, + "completions/mean_terminated_length": 191.375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.07923690136522055, + "epoch": 0.7528852681602173, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "learning_rate": 3.5167949726644545e-06, + "loss": 0.0, + "num_tokens": 5959053.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 134.125, + "completions/mean_terminated_length": 134.125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.03581171575933695, + "epoch": 0.7535641547861507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.4987610871677746e-06, + "loss": 0.0, + "num_tokens": 5963654.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 119.625, + "completions/mean_terminated_length": 119.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.03456205199472606, + "epoch": 0.7542430414120842, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875, + "learning_rate": 3.4807637496959433e-06, + "loss": -0.0, + "num_tokens": 5968331.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 159.25, + "completions/mean_terminated_length": 159.25, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.05524680111557245, + "epoch": 0.7549219280380176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.4628030614246266e-06, + "loss": 0.0, + "num_tokens": 5972709.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 185.375, + "completions/mean_terminated_length": 185.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.056924451142549515, + "epoch": 0.7556008146639511, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.4448791233234467e-06, + "loss": 0.0, + "num_tokens": 5977312.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 181.25, + "completions/mean_terminated_length": 181.25, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.05375195760279894, + "epoch": 0.7562797012898846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.4269920361554342e-06, + "loss": 0.0, + "num_tokens": 5982114.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 320.625, + "completions/mean_terminated_length": 320.625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.016106918221339583, + "epoch": 0.756958587915818, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "learning_rate": 3.409141900476457e-06, + "loss": -0.0, + "num_tokens": 5991655.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 256.375, + "completions/mean_terminated_length": 256.375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.01662868820130825, + "epoch": 0.7576374745417516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9375, + "learning_rate": 3.3913288166346525e-06, + "loss": 0.0, + "num_tokens": 5998130.0, + "reward": 2.953125, + "reward_std": 0.13258251547813416, + "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, + "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 155.125, + "completions/mean_terminated_length": 155.125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.07061495538800955, + "epoch": 0.758316361167685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.3735528847698597e-06, + "loss": 0.0, + "num_tokens": 6002595.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 117.125, + "completions/mean_terminated_length": 117.125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.07740376610308886, + "epoch": 0.7589952477936185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "learning_rate": 3.355814204813058e-06, + "loss": -0.0, + "num_tokens": 6007004.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 185.625, + "completions/mean_terminated_length": 185.625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.07518411288037896, + "epoch": 0.7596741344195519, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "learning_rate": 3.338112876485821e-06, + "loss": 0.0, + "num_tokens": 6012585.0, + "reward": 2.875, + "reward_std": 0.10350989550352097, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.1035098284482956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 205.375, + "completions/mean_terminated_length": 205.375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.17964396253228188, + "epoch": 0.7603530210454854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.3204489992997226e-06, + "loss": 0.0, + "num_tokens": 6017628.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 176.625, + "completions/mean_terminated_length": 176.625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.049109778832644224, + "epoch": 0.7610319076714189, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.302822672555819e-06, + "loss": 0.0, + "num_tokens": 6022473.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 130.75, + "completions/mean_terminated_length": 130.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.06834228057414293, + "epoch": 0.7617107942973523, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "learning_rate": 3.285233995344049e-06, + "loss": -0.0, + "num_tokens": 6026639.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 159.25, + "completions/mean_terminated_length": 159.25, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.04049702361226082, + "epoch": 0.7623896809232859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.267683066542715e-06, + "loss": 0.0, + "num_tokens": 6031193.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 155.5, + "completions/mean_terminated_length": 155.5, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.03817987139336765, + "epoch": 0.7630685675492193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.250169984817897e-06, + "loss": 0.0, + "num_tokens": 6035989.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 229.375, + "completions/mean_terminated_length": 229.375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.04785697255283594, + "epoch": 0.7637474541751528, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "learning_rate": 3.2326948486229105e-06, + "loss": 0.0, + "num_tokens": 6041808.0, + "reward": 2.7750000953674316, + "reward_std": 0.4200340211391449, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.4200340509414673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 149.375, + "completions/mean_terminated_length": 149.375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.03661369648762047, + "epoch": 0.7644263408010862, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "learning_rate": 3.215257756197758e-06, + "loss": 0.0, + "num_tokens": 6046563.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 183.0, + "completions/mean_terminated_length": 183.0, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.06413457496091723, + "epoch": 0.7651052274270197, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "learning_rate": 3.1978588055685733e-06, + "loss": -0.0, + "num_tokens": 6051155.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 255.75, + "completions/mean_terminated_length": 255.75, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.10283747594803572, + "epoch": 0.7657841140529531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.18049809454706e-06, + "loss": 0.0, + "num_tokens": 6056937.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 142.125, + "completions/mean_terminated_length": 142.125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.1147898193448782, + "epoch": 0.7664630006788866, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.90625, + "learning_rate": 3.163175720729954e-06, + "loss": 0.0, + "num_tokens": 6061794.0, + "reward": 2.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 170.875, + "completions/mean_terminated_length": 170.875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.0754433018155396, + "epoch": 0.7671418873048201, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.65625, + "learning_rate": 3.1458917814984657e-06, + "loss": 0.0, + "num_tokens": 6066681.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 106.0, + "completions/mean_terminated_length": 106.0, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.04454766772687435, + "epoch": 0.7678207739307535, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.109375, + "learning_rate": 3.128646374017754e-06, + "loss": 0.0, + "num_tokens": 6070777.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 339.875, + "completions/mean_terminated_length": 339.875, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.029199098702520132, + "epoch": 0.7684996605566871, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "learning_rate": 3.1114395952363486e-06, + "loss": 0.0, + "num_tokens": 6078336.0, + "reward": 2.625, + "reward_std": 0.38917219638824463, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.25717225670814514, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 123.125, + "completions/mean_terminated_length": 123.125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.10223288089036942, + "epoch": 0.7691785471826205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.0942715418856184e-06, + "loss": 0.0, + "num_tokens": 6082425.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 268.0, + "completions/mean_terminated_length": 268.0, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.033164044842123985, + "epoch": 0.769857433808554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.0771423104792454e-06, + "loss": 0.0, + "num_tokens": 6089073.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 118.125, + "completions/mean_terminated_length": 118.125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.0335758535657078, + "epoch": 0.7705363204344874, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "learning_rate": 3.060051997312646e-06, + "loss": 0.0, + "num_tokens": 6093226.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.03158295247703791, + "epoch": 0.7712152070604209, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "learning_rate": 3.0430006984624704e-06, + "loss": -0.0, + "num_tokens": 6099881.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 131.75, + "completions/mean_terminated_length": 131.75, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.05773442704230547, + "epoch": 0.7718940936863544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "learning_rate": 3.025988509786023e-06, + "loss": 0.0, + "num_tokens": 6104207.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 122.125, + "completions/mean_terminated_length": 122.125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.06339313322678208, + "epoch": 0.7725729803122878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.0090155269207575e-06, + "loss": 0.0, + "num_tokens": 6108376.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 79.25, + "completions/mean_terminated_length": 79.25, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.015622854698449373, + "epoch": 0.7732518669382213, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.40625, + "learning_rate": 2.992081845283715e-06, + "loss": 0.0, + "num_tokens": 6112354.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 135.125, + "completions/mean_terminated_length": 135.125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.020465551177039742, + "epoch": 0.7739307535641547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.975187560070998e-06, + "loss": 0.0, + "num_tokens": 6116963.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 165.25, + "completions/mean_terminated_length": 165.25, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.0760266799479723, + "epoch": 0.7746096401900883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.958332766257237e-06, + "loss": 0.0, + "num_tokens": 6121573.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 148.75, + "completions/mean_terminated_length": 148.75, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.05395929981023073, + "epoch": 0.7752885268160217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.941517558595056e-06, + "loss": 0.0, + "num_tokens": 6126011.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 290.375, + "completions/mean_terminated_length": 290.375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.040367637760937214, + "epoch": 0.7759674134419552, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "learning_rate": 2.9247420316145324e-06, + "loss": 0.0, + "num_tokens": 6132990.0, + "reward": 1.975000023841858, + "reward_std": 0.0707106813788414, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 321.5, + "completions/mean_terminated_length": 321.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.04299045028164983, + "epoch": 0.7766463000678887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.908006279622667e-06, + "loss": 0.0, + "num_tokens": 6140258.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 137.5, + "completions/mean_terminated_length": 137.5, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.08779045380651951, + "epoch": 0.7773251866938221, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.859375, + "learning_rate": 2.8913103967028664e-06, + "loss": 0.0, + "num_tokens": 6144358.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 204.375, + "completions/mean_terminated_length": 204.375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.027459141332656145, + "epoch": 0.7780040733197556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.8746544767144056e-06, + "loss": 0.0, + "num_tokens": 6150289.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 200.125, + "completions/mean_terminated_length": 200.125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.04449179023504257, + "epoch": 0.778682959945689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.8580386132918916e-06, + "loss": 0.0, + "num_tokens": 6155850.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 196.625, + "completions/mean_terminated_length": 196.625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.06664275424554944, + "epoch": 0.7793618465716226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.841462899844749e-06, + "loss": 0.0, + "num_tokens": 6160799.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.0406491719186306, + "epoch": 0.780040733197556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.8249274295566863e-06, + "loss": 0.0, + "num_tokens": 6167507.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 299.625, + "completions/mean_terminated_length": 299.625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.05521875433623791, + "epoch": 0.7807196198234895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.8084322953851963e-06, + "loss": 0.0, + "num_tokens": 6174848.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 148.5, + "completions/mean_terminated_length": 148.5, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.07622930407524109, + "epoch": 0.781398506449423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.79197759006099e-06, + "loss": 0.0, + "num_tokens": 6179548.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 133.25, + "completions/mean_terminated_length": 133.25, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.08072125958278775, + "epoch": 0.7820773930753564, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.46875, + "learning_rate": 2.7755634060875135e-06, + "loss": -0.0, + "num_tokens": 6183798.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 190.625, + "completions/mean_terminated_length": 190.625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.09614286851137877, + "epoch": 0.7827562797012899, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "learning_rate": 2.7591898357404066e-06, + "loss": -0.0, + "num_tokens": 6189283.0, + "reward": 2.9479165077209473, + "reward_std": 0.14731398224830627, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9479166269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.1473139226436615, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 174.375, + "completions/mean_terminated_length": 174.375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.06191416038200259, + "epoch": 0.7834351663272233, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "learning_rate": 2.742856971066996e-06, + "loss": -0.0, + "num_tokens": 6194438.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 180.0, + "completions/mean_terminated_length": 180.0, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.04340082639828324, + "epoch": 0.7841140529531568, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.7265649038857776e-06, + "loss": 0.0, + "num_tokens": 6199062.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 80.875, + "completions/mean_terminated_length": 80.875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.05926245450973511, + "epoch": 0.7847929395790902, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.125, + "learning_rate": 2.7103137257858867e-06, + "loss": -0.0, + "num_tokens": 6202733.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 109.25, + "completions/mean_terminated_length": 109.25, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.04237109562382102, + "epoch": 0.7854718262050238, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.21875, + "learning_rate": 2.6941035281265936e-06, + "loss": -0.0, + "num_tokens": 6207007.0, + "reward": 2.8499999046325684, + "reward_std": 0.2777459919452667, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.2777460217475891, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 94.875, + "completions/mean_terminated_length": 94.875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.04313861997798085, + "epoch": 0.7861507128309573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.677934402036797e-06, + "loss": 0.0, + "num_tokens": 6211062.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 109.875, + "completions/mean_terminated_length": 109.875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.03325886162929237, + "epoch": 0.7868295994568907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.6618064384144925e-06, + "loss": 0.0, + "num_tokens": 6215757.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 228.125, + "completions/mean_terminated_length": 228.125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.027541066287085414, + "epoch": 0.7875084860828242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.6457197279262835e-06, + "loss": 0.0, + "num_tokens": 6221766.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 135.75, + "completions/mean_terminated_length": 135.75, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.043505198787897825, + "epoch": 0.7881873727087576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.629674361006851e-06, + "loss": 0.0, + "num_tokens": 6225972.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 183.75, + "completions/mean_terminated_length": 183.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.07034529093652964, + "epoch": 0.7888662593346911, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "learning_rate": 2.6136704278584624e-06, + "loss": 0.0, + "num_tokens": 6230802.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 115.5, + "completions/mean_terminated_length": 115.5, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.05324462801218033, + "epoch": 0.7895451459606245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.597708018450453e-06, + "loss": 0.0, + "num_tokens": 6235102.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 122.0, + "completions/mean_terminated_length": 122.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.038038093596696854, + "epoch": 0.790224032586558, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.609375, + "learning_rate": 2.58178722251872e-06, + "loss": 0.0, + "num_tokens": 6239670.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 162.5, + "completions/mean_terminated_length": 162.5, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.0353456602897495, + "epoch": 0.7909029192124916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.5659081295652298e-06, + "loss": 0.0, + "num_tokens": 6244194.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 200.625, + "completions/mean_terminated_length": 200.625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.06646355940029025, + "epoch": 0.791581805838425, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "learning_rate": 2.550070828857506e-06, + "loss": -0.0, + "num_tokens": 6249279.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 118.5, + "completions/mean_terminated_length": 118.5, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.0543752028606832, + "epoch": 0.7922606924643585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.5342754094281253e-06, + "loss": 0.0, + "num_tokens": 6253499.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 229.875, + "completions/mean_terminated_length": 229.875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.05022946512326598, + "epoch": 0.7929395790902919, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "learning_rate": 2.518521960074217e-06, + "loss": 0.0, + "num_tokens": 6259562.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 171.875, + "completions/mean_terminated_length": 171.875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.043414485175162554, + "epoch": 0.7936184657162254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.502810569356976e-06, + "loss": 0.0, + "num_tokens": 6264297.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 147.5, + "completions/mean_terminated_length": 147.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.05755961430259049, + "epoch": 0.7942973523421588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.4871413256011534e-06, + "loss": 0.0, + "num_tokens": 6269549.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 221.875, + "completions/mean_terminated_length": 221.875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.04200444184243679, + "epoch": 0.7949762389680923, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "learning_rate": 2.471514316894559e-06, + "loss": -0.0, + "num_tokens": 6275292.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 132.25, + "completions/mean_terminated_length": 132.25, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.021738025126978755, + "epoch": 0.7956551255940258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.455929631087568e-06, + "loss": 0.0, + "num_tokens": 6279734.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 135.875, + "completions/mean_terminated_length": 135.875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.033521171659231186, + "epoch": 0.7963340122199593, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "learning_rate": 2.440387355792638e-06, + "loss": 0.0, + "num_tokens": 6284221.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 202.125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.07864878699183464, + "epoch": 0.7970128988458928, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "learning_rate": 2.424887578383799e-06, + "loss": -0.0, + "num_tokens": 6289326.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 330.5, + "completions/mean_terminated_length": 330.5, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.025022073416039348, + "epoch": 0.7976917854718262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "learning_rate": 2.4094303859961774e-06, + "loss": 0.0, + "num_tokens": 6296914.0, + "reward": 2.8125, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 204.125, + "completions/mean_terminated_length": 204.125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.020753496093675494, + "epoch": 0.7983706720977597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.394015865525491e-06, + "loss": 0.0, + "num_tokens": 6302883.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 98.0, + "completions/mean_terminated_length": 98.0, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.035069910110905766, + "epoch": 0.7990495587236931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.3786441036275764e-06, + "loss": 0.0, + "num_tokens": 6306899.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 149.5, + "completions/mean_terminated_length": 149.5, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.07078630151227117, + "epoch": 0.7997284453496266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.3633151867178915e-06, + "loss": 0.0, + "num_tokens": 6311303.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 246.625, + "completions/mean_terminated_length": 246.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.09753914177417755, + "epoch": 0.8004073319755601, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "learning_rate": 2.3480292009710282e-06, + "loss": -0.0, + "num_tokens": 6317148.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 208.375, + "completions/mean_terminated_length": 208.375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.01735112862661481, + "epoch": 0.8010862186014935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.3327862323202377e-06, + "loss": 0.0, + "num_tokens": 6323319.0, + "reward": 2.375, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 139.0, + "completions/mean_terminated_length": 139.0, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.08102944400161505, + "epoch": 0.801765105227427, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "learning_rate": 2.3175863664569454e-06, + "loss": 0.0, + "num_tokens": 6327775.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 103.5, + "completions/mean_terminated_length": 103.5, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.058857130352407694, + "epoch": 0.8024439918533605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.3024296888302565e-06, + "loss": 0.0, + "num_tokens": 6331731.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 193.125, + "completions/mean_terminated_length": 193.125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.03190521849319339, + "epoch": 0.803122878479294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.2873162846464868e-06, + "loss": 0.0, + "num_tokens": 6336996.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 177.875, + "completions/mean_terminated_length": 177.875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.11243955511599779, + "epoch": 0.8038017651052274, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.296875, + "learning_rate": 2.272246238868687e-06, + "loss": -0.0, + "num_tokens": 6342011.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 311.25, + "completions/mean_terminated_length": 311.25, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.04063680907711387, + "epoch": 0.8044806517311609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.2572196362161592e-06, + "loss": 0.0, + "num_tokens": 6349269.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 136.75, + "completions/mean_terminated_length": 136.75, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.07085582660511136, + "epoch": 0.8051595383570944, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1875, + "learning_rate": 2.242236561163976e-06, + "loss": -0.0, + "num_tokens": 6353635.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 173.25, + "completions/mean_terminated_length": 173.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.0492614540271461, + "epoch": 0.8058384249830278, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.09375, + "learning_rate": 2.227297097942511e-06, + "loss": -0.0, + "num_tokens": 6359085.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 107.75, + "completions/mean_terminated_length": 107.75, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.030527201481163502, + "epoch": 0.8065173116089613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.212401330536973e-06, + "loss": 0.0, + "num_tokens": 6363091.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 256.25, + "completions/mean_terminated_length": 256.25, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.03725535562261939, + "epoch": 0.8071961982348947, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "learning_rate": 2.1975493426869155e-06, + "loss": 0.0, + "num_tokens": 6369549.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 264.5, + "completions/mean_terminated_length": 264.5, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.03545244783163071, + "epoch": 0.8078750848608283, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "learning_rate": 2.1827412178857866e-06, + "loss": 0.0, + "num_tokens": 6376537.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 197.875, + "completions/mean_terminated_length": 197.875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.07140312995761633, + "epoch": 0.8085539714867617, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "learning_rate": 2.167977039380439e-06, + "loss": 0.0, + "num_tokens": 6381464.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.0885469326749444, + "epoch": 0.8092328581126952, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.03125, + "learning_rate": 2.153256890170683e-06, + "loss": 0.0, + "num_tokens": 6385690.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 160.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.05450877780094743, + "epoch": 0.8099117447386287, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "learning_rate": 2.1385808530088024e-06, + "loss": 0.0, + "num_tokens": 6390402.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 178.25, + "completions/mean_terminated_length": 178.25, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.07823984464630485, + "epoch": 0.8105906313645621, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "learning_rate": 2.1239490103990946e-06, + "loss": -0.0, + "num_tokens": 6395244.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 158.125, + "completions/mean_terminated_length": 158.125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.03320121788419783, + "epoch": 0.8112695179904956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.109361444597414e-06, + "loss": 0.0, + "num_tokens": 6399733.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 166.5, + "completions/mean_terminated_length": 166.5, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.06206024717539549, + "epoch": 0.811948404616429, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.71875, + "learning_rate": 2.0948182376107063e-06, + "loss": 0.0, + "num_tokens": 6404313.0, + "reward": 2.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 171.0, + "completions/mean_terminated_length": 171.0, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.054358861874789, + "epoch": 0.8126272912423625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.0803194711965356e-06, + "loss": 0.0, + "num_tokens": 6409145.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 121.0, + "completions/mean_terminated_length": 121.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.057942730374634266, + "epoch": 0.813306177868296, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.15625, + "learning_rate": 2.0658652268626402e-06, + "loss": 0.0, + "num_tokens": 6413425.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 141.5, + "completions/mean_terminated_length": 141.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.05780556984245777, + "epoch": 0.8139850644942295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.0514555858664663e-06, + "loss": 0.0, + "num_tokens": 6417621.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 155.25, + "completions/mean_terminated_length": 155.25, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.05955437617376447, + "epoch": 0.814663951120163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.037090629214721e-06, + "loss": 0.0, + "num_tokens": 6422271.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 108.875, + "completions/mean_terminated_length": 108.875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.06297881994396448, + "epoch": 0.8153428377460964, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4375, + "learning_rate": 2.0227704376628987e-06, + "loss": -0.0, + "num_tokens": 6426270.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 146.75, + "completions/mean_terminated_length": 146.75, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.05374971451237798, + "epoch": 0.8160217243720299, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0, + "learning_rate": 2.0084950917148403e-06, + "loss": 0.0, + "num_tokens": 6430684.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 187.875, + "completions/mean_terminated_length": 187.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.10325939115136862, + "epoch": 0.8167006109979633, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "learning_rate": 1.9942646716222867e-06, + "loss": -0.0, + "num_tokens": 6435323.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 136.625, + "completions/mean_terminated_length": 136.625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.051525934133678675, + "epoch": 0.8173794976238968, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "learning_rate": 1.980079257384405e-06, + "loss": 0.0, + "num_tokens": 6440224.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 189.5, + "completions/mean_terminated_length": 189.5, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.07622804772108793, + "epoch": 0.8180583842498302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9659389287473675e-06, + "loss": 0.0, + "num_tokens": 6444860.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 220.875, + "completions/mean_terminated_length": 220.875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.0970824770629406, + "epoch": 0.8187372708757638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "learning_rate": 1.9518437652038757e-06, + "loss": 0.0, + "num_tokens": 6451179.0, + "reward": 1.4375, + "reward_std": 0.6196196675300598, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.3133915960788727, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 156.875, + "completions/mean_terminated_length": 156.875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.056854731403291225, + "epoch": 0.8194161575016972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.937793845992737e-06, + "loss": 0.0, + "num_tokens": 6455634.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 233.625, + "completions/mean_terminated_length": 233.625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.030172571539878845, + "epoch": 0.8200950441276307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9237892500984022e-06, + "loss": 0.0, + "num_tokens": 6461823.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 101.5, + "completions/mean_terminated_length": 101.5, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.041986153926700354, + "epoch": 0.8207739307535642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.0, + "num_tokens": 6465747.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 199.875, + "completions/mean_terminated_length": 199.875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.052354331593960524, + "epoch": 0.8214528173794976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.895916342923534e-06, + "loss": 0.0, + "num_tokens": 6471058.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 127.25, + "completions/mean_terminated_length": 127.25, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.055486186407506466, + "epoch": 0.8221317040054311, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.882048188336172e-06, + "loss": 0.0, + "num_tokens": 6475308.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 179.25, + "completions/mean_terminated_length": 179.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.0853898017667234, + "epoch": 0.8228105906313645, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "learning_rate": 1.8682256704510625e-06, + "loss": 0.0, + "num_tokens": 6480182.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 184.875, + "completions/mean_terminated_length": 184.875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.05451425025239587, + "epoch": 0.823489477257298, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "learning_rate": 1.8544488669742755e-06, + "loss": 0.0, + "num_tokens": 6485453.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 280.375, + "completions/mean_terminated_length": 280.375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.0478174164891243, + "epoch": 0.8241683638832314, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "learning_rate": 1.8407178553548876e-06, + "loss": 0.0, + "num_tokens": 6493336.0, + "reward": 2.625, + "reward_std": 0.43404853343963623, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.039811473339796066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 238.5, + "completions/mean_terminated_length": 238.5, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.02480868436396122, + "epoch": 0.824847250509165, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.828125, + "learning_rate": 1.8270327127845534e-06, + "loss": -0.0, + "num_tokens": 6499508.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 140.0, + "completions/mean_terminated_length": 140.0, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.05973181268200278, + "epoch": 0.8255261371350985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8133935161970561e-06, + "loss": 0.0, + "num_tokens": 6503732.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 119.625, + "completions/mean_terminated_length": 119.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.05151199037209153, + "epoch": 0.8262050237610319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7998003422678867e-06, + "loss": 0.0, + "num_tokens": 6508017.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 181.875, + "completions/mean_terminated_length": 181.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.06716850865632296, + "epoch": 0.8268839103869654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7862532674138166e-06, + "loss": 0.0, + "num_tokens": 6512704.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 143.0, + "completions/mean_terminated_length": 143.0, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.02405495452694595, + "epoch": 0.8275627970128988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.772752367792452e-06, + "loss": 0.0, + "num_tokens": 6517136.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 191.75, + "completions/mean_terminated_length": 191.75, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.03884179727174342, + "epoch": 0.8282416836388323, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "learning_rate": 1.7592977193018268e-06, + "loss": -0.0, + "num_tokens": 6522502.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 212.125, + "completions/mean_terminated_length": 212.125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.059142252430319786, + "epoch": 0.8289205702647657, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "learning_rate": 1.745889397579954e-06, + "loss": 0.0, + "num_tokens": 6527423.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 127.0, + "completions/mean_terminated_length": 127.0, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.08818131405860186, + "epoch": 0.8295994568906992, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "learning_rate": 1.732527478004422e-06, + "loss": 0.0, + "num_tokens": 6531551.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 232.125, + "completions/mean_terminated_length": 232.125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.030679827090352774, + "epoch": 0.8302783435166328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7192120356919517e-06, + "loss": 0.0, + "num_tokens": 6537632.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 351.25, + "completions/mean_terminated_length": 351.25, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.04530319105833769, + "epoch": 0.8309572301425662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7059431454979825e-06, + "loss": 0.0, + "num_tokens": 6546074.0, + "reward": 2.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 165.25, + "completions/mean_terminated_length": 165.25, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.07944596000015736, + "epoch": 0.8316361167684997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6927208820162589e-06, + "loss": 0.0, + "num_tokens": 6550684.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.11907055135816336, + "epoch": 0.8323150033944331, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "learning_rate": 1.6795453195784017e-06, + "loss": 0.0, + "num_tokens": 6556560.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 157.375, + "completions/mean_terminated_length": 157.375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.06162241520360112, + "epoch": 0.8329938900203666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "learning_rate": 1.6664165322534887e-06, + "loss": -0.0, + "num_tokens": 6561035.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 97.625, + "completions/mean_terminated_length": 97.625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.04993098718114197, + "epoch": 0.8336727766463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.653334593847643e-06, + "loss": 0.0, + "num_tokens": 6564856.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 131.75, + "completions/mean_terminated_length": 131.75, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.04275673255324364, + "epoch": 0.8343516632722335, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "learning_rate": 1.6402995779036146e-06, + "loss": -0.0, + "num_tokens": 6569446.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 364.25, + "completions/mean_terminated_length": 364.25, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.021586764021776617, + "epoch": 0.835030549898167, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "learning_rate": 1.6273115577003806e-06, + "loss": 0.0, + "num_tokens": 6577832.0, + "reward": 2.075000047683716, + "reward_std": 0.384522020816803, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 200.5, + "completions/mean_terminated_length": 200.5, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.0508713866584003, + "epoch": 0.8357094365241005, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.34375, + "learning_rate": 1.6143706062527108e-06, + "loss": 0.0, + "num_tokens": 6583372.0, + "reward": 2.0, + "reward_std": 0.8141603469848633, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 143.375, + "completions/mean_terminated_length": 143.375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.0736389528028667, + "epoch": 0.836388323150034, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "learning_rate": 1.6014767963107715e-06, + "loss": -0.0, + "num_tokens": 6587711.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 142.0, + "completions/mean_terminated_length": 142.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.06715327175334096, + "epoch": 0.8370672097759674, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "learning_rate": 1.588630200359711e-06, + "loss": 0.0, + "num_tokens": 6592847.0, + "reward": 1.875, + "reward_std": 0.06681530922651291, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 129.0, + "completions/mean_terminated_length": 129.0, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.058211774099618196, + "epoch": 0.8377460964019009, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "learning_rate": 1.575830890619261e-06, + "loss": -0.0, + "num_tokens": 6597239.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.375, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 156.625, + "completions/mean_terminated_length": 156.625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.047470828518271446, + "epoch": 0.8384249830278343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.563078939043322e-06, + "loss": 0.0, + "num_tokens": 6601780.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 297.875, + "completions/mean_terminated_length": 297.875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "entropy": 0.050020713126286864, + "epoch": 0.8391038696537678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5503744173195568e-06, + "loss": 0.0, + "num_tokens": 6608883.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 177.125, + "completions/mean_terminated_length": 177.125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.09777758829295635, + "epoch": 0.8397827562797013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5377173968689985e-06, + "loss": 0.0, + "num_tokens": 6613596.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 179.875, + "completions/mean_terminated_length": 179.875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.0899508735165, + "epoch": 0.8404616429056347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5251079488456367e-06, + "loss": 0.0, + "num_tokens": 6618491.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 168.375, + "completions/mean_terminated_length": 168.375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.07908450113609433, + "epoch": 0.8411405295315683, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "learning_rate": 1.5125461441360223e-06, + "loss": 0.0, + "num_tokens": 6623294.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 141.625, + "completions/mean_terminated_length": 141.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.07393840188160539, + "epoch": 0.8418194161575017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.500032053358874e-06, + "loss": 0.0, + "num_tokens": 6627603.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 238.625, + "completions/mean_terminated_length": 238.625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.03621608763933182, + "epoch": 0.8424983027834352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4875657468646788e-06, + "loss": 0.0, + "num_tokens": 6633832.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 140.875, + "completions/mean_terminated_length": 140.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.036038106540217996, + "epoch": 0.8431771894093686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4751472947352884e-06, + "loss": 0.0, + "num_tokens": 6638223.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 353.125, + "completions/mean_terminated_length": 353.125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 0.047747176606208086, + "epoch": 0.8438560760353021, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "learning_rate": 1.4627767667835336e-06, + "loss": -0.0, + "num_tokens": 6646968.0, + "reward": 2.7142856121063232, + "reward_std": 0.3581618070602417, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8392857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.22180677950382233, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 188.5, + "completions/mean_terminated_length": 188.5, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.04761378560215235, + "epoch": 0.8445349626612356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.450454232552826e-06, + "loss": 0.0, + "num_tokens": 6651836.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 151.875, + "completions/mean_terminated_length": 151.875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.022800437407568097, + "epoch": 0.845213849287169, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4381797613167859e-06, + "loss": 0.0, + "num_tokens": 6656619.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 145.5, + "completions/mean_terminated_length": 145.5, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.05551019869744778, + "epoch": 0.8458927359131025, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "learning_rate": 1.4259534220788207e-06, + "loss": 0.0, + "num_tokens": 6661183.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 132.375, + "completions/mean_terminated_length": 132.375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.07395130163058639, + "epoch": 0.846571622539036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4137752835717622e-06, + "loss": 0.0, + "num_tokens": 6665602.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 91.875, + "completions/mean_terminated_length": 91.875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.04875294351950288, + "epoch": 0.8472505091649695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4016454142574676e-06, + "loss": 0.0, + "num_tokens": 6669561.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 319.5, + "completions/mean_terminated_length": 319.5, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.027896488085389137, + "epoch": 0.8479293957909029, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "learning_rate": 1.3895638823264447e-06, + "loss": 0.0, + "num_tokens": 6676957.0, + "reward": 1.8055555820465088, + "reward_std": 0.3293411433696747, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6805555820465088, + "rewards/fixed_code_pass_all_test_reward/std": 0.28752732276916504, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 177.875, + "completions/mean_terminated_length": 177.875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.03768895659595728, + "epoch": 0.8486082824168364, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "learning_rate": 1.3775307556974616e-06, + "loss": -0.0, + "num_tokens": 6681804.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 268.875, + "completions/mean_terminated_length": 268.875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.031340275425463915, + "epoch": 0.8492871690427699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3655461020171635e-06, + "loss": 0.0, + "num_tokens": 6688331.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 161.875, + "completions/mean_terminated_length": 161.875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.04886147053912282, + "epoch": 0.8499660556687033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3536099886596933e-06, + "loss": 0.0, + "num_tokens": 6693146.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 197.0, + "completions/mean_terminated_length": 197.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.052328506018966436, + "epoch": 0.8506449422946368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3417224827263232e-06, + "loss": 0.0, + "num_tokens": 6698690.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 138.625, + "completions/mean_terminated_length": 138.625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.06357053900137544, + "epoch": 0.8513238289205702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3298836510450597e-06, + "loss": 0.0, + "num_tokens": 6703055.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 118.125, + "completions/mean_terminated_length": 118.125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.053554283920675516, + "epoch": 0.8520027155465038, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.078125, + "learning_rate": 1.3180935601702838e-06, + "loss": -0.0, + "num_tokens": 6707240.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 218.25, + "completions/mean_terminated_length": 218.25, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.047173636965453625, + "epoch": 0.8526816021724372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3063522763823655e-06, + "loss": 0.0, + "num_tokens": 6713002.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 147.0, + "completions/mean_terminated_length": 147.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.09581214655190706, + "epoch": 0.8533604887983707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2946598656873e-06, + "loss": 0.0, + "num_tokens": 6717530.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 126.75, + "completions/mean_terminated_length": 126.75, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.018327789497561753, + "epoch": 0.8540393754243042, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2830163938163298e-06, + "loss": 0.0, + "num_tokens": 6722056.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 113.5, + "completions/mean_terminated_length": 113.5, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.03673034347593784, + "epoch": 0.8547182620502376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2714219262255777e-06, + "loss": 0.0, + "num_tokens": 6726796.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 243.5, + "completions/mean_terminated_length": 243.5, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.10153366811573505, + "epoch": 0.8553971486761711, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2598765280956793e-06, + "loss": 0.0, + "num_tokens": 6732488.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 166.25, + "completions/mean_terminated_length": 166.25, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.09755927696824074, + "epoch": 0.8560760353021045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2483802643314224e-06, + "loss": 0.0, + "num_tokens": 6737282.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 155.0, + "completions/mean_terminated_length": 155.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.06965050008147955, + "epoch": 0.856754921928038, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "learning_rate": 1.2369331995613664e-06, + "loss": 0.0, + "num_tokens": 6741754.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 192.625, + "completions/mean_terminated_length": 192.625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.04192041279748082, + "epoch": 0.8574338085539714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2255353981374906e-06, + "loss": 0.0, + "num_tokens": 6747199.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 121.125, + "completions/mean_terminated_length": 121.125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.09490544628351927, + "epoch": 0.858112695179905, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "learning_rate": 1.214186924134838e-06, + "loss": -0.0, + "num_tokens": 6751296.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 192.375, + "completions/mean_terminated_length": 192.375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.04287162306718528, + "epoch": 0.8587915818058385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2028878413511413e-06, + "loss": 0.0, + "num_tokens": 6756851.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 290.5, + "completions/mean_terminated_length": 290.5, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.03735964884981513, + "epoch": 0.8594704684317719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1916382133064707e-06, + "loss": 0.0, + "num_tokens": 6764007.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 143.0, + "completions/mean_terminated_length": 143.0, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.061605677008628845, + "epoch": 0.8601493550577054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.180438103242877e-06, + "loss": 0.0, + "num_tokens": 6768343.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 190.125, + "completions/mean_terminated_length": 190.125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.07917618751525879, + "epoch": 0.8608282416836388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1692875741240384e-06, + "loss": 0.0, + "num_tokens": 6773296.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 198.125, + "completions/mean_terminated_length": 198.125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.07407456485088915, + "epoch": 0.8615071283095723, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "learning_rate": 1.158186688634898e-06, + "loss": 0.0, + "num_tokens": 6778153.0, + "reward": 2.25, + "reward_std": 0.8864052295684814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 175.125, + "completions/mean_terminated_length": 175.125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.0826310757547617, + "epoch": 0.8621860149355057, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "learning_rate": 1.1471355091813251e-06, + "loss": 0.0, + "num_tokens": 6782714.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 179.75, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.04163091070950031, + "epoch": 0.8628649015614392, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "learning_rate": 1.1361340978897483e-06, + "loss": -0.0, + "num_tokens": 6787648.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 310.875, + "completions/mean_terminated_length": 310.875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.02888157172128558, + "epoch": 0.8635437881873728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.125182516606823e-06, + "loss": 0.0, + "num_tokens": 6794759.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.03162820101715624, + "epoch": 0.8642226748133062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1142808268990691e-06, + "loss": 0.0, + "num_tokens": 6800029.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 151.375, + "completions/mean_terminated_length": 151.375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.05688911909237504, + "epoch": 0.8649015614392397, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.8125, + "learning_rate": 1.1034290900525279e-06, + "loss": -0.0, + "num_tokens": 6804680.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 98.75, + "completions/mean_terminated_length": 98.75, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.044836345594376326, + "epoch": 0.8655804480651731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0926273670724296e-06, + "loss": 0.0, + "num_tokens": 6808614.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 188.25, + "completions/mean_terminated_length": 188.25, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.07390343258157372, + "epoch": 0.8662593346911066, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "learning_rate": 1.0818757186828388e-06, + "loss": 0.0, + "num_tokens": 6813712.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 181.5, + "completions/mean_terminated_length": 181.5, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.048651386983692646, + "epoch": 0.86693822131704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875, + "learning_rate": 1.0711742053263107e-06, + "loss": 0.0, + "num_tokens": 6818452.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 414.5, + "completions/mean_terminated_length": 414.5, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.01869669440202415, + "epoch": 0.8676171079429735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0605228871635586e-06, + "loss": 0.0, + "num_tokens": 6827392.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 277.5, + "completions/mean_terminated_length": 277.5, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.03080414840951562, + "epoch": 0.868295994568907, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "learning_rate": 1.0499218240731157e-06, + "loss": -0.0, + "num_tokens": 6834132.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 136.625, + "completions/mean_terminated_length": 136.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.07403090968728065, + "epoch": 0.8689748811948405, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.96875, + "learning_rate": 1.039371075650998e-06, + "loss": -0.0, + "num_tokens": 6838425.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 122.5, + "completions/mean_terminated_length": 122.5, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.05667120357975364, + "epoch": 0.869653767820774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0288707012103593e-06, + "loss": 0.0, + "num_tokens": 6842677.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 150.625, + "completions/mean_terminated_length": 150.625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.07893756218254566, + "epoch": 0.8703326544467074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "learning_rate": 1.0184207597811724e-06, + "loss": -0.0, + "num_tokens": 6847298.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 129.625, + "completions/mean_terminated_length": 129.625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.036208047065883875, + "epoch": 0.8710115410726409, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9375, + "learning_rate": 1.0080213101098891e-06, + "loss": -0.0, + "num_tokens": 6851495.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 58.5, + "completions/mean_terminated_length": 58.5, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.018495872616767883, + "epoch": 0.8716904276985743, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.09375, + "learning_rate": 9.976724106591128e-07, + "loss": 0.0, + "num_tokens": 6855003.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 311.5, + "completions/mean_terminated_length": 311.5, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "entropy": 0.04359985748305917, + "epoch": 0.8723693143245078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "learning_rate": 9.873741196072683e-07, + "loss": -0.0, + "num_tokens": 6862047.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 265.25, + "completions/mean_terminated_length": 265.25, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.04085562261752784, + "epoch": 0.8730482009504412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.771264948482695e-07, + "loss": 0.0, + "num_tokens": 6868785.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 178.375, + "completions/mean_terminated_length": 178.375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.07690457534044981, + "epoch": 0.8737270875763747, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "learning_rate": 9.669295939912106e-07, + "loss": 0.0, + "num_tokens": 6873588.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 156.125, + "completions/mean_terminated_length": 156.125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.05118082510307431, + "epoch": 0.8744059742023083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.567834743600202e-07, + "loss": 0.0, + "num_tokens": 6878277.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 144.25, + "completions/mean_terminated_length": 144.25, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.07870891969650984, + "epoch": 0.8750848608282417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.466881929931582e-07, + "loss": 0.0, + "num_tokens": 6882663.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 190.5, + "completions/mean_terminated_length": 190.5, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.05558400508016348, + "epoch": 0.8757637474541752, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.125, + "learning_rate": 9.366438066432804e-07, + "loss": 0.0, + "num_tokens": 6887555.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 177.125, + "completions/mean_terminated_length": 177.125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.10110961738973856, + "epoch": 0.8764426340801086, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "learning_rate": 9.266503717769315e-07, + "loss": -0.0, + "num_tokens": 6892116.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 257.375, + "completions/mean_terminated_length": 257.375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.025766295846551657, + "epoch": 0.8771215207060421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.167079445742188e-07, + "loss": 0.0, + "num_tokens": 6898431.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 81.125, + "completions/mean_terminated_length": 81.125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.043448422104120255, + "epoch": 0.8778004073319755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.06816580928499e-07, + "loss": 0.0, + "num_tokens": 6902160.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 127.75, + "completions/mean_terminated_length": 127.75, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.03499178518541157, + "epoch": 0.878479293957909, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.969763364460682e-07, + "loss": 0.0, + "num_tokens": 6906350.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 72.5, + "completions/mean_terminated_length": 72.5, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.03780035534873605, + "epoch": 0.8791581805838425, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.25, + "learning_rate": 8.871872664458459e-07, + "loss": -0.0, + "num_tokens": 6910074.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 93.875, + "completions/mean_terminated_length": 93.875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.03398936497978866, + "epoch": 0.879837067209776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.774494259590594e-07, + "loss": 0.0, + "num_tokens": 6913929.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 239.375, + "completions/mean_terminated_length": 239.375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.031143828528001904, + "epoch": 0.8805159538357095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "learning_rate": 8.677628697289408e-07, + "loss": -0.0, + "num_tokens": 6919980.0, + "reward": 2.8214285373687744, + "reward_std": 0.3642157018184662, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 84.875, + "completions/mean_terminated_length": 84.875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.034618568839505315, + "epoch": 0.8811948404616429, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.581276522104198e-07, + "loss": 0.0, + "num_tokens": 6923619.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 150.25, + "completions/mean_terminated_length": 150.25, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.046522431541234255, + "epoch": 0.8818737270875764, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "learning_rate": 8.485438275698154e-07, + "loss": 0.0, + "num_tokens": 6927901.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 147.375, + "completions/mean_terminated_length": 147.375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.0452120965346694, + "epoch": 0.8825526137135098, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.03125, + "learning_rate": 8.39011449684527e-07, + "loss": -0.0, + "num_tokens": 6932240.0, + "reward": 2.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 117.75, + "completions/mean_terminated_length": 117.75, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.0458712843246758, + "epoch": 0.8832315003394433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.295305721427371e-07, + "loss": 0.0, + "num_tokens": 6936238.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 96.5, + "completions/mean_terminated_length": 96.5, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.02186664822511375, + "epoch": 0.8839103869653768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.201012482431125e-07, + "loss": 0.0, + "num_tokens": 6940290.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 247.375, + "completions/mean_terminated_length": 247.375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.07473768247291446, + "epoch": 0.8845892735913102, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "learning_rate": 8.10723530994496e-07, + "loss": 0.0, + "num_tokens": 6945725.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 130.375, + "completions/mean_terminated_length": 130.375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.07881530933082104, + "epoch": 0.8852681602172437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.01397473115616e-07, + "loss": 0.0, + "num_tokens": 6949904.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 159.625, + "completions/mean_terminated_length": 159.625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.05573853477835655, + "epoch": 0.8859470468431772, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "learning_rate": 7.921231270347851e-07, + "loss": 0.0, + "num_tokens": 6954277.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 203.625, + "completions/mean_terminated_length": 203.625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.07590067246928811, + "epoch": 0.8866259334691107, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "learning_rate": 7.82900544889612e-07, + "loss": -0.0, + "num_tokens": 6959426.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 202.375, + "completions/mean_terminated_length": 202.375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.05649718875065446, + "epoch": 0.8873048200950441, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.140625, + "learning_rate": 7.737297785266995e-07, + "loss": -0.0, + "num_tokens": 6964973.0, + "reward": 2.3499999046325684, + "reward_std": 0.47509393095970154, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7250000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.14880476891994476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 145.375, + "completions/mean_terminated_length": 145.375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.024505326058715582, + "epoch": 0.8879837067209776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.646108795013563e-07, + "loss": 0.0, + "num_tokens": 6969768.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 201.875, + "completions/mean_terminated_length": 201.875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.025752554647624493, + "epoch": 0.8886625933469111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.555438990773134e-07, + "loss": 0.0, + "num_tokens": 6975335.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 141.25, + "completions/mean_terminated_length": 141.25, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.0454097636975348, + "epoch": 0.8893414799728445, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "learning_rate": 7.46528888226431e-07, + "loss": 0.0, + "num_tokens": 6979681.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 130.375, + "completions/mean_terminated_length": 130.375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.07535605737939477, + "epoch": 0.890020366598778, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.4375, + "learning_rate": 7.375658976284073e-07, + "loss": -0.0, + "num_tokens": 6983980.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 311.75, + "completions/mean_terminated_length": 311.75, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.04017899348400533, + "epoch": 0.8906992532247114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "learning_rate": 7.286549776704987e-07, + "loss": -0.0, + "num_tokens": 6991290.0, + "reward": 1.7857142686843872, + "reward_std": 0.3581618368625641, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.3581618070602417, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.07340986281633377, + "epoch": 0.891378139850645, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "learning_rate": 7.197961784472396e-07, + "loss": -0.0, + "num_tokens": 6996083.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 127.625, + "completions/mean_terminated_length": 127.625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.06486464524641633, + "epoch": 0.8920570264765784, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "learning_rate": 7.109895497601571e-07, + "loss": 0.0, + "num_tokens": 7000312.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 242.5, + "completions/mean_terminated_length": 242.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.0960701210424304, + "epoch": 0.8927359131025119, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "learning_rate": 7.022351411174866e-07, + "loss": -0.0, + "num_tokens": 7005964.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 120.0, + "completions/mean_terminated_length": 120.0, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.05812152964062989, + "epoch": 0.8934147997284454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.935330017339003e-07, + "loss": 0.0, + "num_tokens": 7010124.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 260.875, + "completions/mean_terminated_length": 260.875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.020657073939219117, + "epoch": 0.8940936863543788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.848831805302314e-07, + "loss": 0.0, + "num_tokens": 7016587.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 116.125, + "completions/mean_terminated_length": 116.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.0774011854082346, + "epoch": 0.8947725729803123, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.890625, + "learning_rate": 6.762857261331901e-07, + "loss": 0.0, + "num_tokens": 7021092.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 319.25, + "completions/mean_terminated_length": 319.25, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.06596643687225878, + "epoch": 0.8954514596062457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.677406868751013e-07, + "loss": 0.0, + "num_tokens": 7028726.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 154.5, + "completions/mean_terminated_length": 154.5, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.042495643720030785, + "epoch": 0.8961303462321792, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.96875, + "learning_rate": 6.592481107936243e-07, + "loss": 0.0, + "num_tokens": 7033154.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 160.875, + "completions/mean_terminated_length": 160.875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.05387642700225115, + "epoch": 0.8968092328581126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.50808045631488e-07, + "loss": 0.0, + "num_tokens": 7037681.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 288.5, + "completions/mean_terminated_length": 288.5, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.03018986596725881, + "epoch": 0.8974881194840462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "learning_rate": 6.424205388362203e-07, + "loss": -0.0, + "num_tokens": 7044757.0, + "reward": 2.6666665077209473, + "reward_std": 0.4714045226573944, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 242.0, + "completions/mean_terminated_length": 242.0, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.031667630886659026, + "epoch": 0.8981670061099797, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "learning_rate": 6.340856375598781e-07, + "loss": -0.0, + "num_tokens": 7051141.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 280.5, + "completions/mean_terminated_length": 280.5, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.032366921193897724, + "epoch": 0.8988458927359131, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "learning_rate": 6.258033886587911e-07, + "loss": 0.0, + "num_tokens": 7058505.0, + "reward": 2.90625, + "reward_std": 0.2651650309562683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 113.875, + "completions/mean_terminated_length": 113.875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.046867894008755684, + "epoch": 0.8995247793618466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.175738386932917e-07, + "loss": 0.0, + "num_tokens": 7062512.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 132.0, + "completions/mean_terminated_length": 132.0, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.06957761477679014, + "epoch": 0.90020366598778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.093970339274513e-07, + "loss": 0.0, + "num_tokens": 7066800.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 184.0, + "completions/mean_terminated_length": 184.0, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.06062587955966592, + "epoch": 0.9008825526137135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.012730203288286e-07, + "loss": 0.0, + "num_tokens": 7071536.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 160.25, + "completions/mean_terminated_length": 160.25, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.06407987233251333, + "epoch": 0.9015614392396469, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.390625, + "learning_rate": 5.932018435681985e-07, + "loss": 0.0, + "num_tokens": 7076170.0, + "reward": 2.125, + "reward_std": 0.9910312294960022, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 138.0, + "completions/mean_terminated_length": 138.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.08480401104316115, + "epoch": 0.9022403258655805, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "learning_rate": 5.851835490193136e-07, + "loss": 0.0, + "num_tokens": 7080770.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 163.625, + "completions/mean_terminated_length": 163.625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.042670343071222305, + "epoch": 0.902919212491514, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "learning_rate": 5.772181817586309e-07, + "loss": 0.0, + "num_tokens": 7085543.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 180.875, + "completions/mean_terminated_length": 180.875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.03924781363457441, + "epoch": 0.9035980991174474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.693057865650676e-07, + "loss": 0.0, + "num_tokens": 7090550.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 103.5, + "completions/mean_terminated_length": 103.5, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.04539029020816088, + "epoch": 0.9042769857433809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.614464079197457e-07, + "loss": 0.0, + "num_tokens": 7094538.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 144.0, + "completions/mean_terminated_length": 144.0, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.0610105381347239, + "epoch": 0.9049558723693143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.536400900057493e-07, + "loss": 0.0, + "num_tokens": 7098786.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 145.375, + "completions/mean_terminated_length": 145.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.04104882990941405, + "epoch": 0.9056347589952478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.458868767078673e-07, + "loss": 0.0, + "num_tokens": 7103029.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 147.0, + "completions/mean_terminated_length": 147.0, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.05672460934147239, + "epoch": 0.9063136456211812, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.25, + "learning_rate": 5.381868116123512e-07, + "loss": 0.0, + "num_tokens": 7107349.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 185.625, + "completions/mean_terminated_length": 185.625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.0988643690943718, + "epoch": 0.9069925322471147, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.25, + "learning_rate": 5.305399380066656e-07, + "loss": 0.0, + "num_tokens": 7112402.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 196.25, + "completions/mean_terminated_length": 196.25, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.04725077305920422, + "epoch": 0.9076714188730483, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.229462988792566e-07, + "loss": 0.0, + "num_tokens": 7117836.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 107.5, + "completions/mean_terminated_length": 107.5, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.04917873814702034, + "epoch": 0.9083503054989817, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "learning_rate": 5.154059369192932e-07, + "loss": 0.0, + "num_tokens": 7121888.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 138.875, + "completions/mean_terminated_length": 138.875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.028185414150357246, + "epoch": 0.9090291921249152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.079188945164426e-07, + "loss": 0.0, + "num_tokens": 7126591.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 187.625, + "completions/mean_terminated_length": 187.625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.0862139780074358, + "epoch": 0.9097080787508486, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "learning_rate": 5.004852137606198e-07, + "loss": 0.0, + "num_tokens": 7131364.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 215.375, + "completions/mean_terminated_length": 215.375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.026261223945766687, + "epoch": 0.9103869653767821, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.931049364417628e-07, + "loss": 0.0, + "num_tokens": 7137103.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 190.625, + "completions/mean_terminated_length": 190.625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.0418059267103672, + "epoch": 0.9110658520027155, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "learning_rate": 4.857781040495857e-07, + "loss": -0.0, + "num_tokens": 7142012.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 191.5, + "completions/mean_terminated_length": 191.5, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.09205880155786872, + "epoch": 0.911744738628649, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "learning_rate": 4.785047577733515e-07, + "loss": -0.0, + "num_tokens": 7146984.0, + "reward": 2.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 335.0, + "completions/mean_terminated_length": 335.0, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 0.011861839797347784, + "epoch": 0.9124236252545825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "learning_rate": 4.7128493850164715e-07, + "loss": -0.0, + "num_tokens": 7154784.0, + "reward": 2.920454502105713, + "reward_std": 0.03214118629693985, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9204546213150024, + "rewards/fixed_code_pass_all_test_reward/std": 0.03214120864868164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 226.625, + "completions/mean_terminated_length": 226.625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.04227684112265706, + "epoch": 0.9131025118805159, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.6411868682213923e-07, + "loss": 0.0, + "num_tokens": 7160629.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 103.75, + "completions/mean_terminated_length": 103.75, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.06012857146561146, + "epoch": 0.9137813985064495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.5700604302135633e-07, + "loss": 0.0, + "num_tokens": 7164603.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 117.25, + "completions/mean_terminated_length": 117.25, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.049112992361187935, + "epoch": 0.9144602851323829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.4994704708445804e-07, + "loss": 0.0, + "num_tokens": 7168949.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 318.25, + "completions/mean_terminated_length": 318.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.0391323184594512, + "epoch": 0.9151391717583164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.42941738695013e-07, + "loss": 0.0, + "num_tokens": 7176591.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 241.5, + "completions/mean_terminated_length": 241.5, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.0470014913007617, + "epoch": 0.9158180583842498, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.359901572347758e-07, + "loss": 0.0, + "num_tokens": 7182963.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 330.0, + "completions/mean_terminated_length": 330.0, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "entropy": 0.034591716481372714, + "epoch": 0.9164969450101833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.290923417834625e-07, + "loss": 0.0, + "num_tokens": 7190667.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 189.5, + "completions/mean_terminated_length": 189.5, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.0638595069758594, + "epoch": 0.9171758316361168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "learning_rate": 4.222483311185299e-07, + "loss": 0.0, + "num_tokens": 7195423.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 171.875, + "completions/mean_terminated_length": 171.875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.047411042265594006, + "epoch": 0.9178547182620502, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "learning_rate": 4.1545816371496685e-07, + "loss": 0.0, + "num_tokens": 7200046.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 244.5, + "completions/mean_terminated_length": 244.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.03261782415211201, + "epoch": 0.9185336048879837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.087218777450652e-07, + "loss": 0.0, + "num_tokens": 7205218.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 138.625, + "completions/mean_terminated_length": 138.625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.07520277053117752, + "epoch": 0.9192124915139172, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.59375, + "learning_rate": 4.02039511078216e-07, + "loss": 0.0, + "num_tokens": 7209591.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 166.875, + "completions/mean_terminated_length": 166.875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.046233424451202154, + "epoch": 0.9198913781398507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.954111012806894e-07, + "loss": 0.0, + "num_tokens": 7214358.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 113.0, + "completions/mean_terminated_length": 113.0, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.05395610770210624, + "epoch": 0.9205702647657841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.8883668561542907e-07, + "loss": 0.0, + "num_tokens": 7218390.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 228.875, + "completions/mean_terminated_length": 228.875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.034961943863891065, + "epoch": 0.9212491513917176, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "learning_rate": 3.8231630104183514e-07, + "loss": 0.0, + "num_tokens": 7223485.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 114.5, + "completions/mean_terminated_length": 114.5, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.045125515665858984, + "epoch": 0.921928038017651, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.7584998421556387e-07, + "loss": 0.0, + "num_tokens": 7227665.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 134.625, + "completions/mean_terminated_length": 134.625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.10880998056381941, + "epoch": 0.9226069246435845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.6943777148831907e-07, + "loss": 0.0, + "num_tokens": 7231854.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 168.125, + "completions/mean_terminated_length": 168.125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.06575705483555794, + "epoch": 0.923285811269518, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.140625, + "learning_rate": 3.6307969890764907e-07, + "loss": 0.0, + "num_tokens": 7236687.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 140.375, + "completions/mean_terminated_length": 140.375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.06530605629086494, + "epoch": 0.9239646978954514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.567758022167378e-07, + "loss": 0.0, + "num_tokens": 7240930.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 205.375, + "completions/mean_terminated_length": 205.375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.04267252469435334, + "epoch": 0.924643584521385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.505261168542107e-07, + "loss": 0.0, + "num_tokens": 7246581.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 109.625, + "completions/mean_terminated_length": 109.625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.06701449351385236, + "epoch": 0.9253224711473184, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "learning_rate": 3.443306779539335e-07, + "loss": 0.0, + "num_tokens": 7250562.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 137.625, + "completions/mean_terminated_length": 137.625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.053723922930657864, + "epoch": 0.9260013577732519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.381895203448182e-07, + "loss": 0.0, + "num_tokens": 7254815.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 185.0, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.0972116100601852, + "epoch": 0.9266802443991853, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "learning_rate": 3.321026785506165e-07, + "loss": 0.0, + "num_tokens": 7259479.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 194.0, + "completions/mean_terminated_length": 194.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.04474888020195067, + "epoch": 0.9273591310251188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.2607018678973646e-07, + "loss": 0.0, + "num_tokens": 7265047.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 144.75, + "completions/mean_terminated_length": 144.75, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.07023880514316261, + "epoch": 0.9280380176510523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.2009207897504945e-07, + "loss": 0.0, + "num_tokens": 7269637.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 322.125, + "completions/mean_terminated_length": 322.125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "entropy": 0.05936228530481458, + "epoch": 0.9287169042769857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.1416838871368925e-07, + "loss": 0.0, + "num_tokens": 7277262.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 147.5, + "completions/mean_terminated_length": 147.5, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.08179679419845343, + "epoch": 0.9293957909029192, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.0829914930687767e-07, + "loss": 0.0, + "num_tokens": 7281554.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 165.0, + "completions/mean_terminated_length": 165.0, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.07411248236894608, + "epoch": 0.9300746775288526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.024843937497246e-07, + "loss": 0.0, + "num_tokens": 7285914.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 299.25, + "completions/mean_terminated_length": 299.25, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.032543769804760814, + "epoch": 0.9307535641547862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.9672415473105286e-07, + "loss": 0.0, + "num_tokens": 7292972.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.02955157309770584, + "epoch": 0.9314324507807196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.9101846463320483e-07, + "loss": 0.0, + "num_tokens": 7298065.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 173.0, + "completions/mean_terminated_length": 173.0, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.08593884389847517, + "epoch": 0.9321113374066531, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "learning_rate": 2.8536735553186814e-07, + "loss": -0.0, + "num_tokens": 7302577.0, + "reward": 2.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 236.125, + "completions/mean_terminated_length": 236.125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.03686584485694766, + "epoch": 0.9327902240325866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.7977085919589253e-07, + "loss": 0.0, + "num_tokens": 7308554.0, + "reward": 1.75, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 392.125, + "completions/mean_terminated_length": 392.125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 0.04892592295072973, + "epoch": 0.93346911065852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.7422900708710896e-07, + "loss": 0.0, + "num_tokens": 7317035.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 227.75, + "completions/mean_terminated_length": 227.75, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.01983888167887926, + "epoch": 0.9341479972844535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.687418303601563e-07, + "loss": 0.0, + "num_tokens": 7323089.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 143.0, + "completions/mean_terminated_length": 143.0, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.05106785940006375, + "epoch": 0.9348268839103869, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.633093598623049e-07, + "loss": 0.0, + "num_tokens": 7327449.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 196.0, + "completions/mean_terminated_length": 196.0, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.07594262389466166, + "epoch": 0.9355057705363204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.5793162613328095e-07, + "loss": 0.0, + "num_tokens": 7332977.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 94.75, + "completions/mean_terminated_length": 94.75, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.024748916970565915, + "epoch": 0.9361846571622539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.5260865940510027e-07, + "loss": 0.0, + "num_tokens": 7336751.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 268.5, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.07646798528730869, + "epoch": 0.9368635437881874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "learning_rate": 2.4734048960189385e-07, + "loss": 0.0, + "num_tokens": 7342579.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 109.25, + "completions/mean_terminated_length": 109.25, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.03955674055032432, + "epoch": 0.9375424304141209, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.34375, + "learning_rate": 2.421271463397368e-07, + "loss": 0.0, + "num_tokens": 7346805.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 150.5, + "completions/mean_terminated_length": 150.5, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.04381275549530983, + "epoch": 0.9382213170400543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.369686589264919e-07, + "loss": 0.0, + "num_tokens": 7351377.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 152.75, + "completions/mean_terminated_length": 152.75, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.11116392724215984, + "epoch": 0.9389002036659878, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.28125, + "learning_rate": 2.3186505636163316e-07, + "loss": -0.0, + "num_tokens": 7356087.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.04105406999588013, + "epoch": 0.9395790902919212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.2681636733609457e-07, + "loss": 0.0, + "num_tokens": 7361984.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 332.25, + "completions/mean_terminated_length": 332.25, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.03562983754090965, + "epoch": 0.9402579769178547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "learning_rate": 2.2182262023209612e-07, + "loss": -0.0, + "num_tokens": 7369890.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 171.125, + "completions/mean_terminated_length": 171.125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.04823680454865098, + "epoch": 0.9409368635437881, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.168838431229958e-07, + "loss": 0.0, + "num_tokens": 7374659.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 236.625, + "completions/mean_terminated_length": 236.625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.03606553701683879, + "epoch": 0.9416157501697217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.1200006377312232e-07, + "loss": 0.0, + "num_tokens": 7381224.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 212.375, + "completions/mean_terminated_length": 212.375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.027940819272771478, + "epoch": 0.9422946367956552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.071713096376271e-07, + "loss": 0.0, + "num_tokens": 7387115.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 149.25, + "completions/mean_terminated_length": 149.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.09008469432592392, + "epoch": 0.9429735234215886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.0239760786232355e-07, + "loss": 0.0, + "num_tokens": 7391661.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 139.5, + "completions/mean_terminated_length": 139.5, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.03246856899932027, + "epoch": 0.9436524100475221, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.109375, + "learning_rate": 1.9767898528353923e-07, + "loss": 0.0, + "num_tokens": 7396529.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 242.25, + "completions/mean_terminated_length": 242.25, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.03269170830026269, + "epoch": 0.9443312966734555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.930154684279628e-07, + "loss": 0.0, + "num_tokens": 7402747.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 183.125, + "completions/mean_terminated_length": 183.125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.10386022925376892, + "epoch": 0.945010183299389, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "learning_rate": 1.8840708351249182e-07, + "loss": -0.0, + "num_tokens": 7407836.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.030043622478842735, + "epoch": 0.9456890699253224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "learning_rate": 1.8385385644409282e-07, + "loss": 0.0, + "num_tokens": 7414166.0, + "reward": 2.7708334922790527, + "reward_std": 0.2946277856826782, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 147.125, + "completions/mean_terminated_length": 147.125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.06427620584145188, + "epoch": 0.9463679565512559, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.793558128196493e-07, + "loss": 0.0, + "num_tokens": 7418631.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 337.0, + "completions/mean_terminated_length": 337.0, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.03572363778948784, + "epoch": 0.9470468431771895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7491297792581963e-07, + "loss": 0.0, + "num_tokens": 7426527.0, + "reward": 1.5714285373687744, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 163.625, + "completions/mean_terminated_length": 163.625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.06719027180224657, + "epoch": 0.9477257298031229, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.705253767388948e-07, + "loss": 0.0, + "num_tokens": 7431444.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 346.625, + "completions/mean_terminated_length": 346.625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.04964815126731992, + "epoch": 0.9484046164290564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.661930339246609e-07, + "loss": 0.0, + "num_tokens": 7439361.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 278.375, + "completions/mean_terminated_length": 278.375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.05206569563597441, + "epoch": 0.9490835030549898, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.6191597383825473e-07, + "loss": 0.0, + "num_tokens": 7446452.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 166.0, + "completions/mean_terminated_length": 166.0, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.03718523355200887, + "epoch": 0.9497623896809233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5769422052403172e-07, + "loss": 0.0, + "num_tokens": 7451196.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 119.5, + "completions/mean_terminated_length": 119.5, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.06675209756940603, + "epoch": 0.9504412763068567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.5352779771543037e-07, + "loss": 0.0, + "num_tokens": 7455328.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 210.0, + "completions/mean_terminated_length": 210.0, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.04172407649457455, + "epoch": 0.9511201629327902, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "learning_rate": 1.494167288348347e-07, + "loss": -0.0, + "num_tokens": 7460832.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 192.875, + "completions/mean_terminated_length": 192.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.03628377616405487, + "epoch": 0.9517990495587237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.4536103699344884e-07, + "loss": 0.0, + "num_tokens": 7466263.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 233.25, + "completions/mean_terminated_length": 233.25, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.033826621947810054, + "epoch": 0.9524779361846571, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "learning_rate": 1.4136074499115914e-07, + "loss": -0.0, + "num_tokens": 7472449.0, + "reward": 2.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 108.5, + "completions/mean_terminated_length": 108.5, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.046879804227501154, + "epoch": 0.9531568228105907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3741587531641566e-07, + "loss": 0.0, + "num_tokens": 7476541.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 125.5, + "completions/mean_terminated_length": 125.5, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.050881276838481426, + "epoch": 0.9538357094365241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.3352645014609756e-07, + "loss": 0.0, + "num_tokens": 7480777.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 106.75, + "completions/mean_terminated_length": 106.75, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.04698456637561321, + "epoch": 0.9545145960624576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.296924913453923e-07, + "loss": 0.0, + "num_tokens": 7484919.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 143.25, + "completions/mean_terminated_length": 143.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.035350932739675045, + "epoch": 0.955193482688391, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "learning_rate": 1.259140204676712e-07, + "loss": 0.0, + "num_tokens": 7489881.0, + "reward": 2.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.625, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5175492167472839, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 67.125, + "completions/mean_terminated_length": 67.125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.04363395366817713, + "epoch": 0.9558723693143245, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.3125, + "learning_rate": 1.2219105875437176e-07, + "loss": -0.0, + "num_tokens": 7493466.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.032257709885016084, + "epoch": 0.956551255940258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "learning_rate": 1.185236271348722e-07, + "loss": -0.0, + "num_tokens": 7500434.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 154.375, + "completions/mean_terminated_length": 154.375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.06615744018927217, + "epoch": 0.9572301425661914, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.234375, + "learning_rate": 1.1491174622637934e-07, + "loss": 0.0, + "num_tokens": 7504933.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.875, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 161.625, + "completions/mean_terminated_length": 161.625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.06036748504266143, + "epoch": 0.957909029192125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1135543633380764e-07, + "loss": 0.0, + "num_tokens": 7509322.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 231.375, + "completions/mean_terminated_length": 231.375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.027164625702425838, + "epoch": 0.9585879158180584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0785471744967247e-07, + "loss": 0.0, + "num_tokens": 7515365.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 183.75, + "completions/mean_terminated_length": 183.75, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.0438750758767128, + "epoch": 0.9592668024439919, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "learning_rate": 1.0440960925396925e-07, + "loss": 0.0, + "num_tokens": 7520043.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 274.125, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.09158069361001253, + "epoch": 0.9599456890699253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0102013111406905e-07, + "loss": 0.0, + "num_tokens": 7526316.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 123.875, + "completions/mean_terminated_length": 123.875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.05266101472079754, + "epoch": 0.9606245756958588, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.84375, + "learning_rate": 9.768630208460528e-08, + "loss": 0.0, + "num_tokens": 7530595.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 197.75, + "completions/mean_terminated_length": 197.75, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.05456627393141389, + "epoch": 0.9613034623217923, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.859375, + "learning_rate": 9.440814090737049e-08, + "loss": 0.0, + "num_tokens": 7536369.0, + "reward": 1.821428656578064, + "reward_std": 0.10101527720689774, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8214285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.10101524740457535, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 465.25, + "completions/mean_terminated_length": 465.25, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.020693805068731308, + "epoch": 0.9619823489477257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "learning_rate": 9.11856660112076e-08, + "loss": 0.0, + "num_tokens": 7545723.0, + "reward": 0.9285714626312256, + "reward_std": 0.2020305097103119, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.2020305097103119, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 111.875, + "completions/mean_terminated_length": 111.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.07267671870067716, + "epoch": 0.9626612355736592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.80188955119099e-08, + "loss": 0.0, + "num_tokens": 7549778.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 165.75, + "completions/mean_terminated_length": 165.75, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.05876992456614971, + "epoch": 0.9633401221995926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "learning_rate": 8.490784721211454e-08, + "loss": -0.0, + "num_tokens": 7554368.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 140.875, + "completions/mean_terminated_length": 140.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.055333992931991816, + "epoch": 0.9640190088255262, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.185253860120701e-08, + "loss": 0.0, + "num_tokens": 7558879.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 267.25, + "completions/mean_terminated_length": 267.25, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.056854897644370794, + "epoch": 0.9646978954514596, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "learning_rate": 7.885298685522235e-08, + "loss": -0.0, + "num_tokens": 7566177.0, + "reward": 1.5178570747375488, + "reward_std": 0.07393556088209152, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5178571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 75.375, + "completions/mean_terminated_length": 75.375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.06834522588178515, + "epoch": 0.9653767820773931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.590920883674192e-08, + "loss": 0.0, + "num_tokens": 7570020.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 129.125, + "completions/mean_terminated_length": 129.125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.05737362802028656, + "epoch": 0.9660556687033266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.302122109481002e-08, + "loss": 0.0, + "num_tokens": 7574165.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 149.875, + "completions/mean_terminated_length": 149.875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.039678098633885384, + "epoch": 0.96673455532926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.018903986483083e-08, + "loss": 0.0, + "num_tokens": 7578412.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 115.625, + "completions/mean_terminated_length": 115.625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.045340674463659525, + "epoch": 0.9674134419551935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.741268106848164e-08, + "loss": 0.0, + "num_tokens": 7582609.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 107.875, + "completions/mean_terminated_length": 107.875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.06596353463828564, + "epoch": 0.9680923285811269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.469216031362302e-08, + "loss": 0.0, + "num_tokens": 7586576.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 90.75, + "completions/mean_terminated_length": 90.75, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.04027901426889002, + "epoch": 0.9687712152070604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.202749289420994e-08, + "loss": 0.0, + "num_tokens": 7590366.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 160.125, + "completions/mean_terminated_length": 160.125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.051211504731327295, + "epoch": 0.9694501018329938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.9418693790209705e-08, + "loss": 0.0, + "num_tokens": 7595167.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 197.875, + "completions/mean_terminated_length": 197.875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.021212840685620904, + "epoch": 0.9701289884589274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.686577766751078e-08, + "loss": 0.0, + "num_tokens": 7601230.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 399.75, + "completions/mean_terminated_length": 399.75, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.0578672387637198, + "epoch": 0.9708078750848609, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "learning_rate": 5.4368758877845204e-08, + "loss": 0.0, + "num_tokens": 7609748.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 267.5, + "completions/mean_terminated_length": 267.5, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.04076833697035909, + "epoch": 0.9714867617107943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 5.192765145870748e-08, + "loss": 0.0, + "num_tokens": 7616600.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 136.25, + "completions/mean_terminated_length": 136.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.05700752744451165, + "epoch": 0.9721656483367278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.954246913327576e-08, + "loss": 0.0, + "num_tokens": 7620770.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 219.75, + "completions/mean_terminated_length": 219.75, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.03856371785514057, + "epoch": 0.9728445349626612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.721322531033301e-08, + "loss": 0.0, + "num_tokens": 7626480.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 123.125, + "completions/mean_terminated_length": 123.125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.051664297003299, + "epoch": 0.9735234215885947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.4939933084192646e-08, + "loss": 0.0, + "num_tokens": 7630817.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 80.0, + "completions/mean_terminated_length": 80.0, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.018483178922906518, + "epoch": 0.9742023082145281, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.9375, + "learning_rate": 4.2722605234625236e-08, + "loss": -0.0, + "num_tokens": 7634553.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.03427370940335095, + "epoch": 0.9748811948404617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 4.0561254226786365e-08, + "loss": 0.0, + "num_tokens": 7640390.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 244.625, + "completions/mean_terminated_length": 244.625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.029702861327677965, + "epoch": 0.9755600814663951, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "learning_rate": 3.845589221114554e-08, + "loss": 0.0, + "num_tokens": 7646963.0, + "reward": 1.9305555820465088, + "reward_std": 0.05750548839569092, + "rewards/fixed_code_pass_all_test_reward/mean": 0.930555522441864, + "rewards/fixed_code_pass_all_test_reward/std": 0.05750546231865883, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 250.375, + "completions/mean_terminated_length": 250.375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.021351289004087448, + "epoch": 0.9762389680923286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.6406531023420735e-08, + "loss": 0.0, + "num_tokens": 7653374.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 163.5, + "completions/mean_terminated_length": 163.5, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.060278112068772316, + "epoch": 0.9769178547182621, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 3.4413182184507285e-08, + "loss": 0.0, + "num_tokens": 7658154.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 195.125, + "completions/mean_terminated_length": 195.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.10810685250908136, + "epoch": 0.9775967413441955, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "learning_rate": 3.24758569004191e-08, + "loss": 0.0, + "num_tokens": 7663355.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.5, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.5345224738121033, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 140.875, + "completions/mean_terminated_length": 140.875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.03728607762604952, + "epoch": 0.978275627970129, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "learning_rate": 3.0594566062219776e-08, + "loss": -0.0, + "num_tokens": 7668338.0, + "reward": 2.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 133.375, + "completions/mean_terminated_length": 133.375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.04085223888978362, + "epoch": 0.9789545145960624, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "learning_rate": 2.8769320245966014e-08, + "loss": -0.0, + "num_tokens": 7672973.0, + "reward": 2.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.75, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 113.75, + "completions/mean_terminated_length": 113.75, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.07143508875742555, + "epoch": 0.9796334012219959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.7000129712643208e-08, + "loss": 0.0, + "num_tokens": 7677123.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 117.25, + "completions/mean_terminated_length": 117.25, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.03794521884992719, + "epoch": 0.9803122878479293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.528700440811438e-08, + "loss": 0.0, + "num_tokens": 7681637.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 162.125, + "completions/mean_terminated_length": 162.125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.04380248347297311, + "epoch": 0.9809911744738629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.3629953963058007e-08, + "loss": 0.0, + "num_tokens": 7686278.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 137.25, + "completions/mean_terminated_length": 137.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.07558493688702583, + "epoch": 0.9816700610997964, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.25, + "learning_rate": 2.2028987692915836e-08, + "loss": -0.0, + "num_tokens": 7690592.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.125, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.3535533845424652, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 160.375, + "completions/mean_terminated_length": 160.375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.05028433492407203, + "epoch": 0.9823489477257298, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.453125, + "learning_rate": 2.048411459784516e-08, + "loss": -0.0, + "num_tokens": 7695587.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.25, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.4629100561141968, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 386.25, + "completions/mean_terminated_length": 386.25, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 0.12325855065137148, + "epoch": 0.9830278343516633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.8995343362658846e-08, + "loss": 0.0, + "num_tokens": 7704501.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 200.375, + "completions/mean_terminated_length": 200.375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.05211532860994339, + "epoch": 0.9837067209775967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7562682356786488e-08, + "loss": 0.0, + "num_tokens": 7709928.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 147.5, + "completions/mean_terminated_length": 147.5, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.15142503194510937, + "epoch": 0.9843856076035302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.618613963421889e-08, + "loss": 0.0, + "num_tokens": 7714228.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 233.625, + "completions/mean_terminated_length": 233.625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.10641594044864178, + "epoch": 0.9850644942294636, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "learning_rate": 1.4865722933469218e-08, + "loss": 0.0, + "num_tokens": 7719873.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 186.625, + "completions/mean_terminated_length": 186.625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.0543354912661016, + "epoch": 0.9857433808553971, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "learning_rate": 1.3601439677526363e-08, + "loss": -0.0, + "num_tokens": 7724758.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 86.375, + "completions/mean_terminated_length": 86.375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.048309564124792814, + "epoch": 0.9864222674813307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.2393296973812751e-08, + "loss": 0.0, + "num_tokens": 7728489.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 198.125, + "completions/mean_terminated_length": 198.125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.09527547471225262, + "epoch": 0.9871011541072641, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "learning_rate": 1.1241301614147715e-08, + "loss": 0.0, + "num_tokens": 7733474.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 235.5, + "completions/mean_terminated_length": 235.5, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.11967422068119049, + "epoch": 0.9877800407331976, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "learning_rate": 1.0145460074703073e-08, + "loss": -0.0, + "num_tokens": 7738806.0, + "reward": 2.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 397.125, + "completions/mean_terminated_length": 397.125, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.02805103559512645, + "epoch": 0.988458927359131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 9.105778515974273e-09, + "loss": 0.0, + "num_tokens": 7747687.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 115.625, + "completions/mean_terminated_length": 115.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.031156501034274697, + "epoch": 0.9891378139850645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 8.12226278274042e-09, + "loss": 0.0, + "num_tokens": 7752132.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 197.0, + "completions/mean_terminated_length": 197.0, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.06611996795982122, + "epoch": 0.9898167006109979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.194918404033191e-09, + "loss": 0.0, + "num_tokens": 7757668.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 144.75, + "completions/mean_terminated_length": 144.75, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.051881998777389526, + "epoch": 0.9904955872369314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 6.323750593106859e-09, + "loss": 0.0, + "num_tokens": 7762074.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 163.875, + "completions/mean_terminated_length": 163.875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.08997983951121569, + "epoch": 0.991174473862865, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "learning_rate": 5.508764247406096e-09, + "loss": -0.0, + "num_tokens": 7766897.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 236.75, + "completions/mean_terminated_length": 236.75, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.0508587802760303, + "epoch": 0.9918533604887984, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "learning_rate": 4.749963948540437e-09, + "loss": 0.0, + "num_tokens": 7773279.0, + "reward": 2.8500001430511475, + "reward_std": 0.1414213627576828, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8499999642372131, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 169.75, + "completions/mean_terminated_length": 169.75, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.10342225804924965, + "epoch": 0.9925322471147319, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "learning_rate": 4.047353962259859e-09, + "loss": 0.0, + "num_tokens": 7777941.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 165.75, + "completions/mean_terminated_length": 165.75, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.03919437597505748, + "epoch": 0.9932111337406653, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "learning_rate": 3.4009382384270206e-09, + "loss": -0.0, + "num_tokens": 7783147.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 124.5, + "completions/mean_terminated_length": 124.5, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.05942561570554972, + "epoch": 0.9938900203665988, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "learning_rate": 2.810720410998391e-09, + "loss": 0.0, + "num_tokens": 7787527.0, + "reward": 2.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 161.625, + "completions/mean_terminated_length": 161.625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.05872875917702913, + "epoch": 0.9945689069925322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.2767037980031548e-09, + "loss": 0.0, + "num_tokens": 7792116.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 157.75, + "completions/mean_terminated_length": 157.75, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.06247218558564782, + "epoch": 0.9952477936184657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.7988914015221182e-09, + "loss": 0.0, + "num_tokens": 7796626.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 206.0, + "completions/mean_terminated_length": 206.0, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.0569450743496418, + "epoch": 0.9959266802443992, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.25, + "learning_rate": 1.3772859076754962e-09, + "loss": 0.0, + "num_tokens": 7802242.0, + "reward": 1.9375, + "reward_std": 0.08625822514295578, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.08625820279121399, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 162.625, + "completions/mean_terminated_length": 162.625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.04403562285006046, + "epoch": 0.9966055668703326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.0118896866018192e-09, + "loss": 0.0, + "num_tokens": 7806639.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 112.75, + "completions/mean_terminated_length": 112.75, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.0495642744936049, + "epoch": 0.9972844534962662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 7.027047924512698e-10, + "loss": 0.0, + "num_tokens": 7810885.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 180.75, + "completions/mean_terminated_length": 180.75, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.05805972497910261, + "epoch": 0.9979633401221996, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "learning_rate": 4.497329633679215e-10, + "loss": -0.0, + "num_tokens": 7815819.0, + "reward": 2.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 172.0, + "completions/mean_terminated_length": 172.0, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.05528525682166219, + "epoch": 0.9986422267481331, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.529756214841861e-10, + "loss": 0.0, + "num_tokens": 7820563.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 106.875, + "completions/mean_terminated_length": 106.875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.053659217432141304, + "epoch": 0.9993211133740665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1.1243387291082208e-10, + "loss": 0.0, + "num_tokens": 7824490.0, + "reward": 3.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 127.375, + "completions/mean_terminated_length": 127.375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.049180077854543924, + "epoch": 1.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 2.810850773249385e-11, + "loss": 0.0, + "num_tokens": 7828629.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/mean": 1.0, + "rewards/testcase_pass_groundtruth_and_kill_bug_reward/std": 0.0, + "step": 1473 + } + ], + "logging_steps": 1, + "max_steps": 1473, + "num_input_tokens_seen": 7828629, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}