{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2222222222222222, "eval_steps": 1000, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 246.3125, "epoch": 8.888888888888889e-05, "grad_norm": 60.320960998535156, "learning_rate": 2.5e-07, "loss": -10.229, "reward": 1.7395833730697632, "reward_std": 0.6432403922080994, "rewards/boxed_and_answer_tags_format_reward": 0.65625, "rewards/correctness_reward_func_math": 1.0833333432674408, "step": 1, "zero_std_ratio": 0.0 }, { "epoch": 0.00017777777777777779, "grad_norm": 71.0031509399414, "learning_rate": 5e-07, "loss": -9.5625, "step": 2 }, { "epoch": 0.0002666666666666667, "grad_norm": 61.95022964477539, "learning_rate": 7.5e-07, "loss": -16.2291, "step": 3 }, { "epoch": 0.00035555555555555557, "grad_norm": 67.81867980957031, "learning_rate": 1e-06, "loss": -11.0016, "step": 4 }, { "epoch": 0.00044444444444444447, "grad_norm": 57.108917236328125, "learning_rate": 1.25e-06, "loss": -6.1658, "step": 5 }, { "epoch": 0.0005333333333333334, "grad_norm": 72.85011291503906, "learning_rate": 1.5e-06, "loss": -4.0145, "step": 6 }, { "epoch": 0.0006222222222222223, "grad_norm": 59.103431701660156, "learning_rate": 1.7500000000000002e-06, "loss": -9.9488, "step": 7 }, { "epoch": 0.0007111111111111111, "grad_norm": 73.94007873535156, "learning_rate": 2e-06, "loss": -9.1809, "step": 8 }, { "epoch": 0.0008, "grad_norm": 62.28184509277344, "learning_rate": 2.25e-06, "loss": -16.4311, "step": 9 }, { "epoch": 0.0008888888888888889, "grad_norm": 68.46251678466797, "learning_rate": 2.5e-06, "loss": -11.402, "step": 10 }, { "epoch": 0.0009777777777777777, "grad_norm": 59.184749603271484, "learning_rate": 2.75e-06, "loss": -6.1384, "step": 11 }, { "epoch": 0.0010666666666666667, "grad_norm": 71.60365295410156, "learning_rate": 3e-06, "loss": -3.932, "step": 12 }, { "completion_length": 249.62500762939453, "epoch": 0.0011555555555555555, "grad_norm": 61.79197311401367, "learning_rate": 3e-06, "loss": -2.1748, "reward": 0.9791666865348816, "reward_std": 0.4510806053876877, "rewards/boxed_and_answer_tags_format_reward": 0.6458333134651184, "rewards/correctness_reward_func_math": 0.3333333283662796, "step": 13, "zero_std_ratio": 0.125 }, { "epoch": 0.0012444444444444445, "grad_norm": 55.82655334472656, "learning_rate": 3e-06, "loss": -3.8107, "step": 14 }, { "epoch": 0.0013333333333333333, "grad_norm": 58.3712158203125, "learning_rate": 3e-06, "loss": 4.085, "step": 15 }, { "epoch": 0.0014222222222222223, "grad_norm": 73.97306823730469, "learning_rate": 3e-06, "loss": -4.8915, "step": 16 }, { "epoch": 0.001511111111111111, "grad_norm": 51.51576232910156, "learning_rate": 3e-06, "loss": -0.161, "step": 17 }, { "epoch": 0.0016, "grad_norm": 72.59639739990234, "learning_rate": 3e-06, "loss": 3.0369, "step": 18 }, { "epoch": 0.0016888888888888889, "grad_norm": 48.86510467529297, "learning_rate": 3e-06, "loss": -2.3437, "step": 19 }, { "epoch": 0.0017777777777777779, "grad_norm": 55.3180046081543, "learning_rate": 3e-06, "loss": -4.3585, "step": 20 }, { "epoch": 0.0018666666666666666, "grad_norm": 54.75101089477539, "learning_rate": 3e-06, "loss": 3.9965, "step": 21 }, { "epoch": 0.0019555555555555554, "grad_norm": 75.27330017089844, "learning_rate": 3e-06, "loss": -4.9733, "step": 22 }, { "epoch": 0.0020444444444444447, "grad_norm": 51.991214752197266, "learning_rate": 3e-06, "loss": -0.3033, "step": 23 }, { "epoch": 0.0021333333333333334, "grad_norm": 54.72827911376953, "learning_rate": 3e-06, "loss": 2.5278, "step": 24 }, { "completion_length": 226.8125, "epoch": 0.0022222222222222222, "grad_norm": 58.25025939941406, "learning_rate": 3e-06, "loss": -6.7768, "reward": 1.6041666865348816, "reward_std": 0.6311438381671906, "rewards/boxed_and_answer_tags_format_reward": 0.6458333432674408, "rewards/correctness_reward_func_math": 0.9583333432674408, "step": 25, "zero_std_ratio": 0.0 }, { "epoch": 0.002311111111111111, "grad_norm": 68.62551879882812, "learning_rate": 3e-06, "loss": -4.0906, "step": 26 }, { "epoch": 0.0024, "grad_norm": 65.11053466796875, "learning_rate": 3e-06, "loss": -4.6172, "step": 27 }, { "epoch": 0.002488888888888889, "grad_norm": 76.28429412841797, "learning_rate": 3e-06, "loss": -7.9209, "step": 28 }, { "epoch": 0.002577777777777778, "grad_norm": 62.037696838378906, "learning_rate": 3e-06, "loss": -3.4414, "step": 29 }, { "epoch": 0.0026666666666666666, "grad_norm": 58.92220687866211, "learning_rate": 3e-06, "loss": -3.2836, "step": 30 }, { "epoch": 0.0027555555555555554, "grad_norm": 57.03800582885742, "learning_rate": 3e-06, "loss": -7.1747, "step": 31 }, { "epoch": 0.0028444444444444446, "grad_norm": 71.39422607421875, "learning_rate": 3e-06, "loss": -4.5251, "step": 32 }, { "epoch": 0.0029333333333333334, "grad_norm": 130.19813537597656, "learning_rate": 3e-06, "loss": -4.5744, "step": 33 }, { "epoch": 0.003022222222222222, "grad_norm": 76.09828186035156, "learning_rate": 3e-06, "loss": -7.9552, "step": 34 }, { "epoch": 0.003111111111111111, "grad_norm": 63.77288055419922, "learning_rate": 3e-06, "loss": -3.6391, "step": 35 }, { "epoch": 0.0032, "grad_norm": 58.53509521484375, "learning_rate": 3e-06, "loss": -3.9415, "step": 36 }, { "completion_length": 245.14583587646484, "epoch": 0.003288888888888889, "grad_norm": 53.06296920776367, "learning_rate": 3e-06, "loss": 7.1798, "reward": 0.9375000298023224, "reward_std": 0.3340114951133728, "rewards/boxed_and_answer_tags_format_reward": 0.6041666567325592, "rewards/correctness_reward_func_math": 0.3333333246409893, "step": 37, "zero_std_ratio": 0.25 }, { "epoch": 0.0033777777777777777, "grad_norm": 78.04679870605469, "learning_rate": 3e-06, "loss": 13.2393, "step": 38 }, { "epoch": 0.0034666666666666665, "grad_norm": 64.38521575927734, "learning_rate": 3e-06, "loss": 11.5406, "step": 39 }, { "epoch": 0.0035555555555555557, "grad_norm": 56.69493865966797, "learning_rate": 3e-06, "loss": 11.0537, "step": 40 }, { "epoch": 0.0036444444444444445, "grad_norm": 59.67893600463867, "learning_rate": 3e-06, "loss": 12.2084, "step": 41 }, { "epoch": 0.0037333333333333333, "grad_norm": 44.71684646606445, "learning_rate": 3e-06, "loss": 14.915, "step": 42 }, { "epoch": 0.003822222222222222, "grad_norm": 53.003570556640625, "learning_rate": 3e-06, "loss": 7.1581, "step": 43 }, { "epoch": 0.003911111111111111, "grad_norm": 86.505615234375, "learning_rate": 3e-06, "loss": 13.0403, "step": 44 }, { "epoch": 0.004, "grad_norm": 73.7258529663086, "learning_rate": 3e-06, "loss": 11.1962, "step": 45 }, { "epoch": 0.004088888888888889, "grad_norm": 92.57136535644531, "learning_rate": 3e-06, "loss": 10.6526, "step": 46 }, { "epoch": 0.004177777777777778, "grad_norm": 63.43205642700195, "learning_rate": 3e-06, "loss": 11.8212, "step": 47 }, { "epoch": 0.004266666666666667, "grad_norm": 44.73876953125, "learning_rate": 3e-06, "loss": 14.3455, "step": 48 }, { "completion_length": 236.37500762939453, "epoch": 0.004355555555555555, "grad_norm": 42.604164123535156, "learning_rate": 3e-06, "loss": -28.8562, "reward": 1.6145833730697632, "reward_std": 0.3440491110086441, "rewards/boxed_and_answer_tags_format_reward": 0.65625, "rewards/correctness_reward_func_math": 0.9583333432674408, "step": 49, "zero_std_ratio": 0.125 }, { "epoch": 0.0044444444444444444, "grad_norm": 55.280696868896484, "learning_rate": 3e-06, "loss": -29.8671, "step": 50 }, { "epoch": 0.004533333333333334, "grad_norm": 53.84416198730469, "learning_rate": 3e-06, "loss": -28.6148, "step": 51 }, { "epoch": 0.004622222222222222, "grad_norm": 48.8647575378418, "learning_rate": 3e-06, "loss": -28.0853, "step": 52 }, { "epoch": 0.004711111111111111, "grad_norm": 65.01343536376953, "learning_rate": 3e-06, "loss": -26.2356, "step": 53 }, { "epoch": 0.0048, "grad_norm": 64.81402587890625, "learning_rate": 3e-06, "loss": -30.8205, "step": 54 }, { "epoch": 0.004888888888888889, "grad_norm": 44.85778045654297, "learning_rate": 3e-06, "loss": -28.6691, "step": 55 }, { "epoch": 0.004977777777777778, "grad_norm": 45.61606216430664, "learning_rate": 3e-06, "loss": -30.0595, "step": 56 }, { "epoch": 0.005066666666666666, "grad_norm": 49.3116455078125, "learning_rate": 3e-06, "loss": -28.8315, "step": 57 }, { "epoch": 0.005155555555555556, "grad_norm": 45.42935562133789, "learning_rate": 3e-06, "loss": -28.1493, "step": 58 }, { "epoch": 0.005244444444444445, "grad_norm": 52.282257080078125, "learning_rate": 3e-06, "loss": -26.8023, "step": 59 }, { "epoch": 0.005333333333333333, "grad_norm": 61.042945861816406, "learning_rate": 3e-06, "loss": -30.9091, "step": 60 }, { "completion_length": 250.8541717529297, "epoch": 0.005422222222222222, "grad_norm": 56.66669464111328, "learning_rate": 3e-06, "loss": -4.2644, "reward": 1.0104166865348816, "reward_std": 0.3859569579362869, "rewards/boxed_and_answer_tags_format_reward": 0.5104166716337204, "rewards/correctness_reward_func_math": 0.4999999850988388, "step": 61, "zero_std_ratio": 0.5 }, { "epoch": 0.005511111111111111, "grad_norm": 46.742279052734375, "learning_rate": 3e-06, "loss": 1.921, "step": 62 }, { "epoch": 0.0056, "grad_norm": 64.74068450927734, "learning_rate": 3e-06, "loss": -1.0677, "step": 63 }, { "epoch": 0.005688888888888889, "grad_norm": 53.72319412231445, "learning_rate": 3e-06, "loss": 0.7498, "step": 64 }, { "epoch": 0.0057777777777777775, "grad_norm": 51.9224739074707, "learning_rate": 3e-06, "loss": -1.7073, "step": 65 }, { "epoch": 0.005866666666666667, "grad_norm": 49.95579528808594, "learning_rate": 3e-06, "loss": -4.7011, "step": 66 }, { "epoch": 0.005955555555555556, "grad_norm": 54.09262466430664, "learning_rate": 3e-06, "loss": -4.882, "step": 67 }, { "epoch": 0.006044444444444444, "grad_norm": 51.433746337890625, "learning_rate": 3e-06, "loss": 1.6496, "step": 68 }, { "epoch": 0.0061333333333333335, "grad_norm": 48.16537094116211, "learning_rate": 3e-06, "loss": -1.5035, "step": 69 }, { "epoch": 0.006222222222222222, "grad_norm": 55.34268569946289, "learning_rate": 3e-06, "loss": 0.0384, "step": 70 }, { "epoch": 0.006311111111111111, "grad_norm": 45.631813049316406, "learning_rate": 3e-06, "loss": -1.8713, "step": 71 }, { "epoch": 0.0064, "grad_norm": 48.471473693847656, "learning_rate": 3e-06, "loss": -4.9618, "step": 72 }, { "completion_length": 231.14583587646484, "epoch": 0.006488888888888889, "grad_norm": 91.5987777709961, "learning_rate": 3e-06, "loss": 59.5993, "reward": 1.1145833730697632, "reward_std": 0.4806128740310669, "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, "rewards/correctness_reward_func_math": 0.5, "step": 73, "zero_std_ratio": 0.125 }, { "epoch": 0.006577777777777778, "grad_norm": 83.08058166503906, "learning_rate": 3e-06, "loss": 67.2965, "step": 74 }, { "epoch": 0.006666666666666667, "grad_norm": 65.38250732421875, "learning_rate": 3e-06, "loss": 60.7344, "step": 75 }, { "epoch": 0.0067555555555555554, "grad_norm": 66.78120422363281, "learning_rate": 3e-06, "loss": 63.4533, "step": 76 }, { "epoch": 0.006844444444444445, "grad_norm": 62.675838470458984, "learning_rate": 3e-06, "loss": 54.1729, "step": 77 }, { "epoch": 0.006933333333333333, "grad_norm": 63.28793716430664, "learning_rate": 3e-06, "loss": 61.2604, "step": 78 }, { "epoch": 0.007022222222222222, "grad_norm": 75.33735656738281, "learning_rate": 3e-06, "loss": 59.0054, "step": 79 }, { "epoch": 0.0071111111111111115, "grad_norm": 86.537109375, "learning_rate": 3e-06, "loss": 66.586, "step": 80 }, { "epoch": 0.0072, "grad_norm": 66.0783462524414, "learning_rate": 3e-06, "loss": 59.9151, "step": 81 }, { "epoch": 0.007288888888888889, "grad_norm": 66.10869598388672, "learning_rate": 3e-06, "loss": 62.0304, "step": 82 }, { "epoch": 0.007377777777777777, "grad_norm": 58.026912689208984, "learning_rate": 3e-06, "loss": 53.3198, "step": 83 }, { "epoch": 0.007466666666666667, "grad_norm": 59.65370559692383, "learning_rate": 3e-06, "loss": 60.237, "step": 84 }, { "completion_length": 246.70833587646484, "epoch": 0.007555555555555556, "grad_norm": 38.2843017578125, "learning_rate": 3e-06, "loss": -3.4492, "reward": 0.9270833730697632, "reward_std": 0.2587623968720436, "rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592, "rewards/correctness_reward_func_math": 0.2916666679084301, "step": 85, "zero_std_ratio": 0.125 }, { "epoch": 0.007644444444444444, "grad_norm": 42.626834869384766, "learning_rate": 3e-06, "loss": -3.0457, "step": 86 }, { "epoch": 0.007733333333333333, "grad_norm": 31.817684173583984, "learning_rate": 3e-06, "loss": 0.2054, "step": 87 }, { "epoch": 0.007822222222222222, "grad_norm": 41.712833404541016, "learning_rate": 3e-06, "loss": 0.6522, "step": 88 }, { "epoch": 0.007911111111111112, "grad_norm": 33.385929107666016, "learning_rate": 3e-06, "loss": -2.3715, "step": 89 }, { "epoch": 0.008, "grad_norm": 43.1032829284668, "learning_rate": 3e-06, "loss": 1.5502, "step": 90 }, { "epoch": 0.008088888888888889, "grad_norm": 36.241458892822266, "learning_rate": 3e-06, "loss": -3.5684, "step": 91 }, { "epoch": 0.008177777777777779, "grad_norm": 41.06986618041992, "learning_rate": 3e-06, "loss": -3.2263, "step": 92 }, { "epoch": 0.008266666666666667, "grad_norm": 31.25284767150879, "learning_rate": 3e-06, "loss": -0.3487, "step": 93 }, { "epoch": 0.008355555555555555, "grad_norm": 36.958518981933594, "learning_rate": 3e-06, "loss": 0.1995, "step": 94 }, { "epoch": 0.008444444444444444, "grad_norm": 34.949676513671875, "learning_rate": 3e-06, "loss": -2.9378, "step": 95 }, { "epoch": 0.008533333333333334, "grad_norm": 36.523372650146484, "learning_rate": 3e-06, "loss": 0.9469, "step": 96 }, { "completion_length": 246.08333587646484, "epoch": 0.008622222222222222, "grad_norm": 59.17626953125, "learning_rate": 3e-06, "loss": 2.6896, "reward": 1.3333333730697632, "reward_std": 0.5695068836212158, "rewards/boxed_and_answer_tags_format_reward": 0.5833333432674408, "rewards/correctness_reward_func_math": 0.7500000149011612, "step": 97, "zero_std_ratio": 0.125 }, { "epoch": 0.00871111111111111, "grad_norm": 71.58135223388672, "learning_rate": 3e-06, "loss": 1.0704, "step": 98 }, { "epoch": 0.0088, "grad_norm": 65.36974334716797, "learning_rate": 3e-06, "loss": -2.7445, "step": 99 }, { "epoch": 0.008888888888888889, "grad_norm": 60.50218200683594, "learning_rate": 3e-06, "loss": 3.606, "step": 100 }, { "epoch": 0.008977777777777777, "grad_norm": 61.99585723876953, "learning_rate": 3e-06, "loss": -1.4435, "step": 101 }, { "epoch": 0.009066666666666667, "grad_norm": 106.92288970947266, "learning_rate": 3e-06, "loss": -3.972, "step": 102 }, { "epoch": 0.009155555555555556, "grad_norm": 58.85340118408203, "learning_rate": 3e-06, "loss": 1.5567, "step": 103 }, { "epoch": 0.009244444444444444, "grad_norm": 70.97467041015625, "learning_rate": 3e-06, "loss": -0.3592, "step": 104 }, { "epoch": 0.009333333333333334, "grad_norm": 62.310516357421875, "learning_rate": 3e-06, "loss": -4.0897, "step": 105 }, { "epoch": 0.009422222222222222, "grad_norm": 60.98678207397461, "learning_rate": 3e-06, "loss": 2.9803, "step": 106 }, { "epoch": 0.00951111111111111, "grad_norm": 60.23484420776367, "learning_rate": 3e-06, "loss": -2.2749, "step": 107 }, { "epoch": 0.0096, "grad_norm": 58.0914192199707, "learning_rate": 3e-06, "loss": -4.9462, "step": 108 }, { "completion_length": 249.06250762939453, "epoch": 0.00968888888888889, "grad_norm": 65.09230041503906, "learning_rate": 3e-06, "loss": -11.6324, "reward": 1.125, "reward_std": 0.5275504291057587, "rewards/boxed_and_answer_tags_format_reward": 0.5833333432674408, "rewards/correctness_reward_func_math": 0.5416666716337204, "step": 109, "zero_std_ratio": 0.0 }, { "epoch": 0.009777777777777778, "grad_norm": 58.50445556640625, "learning_rate": 3e-06, "loss": -12.2664, "step": 110 }, { "epoch": 0.009866666666666666, "grad_norm": 53.459251403808594, "learning_rate": 3e-06, "loss": -7.2192, "step": 111 }, { "epoch": 0.009955555555555556, "grad_norm": 60.34041213989258, "learning_rate": 3e-06, "loss": -6.9971, "step": 112 }, { "epoch": 0.010044444444444444, "grad_norm": 61.72711944580078, "learning_rate": 3e-06, "loss": -0.4686, "step": 113 }, { "epoch": 0.010133333333333333, "grad_norm": 96.1756591796875, "learning_rate": 3e-06, "loss": -7.4161, "step": 114 }, { "epoch": 0.010222222222222223, "grad_norm": 61.3508415222168, "learning_rate": 3e-06, "loss": -12.3763, "step": 115 }, { "epoch": 0.010311111111111111, "grad_norm": 55.424896240234375, "learning_rate": 3e-06, "loss": -12.8949, "step": 116 }, { "epoch": 0.0104, "grad_norm": 56.08291244506836, "learning_rate": 3e-06, "loss": -7.8472, "step": 117 }, { "epoch": 0.01048888888888889, "grad_norm": 73.18891906738281, "learning_rate": 3e-06, "loss": -8.0281, "step": 118 }, { "epoch": 0.010577777777777778, "grad_norm": 64.47604370117188, "learning_rate": 3e-06, "loss": -1.3444, "step": 119 }, { "epoch": 0.010666666666666666, "grad_norm": 96.0721664428711, "learning_rate": 3e-06, "loss": -8.5737, "step": 120 }, { "completion_length": 253.0, "epoch": 0.010755555555555556, "grad_norm": 60.78779983520508, "learning_rate": 3e-06, "loss": -1.605, "reward": 1.1145833730697632, "reward_std": 0.4272044152021408, "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, "rewards/correctness_reward_func_math": 0.4999999850988388, "step": 121, "zero_std_ratio": 0.25 }, { "epoch": 0.010844444444444445, "grad_norm": 49.34260177612305, "learning_rate": 3e-06, "loss": -0.2147, "step": 122 }, { "epoch": 0.010933333333333333, "grad_norm": 53.38318634033203, "learning_rate": 3e-06, "loss": -7.1697, "step": 123 }, { "epoch": 0.011022222222222221, "grad_norm": 84.88465881347656, "learning_rate": 3e-06, "loss": 4.7029, "step": 124 }, { "epoch": 0.011111111111111112, "grad_norm": 50.966583251953125, "learning_rate": 3e-06, "loss": -5.2481, "step": 125 }, { "epoch": 0.0112, "grad_norm": 64.3619155883789, "learning_rate": 3e-06, "loss": -5.9545, "step": 126 }, { "epoch": 0.011288888888888888, "grad_norm": 60.359500885009766, "learning_rate": 3e-06, "loss": -1.8244, "step": 127 }, { "epoch": 0.011377777777777778, "grad_norm": 51.08177947998047, "learning_rate": 3e-06, "loss": -0.8292, "step": 128 }, { "epoch": 0.011466666666666667, "grad_norm": 53.191165924072266, "learning_rate": 3e-06, "loss": -7.7867, "step": 129 }, { "epoch": 0.011555555555555555, "grad_norm": 87.42491912841797, "learning_rate": 3e-06, "loss": 3.92, "step": 130 }, { "epoch": 0.011644444444444445, "grad_norm": 49.99729537963867, "learning_rate": 3e-06, "loss": -5.9859, "step": 131 }, { "epoch": 0.011733333333333333, "grad_norm": 45.14487075805664, "learning_rate": 3e-06, "loss": -6.6928, "step": 132 }, { "completion_length": 239.1666717529297, "epoch": 0.011822222222222222, "grad_norm": 68.43509674072266, "learning_rate": 3e-06, "loss": 45.5812, "reward": 1.1666666865348816, "reward_std": 0.47104020416736603, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.5416666679084301, "step": 133, "zero_std_ratio": 0.125 }, { "epoch": 0.011911111111111112, "grad_norm": 62.809059143066406, "learning_rate": 3e-06, "loss": 47.1017, "step": 134 }, { "epoch": 0.012, "grad_norm": 61.8614387512207, "learning_rate": 3e-06, "loss": 41.2505, "step": 135 }, { "epoch": 0.012088888888888889, "grad_norm": 65.46350860595703, "learning_rate": 3e-06, "loss": 44.191, "step": 136 }, { "epoch": 0.012177777777777777, "grad_norm": 59.1669807434082, "learning_rate": 3e-06, "loss": 36.6508, "step": 137 }, { "epoch": 0.012266666666666667, "grad_norm": 55.44610595703125, "learning_rate": 3e-06, "loss": 41.1041, "step": 138 }, { "epoch": 0.012355555555555555, "grad_norm": 61.210411071777344, "learning_rate": 3e-06, "loss": 44.6212, "step": 139 }, { "epoch": 0.012444444444444444, "grad_norm": 64.934326171875, "learning_rate": 3e-06, "loss": 46.5024, "step": 140 }, { "epoch": 0.012533333333333334, "grad_norm": 67.7354507446289, "learning_rate": 3e-06, "loss": 40.309, "step": 141 }, { "epoch": 0.012622222222222222, "grad_norm": 69.55413055419922, "learning_rate": 3e-06, "loss": 42.8161, "step": 142 }, { "epoch": 0.01271111111111111, "grad_norm": 59.040592193603516, "learning_rate": 3e-06, "loss": 35.3869, "step": 143 }, { "epoch": 0.0128, "grad_norm": 56.21048355102539, "learning_rate": 3e-06, "loss": 39.9197, "step": 144 }, { "completion_length": 237.37500762939453, "epoch": 0.012888888888888889, "grad_norm": 67.88895416259766, "learning_rate": 3e-06, "loss": -21.4223, "reward": 0.8958333730697632, "reward_std": 0.44294705986976624, "rewards/boxed_and_answer_tags_format_reward": 0.5625000149011612, "rewards/correctness_reward_func_math": 0.3333333358168602, "step": 145, "zero_std_ratio": 0.375 }, { "epoch": 0.012977777777777777, "grad_norm": 55.02178955078125, "learning_rate": 3e-06, "loss": -26.3881, "step": 146 }, { "epoch": 0.013066666666666667, "grad_norm": 103.78085327148438, "learning_rate": 3e-06, "loss": -21.7028, "step": 147 }, { "epoch": 0.013155555555555556, "grad_norm": 62.1268196105957, "learning_rate": 3e-06, "loss": -19.048, "step": 148 }, { "epoch": 0.013244444444444444, "grad_norm": 57.99726486206055, "learning_rate": 3e-06, "loss": -19.346, "step": 149 }, { "epoch": 0.013333333333333334, "grad_norm": 58.639549255371094, "learning_rate": 3e-06, "loss": -25.0216, "step": 150 }, { "epoch": 0.013422222222222223, "grad_norm": 75.58393859863281, "learning_rate": 3e-06, "loss": -21.7941, "step": 151 }, { "epoch": 0.013511111111111111, "grad_norm": 54.83882522583008, "learning_rate": 3e-06, "loss": -27.6056, "step": 152 }, { "epoch": 0.0136, "grad_norm": 70.61170196533203, "learning_rate": 3e-06, "loss": -21.99, "step": 153 }, { "epoch": 0.01368888888888889, "grad_norm": 68.1909408569336, "learning_rate": 3e-06, "loss": -20.2119, "step": 154 }, { "epoch": 0.013777777777777778, "grad_norm": 68.70491027832031, "learning_rate": 3e-06, "loss": -20.2249, "step": 155 }, { "epoch": 0.013866666666666666, "grad_norm": 55.29183578491211, "learning_rate": 3e-06, "loss": -25.9634, "step": 156 }, { "completion_length": 250.6666717529297, "epoch": 0.013955555555555556, "grad_norm": 70.28712463378906, "learning_rate": 3e-06, "loss": -13.869, "reward": 1.4583333730697632, "reward_std": 0.6823203265666962, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.8333333283662796, "step": 157, "zero_std_ratio": 0.125 }, { "epoch": 0.014044444444444444, "grad_norm": 60.210201263427734, "learning_rate": 3e-06, "loss": -24.2593, "step": 158 }, { "epoch": 0.014133333333333333, "grad_norm": 77.62222290039062, "learning_rate": 3e-06, "loss": -11.4696, "step": 159 }, { "epoch": 0.014222222222222223, "grad_norm": 70.80023193359375, "learning_rate": 3e-06, "loss": -25.8617, "step": 160 }, { "epoch": 0.014311111111111111, "grad_norm": 64.7750244140625, "learning_rate": 3e-06, "loss": -14.8635, "step": 161 }, { "epoch": 0.0144, "grad_norm": 77.83097076416016, "learning_rate": 3e-06, "loss": -7.727, "step": 162 }, { "epoch": 0.01448888888888889, "grad_norm": 79.27497100830078, "learning_rate": 3e-06, "loss": -14.3689, "step": 163 }, { "epoch": 0.014577777777777778, "grad_norm": 78.7293472290039, "learning_rate": 3e-06, "loss": -24.9423, "step": 164 }, { "epoch": 0.014666666666666666, "grad_norm": 70.24745178222656, "learning_rate": 3e-06, "loss": -12.4543, "step": 165 }, { "epoch": 0.014755555555555555, "grad_norm": 75.36212158203125, "learning_rate": 3e-06, "loss": -26.7206, "step": 166 }, { "epoch": 0.014844444444444445, "grad_norm": 65.05477142333984, "learning_rate": 3e-06, "loss": -16.0018, "step": 167 }, { "epoch": 0.014933333333333333, "grad_norm": 78.83174133300781, "learning_rate": 3e-06, "loss": -8.9618, "step": 168 }, { "completion_length": 236.2916717529297, "epoch": 0.015022222222222222, "grad_norm": 82.83967590332031, "learning_rate": 3e-06, "loss": -0.5024, "reward": 1.1145833730697632, "reward_std": 0.3740755543112755, "rewards/boxed_and_answer_tags_format_reward": 0.6145833134651184, "rewards/correctness_reward_func_math": 0.5, "step": 169, "zero_std_ratio": 0.375 }, { "epoch": 0.015111111111111112, "grad_norm": 51.84051513671875, "learning_rate": 3e-06, "loss": 6.7446, "step": 170 }, { "epoch": 0.0152, "grad_norm": 63.607723236083984, "learning_rate": 3e-06, "loss": 2.7771, "step": 171 }, { "epoch": 0.015288888888888888, "grad_norm": 52.88029479980469, "learning_rate": 3e-06, "loss": 8.1945, "step": 172 }, { "epoch": 0.015377777777777778, "grad_norm": 68.90487670898438, "learning_rate": 3e-06, "loss": 0.8609, "step": 173 }, { "epoch": 0.015466666666666667, "grad_norm": 57.66716766357422, "learning_rate": 3e-06, "loss": -0.5103, "step": 174 }, { "epoch": 0.015555555555555555, "grad_norm": 69.48858642578125, "learning_rate": 3e-06, "loss": -0.89, "step": 175 }, { "epoch": 0.015644444444444443, "grad_norm": 51.13008117675781, "learning_rate": 3e-06, "loss": 5.8779, "step": 176 }, { "epoch": 0.015733333333333332, "grad_norm": 61.48530578613281, "learning_rate": 3e-06, "loss": 2.0727, "step": 177 }, { "epoch": 0.015822222222222224, "grad_norm": 55.415924072265625, "learning_rate": 3e-06, "loss": 7.6559, "step": 178 }, { "epoch": 0.015911111111111112, "grad_norm": 65.15290069580078, "learning_rate": 3e-06, "loss": -0.6101, "step": 179 }, { "epoch": 0.016, "grad_norm": 52.03913879394531, "learning_rate": 3e-06, "loss": -1.3899, "step": 180 }, { "completion_length": 250.14583587646484, "epoch": 0.01608888888888889, "grad_norm": 63.963829040527344, "learning_rate": 3e-06, "loss": -8.994, "reward": 0.885416716337204, "reward_std": 0.3302172925323248, "rewards/boxed_and_answer_tags_format_reward": 0.59375, "rewards/correctness_reward_func_math": 0.2916666567325592, "step": 181, "zero_std_ratio": 0.5 }, { "epoch": 0.016177777777777777, "grad_norm": 48.50006866455078, "learning_rate": 3e-06, "loss": -0.9428, "step": 182 }, { "epoch": 0.016266666666666665, "grad_norm": 58.21607971191406, "learning_rate": 3e-06, "loss": -8.2051, "step": 183 }, { "epoch": 0.016355555555555557, "grad_norm": 76.80998992919922, "learning_rate": 3e-06, "loss": 1.6849, "step": 184 }, { "epoch": 0.016444444444444446, "grad_norm": 48.460941314697266, "learning_rate": 3e-06, "loss": -3.4021, "step": 185 }, { "epoch": 0.016533333333333334, "grad_norm": 55.28091049194336, "learning_rate": 3e-06, "loss": 0.4156, "step": 186 }, { "epoch": 0.016622222222222222, "grad_norm": 65.21077728271484, "learning_rate": 3e-06, "loss": -9.4278, "step": 187 }, { "epoch": 0.01671111111111111, "grad_norm": 50.71424865722656, "learning_rate": 3e-06, "loss": -2.0682, "step": 188 }, { "epoch": 0.0168, "grad_norm": 58.02372360229492, "learning_rate": 3e-06, "loss": -9.1068, "step": 189 }, { "epoch": 0.016888888888888887, "grad_norm": 61.12031555175781, "learning_rate": 3e-06, "loss": 0.679, "step": 190 }, { "epoch": 0.01697777777777778, "grad_norm": 51.7930908203125, "learning_rate": 3e-06, "loss": -4.3214, "step": 191 }, { "epoch": 0.017066666666666667, "grad_norm": 48.15507507324219, "learning_rate": 3e-06, "loss": -0.8545, "step": 192 }, { "completion_length": 248.1666717529297, "epoch": 0.017155555555555556, "grad_norm": 62.317527770996094, "learning_rate": 3e-06, "loss": 13.8287, "reward": 1.0729167461395264, "reward_std": 0.348264142870903, "rewards/boxed_and_answer_tags_format_reward": 0.5729166865348816, "rewards/correctness_reward_func_math": 0.5, "step": 193, "zero_std_ratio": 0.375 }, { "epoch": 0.017244444444444444, "grad_norm": 43.339691162109375, "learning_rate": 3e-06, "loss": 17.523, "step": 194 }, { "epoch": 0.017333333333333333, "grad_norm": 48.14270782470703, "learning_rate": 3e-06, "loss": 15.9481, "step": 195 }, { "epoch": 0.01742222222222222, "grad_norm": 43.32905960083008, "learning_rate": 3e-06, "loss": 14.7259, "step": 196 }, { "epoch": 0.017511111111111113, "grad_norm": 45.01740264892578, "learning_rate": 3e-06, "loss": 13.8658, "step": 197 }, { "epoch": 0.0176, "grad_norm": 43.2428092956543, "learning_rate": 3e-06, "loss": 20.4664, "step": 198 }, { "epoch": 0.01768888888888889, "grad_norm": 56.058616638183594, "learning_rate": 3e-06, "loss": 13.1828, "step": 199 }, { "epoch": 0.017777777777777778, "grad_norm": 46.968666076660156, "learning_rate": 3e-06, "loss": 16.8591, "step": 200 }, { "epoch": 0.017866666666666666, "grad_norm": 45.98298263549805, "learning_rate": 3e-06, "loss": 15.5154, "step": 201 }, { "epoch": 0.017955555555555554, "grad_norm": 43.91643142700195, "learning_rate": 3e-06, "loss": 14.2328, "step": 202 }, { "epoch": 0.018044444444444443, "grad_norm": 44.83538055419922, "learning_rate": 3e-06, "loss": 13.1929, "step": 203 }, { "epoch": 0.018133333333333335, "grad_norm": 42.83240509033203, "learning_rate": 3e-06, "loss": 19.8936, "step": 204 }, { "completion_length": 249.02083587646484, "epoch": 0.018222222222222223, "grad_norm": 119.68338775634766, "learning_rate": 3e-06, "loss": -4.5616, "reward": 0.8229166865348816, "reward_std": 0.28067073225975037, "rewards/boxed_and_answer_tags_format_reward": 0.65625, "rewards/correctness_reward_func_math": 0.1666666716337204, "step": 205, "zero_std_ratio": 0.5 }, { "epoch": 0.01831111111111111, "grad_norm": 40.65678405761719, "learning_rate": 3e-06, "loss": 3.7784, "step": 206 }, { "epoch": 0.0184, "grad_norm": 51.84949493408203, "learning_rate": 3e-06, "loss": 2.132, "step": 207 }, { "epoch": 0.018488888888888888, "grad_norm": 40.80442428588867, "learning_rate": 3e-06, "loss": -1.7568, "step": 208 }, { "epoch": 0.018577777777777776, "grad_norm": 51.88225555419922, "learning_rate": 3e-06, "loss": -2.808, "step": 209 }, { "epoch": 0.018666666666666668, "grad_norm": 57.230106353759766, "learning_rate": 3e-06, "loss": -1.6958, "step": 210 }, { "epoch": 0.018755555555555557, "grad_norm": 65.36343383789062, "learning_rate": 3e-06, "loss": -5.016, "step": 211 }, { "epoch": 0.018844444444444445, "grad_norm": 42.36751937866211, "learning_rate": 3e-06, "loss": 3.2604, "step": 212 }, { "epoch": 0.018933333333333333, "grad_norm": 54.347625732421875, "learning_rate": 3e-06, "loss": 1.5256, "step": 213 }, { "epoch": 0.01902222222222222, "grad_norm": 40.971683502197266, "learning_rate": 3e-06, "loss": -2.3704, "step": 214 }, { "epoch": 0.01911111111111111, "grad_norm": 51.366546630859375, "learning_rate": 3e-06, "loss": -3.7841, "step": 215 }, { "epoch": 0.0192, "grad_norm": 64.25231170654297, "learning_rate": 3e-06, "loss": -2.6423, "step": 216 }, { "completion_length": 250.4791717529297, "epoch": 0.01928888888888889, "grad_norm": 60.69169235229492, "learning_rate": 3e-06, "loss": 3.7848, "reward": 1.0833333730697632, "reward_std": 0.5039487332105637, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.4583333283662796, "step": 217, "zero_std_ratio": 0.25 }, { "epoch": 0.01937777777777778, "grad_norm": 65.37804412841797, "learning_rate": 3e-06, "loss": -5.4996, "step": 218 }, { "epoch": 0.019466666666666667, "grad_norm": 58.69138717651367, "learning_rate": 3e-06, "loss": 0.3025, "step": 219 }, { "epoch": 0.019555555555555555, "grad_norm": 72.17839813232422, "learning_rate": 3e-06, "loss": -0.8041, "step": 220 }, { "epoch": 0.019644444444444444, "grad_norm": 69.56704711914062, "learning_rate": 3e-06, "loss": 3.1412, "step": 221 }, { "epoch": 0.019733333333333332, "grad_norm": 64.57500457763672, "learning_rate": 3e-06, "loss": 6.7244, "step": 222 }, { "epoch": 0.019822222222222224, "grad_norm": 65.06715393066406, "learning_rate": 3e-06, "loss": 3.0694, "step": 223 }, { "epoch": 0.019911111111111112, "grad_norm": 72.74304962158203, "learning_rate": 3e-06, "loss": -5.928, "step": 224 }, { "epoch": 0.02, "grad_norm": 62.06201934814453, "learning_rate": 3e-06, "loss": -0.2234, "step": 225 }, { "epoch": 0.02008888888888889, "grad_norm": 74.25010681152344, "learning_rate": 3e-06, "loss": -1.5055, "step": 226 }, { "epoch": 0.020177777777777777, "grad_norm": 64.32748413085938, "learning_rate": 3e-06, "loss": 2.4819, "step": 227 }, { "epoch": 0.020266666666666665, "grad_norm": 64.75834655761719, "learning_rate": 3e-06, "loss": 5.504, "step": 228 }, { "completion_length": 252.9375, "epoch": 0.020355555555555557, "grad_norm": 86.62522888183594, "learning_rate": 3e-06, "loss": 14.2874, "reward": 1.1562500596046448, "reward_std": 0.5227071046829224, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.4166666567325592, "step": 229, "zero_std_ratio": 0.375 }, { "epoch": 0.020444444444444446, "grad_norm": 61.35566329956055, "learning_rate": 3e-06, "loss": 17.7162, "step": 230 }, { "epoch": 0.020533333333333334, "grad_norm": 61.87510681152344, "learning_rate": 3e-06, "loss": 8.9931, "step": 231 }, { "epoch": 0.020622222222222222, "grad_norm": 57.673770904541016, "learning_rate": 3e-06, "loss": 14.8206, "step": 232 }, { "epoch": 0.02071111111111111, "grad_norm": 64.32942199707031, "learning_rate": 3e-06, "loss": 10.9396, "step": 233 }, { "epoch": 0.0208, "grad_norm": 66.23136138916016, "learning_rate": 3e-06, "loss": 15.6923, "step": 234 }, { "epoch": 0.020888888888888887, "grad_norm": 74.55809783935547, "learning_rate": 3e-06, "loss": 13.7889, "step": 235 }, { "epoch": 0.02097777777777778, "grad_norm": 60.680240631103516, "learning_rate": 3e-06, "loss": 17.2098, "step": 236 }, { "epoch": 0.021066666666666668, "grad_norm": 63.526371002197266, "learning_rate": 3e-06, "loss": 8.3714, "step": 237 }, { "epoch": 0.021155555555555556, "grad_norm": 60.387813568115234, "learning_rate": 3e-06, "loss": 13.9642, "step": 238 }, { "epoch": 0.021244444444444444, "grad_norm": 65.6108169555664, "learning_rate": 3e-06, "loss": 10.0042, "step": 239 }, { "epoch": 0.021333333333333333, "grad_norm": 65.62525177001953, "learning_rate": 3e-06, "loss": 14.6078, "step": 240 }, { "completion_length": 245.89584350585938, "epoch": 0.02142222222222222, "grad_norm": 99.76964569091797, "learning_rate": 3e-06, "loss": -5.9045, "reward": 1.2916666865348816, "reward_std": 0.23116151243448257, "rewards/boxed_and_answer_tags_format_reward": 0.7083333432674408, "rewards/correctness_reward_func_math": 0.5833333358168602, "step": 241, "zero_std_ratio": 0.375 }, { "epoch": 0.021511111111111113, "grad_norm": 34.929359436035156, "learning_rate": 3e-06, "loss": -5.1064, "step": 242 }, { "epoch": 0.0216, "grad_norm": 41.406982421875, "learning_rate": 3e-06, "loss": -7.9375, "step": 243 }, { "epoch": 0.02168888888888889, "grad_norm": 40.73991775512695, "learning_rate": 3e-06, "loss": -4.6122, "step": 244 }, { "epoch": 0.021777777777777778, "grad_norm": 32.28548812866211, "learning_rate": 3e-06, "loss": -3.2865, "step": 245 }, { "epoch": 0.021866666666666666, "grad_norm": 37.392860412597656, "learning_rate": 3e-06, "loss": -0.2002, "step": 246 }, { "epoch": 0.021955555555555555, "grad_norm": 105.95482635498047, "learning_rate": 3e-06, "loss": -5.7761, "step": 247 }, { "epoch": 0.022044444444444443, "grad_norm": 35.37491226196289, "learning_rate": 3e-06, "loss": -5.3203, "step": 248 }, { "epoch": 0.022133333333333335, "grad_norm": 37.672000885009766, "learning_rate": 3e-06, "loss": -8.3175, "step": 249 }, { "epoch": 0.022222222222222223, "grad_norm": 34.36002731323242, "learning_rate": 3e-06, "loss": -4.8629, "step": 250 }, { "epoch": 0.02231111111111111, "grad_norm": 35.60414123535156, "learning_rate": 3e-06, "loss": -3.8192, "step": 251 }, { "epoch": 0.0224, "grad_norm": 38.58955764770508, "learning_rate": 3e-06, "loss": -0.897, "step": 252 }, { "completion_length": 250.1875, "epoch": 0.022488888888888888, "grad_norm": 71.29794311523438, "learning_rate": 3e-06, "loss": 2.5796, "reward": 0.8645833432674408, "reward_std": 0.3201860636472702, "rewards/boxed_and_answer_tags_format_reward": 0.5729166716337204, "rewards/correctness_reward_func_math": 0.2916666641831398, "step": 253, "zero_std_ratio": 0.625 }, { "epoch": 0.022577777777777776, "grad_norm": 62.11003112792969, "learning_rate": 3e-06, "loss": 2.4232, "step": 254 }, { "epoch": 0.02266666666666667, "grad_norm": 57.18949508666992, "learning_rate": 3e-06, "loss": 5.9388, "step": 255 }, { "epoch": 0.022755555555555557, "grad_norm": 60.49555206298828, "learning_rate": 3e-06, "loss": 5.5698, "step": 256 }, { "epoch": 0.022844444444444445, "grad_norm": 134.1082305908203, "learning_rate": 3e-06, "loss": -5.3771, "step": 257 }, { "epoch": 0.022933333333333333, "grad_norm": 170.15768432617188, "learning_rate": 3e-06, "loss": 6.3811, "step": 258 }, { "epoch": 0.02302222222222222, "grad_norm": 70.64490509033203, "learning_rate": 3e-06, "loss": 1.7661, "step": 259 }, { "epoch": 0.02311111111111111, "grad_norm": 71.96417999267578, "learning_rate": 3e-06, "loss": 0.8909, "step": 260 }, { "epoch": 0.0232, "grad_norm": 58.19865417480469, "learning_rate": 3e-06, "loss": 5.1442, "step": 261 }, { "epoch": 0.02328888888888889, "grad_norm": 61.813690185546875, "learning_rate": 3e-06, "loss": 4.1458, "step": 262 }, { "epoch": 0.02337777777777778, "grad_norm": 63.21968460083008, "learning_rate": 3e-06, "loss": -6.5992, "step": 263 }, { "epoch": 0.023466666666666667, "grad_norm": 130.61351013183594, "learning_rate": 3e-06, "loss": 4.4745, "step": 264 }, { "completion_length": 254.58333587646484, "epoch": 0.023555555555555555, "grad_norm": 37.4125862121582, "learning_rate": 3e-06, "loss": 6.5013, "reward": 1.0000000298023224, "reward_std": 0.32049281150102615, "rewards/boxed_and_answer_tags_format_reward": 0.7083333134651184, "rewards/correctness_reward_func_math": 0.2916666679084301, "step": 265, "zero_std_ratio": 0.5 }, { "epoch": 0.023644444444444444, "grad_norm": 47.14967346191406, "learning_rate": 3e-06, "loss": 0.9939, "step": 266 }, { "epoch": 0.023733333333333332, "grad_norm": 52.939048767089844, "learning_rate": 3e-06, "loss": -0.9139, "step": 267 }, { "epoch": 0.023822222222222224, "grad_norm": 86.99070739746094, "learning_rate": 3e-06, "loss": 8.1755, "step": 268 }, { "epoch": 0.023911111111111112, "grad_norm": 39.69975280761719, "learning_rate": 3e-06, "loss": 4.7483, "step": 269 }, { "epoch": 0.024, "grad_norm": 59.52255630493164, "learning_rate": 3e-06, "loss": 1.1483, "step": 270 }, { "epoch": 0.02408888888888889, "grad_norm": 38.91862106323242, "learning_rate": 3e-06, "loss": 6.09, "step": 271 }, { "epoch": 0.024177777777777777, "grad_norm": 43.66323471069336, "learning_rate": 3e-06, "loss": 0.4765, "step": 272 }, { "epoch": 0.024266666666666666, "grad_norm": 44.54389572143555, "learning_rate": 3e-06, "loss": -1.5437, "step": 273 }, { "epoch": 0.024355555555555554, "grad_norm": 84.41556549072266, "learning_rate": 3e-06, "loss": 7.1028, "step": 274 }, { "epoch": 0.024444444444444446, "grad_norm": 38.220367431640625, "learning_rate": 3e-06, "loss": 4.0407, "step": 275 }, { "epoch": 0.024533333333333334, "grad_norm": 45.620452880859375, "learning_rate": 3e-06, "loss": 0.5047, "step": 276 }, { "completion_length": 243.18750762939453, "epoch": 0.024622222222222222, "grad_norm": 84.81961059570312, "learning_rate": 3e-06, "loss": 5.442, "reward": 1.3333333730697632, "reward_std": 0.4887756109237671, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.7083333432674408, "step": 277, "zero_std_ratio": 0.0 }, { "epoch": 0.02471111111111111, "grad_norm": 112.86151885986328, "learning_rate": 3e-06, "loss": -1.2528, "step": 278 }, { "epoch": 0.0248, "grad_norm": 76.52424621582031, "learning_rate": 3e-06, "loss": -4.5325, "step": 279 }, { "epoch": 0.024888888888888887, "grad_norm": 94.0294189453125, "learning_rate": 3e-06, "loss": -6.6167, "step": 280 }, { "epoch": 0.02497777777777778, "grad_norm": 78.60155487060547, "learning_rate": 3e-06, "loss": 0.2653, "step": 281 }, { "epoch": 0.025066666666666668, "grad_norm": 58.42827224731445, "learning_rate": 3e-06, "loss": -5.6591, "step": 282 }, { "epoch": 0.025155555555555556, "grad_norm": 81.04212188720703, "learning_rate": 3e-06, "loss": 4.2739, "step": 283 }, { "epoch": 0.025244444444444444, "grad_norm": 67.27478790283203, "learning_rate": 3e-06, "loss": -1.5776, "step": 284 }, { "epoch": 0.025333333333333333, "grad_norm": 114.41588592529297, "learning_rate": 3e-06, "loss": -5.4532, "step": 285 }, { "epoch": 0.02542222222222222, "grad_norm": 75.61115264892578, "learning_rate": 3e-06, "loss": -7.4401, "step": 286 }, { "epoch": 0.02551111111111111, "grad_norm": 236.67214965820312, "learning_rate": 3e-06, "loss": -0.6833, "step": 287 }, { "epoch": 0.0256, "grad_norm": 59.2407341003418, "learning_rate": 3e-06, "loss": -6.8172, "step": 288 }, { "completion_length": 250.20834350585938, "epoch": 0.02568888888888889, "grad_norm": 93.99871063232422, "learning_rate": 3e-06, "loss": -12.9867, "reward": 1.4687500596046448, "reward_std": 0.6822589337825775, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.7916666865348816, "step": 289, "zero_std_ratio": 0.25 }, { "epoch": 0.025777777777777778, "grad_norm": 79.78562927246094, "learning_rate": 3e-06, "loss": -10.387, "step": 290 }, { "epoch": 0.025866666666666666, "grad_norm": 150.55654907226562, "learning_rate": 3e-06, "loss": -11.9684, "step": 291 }, { "epoch": 0.025955555555555555, "grad_norm": 86.15855407714844, "learning_rate": 3e-06, "loss": -13.3488, "step": 292 }, { "epoch": 0.026044444444444443, "grad_norm": 82.68080139160156, "learning_rate": 3e-06, "loss": -9.7978, "step": 293 }, { "epoch": 0.026133333333333335, "grad_norm": 73.47705841064453, "learning_rate": 3e-06, "loss": -15.2661, "step": 294 }, { "epoch": 0.026222222222222223, "grad_norm": 88.39766693115234, "learning_rate": 3e-06, "loss": -14.0408, "step": 295 }, { "epoch": 0.02631111111111111, "grad_norm": 81.03710174560547, "learning_rate": 3e-06, "loss": -11.6435, "step": 296 }, { "epoch": 0.0264, "grad_norm": 97.82394409179688, "learning_rate": 3e-06, "loss": -12.9819, "step": 297 }, { "epoch": 0.026488888888888888, "grad_norm": 91.29530334472656, "learning_rate": 3e-06, "loss": -14.6218, "step": 298 }, { "epoch": 0.026577777777777777, "grad_norm": 76.14654541015625, "learning_rate": 3e-06, "loss": -10.3319, "step": 299 }, { "epoch": 0.02666666666666667, "grad_norm": 76.4620590209961, "learning_rate": 3e-06, "loss": -16.649, "step": 300 }, { "completion_length": 245.87500762939453, "epoch": 0.026755555555555557, "grad_norm": 66.47940063476562, "learning_rate": 3e-06, "loss": -2.4273, "reward": 1.2916666865348816, "reward_std": 0.4701542556285858, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.5416666716337204, "step": 301, "zero_std_ratio": 0.5 }, { "epoch": 0.026844444444444445, "grad_norm": 57.04201126098633, "learning_rate": 3e-06, "loss": -2.7348, "step": 302 }, { "epoch": 0.026933333333333333, "grad_norm": 67.42317962646484, "learning_rate": 3e-06, "loss": -0.5419, "step": 303 }, { "epoch": 0.027022222222222222, "grad_norm": 68.1643295288086, "learning_rate": 3e-06, "loss": -4.7537, "step": 304 }, { "epoch": 0.02711111111111111, "grad_norm": 66.15480041503906, "learning_rate": 3e-06, "loss": -3.0219, "step": 305 }, { "epoch": 0.0272, "grad_norm": 72.49027252197266, "learning_rate": 3e-06, "loss": -0.992, "step": 306 }, { "epoch": 0.02728888888888889, "grad_norm": 63.84511947631836, "learning_rate": 3e-06, "loss": -2.9532, "step": 307 }, { "epoch": 0.02737777777777778, "grad_norm": 60.41191864013672, "learning_rate": 3e-06, "loss": -3.5425, "step": 308 }, { "epoch": 0.027466666666666667, "grad_norm": 75.88224029541016, "learning_rate": 3e-06, "loss": -0.626, "step": 309 }, { "epoch": 0.027555555555555555, "grad_norm": 60.12965774536133, "learning_rate": 3e-06, "loss": -4.971, "step": 310 }, { "epoch": 0.027644444444444444, "grad_norm": 67.24330139160156, "learning_rate": 3e-06, "loss": -3.5011, "step": 311 }, { "epoch": 0.027733333333333332, "grad_norm": 66.98039245605469, "learning_rate": 3e-06, "loss": -1.487, "step": 312 }, { "completion_length": 255.27083587646484, "epoch": 0.027822222222222224, "grad_norm": 36.04975891113281, "learning_rate": 3e-06, "loss": -0.5188, "reward": 1.5312500596046448, "reward_std": 0.1546149756759405, "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, "rewards/correctness_reward_func_math": 0.9166666567325592, "step": 313, "zero_std_ratio": 0.75 }, { "epoch": 0.027911111111111112, "grad_norm": 35.71009063720703, "learning_rate": 3e-06, "loss": 2.2571, "step": 314 }, { "epoch": 0.028, "grad_norm": 43.758975982666016, "learning_rate": 3e-06, "loss": 1.5113, "step": 315 }, { "epoch": 0.02808888888888889, "grad_norm": 42.099124908447266, "learning_rate": 3e-06, "loss": -1.203, "step": 316 }, { "epoch": 0.028177777777777777, "grad_norm": 46.69057846069336, "learning_rate": 3e-06, "loss": 2.1907, "step": 317 }, { "epoch": 0.028266666666666666, "grad_norm": 43.00071334838867, "learning_rate": 3e-06, "loss": -0.0413, "step": 318 }, { "epoch": 0.028355555555555554, "grad_norm": 36.017799377441406, "learning_rate": 3e-06, "loss": -0.9995, "step": 319 }, { "epoch": 0.028444444444444446, "grad_norm": 35.86075973510742, "learning_rate": 3e-06, "loss": 1.8159, "step": 320 }, { "epoch": 0.028533333333333334, "grad_norm": 46.41409683227539, "learning_rate": 3e-06, "loss": 0.8693, "step": 321 }, { "epoch": 0.028622222222222223, "grad_norm": 42.182472229003906, "learning_rate": 3e-06, "loss": -1.9042, "step": 322 }, { "epoch": 0.02871111111111111, "grad_norm": 47.805999755859375, "learning_rate": 3e-06, "loss": 1.6417, "step": 323 }, { "epoch": 0.0288, "grad_norm": 45.03670883178711, "learning_rate": 3e-06, "loss": -1.09, "step": 324 }, { "completion_length": 252.64583587646484, "epoch": 0.028888888888888888, "grad_norm": 58.358917236328125, "learning_rate": 3e-06, "loss": -12.4368, "reward": 1.1666666865348816, "reward_std": 0.37967559695243835, "rewards/boxed_and_answer_tags_format_reward": 0.6666666567325592, "rewards/correctness_reward_func_math": 0.5, "step": 325, "zero_std_ratio": 0.625 }, { "epoch": 0.02897777777777778, "grad_norm": 70.42740631103516, "learning_rate": 3e-06, "loss": -6.5213, "step": 326 }, { "epoch": 0.029066666666666668, "grad_norm": 71.7884750366211, "learning_rate": 3e-06, "loss": -14.7372, "step": 327 }, { "epoch": 0.029155555555555556, "grad_norm": 64.89356231689453, "learning_rate": 3e-06, "loss": -2.411, "step": 328 }, { "epoch": 0.029244444444444444, "grad_norm": 63.557125091552734, "learning_rate": 3e-06, "loss": -6.2777, "step": 329 }, { "epoch": 0.029333333333333333, "grad_norm": 55.46377182006836, "learning_rate": 3e-06, "loss": -6.9502, "step": 330 }, { "epoch": 0.02942222222222222, "grad_norm": 67.63842010498047, "learning_rate": 3e-06, "loss": -13.2148, "step": 331 }, { "epoch": 0.02951111111111111, "grad_norm": 69.31304931640625, "learning_rate": 3e-06, "loss": -7.451, "step": 332 }, { "epoch": 0.0296, "grad_norm": 72.68626403808594, "learning_rate": 3e-06, "loss": -15.5911, "step": 333 }, { "epoch": 0.02968888888888889, "grad_norm": 67.20828247070312, "learning_rate": 3e-06, "loss": -3.4952, "step": 334 }, { "epoch": 0.029777777777777778, "grad_norm": 71.56851959228516, "learning_rate": 3e-06, "loss": -6.8577, "step": 335 }, { "epoch": 0.029866666666666666, "grad_norm": 55.80412292480469, "learning_rate": 3e-06, "loss": -7.9959, "step": 336 }, { "completion_length": 254.33333587646484, "epoch": 0.029955555555555555, "grad_norm": 83.18997955322266, "learning_rate": 3e-06, "loss": -4.4206, "reward": 1.3854166865348816, "reward_std": 0.6009446382522583, "rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592, "rewards/correctness_reward_func_math": 0.75, "step": 337, "zero_std_ratio": 0.25 }, { "epoch": 0.030044444444444443, "grad_norm": 76.95658111572266, "learning_rate": 3e-06, "loss": 0.0507, "step": 338 }, { "epoch": 0.030133333333333335, "grad_norm": 99.8234634399414, "learning_rate": 3e-06, "loss": -4.7763, "step": 339 }, { "epoch": 0.030222222222222223, "grad_norm": 89.73624420166016, "learning_rate": 3e-06, "loss": -12.8145, "step": 340 }, { "epoch": 0.03031111111111111, "grad_norm": 105.25814819335938, "learning_rate": 3e-06, "loss": 1.3688, "step": 341 }, { "epoch": 0.0304, "grad_norm": 91.62116241455078, "learning_rate": 3e-06, "loss": -7.2119, "step": 342 }, { "epoch": 0.03048888888888889, "grad_norm": 89.00618743896484, "learning_rate": 3e-06, "loss": -5.8364, "step": 343 }, { "epoch": 0.030577777777777777, "grad_norm": 75.79231262207031, "learning_rate": 3e-06, "loss": -1.9053, "step": 344 }, { "epoch": 0.030666666666666665, "grad_norm": 109.15798950195312, "learning_rate": 3e-06, "loss": -7.4347, "step": 345 }, { "epoch": 0.030755555555555557, "grad_norm": 91.9997787475586, "learning_rate": 3e-06, "loss": -15.0307, "step": 346 }, { "epoch": 0.030844444444444445, "grad_norm": 113.604248046875, "learning_rate": 3e-06, "loss": -0.8063, "step": 347 }, { "epoch": 0.030933333333333334, "grad_norm": 90.35537719726562, "learning_rate": 3e-06, "loss": -9.993, "step": 348 }, { "completion_length": 241.06250762939453, "epoch": 0.031022222222222222, "grad_norm": 79.29890441894531, "learning_rate": 3e-06, "loss": -11.3458, "reward": 1.6979167461395264, "reward_std": 0.5608386099338531, "rewards/boxed_and_answer_tags_format_reward": 0.65625, "rewards/correctness_reward_func_math": 1.0416666567325592, "step": 349, "zero_std_ratio": 0.25 }, { "epoch": 0.03111111111111111, "grad_norm": 85.71048736572266, "learning_rate": 3e-06, "loss": -0.4666, "step": 350 }, { "epoch": 0.0312, "grad_norm": 74.05301666259766, "learning_rate": 3e-06, "loss": -17.7307, "step": 351 }, { "epoch": 0.03128888888888889, "grad_norm": 77.7562026977539, "learning_rate": 3e-06, "loss": -10.9744, "step": 352 }, { "epoch": 0.031377777777777775, "grad_norm": 81.5293197631836, "learning_rate": 3e-06, "loss": -7.753, "step": 353 }, { "epoch": 0.031466666666666664, "grad_norm": 74.96295166015625, "learning_rate": 3e-06, "loss": -4.9168, "step": 354 }, { "epoch": 0.03155555555555556, "grad_norm": 77.19646453857422, "learning_rate": 3e-06, "loss": -12.4852, "step": 355 }, { "epoch": 0.03164444444444445, "grad_norm": 88.24592590332031, "learning_rate": 3e-06, "loss": -2.2546, "step": 356 }, { "epoch": 0.031733333333333336, "grad_norm": 69.32099151611328, "learning_rate": 3e-06, "loss": -18.7936, "step": 357 }, { "epoch": 0.031822222222222224, "grad_norm": 74.59849548339844, "learning_rate": 3e-06, "loss": -12.5008, "step": 358 }, { "epoch": 0.03191111111111111, "grad_norm": 89.21590423583984, "learning_rate": 3e-06, "loss": -9.1562, "step": 359 }, { "epoch": 0.032, "grad_norm": 70.9638671875, "learning_rate": 3e-06, "loss": -7.0573, "step": 360 }, { "completion_length": 250.83334350585938, "epoch": 0.03208888888888889, "grad_norm": 72.94464111328125, "learning_rate": 3e-06, "loss": -14.781, "reward": 0.947916716337204, "reward_std": 0.45044803619384766, "rewards/boxed_and_answer_tags_format_reward": 0.6979166567325592, "rewards/correctness_reward_func_math": 0.2499999962747097, "step": 361, "zero_std_ratio": 0.375 }, { "epoch": 0.03217777777777778, "grad_norm": 91.14832305908203, "learning_rate": 3e-06, "loss": -15.6654, "step": 362 }, { "epoch": 0.032266666666666666, "grad_norm": 74.7421875, "learning_rate": 3e-06, "loss": -18.7276, "step": 363 }, { "epoch": 0.032355555555555554, "grad_norm": 76.62783813476562, "learning_rate": 3e-06, "loss": -20.5042, "step": 364 }, { "epoch": 0.03244444444444444, "grad_norm": 77.55496978759766, "learning_rate": 3e-06, "loss": -23.5561, "step": 365 }, { "epoch": 0.03253333333333333, "grad_norm": 87.73894500732422, "learning_rate": 3e-06, "loss": -13.7666, "step": 366 }, { "epoch": 0.03262222222222222, "grad_norm": 85.25169372558594, "learning_rate": 3e-06, "loss": -15.9124, "step": 367 }, { "epoch": 0.032711111111111114, "grad_norm": 82.08868408203125, "learning_rate": 3e-06, "loss": -17.5144, "step": 368 }, { "epoch": 0.0328, "grad_norm": 88.5888900756836, "learning_rate": 3e-06, "loss": -20.7047, "step": 369 }, { "epoch": 0.03288888888888889, "grad_norm": 87.28410339355469, "learning_rate": 3e-06, "loss": -22.9003, "step": 370 }, { "epoch": 0.03297777777777778, "grad_norm": 75.89826965332031, "learning_rate": 3e-06, "loss": -25.7767, "step": 371 }, { "epoch": 0.03306666666666667, "grad_norm": 79.7787094116211, "learning_rate": 3e-06, "loss": -16.1167, "step": 372 }, { "completion_length": 251.9791717529297, "epoch": 0.033155555555555556, "grad_norm": 122.57412719726562, "learning_rate": 3e-06, "loss": -7.3842, "reward": 1.1666666865348816, "reward_std": 0.7582502365112305, "rewards/boxed_and_answer_tags_format_reward": 0.5833333134651184, "rewards/correctness_reward_func_math": 0.5833333283662796, "step": 373, "zero_std_ratio": 0.125 }, { "epoch": 0.033244444444444445, "grad_norm": 103.27584838867188, "learning_rate": 3e-06, "loss": -2.3769, "step": 374 }, { "epoch": 0.03333333333333333, "grad_norm": 111.76622772216797, "learning_rate": 3e-06, "loss": 5.4646, "step": 375 }, { "epoch": 0.03342222222222222, "grad_norm": 110.36207580566406, "learning_rate": 3e-06, "loss": -9.271, "step": 376 }, { "epoch": 0.03351111111111111, "grad_norm": 102.65152740478516, "learning_rate": 3e-06, "loss": -0.8725, "step": 377 }, { "epoch": 0.0336, "grad_norm": 107.27348327636719, "learning_rate": 3e-06, "loss": -4.3279, "step": 378 }, { "epoch": 0.033688888888888886, "grad_norm": 118.1567153930664, "learning_rate": 3e-06, "loss": -8.1813, "step": 379 }, { "epoch": 0.033777777777777775, "grad_norm": 98.9560317993164, "learning_rate": 3e-06, "loss": -3.3073, "step": 380 }, { "epoch": 0.03386666666666667, "grad_norm": 119.64665222167969, "learning_rate": 3e-06, "loss": 4.1117, "step": 381 }, { "epoch": 0.03395555555555556, "grad_norm": 118.64970397949219, "learning_rate": 3e-06, "loss": -10.3727, "step": 382 }, { "epoch": 0.03404444444444445, "grad_norm": 117.53937530517578, "learning_rate": 3e-06, "loss": -3.6092, "step": 383 }, { "epoch": 0.034133333333333335, "grad_norm": 108.86544799804688, "learning_rate": 3e-06, "loss": -6.4896, "step": 384 }, { "completion_length": 242.27084350585938, "epoch": 0.03422222222222222, "grad_norm": 37.28844451904297, "learning_rate": 3e-06, "loss": 35.3752, "reward": 1.0208333432674408, "reward_std": 0.12909945845603943, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.3333333358168602, "step": 385, "zero_std_ratio": 0.875 }, { "epoch": 0.03431111111111111, "grad_norm": 34.82659149169922, "learning_rate": 3e-06, "loss": 32.1003, "step": 386 }, { "epoch": 0.0344, "grad_norm": 34.34743881225586, "learning_rate": 3e-06, "loss": 31.1165, "step": 387 }, { "epoch": 0.03448888888888889, "grad_norm": 44.72328186035156, "learning_rate": 3e-06, "loss": 32.7756, "step": 388 }, { "epoch": 0.03457777777777778, "grad_norm": 42.72700119018555, "learning_rate": 3e-06, "loss": 32.2398, "step": 389 }, { "epoch": 0.034666666666666665, "grad_norm": 47.69383239746094, "learning_rate": 3e-06, "loss": 33.0683, "step": 390 }, { "epoch": 0.03475555555555555, "grad_norm": 39.66519546508789, "learning_rate": 3e-06, "loss": 34.5053, "step": 391 }, { "epoch": 0.03484444444444444, "grad_norm": 39.71942138671875, "learning_rate": 3e-06, "loss": 31.0092, "step": 392 }, { "epoch": 0.03493333333333333, "grad_norm": 36.60993576049805, "learning_rate": 3e-06, "loss": 30.3034, "step": 393 }, { "epoch": 0.035022222222222225, "grad_norm": 47.912837982177734, "learning_rate": 3e-06, "loss": 31.4023, "step": 394 }, { "epoch": 0.035111111111111114, "grad_norm": 42.3475341796875, "learning_rate": 3e-06, "loss": 31.2326, "step": 395 }, { "epoch": 0.0352, "grad_norm": 40.417381286621094, "learning_rate": 3e-06, "loss": 31.5571, "step": 396 }, { "completion_length": 251.56250762939453, "epoch": 0.03528888888888889, "grad_norm": 41.37530517578125, "learning_rate": 3e-06, "loss": 9.2621, "reward": 1.6458333730697632, "reward_std": 0.1489431317895651, "rewards/boxed_and_answer_tags_format_reward": 0.7291666865348816, "rewards/correctness_reward_func_math": 0.9166666865348816, "step": 397, "zero_std_ratio": 0.75 }, { "epoch": 0.03537777777777778, "grad_norm": 42.853084564208984, "learning_rate": 3e-06, "loss": 8.8572, "step": 398 }, { "epoch": 0.03546666666666667, "grad_norm": 41.449344635009766, "learning_rate": 3e-06, "loss": 5.6158, "step": 399 }, { "epoch": 0.035555555555555556, "grad_norm": 58.31279754638672, "learning_rate": 3e-06, "loss": 7.968, "step": 400 }, { "epoch": 0.035644444444444444, "grad_norm": 48.664459228515625, "learning_rate": 3e-06, "loss": 11.1793, "step": 401 }, { "epoch": 0.03573333333333333, "grad_norm": 45.02378845214844, "learning_rate": 3e-06, "loss": 7.6242, "step": 402 }, { "epoch": 0.03582222222222222, "grad_norm": 43.53935241699219, "learning_rate": 3e-06, "loss": 8.0172, "step": 403 }, { "epoch": 0.03591111111111111, "grad_norm": 42.496604919433594, "learning_rate": 3e-06, "loss": 7.5088, "step": 404 }, { "epoch": 0.036, "grad_norm": 44.294986724853516, "learning_rate": 3e-06, "loss": 3.9932, "step": 405 }, { "epoch": 0.036088888888888886, "grad_norm": 73.07268524169922, "learning_rate": 3e-06, "loss": 6.5222, "step": 406 }, { "epoch": 0.03617777777777778, "grad_norm": 44.31553649902344, "learning_rate": 3e-06, "loss": 9.2936, "step": 407 }, { "epoch": 0.03626666666666667, "grad_norm": 48.42079162597656, "learning_rate": 3e-06, "loss": 5.8115, "step": 408 }, { "completion_length": 253.7291717529297, "epoch": 0.03635555555555556, "grad_norm": 55.15653991699219, "learning_rate": 3e-06, "loss": -4.6673, "reward": 0.90625, "reward_std": 0.15461495518684387, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.1666666716337204, "step": 409, "zero_std_ratio": 0.75 }, { "epoch": 0.036444444444444446, "grad_norm": 54.93301773071289, "learning_rate": 3e-06, "loss": 1.7365, "step": 410 }, { "epoch": 0.036533333333333334, "grad_norm": 50.56829071044922, "learning_rate": 3e-06, "loss": 2.3972, "step": 411 }, { "epoch": 0.03662222222222222, "grad_norm": 50.894187927246094, "learning_rate": 3e-06, "loss": 0.7298, "step": 412 }, { "epoch": 0.03671111111111111, "grad_norm": 64.99378204345703, "learning_rate": 3e-06, "loss": 1.6822, "step": 413 }, { "epoch": 0.0368, "grad_norm": 53.45103454589844, "learning_rate": 3e-06, "loss": 2.1252, "step": 414 }, { "epoch": 0.03688888888888889, "grad_norm": 58.880393981933594, "learning_rate": 3e-06, "loss": -4.8912, "step": 415 }, { "epoch": 0.036977777777777776, "grad_norm": 52.00230407714844, "learning_rate": 3e-06, "loss": 0.9087, "step": 416 }, { "epoch": 0.037066666666666664, "grad_norm": 54.192508697509766, "learning_rate": 3e-06, "loss": 1.5202, "step": 417 }, { "epoch": 0.03715555555555555, "grad_norm": 52.10379409790039, "learning_rate": 3e-06, "loss": -0.0026, "step": 418 }, { "epoch": 0.03724444444444444, "grad_norm": 56.19913864135742, "learning_rate": 3e-06, "loss": 0.9843, "step": 419 }, { "epoch": 0.037333333333333336, "grad_norm": 51.573699951171875, "learning_rate": 3e-06, "loss": 0.9967, "step": 420 }, { "completion_length": 241.95833587646484, "epoch": 0.037422222222222225, "grad_norm": 65.98304748535156, "learning_rate": 3e-06, "loss": 1.8524, "reward": 1.0937500596046448, "reward_std": 0.3994170129299164, "rewards/boxed_and_answer_tags_format_reward": 0.59375, "rewards/correctness_reward_func_math": 0.5, "step": 421, "zero_std_ratio": 0.625 }, { "epoch": 0.03751111111111111, "grad_norm": 68.6063232421875, "learning_rate": 3e-06, "loss": -2.8346, "step": 422 }, { "epoch": 0.0376, "grad_norm": 61.650146484375, "learning_rate": 3e-06, "loss": -2.9892, "step": 423 }, { "epoch": 0.03768888888888889, "grad_norm": 66.45751953125, "learning_rate": 3e-06, "loss": -3.349, "step": 424 }, { "epoch": 0.03777777777777778, "grad_norm": 66.84425354003906, "learning_rate": 3e-06, "loss": 2.716, "step": 425 }, { "epoch": 0.037866666666666667, "grad_norm": 61.292354583740234, "learning_rate": 3e-06, "loss": 1.6447, "step": 426 }, { "epoch": 0.037955555555555555, "grad_norm": 66.02394104003906, "learning_rate": 3e-06, "loss": 1.1078, "step": 427 }, { "epoch": 0.03804444444444444, "grad_norm": 62.10107421875, "learning_rate": 3e-06, "loss": -3.9971, "step": 428 }, { "epoch": 0.03813333333333333, "grad_norm": 85.67212677001953, "learning_rate": 3e-06, "loss": -3.8248, "step": 429 }, { "epoch": 0.03822222222222222, "grad_norm": 60.61140060424805, "learning_rate": 3e-06, "loss": -4.433, "step": 430 }, { "epoch": 0.03831111111111111, "grad_norm": 62.81836700439453, "learning_rate": 3e-06, "loss": 1.751, "step": 431 }, { "epoch": 0.0384, "grad_norm": 60.05256652832031, "learning_rate": 3e-06, "loss": 0.8432, "step": 432 }, { "completion_length": 247.27083587646484, "epoch": 0.03848888888888889, "grad_norm": 83.67351531982422, "learning_rate": 3e-06, "loss": 6.9951, "reward": 1.3229166865348816, "reward_std": 0.3936077058315277, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.5833333544433117, "step": 433, "zero_std_ratio": 0.5 }, { "epoch": 0.03857777777777778, "grad_norm": 71.3005599975586, "learning_rate": 3e-06, "loss": 4.71, "step": 434 }, { "epoch": 0.03866666666666667, "grad_norm": 81.9188003540039, "learning_rate": 3e-06, "loss": 8.7788, "step": 435 }, { "epoch": 0.03875555555555556, "grad_norm": 79.42195129394531, "learning_rate": 3e-06, "loss": 11.129, "step": 436 }, { "epoch": 0.038844444444444445, "grad_norm": 84.69261169433594, "learning_rate": 3e-06, "loss": 10.7206, "step": 437 }, { "epoch": 0.038933333333333334, "grad_norm": 74.52008819580078, "learning_rate": 3e-06, "loss": 12.7332, "step": 438 }, { "epoch": 0.03902222222222222, "grad_norm": 93.8567123413086, "learning_rate": 3e-06, "loss": 6.2331, "step": 439 }, { "epoch": 0.03911111111111111, "grad_norm": 79.21229553222656, "learning_rate": 3e-06, "loss": 3.7102, "step": 440 }, { "epoch": 0.0392, "grad_norm": 74.53849792480469, "learning_rate": 3e-06, "loss": 8.0091, "step": 441 }, { "epoch": 0.03928888888888889, "grad_norm": 79.04943084716797, "learning_rate": 3e-06, "loss": 9.4256, "step": 442 }, { "epoch": 0.039377777777777775, "grad_norm": 81.54142761230469, "learning_rate": 3e-06, "loss": 9.0032, "step": 443 }, { "epoch": 0.039466666666666664, "grad_norm": 83.44096374511719, "learning_rate": 3e-06, "loss": 11.701, "step": 444 }, { "completion_length": 249.2291717529297, "epoch": 0.03955555555555555, "grad_norm": 78.02682495117188, "learning_rate": 3e-06, "loss": 1.3882, "reward": 1.1458333730697632, "reward_std": 0.33968228101730347, "rewards/boxed_and_answer_tags_format_reward": 0.6041666865348816, "rewards/correctness_reward_func_math": 0.5416666679084301, "step": 445, "zero_std_ratio": 0.5 }, { "epoch": 0.03964444444444445, "grad_norm": 77.9029312133789, "learning_rate": 3e-06, "loss": -10.5754, "step": 446 }, { "epoch": 0.039733333333333336, "grad_norm": 78.25541687011719, "learning_rate": 3e-06, "loss": -4.1056, "step": 447 }, { "epoch": 0.039822222222222224, "grad_norm": 100.47134399414062, "learning_rate": 3e-06, "loss": -10.7897, "step": 448 }, { "epoch": 0.03991111111111111, "grad_norm": 74.78359985351562, "learning_rate": 3e-06, "loss": -3.3144, "step": 449 }, { "epoch": 0.04, "grad_norm": 96.55880737304688, "learning_rate": 3e-06, "loss": 2.8763, "step": 450 }, { "epoch": 0.04008888888888889, "grad_norm": 87.14449310302734, "learning_rate": 3e-06, "loss": 0.2308, "step": 451 }, { "epoch": 0.04017777777777778, "grad_norm": 97.77748107910156, "learning_rate": 3e-06, "loss": -11.2666, "step": 452 }, { "epoch": 0.040266666666666666, "grad_norm": 73.44164276123047, "learning_rate": 3e-06, "loss": -5.678, "step": 453 }, { "epoch": 0.040355555555555554, "grad_norm": 100.09737396240234, "learning_rate": 3e-06, "loss": -11.4908, "step": 454 }, { "epoch": 0.04044444444444444, "grad_norm": 74.52122497558594, "learning_rate": 3e-06, "loss": -4.6635, "step": 455 }, { "epoch": 0.04053333333333333, "grad_norm": 71.74639892578125, "learning_rate": 3e-06, "loss": 1.7931, "step": 456 }, { "completion_length": 252.52084350585938, "epoch": 0.04062222222222222, "grad_norm": 102.89392852783203, "learning_rate": 3e-06, "loss": 12.935, "reward": 1.0625000596046448, "reward_std": 0.43528568744659424, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.3750000111758709, "step": 457, "zero_std_ratio": 0.5 }, { "epoch": 0.040711111111111115, "grad_norm": 105.67884063720703, "learning_rate": 3e-06, "loss": 2.3682, "step": 458 }, { "epoch": 0.0408, "grad_norm": 90.33670806884766, "learning_rate": 3e-06, "loss": 14.3722, "step": 459 }, { "epoch": 0.04088888888888889, "grad_norm": 109.0367431640625, "learning_rate": 3e-06, "loss": 11.6758, "step": 460 }, { "epoch": 0.04097777777777778, "grad_norm": 108.55673217773438, "learning_rate": 3e-06, "loss": 13.7948, "step": 461 }, { "epoch": 0.04106666666666667, "grad_norm": 91.73406982421875, "learning_rate": 3e-06, "loss": 7.7516, "step": 462 }, { "epoch": 0.041155555555555556, "grad_norm": 94.59785461425781, "learning_rate": 3e-06, "loss": 11.5977, "step": 463 }, { "epoch": 0.041244444444444445, "grad_norm": 78.94522857666016, "learning_rate": 3e-06, "loss": 1.9425, "step": 464 }, { "epoch": 0.04133333333333333, "grad_norm": 93.91060638427734, "learning_rate": 3e-06, "loss": 12.3454, "step": 465 }, { "epoch": 0.04142222222222222, "grad_norm": 98.78986358642578, "learning_rate": 3e-06, "loss": 10.3247, "step": 466 }, { "epoch": 0.04151111111111111, "grad_norm": 111.63731384277344, "learning_rate": 3e-06, "loss": 12.2581, "step": 467 }, { "epoch": 0.0416, "grad_norm": 88.63348388671875, "learning_rate": 3e-06, "loss": 5.7115, "step": 468 }, { "completion_length": 235.58333587646484, "epoch": 0.041688888888888886, "grad_norm": 70.78716278076172, "learning_rate": 3e-06, "loss": -40.1334, "reward": 1.4687500596046448, "reward_std": 0.38577648997306824, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.7916666865348816, "step": 469, "zero_std_ratio": 0.625 }, { "epoch": 0.041777777777777775, "grad_norm": 93.38511657714844, "learning_rate": 3e-06, "loss": -36.9438, "step": 470 }, { "epoch": 0.04186666666666667, "grad_norm": 84.67546081542969, "learning_rate": 3e-06, "loss": -43.221, "step": 471 }, { "epoch": 0.04195555555555556, "grad_norm": 99.22493743896484, "learning_rate": 3e-06, "loss": -43.1284, "step": 472 }, { "epoch": 0.04204444444444445, "grad_norm": 77.45098876953125, "learning_rate": 3e-06, "loss": -40.9152, "step": 473 }, { "epoch": 0.042133333333333335, "grad_norm": 70.95470428466797, "learning_rate": 3e-06, "loss": -32.8293, "step": 474 }, { "epoch": 0.042222222222222223, "grad_norm": 77.2640609741211, "learning_rate": 3e-06, "loss": -41.1009, "step": 475 }, { "epoch": 0.04231111111111111, "grad_norm": 84.97554779052734, "learning_rate": 3e-06, "loss": -37.7295, "step": 476 }, { "epoch": 0.0424, "grad_norm": 67.63621520996094, "learning_rate": 3e-06, "loss": -44.2423, "step": 477 }, { "epoch": 0.04248888888888889, "grad_norm": 99.13645935058594, "learning_rate": 3e-06, "loss": -45.4078, "step": 478 }, { "epoch": 0.04257777777777778, "grad_norm": 71.1223373413086, "learning_rate": 3e-06, "loss": -43.0538, "step": 479 }, { "epoch": 0.042666666666666665, "grad_norm": 81.17517852783203, "learning_rate": 3e-06, "loss": -34.5254, "step": 480 }, { "completion_length": 250.5625, "epoch": 0.042755555555555554, "grad_norm": 111.48869323730469, "learning_rate": 3e-06, "loss": 20.9161, "reward": 1.2291666865348816, "reward_std": 0.26686520874500275, "rewards/boxed_and_answer_tags_format_reward": 0.6041666865348816, "rewards/correctness_reward_func_math": 0.6250000223517418, "step": 481, "zero_std_ratio": 0.625 }, { "epoch": 0.04284444444444444, "grad_norm": 110.74134063720703, "learning_rate": 3e-06, "loss": 21.5354, "step": 482 }, { "epoch": 0.04293333333333333, "grad_norm": 121.27849578857422, "learning_rate": 3e-06, "loss": 22.1143, "step": 483 }, { "epoch": 0.043022222222222226, "grad_norm": 111.5245590209961, "learning_rate": 3e-06, "loss": 27.6408, "step": 484 }, { "epoch": 0.043111111111111114, "grad_norm": 100.0729751586914, "learning_rate": 3e-06, "loss": 24.0997, "step": 485 }, { "epoch": 0.0432, "grad_norm": 123.4867935180664, "learning_rate": 3e-06, "loss": 19.3324, "step": 486 }, { "epoch": 0.04328888888888889, "grad_norm": 107.24234008789062, "learning_rate": 3e-06, "loss": 18.5364, "step": 487 }, { "epoch": 0.04337777777777778, "grad_norm": 113.5108413696289, "learning_rate": 3e-06, "loss": 18.1962, "step": 488 }, { "epoch": 0.04346666666666667, "grad_norm": 136.6710968017578, "learning_rate": 3e-06, "loss": 17.9099, "step": 489 }, { "epoch": 0.043555555555555556, "grad_norm": 110.78118896484375, "learning_rate": 3e-06, "loss": 24.1538, "step": 490 }, { "epoch": 0.043644444444444444, "grad_norm": 87.0732192993164, "learning_rate": 3e-06, "loss": 20.6286, "step": 491 }, { "epoch": 0.04373333333333333, "grad_norm": 111.51911163330078, "learning_rate": 3e-06, "loss": 15.8689, "step": 492 }, { "completion_length": 232.1666717529297, "epoch": 0.04382222222222222, "grad_norm": 58.495811462402344, "learning_rate": 3e-06, "loss": 1.5791, "reward": 1.1145833730697632, "reward_std": 0.20219221711158752, "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, "rewards/correctness_reward_func_math": 0.5, "step": 493, "zero_std_ratio": 0.75 }, { "epoch": 0.04391111111111111, "grad_norm": 66.24677276611328, "learning_rate": 3e-06, "loss": 8.869, "step": 494 }, { "epoch": 0.044, "grad_norm": 75.63920593261719, "learning_rate": 3e-06, "loss": 6.6083, "step": 495 }, { "epoch": 0.044088888888888886, "grad_norm": 41.957889556884766, "learning_rate": 3e-06, "loss": 6.373, "step": 496 }, { "epoch": 0.04417777777777778, "grad_norm": 56.327693939208984, "learning_rate": 3e-06, "loss": 8.5285, "step": 497 }, { "epoch": 0.04426666666666667, "grad_norm": 56.58005905151367, "learning_rate": 3e-06, "loss": 8.228, "step": 498 }, { "epoch": 0.04435555555555556, "grad_norm": 56.65522766113281, "learning_rate": 3e-06, "loss": 1.1127, "step": 499 }, { "epoch": 0.044444444444444446, "grad_norm": 52.785221099853516, "learning_rate": 3e-06, "loss": 8.324, "step": 500 }, { "epoch": 0.044533333333333334, "grad_norm": 52.37721252441406, "learning_rate": 3e-06, "loss": 5.6566, "step": 501 }, { "epoch": 0.04462222222222222, "grad_norm": 50.5732307434082, "learning_rate": 3e-06, "loss": 5.266, "step": 502 }, { "epoch": 0.04471111111111111, "grad_norm": 59.614261627197266, "learning_rate": 3e-06, "loss": 7.5254, "step": 503 }, { "epoch": 0.0448, "grad_norm": 57.78561782836914, "learning_rate": 3e-06, "loss": 6.7155, "step": 504 }, { "completion_length": 249.5416717529297, "epoch": 0.04488888888888889, "grad_norm": 75.29253387451172, "learning_rate": 3e-06, "loss": 1.5945, "reward": 1.1979166865348816, "reward_std": 0.3113893121480942, "rewards/boxed_and_answer_tags_format_reward": 0.65625, "rewards/correctness_reward_func_math": 0.5416666567325592, "step": 505, "zero_std_ratio": 0.5 }, { "epoch": 0.044977777777777776, "grad_norm": 71.35801696777344, "learning_rate": 3e-06, "loss": 6.5836, "step": 506 }, { "epoch": 0.045066666666666665, "grad_norm": 68.37297058105469, "learning_rate": 3e-06, "loss": 2.1054, "step": 507 }, { "epoch": 0.04515555555555555, "grad_norm": 67.4723892211914, "learning_rate": 3e-06, "loss": 2.3974, "step": 508 }, { "epoch": 0.04524444444444444, "grad_norm": 84.1152114868164, "learning_rate": 3e-06, "loss": 0.1293, "step": 509 }, { "epoch": 0.04533333333333334, "grad_norm": 95.73898315429688, "learning_rate": 3e-06, "loss": -3.5888, "step": 510 }, { "epoch": 0.045422222222222225, "grad_norm": 73.4489974975586, "learning_rate": 3e-06, "loss": 0.6248, "step": 511 }, { "epoch": 0.04551111111111111, "grad_norm": 67.3970947265625, "learning_rate": 3e-06, "loss": 5.0044, "step": 512 }, { "epoch": 0.0456, "grad_norm": 68.55184936523438, "learning_rate": 3e-06, "loss": 1.048, "step": 513 }, { "epoch": 0.04568888888888889, "grad_norm": 72.8236312866211, "learning_rate": 3e-06, "loss": 0.9001, "step": 514 }, { "epoch": 0.04577777777777778, "grad_norm": 72.06463623046875, "learning_rate": 3e-06, "loss": -1.2144, "step": 515 }, { "epoch": 0.04586666666666667, "grad_norm": 87.04244995117188, "learning_rate": 3e-06, "loss": -5.3325, "step": 516 }, { "completion_length": 235.27083587646484, "epoch": 0.045955555555555555, "grad_norm": 76.81670379638672, "learning_rate": 3e-06, "loss": -18.2768, "reward": 1.2395833730697632, "reward_std": 0.4973409175872803, "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, "rewards/correctness_reward_func_math": 0.625, "step": 517, "zero_std_ratio": 0.5 }, { "epoch": 0.04604444444444444, "grad_norm": 129.93211364746094, "learning_rate": 3e-06, "loss": -20.3578, "step": 518 }, { "epoch": 0.04613333333333333, "grad_norm": 87.40878295898438, "learning_rate": 3e-06, "loss": -19.4647, "step": 519 }, { "epoch": 0.04622222222222222, "grad_norm": 88.27845764160156, "learning_rate": 3e-06, "loss": -23.9652, "step": 520 }, { "epoch": 0.04631111111111111, "grad_norm": 74.01776123046875, "learning_rate": 3e-06, "loss": -23.0054, "step": 521 }, { "epoch": 0.0464, "grad_norm": 86.09662628173828, "learning_rate": 3e-06, "loss": -16.2859, "step": 522 }, { "epoch": 0.04648888888888889, "grad_norm": 86.07221221923828, "learning_rate": 3e-06, "loss": -19.6548, "step": 523 }, { "epoch": 0.04657777777777778, "grad_norm": 81.51930236816406, "learning_rate": 3e-06, "loss": -22.0114, "step": 524 }, { "epoch": 0.04666666666666667, "grad_norm": 85.83316802978516, "learning_rate": 3e-06, "loss": -21.3756, "step": 525 }, { "epoch": 0.04675555555555556, "grad_norm": 80.6872787475586, "learning_rate": 3e-06, "loss": -25.6275, "step": 526 }, { "epoch": 0.046844444444444445, "grad_norm": 82.79136657714844, "learning_rate": 3e-06, "loss": -24.846, "step": 527 }, { "epoch": 0.046933333333333334, "grad_norm": 91.39631652832031, "learning_rate": 3e-06, "loss": -17.8527, "step": 528 }, { "completion_length": 251.77084350585938, "epoch": 0.04702222222222222, "grad_norm": 58.933895111083984, "learning_rate": 3e-06, "loss": 1.5821, "reward": 1.7083333730697632, "reward_std": 0.29204893112182617, "rewards/boxed_and_answer_tags_format_reward": 0.7083333432674408, "rewards/correctness_reward_func_math": 1.0, "step": 529, "zero_std_ratio": 0.625 }, { "epoch": 0.04711111111111111, "grad_norm": 70.82673645019531, "learning_rate": 3e-06, "loss": 4.666, "step": 530 }, { "epoch": 0.0472, "grad_norm": 62.45388412475586, "learning_rate": 3e-06, "loss": -0.8319, "step": 531 }, { "epoch": 0.04728888888888889, "grad_norm": 68.45257568359375, "learning_rate": 3e-06, "loss": -4.2921, "step": 532 }, { "epoch": 0.047377777777777776, "grad_norm": 63.96629333496094, "learning_rate": 3e-06, "loss": 7.1791, "step": 533 }, { "epoch": 0.047466666666666664, "grad_norm": 71.47022247314453, "learning_rate": 3e-06, "loss": -0.0559, "step": 534 }, { "epoch": 0.04755555555555555, "grad_norm": 69.21123504638672, "learning_rate": 3e-06, "loss": 1.1193, "step": 535 }, { "epoch": 0.04764444444444445, "grad_norm": 54.05726623535156, "learning_rate": 3e-06, "loss": 3.7663, "step": 536 }, { "epoch": 0.047733333333333336, "grad_norm": 57.079166412353516, "learning_rate": 3e-06, "loss": -1.8682, "step": 537 }, { "epoch": 0.047822222222222224, "grad_norm": 89.69593811035156, "learning_rate": 3e-06, "loss": -5.0232, "step": 538 }, { "epoch": 0.04791111111111111, "grad_norm": 68.15176391601562, "learning_rate": 3e-06, "loss": 6.4281, "step": 539 }, { "epoch": 0.048, "grad_norm": 71.53436279296875, "learning_rate": 3e-06, "loss": -0.8678, "step": 540 }, { "completion_length": 248.4791717529297, "epoch": 0.04808888888888889, "grad_norm": 111.93199157714844, "learning_rate": 3e-06, "loss": -3.1821, "reward": 1.3125, "reward_std": 0.7091469466686249, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.625, "step": 541, "zero_std_ratio": 0.25 }, { "epoch": 0.04817777777777778, "grad_norm": 104.72472381591797, "learning_rate": 3e-06, "loss": -9.114, "step": 542 }, { "epoch": 0.048266666666666666, "grad_norm": 137.42185974121094, "learning_rate": 3e-06, "loss": -12.9595, "step": 543 }, { "epoch": 0.048355555555555554, "grad_norm": 115.6964340209961, "learning_rate": 3e-06, "loss": -16.5078, "step": 544 }, { "epoch": 0.04844444444444444, "grad_norm": 107.43921661376953, "learning_rate": 3e-06, "loss": -12.0856, "step": 545 }, { "epoch": 0.04853333333333333, "grad_norm": 139.91366577148438, "learning_rate": 3e-06, "loss": -8.5451, "step": 546 }, { "epoch": 0.04862222222222222, "grad_norm": 107.25052642822266, "learning_rate": 3e-06, "loss": -4.7342, "step": 547 }, { "epoch": 0.04871111111111111, "grad_norm": 104.6925048828125, "learning_rate": 3e-06, "loss": -10.6582, "step": 548 }, { "epoch": 0.0488, "grad_norm": 108.13795471191406, "learning_rate": 3e-06, "loss": -14.883, "step": 549 }, { "epoch": 0.04888888888888889, "grad_norm": 108.62395477294922, "learning_rate": 3e-06, "loss": -19.0199, "step": 550 }, { "epoch": 0.04897777777777778, "grad_norm": 103.06570434570312, "learning_rate": 3e-06, "loss": -14.306, "step": 551 }, { "epoch": 0.04906666666666667, "grad_norm": 125.94219970703125, "learning_rate": 3e-06, "loss": -10.2489, "step": 552 }, { "completion_length": 221.43750762939453, "epoch": 0.049155555555555557, "grad_norm": 90.84999084472656, "learning_rate": 3e-06, "loss": 12.0477, "reward": 2.0104166865348816, "reward_std": 0.406316339969635, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 1.3333333730697632, "step": 553, "zero_std_ratio": 0.5 }, { "epoch": 0.049244444444444445, "grad_norm": 81.99716186523438, "learning_rate": 3e-06, "loss": 11.4046, "step": 554 }, { "epoch": 0.04933333333333333, "grad_norm": 89.69168090820312, "learning_rate": 3e-06, "loss": 15.5262, "step": 555 }, { "epoch": 0.04942222222222222, "grad_norm": 93.98981475830078, "learning_rate": 3e-06, "loss": 15.7367, "step": 556 }, { "epoch": 0.04951111111111111, "grad_norm": 86.68983459472656, "learning_rate": 3e-06, "loss": 5.511, "step": 557 }, { "epoch": 0.0496, "grad_norm": 78.73108673095703, "learning_rate": 3e-06, "loss": 13.5853, "step": 558 }, { "epoch": 0.04968888888888889, "grad_norm": 88.60321044921875, "learning_rate": 3e-06, "loss": 10.8131, "step": 559 }, { "epoch": 0.049777777777777775, "grad_norm": 78.82019805908203, "learning_rate": 3e-06, "loss": 10.9253, "step": 560 }, { "epoch": 0.04986666666666666, "grad_norm": 91.8647232055664, "learning_rate": 3e-06, "loss": 14.6415, "step": 561 }, { "epoch": 0.04995555555555556, "grad_norm": 106.29496765136719, "learning_rate": 3e-06, "loss": 14.4603, "step": 562 }, { "epoch": 0.05004444444444445, "grad_norm": 72.984130859375, "learning_rate": 3e-06, "loss": 4.1028, "step": 563 }, { "epoch": 0.050133333333333335, "grad_norm": 88.46710205078125, "learning_rate": 3e-06, "loss": 11.4185, "step": 564 }, { "completion_length": 238.89584350585938, "epoch": 0.050222222222222224, "grad_norm": 91.9970703125, "learning_rate": 3e-06, "loss": -6.0724, "reward": 1.0729166865348816, "reward_std": 0.5935818552970886, "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, "rewards/correctness_reward_func_math": 0.4583333283662796, "step": 565, "zero_std_ratio": 0.375 }, { "epoch": 0.05031111111111111, "grad_norm": 101.35919952392578, "learning_rate": 3e-06, "loss": -6.1114, "step": 566 }, { "epoch": 0.0504, "grad_norm": 100.29364776611328, "learning_rate": 3e-06, "loss": 3.7467, "step": 567 }, { "epoch": 0.05048888888888889, "grad_norm": 93.71849822998047, "learning_rate": 3e-06, "loss": 7.819, "step": 568 }, { "epoch": 0.05057777777777778, "grad_norm": 84.14008331298828, "learning_rate": 3e-06, "loss": -3.9591, "step": 569 }, { "epoch": 0.050666666666666665, "grad_norm": 91.14092254638672, "learning_rate": 3e-06, "loss": -7.111, "step": 570 }, { "epoch": 0.050755555555555554, "grad_norm": 93.15682220458984, "learning_rate": 3e-06, "loss": -7.5642, "step": 571 }, { "epoch": 0.05084444444444444, "grad_norm": 113.89299011230469, "learning_rate": 3e-06, "loss": -6.5357, "step": 572 }, { "epoch": 0.05093333333333333, "grad_norm": 91.64227294921875, "learning_rate": 3e-06, "loss": 2.2068, "step": 573 }, { "epoch": 0.05102222222222222, "grad_norm": 108.78038024902344, "learning_rate": 3e-06, "loss": 5.8992, "step": 574 }, { "epoch": 0.051111111111111114, "grad_norm": 83.73683166503906, "learning_rate": 3e-06, "loss": -5.9239, "step": 575 }, { "epoch": 0.0512, "grad_norm": 143.7006378173828, "learning_rate": 3e-06, "loss": -8.4902, "step": 576 }, { "completion_length": 243.7291717529297, "epoch": 0.05128888888888889, "grad_norm": 82.69302368164062, "learning_rate": 3e-06, "loss": 26.9064, "reward": 1.4583333730697632, "reward_std": 0.4701542556285858, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.708333358168602, "step": 577, "zero_std_ratio": 0.5 }, { "epoch": 0.05137777777777778, "grad_norm": 77.71623992919922, "learning_rate": 3e-06, "loss": 24.0495, "step": 578 }, { "epoch": 0.05146666666666667, "grad_norm": 80.28130340576172, "learning_rate": 3e-06, "loss": 19.1044, "step": 579 }, { "epoch": 0.051555555555555556, "grad_norm": 86.22237396240234, "learning_rate": 3e-06, "loss": 22.9667, "step": 580 }, { "epoch": 0.051644444444444444, "grad_norm": 94.3071060180664, "learning_rate": 3e-06, "loss": 17.6854, "step": 581 }, { "epoch": 0.05173333333333333, "grad_norm": 86.01050567626953, "learning_rate": 3e-06, "loss": 28.4794, "step": 582 }, { "epoch": 0.05182222222222222, "grad_norm": 87.72801971435547, "learning_rate": 3e-06, "loss": 25.3194, "step": 583 }, { "epoch": 0.05191111111111111, "grad_norm": 74.66322326660156, "learning_rate": 3e-06, "loss": 22.9833, "step": 584 }, { "epoch": 0.052, "grad_norm": 90.72804260253906, "learning_rate": 3e-06, "loss": 17.4641, "step": 585 }, { "epoch": 0.052088888888888886, "grad_norm": 87.39917755126953, "learning_rate": 3e-06, "loss": 21.6816, "step": 586 }, { "epoch": 0.052177777777777774, "grad_norm": 85.64997100830078, "learning_rate": 3e-06, "loss": 15.9135, "step": 587 }, { "epoch": 0.05226666666666667, "grad_norm": 92.35039520263672, "learning_rate": 3e-06, "loss": 26.4856, "step": 588 }, { "completion_length": 248.3541717529297, "epoch": 0.05235555555555556, "grad_norm": 93.8902359008789, "learning_rate": 3e-06, "loss": 14.9976, "reward": 1.3541666865348816, "reward_std": 0.5839263796806335, "rewards/boxed_and_answer_tags_format_reward": 0.7291666865348816, "rewards/correctness_reward_func_math": 0.6250000149011612, "step": 589, "zero_std_ratio": 0.375 }, { "epoch": 0.052444444444444446, "grad_norm": 98.34622192382812, "learning_rate": 3e-06, "loss": 12.6228, "step": 590 }, { "epoch": 0.052533333333333335, "grad_norm": 108.93590545654297, "learning_rate": 3e-06, "loss": 9.3556, "step": 591 }, { "epoch": 0.05262222222222222, "grad_norm": 114.49545288085938, "learning_rate": 3e-06, "loss": 23.8665, "step": 592 }, { "epoch": 0.05271111111111111, "grad_norm": 102.37223052978516, "learning_rate": 3e-06, "loss": 7.3813, "step": 593 }, { "epoch": 0.0528, "grad_norm": 98.17306518554688, "learning_rate": 3e-06, "loss": 13.3691, "step": 594 }, { "epoch": 0.05288888888888889, "grad_norm": 95.84387969970703, "learning_rate": 3e-06, "loss": 13.8857, "step": 595 }, { "epoch": 0.052977777777777776, "grad_norm": 101.44741821289062, "learning_rate": 3e-06, "loss": 11.0637, "step": 596 }, { "epoch": 0.053066666666666665, "grad_norm": 96.87228393554688, "learning_rate": 3e-06, "loss": 7.1168, "step": 597 }, { "epoch": 0.05315555555555555, "grad_norm": 129.4097442626953, "learning_rate": 3e-06, "loss": 22.4722, "step": 598 }, { "epoch": 0.05324444444444444, "grad_norm": 100.4094467163086, "learning_rate": 3e-06, "loss": 4.9249, "step": 599 }, { "epoch": 0.05333333333333334, "grad_norm": 97.2879638671875, "learning_rate": 3e-06, "loss": 10.7864, "step": 600 }, { "completion_length": 249.31250762939453, "epoch": 0.053422222222222225, "grad_norm": 81.29678344726562, "learning_rate": 3e-06, "loss": -1.4361, "reward": 1.6458333730697632, "reward_std": 0.43528565764427185, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.9583333432674408, "step": 601, "zero_std_ratio": 0.5 }, { "epoch": 0.05351111111111111, "grad_norm": 71.18397521972656, "learning_rate": 3e-06, "loss": -9.7138, "step": 602 }, { "epoch": 0.0536, "grad_norm": 79.09825897216797, "learning_rate": 3e-06, "loss": -7.5626, "step": 603 }, { "epoch": 0.05368888888888889, "grad_norm": 88.4588394165039, "learning_rate": 3e-06, "loss": -8.3433, "step": 604 }, { "epoch": 0.05377777777777778, "grad_norm": 75.85662841796875, "learning_rate": 3e-06, "loss": -3.126, "step": 605 }, { "epoch": 0.05386666666666667, "grad_norm": 76.75032806396484, "learning_rate": 3e-06, "loss": -4.1707, "step": 606 }, { "epoch": 0.053955555555555555, "grad_norm": 96.06957244873047, "learning_rate": 3e-06, "loss": -2.3824, "step": 607 }, { "epoch": 0.054044444444444444, "grad_norm": 108.4106674194336, "learning_rate": 3e-06, "loss": -10.8022, "step": 608 }, { "epoch": 0.05413333333333333, "grad_norm": 82.68360900878906, "learning_rate": 3e-06, "loss": -9.1987, "step": 609 }, { "epoch": 0.05422222222222222, "grad_norm": 92.35367584228516, "learning_rate": 3e-06, "loss": -9.379, "step": 610 }, { "epoch": 0.05431111111111111, "grad_norm": 78.61454010009766, "learning_rate": 3e-06, "loss": -4.4421, "step": 611 }, { "epoch": 0.0544, "grad_norm": 83.68685150146484, "learning_rate": 3e-06, "loss": -5.8651, "step": 612 }, { "completion_length": 252.8541717529297, "epoch": 0.05448888888888889, "grad_norm": 156.99725341796875, "learning_rate": 3e-06, "loss": -38.1088, "reward": 1.2395833730697632, "reward_std": 0.3936076909303665, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.5, "step": 613, "zero_std_ratio": 0.5 }, { "epoch": 0.05457777777777778, "grad_norm": 82.39892578125, "learning_rate": 3e-06, "loss": -24.9067, "step": 614 }, { "epoch": 0.05466666666666667, "grad_norm": 76.3405532836914, "learning_rate": 3e-06, "loss": -21.2904, "step": 615 }, { "epoch": 0.05475555555555556, "grad_norm": 114.32887268066406, "learning_rate": 3e-06, "loss": -41.3273, "step": 616 }, { "epoch": 0.054844444444444446, "grad_norm": 90.10194396972656, "learning_rate": 3e-06, "loss": -19.8759, "step": 617 }, { "epoch": 0.054933333333333334, "grad_norm": 101.73013305664062, "learning_rate": 3e-06, "loss": -31.9803, "step": 618 }, { "epoch": 0.05502222222222222, "grad_norm": 107.18656921386719, "learning_rate": 3e-06, "loss": -38.8443, "step": 619 }, { "epoch": 0.05511111111111111, "grad_norm": 85.78763580322266, "learning_rate": 3e-06, "loss": -26.4852, "step": 620 }, { "epoch": 0.0552, "grad_norm": 78.22523498535156, "learning_rate": 3e-06, "loss": -22.9265, "step": 621 }, { "epoch": 0.05528888888888889, "grad_norm": 120.62594604492188, "learning_rate": 3e-06, "loss": -44.6351, "step": 622 }, { "epoch": 0.055377777777777776, "grad_norm": 93.32075500488281, "learning_rate": 3e-06, "loss": -21.4564, "step": 623 }, { "epoch": 0.055466666666666664, "grad_norm": 97.2727279663086, "learning_rate": 3e-06, "loss": -34.0292, "step": 624 }, { "completion_length": 247.5, "epoch": 0.05555555555555555, "grad_norm": 98.01384735107422, "learning_rate": 3e-06, "loss": -25.8639, "reward": 1.9895833730697632, "reward_std": 0.6326004266738892, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 1.25, "step": 625, "zero_std_ratio": 0.25 }, { "epoch": 0.05564444444444445, "grad_norm": 110.7737045288086, "learning_rate": 3e-06, "loss": -26.4317, "step": 626 }, { "epoch": 0.055733333333333336, "grad_norm": 100.14824676513672, "learning_rate": 3e-06, "loss": -17.6826, "step": 627 }, { "epoch": 0.055822222222222224, "grad_norm": 95.32125854492188, "learning_rate": 3e-06, "loss": -13.7046, "step": 628 }, { "epoch": 0.05591111111111111, "grad_norm": 153.66207885742188, "learning_rate": 3e-06, "loss": -33.0736, "step": 629 }, { "epoch": 0.056, "grad_norm": 98.00647735595703, "learning_rate": 3e-06, "loss": -23.4488, "step": 630 }, { "epoch": 0.05608888888888889, "grad_norm": 101.93690490722656, "learning_rate": 3e-06, "loss": -27.788, "step": 631 }, { "epoch": 0.05617777777777778, "grad_norm": 109.6976089477539, "learning_rate": 3e-06, "loss": -27.8496, "step": 632 }, { "epoch": 0.056266666666666666, "grad_norm": 94.93986511230469, "learning_rate": 3e-06, "loss": -18.741, "step": 633 }, { "epoch": 0.056355555555555555, "grad_norm": 99.8827133178711, "learning_rate": 3e-06, "loss": -15.4788, "step": 634 }, { "epoch": 0.05644444444444444, "grad_norm": 157.86849975585938, "learning_rate": 3e-06, "loss": -35.8463, "step": 635 }, { "epoch": 0.05653333333333333, "grad_norm": 109.21646118164062, "learning_rate": 3e-06, "loss": -25.4238, "step": 636 }, { "completion_length": 247.5416717529297, "epoch": 0.05662222222222222, "grad_norm": 107.59577941894531, "learning_rate": 3e-06, "loss": 2.5406, "reward": 0.71875, "reward_std": 0.11004260182380676, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.0416666679084301, "step": 637, "zero_std_ratio": 0.875 }, { "epoch": 0.05671111111111111, "grad_norm": 92.30776977539062, "learning_rate": 3e-06, "loss": 3.1641, "step": 638 }, { "epoch": 0.0568, "grad_norm": 41.486106872558594, "learning_rate": 3e-06, "loss": 4.5319, "step": 639 }, { "epoch": 0.05688888888888889, "grad_norm": 50.142147064208984, "learning_rate": 3e-06, "loss": 0.4783, "step": 640 }, { "epoch": 0.05697777777777778, "grad_norm": 41.67461395263672, "learning_rate": 3e-06, "loss": 0.8438, "step": 641 }, { "epoch": 0.05706666666666667, "grad_norm": 51.555999755859375, "learning_rate": 3e-06, "loss": 0.3726, "step": 642 }, { "epoch": 0.05715555555555556, "grad_norm": 56.56801223754883, "learning_rate": 3e-06, "loss": 2.3059, "step": 643 }, { "epoch": 0.057244444444444445, "grad_norm": 57.52075958251953, "learning_rate": 3e-06, "loss": 2.0942, "step": 644 }, { "epoch": 0.05733333333333333, "grad_norm": 51.82474136352539, "learning_rate": 3e-06, "loss": 3.4151, "step": 645 }, { "epoch": 0.05742222222222222, "grad_norm": 45.45164108276367, "learning_rate": 3e-06, "loss": -0.6204, "step": 646 }, { "epoch": 0.05751111111111111, "grad_norm": 43.34312057495117, "learning_rate": 3e-06, "loss": -0.6824, "step": 647 }, { "epoch": 0.0576, "grad_norm": 88.9341812133789, "learning_rate": 3e-06, "loss": -0.345, "step": 648 }, { "completion_length": 246.81250762939453, "epoch": 0.05768888888888889, "grad_norm": 91.70360565185547, "learning_rate": 3e-06, "loss": -21.4258, "reward": 1.8541666865348816, "reward_std": 0.3332235887646675, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 1.1666666567325592, "step": 649, "zero_std_ratio": 0.625 }, { "epoch": 0.057777777777777775, "grad_norm": 117.84858703613281, "learning_rate": 3e-06, "loss": -24.4443, "step": 650 }, { "epoch": 0.057866666666666663, "grad_norm": 94.53849792480469, "learning_rate": 3e-06, "loss": -21.691, "step": 651 }, { "epoch": 0.05795555555555556, "grad_norm": 97.39710998535156, "learning_rate": 3e-06, "loss": -24.1024, "step": 652 }, { "epoch": 0.05804444444444445, "grad_norm": 90.82528686523438, "learning_rate": 3e-06, "loss": -23.0164, "step": 653 }, { "epoch": 0.058133333333333335, "grad_norm": 77.49068450927734, "learning_rate": 3e-06, "loss": -22.6643, "step": 654 }, { "epoch": 0.058222222222222224, "grad_norm": 87.06861114501953, "learning_rate": 3e-06, "loss": -22.1806, "step": 655 }, { "epoch": 0.05831111111111111, "grad_norm": 116.24286651611328, "learning_rate": 3e-06, "loss": -25.0616, "step": 656 }, { "epoch": 0.0584, "grad_norm": 80.90653228759766, "learning_rate": 3e-06, "loss": -23.1949, "step": 657 }, { "epoch": 0.05848888888888889, "grad_norm": 122.82756042480469, "learning_rate": 3e-06, "loss": -25.4261, "step": 658 }, { "epoch": 0.05857777777777778, "grad_norm": 87.51302337646484, "learning_rate": 3e-06, "loss": -24.0548, "step": 659 }, { "epoch": 0.058666666666666666, "grad_norm": 85.40345764160156, "learning_rate": 3e-06, "loss": -23.4326, "step": 660 }, { "completion_length": 250.9375, "epoch": 0.058755555555555554, "grad_norm": 40.29450607299805, "learning_rate": 3e-06, "loss": 9.1824, "reward": 0.8541666865348816, "reward_std": 0.10206206887960434, "rewards/boxed_and_answer_tags_format_reward": 0.5625, "rewards/correctness_reward_func_math": 0.2916666567325592, "step": 661, "zero_std_ratio": 0.875 }, { "epoch": 0.05884444444444444, "grad_norm": 53.10469436645508, "learning_rate": 3e-06, "loss": 5.2732, "step": 662 }, { "epoch": 0.05893333333333333, "grad_norm": 53.52404022216797, "learning_rate": 3e-06, "loss": 9.7305, "step": 663 }, { "epoch": 0.05902222222222222, "grad_norm": 43.00156021118164, "learning_rate": 3e-06, "loss": 10.5135, "step": 664 }, { "epoch": 0.059111111111111114, "grad_norm": 51.94622802734375, "learning_rate": 3e-06, "loss": 8.7434, "step": 665 }, { "epoch": 0.0592, "grad_norm": 45.08756637573242, "learning_rate": 3e-06, "loss": 10.9107, "step": 666 }, { "epoch": 0.05928888888888889, "grad_norm": 38.80097961425781, "learning_rate": 3e-06, "loss": 9.0445, "step": 667 }, { "epoch": 0.05937777777777778, "grad_norm": 50.45252227783203, "learning_rate": 3e-06, "loss": 4.2955, "step": 668 }, { "epoch": 0.05946666666666667, "grad_norm": 42.282501220703125, "learning_rate": 3e-06, "loss": 8.7618, "step": 669 }, { "epoch": 0.059555555555555556, "grad_norm": 40.22513961791992, "learning_rate": 3e-06, "loss": 9.5891, "step": 670 }, { "epoch": 0.059644444444444444, "grad_norm": 55.79698181152344, "learning_rate": 3e-06, "loss": 7.7516, "step": 671 }, { "epoch": 0.05973333333333333, "grad_norm": 41.462181091308594, "learning_rate": 3e-06, "loss": 10.17, "step": 672 }, { "completion_length": 238.4375, "epoch": 0.05982222222222222, "grad_norm": 95.73463439941406, "learning_rate": 3e-06, "loss": 14.012, "reward": 1.4583333730697632, "reward_std": 0.3410547822713852, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.7083333134651184, "step": 673, "zero_std_ratio": 0.625 }, { "epoch": 0.05991111111111111, "grad_norm": 92.55806732177734, "learning_rate": 3e-06, "loss": 7.0593, "step": 674 }, { "epoch": 0.06, "grad_norm": 90.97936248779297, "learning_rate": 3e-06, "loss": 6.1863, "step": 675 }, { "epoch": 0.060088888888888886, "grad_norm": 82.84268951416016, "learning_rate": 3e-06, "loss": -3.7791, "step": 676 }, { "epoch": 0.060177777777777774, "grad_norm": 93.55880737304688, "learning_rate": 3e-06, "loss": 1.1706, "step": 677 }, { "epoch": 0.06026666666666667, "grad_norm": 143.54983520507812, "learning_rate": 3e-06, "loss": 5.2435, "step": 678 }, { "epoch": 0.06035555555555556, "grad_norm": 103.67829132080078, "learning_rate": 3e-06, "loss": 12.2804, "step": 679 }, { "epoch": 0.060444444444444446, "grad_norm": 94.47793579101562, "learning_rate": 3e-06, "loss": 5.8252, "step": 680 }, { "epoch": 0.060533333333333335, "grad_norm": 87.38697814941406, "learning_rate": 3e-06, "loss": 4.5993, "step": 681 }, { "epoch": 0.06062222222222222, "grad_norm": 79.12389373779297, "learning_rate": 3e-06, "loss": -4.8813, "step": 682 }, { "epoch": 0.06071111111111111, "grad_norm": 97.0263900756836, "learning_rate": 3e-06, "loss": -0.474, "step": 683 }, { "epoch": 0.0608, "grad_norm": 183.30641174316406, "learning_rate": 3e-06, "loss": 2.9729, "step": 684 }, { "completion_length": 255.4166717529297, "epoch": 0.06088888888888889, "grad_norm": 119.54466247558594, "learning_rate": 3e-06, "loss": -13.0228, "reward": 1.4583333730697632, "reward_std": 0.505022794008255, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.708333358168602, "step": 685, "zero_std_ratio": 0.5 }, { "epoch": 0.06097777777777778, "grad_norm": 121.260986328125, "learning_rate": 3e-06, "loss": -7.3608, "step": 686 }, { "epoch": 0.061066666666666665, "grad_norm": 107.9063720703125, "learning_rate": 3e-06, "loss": -2.885, "step": 687 }, { "epoch": 0.06115555555555555, "grad_norm": 122.71879577636719, "learning_rate": 3e-06, "loss": -9.4339, "step": 688 }, { "epoch": 0.06124444444444444, "grad_norm": 98.11092376708984, "learning_rate": 3e-06, "loss": -7.9372, "step": 689 }, { "epoch": 0.06133333333333333, "grad_norm": 106.66928100585938, "learning_rate": 3e-06, "loss": 2.2675, "step": 690 }, { "epoch": 0.061422222222222225, "grad_norm": 117.10845947265625, "learning_rate": 3e-06, "loss": -13.9389, "step": 691 }, { "epoch": 0.061511111111111114, "grad_norm": 123.24708557128906, "learning_rate": 3e-06, "loss": -9.064, "step": 692 }, { "epoch": 0.0616, "grad_norm": 108.9686050415039, "learning_rate": 3e-06, "loss": -4.3708, "step": 693 }, { "epoch": 0.06168888888888889, "grad_norm": 120.98512268066406, "learning_rate": 3e-06, "loss": -10.9314, "step": 694 }, { "epoch": 0.06177777777777778, "grad_norm": 96.25732421875, "learning_rate": 3e-06, "loss": -9.4401, "step": 695 }, { "epoch": 0.06186666666666667, "grad_norm": 107.97279357910156, "learning_rate": 3e-06, "loss": 0.1328, "step": 696 }, { "completion_length": 242.4166717529297, "epoch": 0.061955555555555555, "grad_norm": 77.24815368652344, "learning_rate": 3e-06, "loss": 21.7633, "reward": 1.6875, "reward_std": 0.3680921494960785, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 1.0, "step": 697, "zero_std_ratio": 0.625 }, { "epoch": 0.062044444444444444, "grad_norm": 90.61959075927734, "learning_rate": 3e-06, "loss": 21.3004, "step": 698 }, { "epoch": 0.06213333333333333, "grad_norm": 79.43978881835938, "learning_rate": 3e-06, "loss": 22.885, "step": 699 }, { "epoch": 0.06222222222222222, "grad_norm": 88.06346130371094, "learning_rate": 3e-06, "loss": 16.6794, "step": 700 }, { "epoch": 0.06231111111111111, "grad_norm": 215.39535522460938, "learning_rate": 3e-06, "loss": 18.8777, "step": 701 }, { "epoch": 0.0624, "grad_norm": 92.26004791259766, "learning_rate": 3e-06, "loss": 11.0055, "step": 702 }, { "epoch": 0.062488888888888885, "grad_norm": 77.40907287597656, "learning_rate": 3e-06, "loss": 20.7747, "step": 703 }, { "epoch": 0.06257777777777777, "grad_norm": 100.23749542236328, "learning_rate": 3e-06, "loss": 20.3858, "step": 704 }, { "epoch": 0.06266666666666666, "grad_norm": 75.75386047363281, "learning_rate": 3e-06, "loss": 22.1331, "step": 705 }, { "epoch": 0.06275555555555555, "grad_norm": 88.5040054321289, "learning_rate": 3e-06, "loss": 15.6211, "step": 706 }, { "epoch": 0.06284444444444444, "grad_norm": 83.19754028320312, "learning_rate": 3e-06, "loss": 17.9444, "step": 707 }, { "epoch": 0.06293333333333333, "grad_norm": 88.55027770996094, "learning_rate": 3e-06, "loss": 9.7909, "step": 708 }, { "completion_length": 253.8541717529297, "epoch": 0.06302222222222222, "grad_norm": 106.40866088867188, "learning_rate": 3e-06, "loss": -1.6061, "reward": 1.2708333730697632, "reward_std": 0.38547582924366, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.5833333432674408, "step": 709, "zero_std_ratio": 0.25 }, { "epoch": 0.06311111111111112, "grad_norm": 123.38434600830078, "learning_rate": 3e-06, "loss": -9.3301, "step": 710 }, { "epoch": 0.0632, "grad_norm": 101.67454528808594, "learning_rate": 3e-06, "loss": -7.1273, "step": 711 }, { "epoch": 0.0632888888888889, "grad_norm": 112.82793426513672, "learning_rate": 3e-06, "loss": -9.8195, "step": 712 }, { "epoch": 0.06337777777777778, "grad_norm": 108.99236297607422, "learning_rate": 3e-06, "loss": -2.3741, "step": 713 }, { "epoch": 0.06346666666666667, "grad_norm": 106.9615478515625, "learning_rate": 3e-06, "loss": -7.792, "step": 714 }, { "epoch": 0.06355555555555556, "grad_norm": 94.91867065429688, "learning_rate": 3e-06, "loss": -3.0819, "step": 715 }, { "epoch": 0.06364444444444445, "grad_norm": 114.82243347167969, "learning_rate": 3e-06, "loss": -11.4017, "step": 716 }, { "epoch": 0.06373333333333334, "grad_norm": 104.33937072753906, "learning_rate": 3e-06, "loss": -9.7339, "step": 717 }, { "epoch": 0.06382222222222222, "grad_norm": 136.31576538085938, "learning_rate": 3e-06, "loss": -12.1719, "step": 718 }, { "epoch": 0.06391111111111111, "grad_norm": 99.78816223144531, "learning_rate": 3e-06, "loss": -4.7219, "step": 719 }, { "epoch": 0.064, "grad_norm": 120.37998962402344, "learning_rate": 3e-06, "loss": -10.5969, "step": 720 }, { "completion_length": 244.0416717529297, "epoch": 0.06408888888888889, "grad_norm": 101.32681274414062, "learning_rate": 3e-06, "loss": 2.1527, "reward": 1.0, "reward_std": 0.4417443424463272, "rewards/boxed_and_answer_tags_format_reward": 0.6666666865348816, "rewards/correctness_reward_func_math": 0.3333333246409893, "step": 721, "zero_std_ratio": 0.375 }, { "epoch": 0.06417777777777778, "grad_norm": 113.38105773925781, "learning_rate": 3e-06, "loss": 11.181, "step": 722 }, { "epoch": 0.06426666666666667, "grad_norm": 114.85003662109375, "learning_rate": 3e-06, "loss": -2.1413, "step": 723 }, { "epoch": 0.06435555555555555, "grad_norm": 133.62515258789062, "learning_rate": 3e-06, "loss": 9.3709, "step": 724 }, { "epoch": 0.06444444444444444, "grad_norm": 113.68856048583984, "learning_rate": 3e-06, "loss": -3.0918, "step": 725 }, { "epoch": 0.06453333333333333, "grad_norm": 161.93836975097656, "learning_rate": 3e-06, "loss": -1.9418, "step": 726 }, { "epoch": 0.06462222222222222, "grad_norm": 100.46546173095703, "learning_rate": 3e-06, "loss": 0.5341, "step": 727 }, { "epoch": 0.06471111111111111, "grad_norm": 141.3654022216797, "learning_rate": 3e-06, "loss": 9.9107, "step": 728 }, { "epoch": 0.0648, "grad_norm": 131.8665771484375, "learning_rate": 3e-06, "loss": -3.674, "step": 729 }, { "epoch": 0.06488888888888888, "grad_norm": 128.03195190429688, "learning_rate": 3e-06, "loss": 6.8735, "step": 730 }, { "epoch": 0.06497777777777777, "grad_norm": 117.97486114501953, "learning_rate": 3e-06, "loss": -5.1859, "step": 731 }, { "epoch": 0.06506666666666666, "grad_norm": 158.5392303466797, "learning_rate": 3e-06, "loss": -4.222, "step": 732 }, { "completion_length": 249.08334350585938, "epoch": 0.06515555555555555, "grad_norm": 122.15642547607422, "learning_rate": 3e-06, "loss": 5.173, "reward": 1.1770833730697632, "reward_std": 0.2915456295013428, "rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592, "rewards/correctness_reward_func_math": 0.5416666567325592, "step": 733, "zero_std_ratio": 0.625 }, { "epoch": 0.06524444444444444, "grad_norm": 122.5665054321289, "learning_rate": 3e-06, "loss": -10.2692, "step": 734 }, { "epoch": 0.06533333333333333, "grad_norm": 95.55500030517578, "learning_rate": 3e-06, "loss": -4.3816, "step": 735 }, { "epoch": 0.06542222222222223, "grad_norm": 111.13971710205078, "learning_rate": 3e-06, "loss": -4.5502, "step": 736 }, { "epoch": 0.06551111111111112, "grad_norm": 98.50959014892578, "learning_rate": 3e-06, "loss": -5.0919, "step": 737 }, { "epoch": 0.0656, "grad_norm": 82.98762512207031, "learning_rate": 3e-06, "loss": -7.9156, "step": 738 }, { "epoch": 0.0656888888888889, "grad_norm": 114.31904602050781, "learning_rate": 3e-06, "loss": 4.2146, "step": 739 }, { "epoch": 0.06577777777777778, "grad_norm": 109.786376953125, "learning_rate": 3e-06, "loss": -11.7776, "step": 740 }, { "epoch": 0.06586666666666667, "grad_norm": 95.00721740722656, "learning_rate": 3e-06, "loss": -5.7331, "step": 741 }, { "epoch": 0.06595555555555556, "grad_norm": 87.01516723632812, "learning_rate": 3e-06, "loss": -5.0034, "step": 742 }, { "epoch": 0.06604444444444445, "grad_norm": 101.2181167602539, "learning_rate": 3e-06, "loss": -6.7069, "step": 743 }, { "epoch": 0.06613333333333334, "grad_norm": 79.3046875, "learning_rate": 3e-06, "loss": -8.6759, "step": 744 }, { "completion_length": 252.93750762939453, "epoch": 0.06622222222222222, "grad_norm": 101.21440887451172, "learning_rate": 3e-06, "loss": 2.4358, "reward": 1.354166716337204, "reward_std": 0.4736091196537018, "rewards/boxed_and_answer_tags_format_reward": 0.6458333432674408, "rewards/correctness_reward_func_math": 0.7083333358168602, "step": 745, "zero_std_ratio": 0.375 }, { "epoch": 0.06631111111111111, "grad_norm": 133.58985900878906, "learning_rate": 3e-06, "loss": 0.2243, "step": 746 }, { "epoch": 0.0664, "grad_norm": 137.97776794433594, "learning_rate": 3e-06, "loss": 3.7561, "step": 747 }, { "epoch": 0.06648888888888889, "grad_norm": 97.12255859375, "learning_rate": 3e-06, "loss": 2.8029, "step": 748 }, { "epoch": 0.06657777777777778, "grad_norm": 133.5359344482422, "learning_rate": 3e-06, "loss": 5.9185, "step": 749 }, { "epoch": 0.06666666666666667, "grad_norm": 165.00294494628906, "learning_rate": 3e-06, "loss": 5.6118, "step": 750 }, { "epoch": 0.06675555555555555, "grad_norm": 147.9979705810547, "learning_rate": 3e-06, "loss": 1.9697, "step": 751 }, { "epoch": 0.06684444444444444, "grad_norm": 119.22462463378906, "learning_rate": 3e-06, "loss": -0.882, "step": 752 }, { "epoch": 0.06693333333333333, "grad_norm": 144.49305725097656, "learning_rate": 3e-06, "loss": 2.487, "step": 753 }, { "epoch": 0.06702222222222222, "grad_norm": 96.12986755371094, "learning_rate": 3e-06, "loss": 1.0482, "step": 754 }, { "epoch": 0.06711111111111111, "grad_norm": 117.53173065185547, "learning_rate": 3e-06, "loss": 4.1922, "step": 755 }, { "epoch": 0.0672, "grad_norm": 99.70003509521484, "learning_rate": 3e-06, "loss": 4.014, "step": 756 }, { "completion_length": 252.9166717529297, "epoch": 0.06728888888888888, "grad_norm": 107.6641616821289, "learning_rate": 3e-06, "loss": -6.2275, "reward": 1.7812500596046448, "reward_std": 0.38577648997306824, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 1.0416666567325592, "step": 757, "zero_std_ratio": 0.5 }, { "epoch": 0.06737777777777777, "grad_norm": 87.34847259521484, "learning_rate": 3e-06, "loss": -3.2976, "step": 758 }, { "epoch": 0.06746666666666666, "grad_norm": 114.33875274658203, "learning_rate": 3e-06, "loss": -4.307, "step": 759 }, { "epoch": 0.06755555555555555, "grad_norm": 106.53048706054688, "learning_rate": 3e-06, "loss": -1.13, "step": 760 }, { "epoch": 0.06764444444444444, "grad_norm": 120.27633666992188, "learning_rate": 3e-06, "loss": -3.2187, "step": 761 }, { "epoch": 0.06773333333333334, "grad_norm": 96.025390625, "learning_rate": 3e-06, "loss": 0.3329, "step": 762 }, { "epoch": 0.06782222222222223, "grad_norm": 115.00994110107422, "learning_rate": 3e-06, "loss": -7.6432, "step": 763 }, { "epoch": 0.06791111111111112, "grad_norm": 87.76792907714844, "learning_rate": 3e-06, "loss": -4.6143, "step": 764 }, { "epoch": 0.068, "grad_norm": 113.94709777832031, "learning_rate": 3e-06, "loss": -5.7334, "step": 765 }, { "epoch": 0.0680888888888889, "grad_norm": 111.59996032714844, "learning_rate": 3e-06, "loss": -2.659, "step": 766 }, { "epoch": 0.06817777777777778, "grad_norm": 129.63861083984375, "learning_rate": 3e-06, "loss": -5.0582, "step": 767 }, { "epoch": 0.06826666666666667, "grad_norm": 93.2347412109375, "learning_rate": 3e-06, "loss": -1.0268, "step": 768 }, { "completion_length": 251.52083587646484, "epoch": 0.06835555555555556, "grad_norm": 184.61167907714844, "learning_rate": 3e-06, "loss": 5.0221, "reward": 1.4166666865348816, "reward_std": 0.6262910515069962, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.6666666567325592, "step": 769, "zero_std_ratio": 0.375 }, { "epoch": 0.06844444444444445, "grad_norm": 137.45404052734375, "learning_rate": 3e-06, "loss": 7.8168, "step": 770 }, { "epoch": 0.06853333333333333, "grad_norm": 202.94410705566406, "learning_rate": 3e-06, "loss": 9.1514, "step": 771 }, { "epoch": 0.06862222222222222, "grad_norm": 161.6815948486328, "learning_rate": 3e-06, "loss": 8.282, "step": 772 }, { "epoch": 0.06871111111111111, "grad_norm": 123.37694549560547, "learning_rate": 3e-06, "loss": 3.5652, "step": 773 }, { "epoch": 0.0688, "grad_norm": 146.2295379638672, "learning_rate": 3e-06, "loss": 9.5204, "step": 774 }, { "epoch": 0.06888888888888889, "grad_norm": 145.88613891601562, "learning_rate": 3e-06, "loss": 4.6675, "step": 775 }, { "epoch": 0.06897777777777778, "grad_norm": 135.99313354492188, "learning_rate": 3e-06, "loss": 6.1798, "step": 776 }, { "epoch": 0.06906666666666667, "grad_norm": 134.66729736328125, "learning_rate": 3e-06, "loss": 6.9994, "step": 777 }, { "epoch": 0.06915555555555555, "grad_norm": 173.39735412597656, "learning_rate": 3e-06, "loss": 7.3314, "step": 778 }, { "epoch": 0.06924444444444444, "grad_norm": 136.74331665039062, "learning_rate": 3e-06, "loss": 1.5412, "step": 779 }, { "epoch": 0.06933333333333333, "grad_norm": 142.06529235839844, "learning_rate": 3e-06, "loss": 7.5, "step": 780 }, { "completion_length": 236.14583587646484, "epoch": 0.06942222222222222, "grad_norm": 92.94062805175781, "learning_rate": 3e-06, "loss": 9.4998, "reward": 0.8854166865348816, "reward_std": 0.3922351598739624, "rewards/boxed_and_answer_tags_format_reward": 0.71875, "rewards/correctness_reward_func_math": 0.1666666679084301, "step": 781, "zero_std_ratio": 0.375 }, { "epoch": 0.0695111111111111, "grad_norm": 84.2685546875, "learning_rate": 3e-06, "loss": 20.3539, "step": 782 }, { "epoch": 0.0696, "grad_norm": 79.0595932006836, "learning_rate": 3e-06, "loss": 16.2649, "step": 783 }, { "epoch": 0.06968888888888888, "grad_norm": 151.0120086669922, "learning_rate": 3e-06, "loss": 13.9466, "step": 784 }, { "epoch": 0.06977777777777777, "grad_norm": 84.83601379394531, "learning_rate": 3e-06, "loss": 13.7201, "step": 785 }, { "epoch": 0.06986666666666666, "grad_norm": 149.62045288085938, "learning_rate": 3e-06, "loss": 13.032, "step": 786 }, { "epoch": 0.06995555555555556, "grad_norm": 95.56779479980469, "learning_rate": 3e-06, "loss": 8.1661, "step": 787 }, { "epoch": 0.07004444444444445, "grad_norm": 89.89787292480469, "learning_rate": 3e-06, "loss": 19.6015, "step": 788 }, { "epoch": 0.07013333333333334, "grad_norm": 78.87327575683594, "learning_rate": 3e-06, "loss": 15.0965, "step": 789 }, { "epoch": 0.07022222222222223, "grad_norm": 88.94290161132812, "learning_rate": 3e-06, "loss": 12.3248, "step": 790 }, { "epoch": 0.07031111111111112, "grad_norm": 84.62553405761719, "learning_rate": 3e-06, "loss": 12.571, "step": 791 }, { "epoch": 0.0704, "grad_norm": 102.37629699707031, "learning_rate": 3e-06, "loss": 11.5955, "step": 792 }, { "completion_length": 245.95834350585938, "epoch": 0.07048888888888889, "grad_norm": 87.68032836914062, "learning_rate": 3e-06, "loss": -2.896, "reward": 0.979166716337204, "reward_std": 0.3332235962152481, "rewards/boxed_and_answer_tags_format_reward": 0.5625, "rewards/correctness_reward_func_math": 0.416666679084301, "step": 793, "zero_std_ratio": 0.625 }, { "epoch": 0.07057777777777778, "grad_norm": 102.35977935791016, "learning_rate": 3e-06, "loss": -14.1397, "step": 794 }, { "epoch": 0.07066666666666667, "grad_norm": 121.52274322509766, "learning_rate": 3e-06, "loss": -20.6379, "step": 795 }, { "epoch": 0.07075555555555556, "grad_norm": 123.77271270751953, "learning_rate": 3e-06, "loss": -15.954, "step": 796 }, { "epoch": 0.07084444444444445, "grad_norm": 115.74909210205078, "learning_rate": 3e-06, "loss": -19.257, "step": 797 }, { "epoch": 0.07093333333333333, "grad_norm": 171.18011474609375, "learning_rate": 3e-06, "loss": -15.1334, "step": 798 }, { "epoch": 0.07102222222222222, "grad_norm": 92.46512603759766, "learning_rate": 3e-06, "loss": -3.7242, "step": 799 }, { "epoch": 0.07111111111111111, "grad_norm": 104.15264129638672, "learning_rate": 3e-06, "loss": -14.9942, "step": 800 }, { "epoch": 0.0712, "grad_norm": 120.8930892944336, "learning_rate": 3e-06, "loss": -22.0188, "step": 801 }, { "epoch": 0.07128888888888889, "grad_norm": 112.13275146484375, "learning_rate": 3e-06, "loss": -17.6709, "step": 802 }, { "epoch": 0.07137777777777778, "grad_norm": 136.23388671875, "learning_rate": 3e-06, "loss": -20.6627, "step": 803 }, { "epoch": 0.07146666666666666, "grad_norm": 124.1773681640625, "learning_rate": 3e-06, "loss": -17.0962, "step": 804 }, { "completion_length": 232.89583587646484, "epoch": 0.07155555555555555, "grad_norm": 91.88973236083984, "learning_rate": 3e-06, "loss": -7.8492, "reward": 1.7291666865348816, "reward_std": 0.47015421837568283, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 1.0416666567325592, "step": 805, "zero_std_ratio": 0.5 }, { "epoch": 0.07164444444444444, "grad_norm": 107.12287139892578, "learning_rate": 3e-06, "loss": -23.4332, "step": 806 }, { "epoch": 0.07173333333333333, "grad_norm": 104.5981674194336, "learning_rate": 3e-06, "loss": -11.8278, "step": 807 }, { "epoch": 0.07182222222222222, "grad_norm": 102.6692886352539, "learning_rate": 3e-06, "loss": -20.165, "step": 808 }, { "epoch": 0.0719111111111111, "grad_norm": 88.46080017089844, "learning_rate": 3e-06, "loss": -10.8484, "step": 809 }, { "epoch": 0.072, "grad_norm": 124.10685729980469, "learning_rate": 3e-06, "loss": -20.237, "step": 810 }, { "epoch": 0.07208888888888888, "grad_norm": 95.68196868896484, "learning_rate": 3e-06, "loss": -9.2521, "step": 811 }, { "epoch": 0.07217777777777777, "grad_norm": 95.62832641601562, "learning_rate": 3e-06, "loss": -25.6333, "step": 812 }, { "epoch": 0.07226666666666667, "grad_norm": 104.2110366821289, "learning_rate": 3e-06, "loss": -14.2533, "step": 813 }, { "epoch": 0.07235555555555556, "grad_norm": 114.39372253417969, "learning_rate": 3e-06, "loss": -22.2888, "step": 814 }, { "epoch": 0.07244444444444445, "grad_norm": 112.07288360595703, "learning_rate": 3e-06, "loss": -12.7028, "step": 815 }, { "epoch": 0.07253333333333334, "grad_norm": 138.86337280273438, "learning_rate": 3e-06, "loss": -23.0688, "step": 816 }, { "completion_length": 249.3541717529297, "epoch": 0.07262222222222223, "grad_norm": 106.32572937011719, "learning_rate": 3e-06, "loss": 1.5611, "reward": 1.291666716337204, "reward_std": 0.37455084919929504, "rewards/boxed_and_answer_tags_format_reward": 0.6666666865348816, "rewards/correctness_reward_func_math": 0.625, "step": 817, "zero_std_ratio": 0.5 }, { "epoch": 0.07271111111111112, "grad_norm": 119.1928482055664, "learning_rate": 3e-06, "loss": 6.5715, "step": 818 }, { "epoch": 0.0728, "grad_norm": 89.53767395019531, "learning_rate": 3e-06, "loss": 9.9694, "step": 819 }, { "epoch": 0.07288888888888889, "grad_norm": 97.2778091430664, "learning_rate": 3e-06, "loss": 8.1343, "step": 820 }, { "epoch": 0.07297777777777778, "grad_norm": 99.85116577148438, "learning_rate": 3e-06, "loss": 3.7739, "step": 821 }, { "epoch": 0.07306666666666667, "grad_norm": 107.6137924194336, "learning_rate": 3e-06, "loss": -0.6852, "step": 822 }, { "epoch": 0.07315555555555556, "grad_norm": 113.21214294433594, "learning_rate": 3e-06, "loss": 1.024, "step": 823 }, { "epoch": 0.07324444444444445, "grad_norm": 130.27040100097656, "learning_rate": 3e-06, "loss": 4.4904, "step": 824 }, { "epoch": 0.07333333333333333, "grad_norm": 87.94723510742188, "learning_rate": 3e-06, "loss": 7.7117, "step": 825 }, { "epoch": 0.07342222222222222, "grad_norm": 102.87310791015625, "learning_rate": 3e-06, "loss": 5.3932, "step": 826 }, { "epoch": 0.07351111111111111, "grad_norm": 97.53043365478516, "learning_rate": 3e-06, "loss": 1.1637, "step": 827 }, { "epoch": 0.0736, "grad_norm": 126.4931640625, "learning_rate": 3e-06, "loss": -3.7025, "step": 828 }, { "completion_length": 240.75, "epoch": 0.07368888888888889, "grad_norm": 88.22557067871094, "learning_rate": 3e-06, "loss": 22.5183, "reward": 1.1354166865348816, "reward_std": 0.35377833247184753, "rewards/boxed_and_answer_tags_format_reward": 0.59375, "rewards/correctness_reward_func_math": 0.5416666567325592, "step": 829, "zero_std_ratio": 0.625 }, { "epoch": 0.07377777777777778, "grad_norm": 86.47975158691406, "learning_rate": 3e-06, "loss": 24.5943, "step": 830 }, { "epoch": 0.07386666666666666, "grad_norm": 93.10693359375, "learning_rate": 3e-06, "loss": 27.2039, "step": 831 }, { "epoch": 0.07395555555555555, "grad_norm": 96.74742889404297, "learning_rate": 3e-06, "loss": 32.9558, "step": 832 }, { "epoch": 0.07404444444444444, "grad_norm": 97.9085693359375, "learning_rate": 3e-06, "loss": 25.7304, "step": 833 }, { "epoch": 0.07413333333333333, "grad_norm": 95.80497741699219, "learning_rate": 3e-06, "loss": 28.8338, "step": 834 }, { "epoch": 0.07422222222222222, "grad_norm": 82.9104995727539, "learning_rate": 3e-06, "loss": 20.9569, "step": 835 }, { "epoch": 0.0743111111111111, "grad_norm": 332.6025695800781, "learning_rate": 3e-06, "loss": 22.7693, "step": 836 }, { "epoch": 0.0744, "grad_norm": 97.00851440429688, "learning_rate": 3e-06, "loss": 24.6048, "step": 837 }, { "epoch": 0.07448888888888888, "grad_norm": 90.94817352294922, "learning_rate": 3e-06, "loss": 30.2657, "step": 838 }, { "epoch": 0.07457777777777778, "grad_norm": 91.87737274169922, "learning_rate": 3e-06, "loss": 23.8124, "step": 839 }, { "epoch": 0.07466666666666667, "grad_norm": 100.71826934814453, "learning_rate": 3e-06, "loss": 26.5346, "step": 840 }, { "completion_length": 244.3125, "epoch": 0.07475555555555556, "grad_norm": 454.38482666015625, "learning_rate": 3e-06, "loss": -16.9865, "reward": 1.2812500596046448, "reward_std": 0.3782803416252136, "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, "rewards/correctness_reward_func_math": 0.6666666492819786, "step": 841, "zero_std_ratio": 0.625 }, { "epoch": 0.07484444444444445, "grad_norm": 102.08020782470703, "learning_rate": 3e-06, "loss": -18.5601, "step": 842 }, { "epoch": 0.07493333333333334, "grad_norm": 101.7997055053711, "learning_rate": 3e-06, "loss": -23.9473, "step": 843 }, { "epoch": 0.07502222222222223, "grad_norm": 100.2668685913086, "learning_rate": 3e-06, "loss": -26.2402, "step": 844 }, { "epoch": 0.07511111111111111, "grad_norm": 119.95198059082031, "learning_rate": 3e-06, "loss": -22.5011, "step": 845 }, { "epoch": 0.0752, "grad_norm": 101.83861541748047, "learning_rate": 3e-06, "loss": -14.7265, "step": 846 }, { "epoch": 0.07528888888888889, "grad_norm": 106.50312042236328, "learning_rate": 3e-06, "loss": -18.8046, "step": 847 }, { "epoch": 0.07537777777777778, "grad_norm": 114.58135986328125, "learning_rate": 3e-06, "loss": -20.037, "step": 848 }, { "epoch": 0.07546666666666667, "grad_norm": 121.03673553466797, "learning_rate": 3e-06, "loss": -25.817, "step": 849 }, { "epoch": 0.07555555555555556, "grad_norm": 97.15817260742188, "learning_rate": 3e-06, "loss": -28.7689, "step": 850 }, { "epoch": 0.07564444444444444, "grad_norm": 101.58541107177734, "learning_rate": 3e-06, "loss": -24.6109, "step": 851 }, { "epoch": 0.07573333333333333, "grad_norm": 100.84017181396484, "learning_rate": 3e-06, "loss": -16.8195, "step": 852 }, { "completion_length": 247.4166717529297, "epoch": 0.07582222222222222, "grad_norm": 60.974788665771484, "learning_rate": 3e-06, "loss": 1.297, "reward": 1.2708333432674408, "reward_std": 0.12909945845603943, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.5833333358168602, "step": 853, "zero_std_ratio": 0.875 }, { "epoch": 0.07591111111111111, "grad_norm": 58.498374938964844, "learning_rate": 3e-06, "loss": -4.5823, "step": 854 }, { "epoch": 0.076, "grad_norm": 51.417320251464844, "learning_rate": 3e-06, "loss": -2.8582, "step": 855 }, { "epoch": 0.07608888888888889, "grad_norm": 52.74491882324219, "learning_rate": 3e-06, "loss": -3.5933, "step": 856 }, { "epoch": 0.07617777777777777, "grad_norm": 54.02571487426758, "learning_rate": 3e-06, "loss": -0.1777, "step": 857 }, { "epoch": 0.07626666666666666, "grad_norm": 44.29707717895508, "learning_rate": 3e-06, "loss": -0.0581, "step": 858 }, { "epoch": 0.07635555555555555, "grad_norm": 63.4464111328125, "learning_rate": 3e-06, "loss": 0.981, "step": 859 }, { "epoch": 0.07644444444444444, "grad_norm": 58.98569869995117, "learning_rate": 3e-06, "loss": -5.3316, "step": 860 }, { "epoch": 0.07653333333333333, "grad_norm": 54.73743438720703, "learning_rate": 3e-06, "loss": -3.799, "step": 861 }, { "epoch": 0.07662222222222222, "grad_norm": 55.15388107299805, "learning_rate": 3e-06, "loss": -4.4757, "step": 862 }, { "epoch": 0.0767111111111111, "grad_norm": 61.510887145996094, "learning_rate": 3e-06, "loss": -0.6125, "step": 863 }, { "epoch": 0.0768, "grad_norm": 46.19833755493164, "learning_rate": 3e-06, "loss": -0.7855, "step": 864 }, { "completion_length": 251.3125, "epoch": 0.0768888888888889, "grad_norm": 111.38273620605469, "learning_rate": 3e-06, "loss": 14.4732, "reward": 1.5208333730697632, "reward_std": 0.4937378317117691, "rewards/boxed_and_answer_tags_format_reward": 0.7291666567325592, "rewards/correctness_reward_func_math": 0.7916666567325592, "step": 865, "zero_std_ratio": 0.375 }, { "epoch": 0.07697777777777778, "grad_norm": 112.39920043945312, "learning_rate": 3e-06, "loss": 3.7967, "step": 866 }, { "epoch": 0.07706666666666667, "grad_norm": 106.71125793457031, "learning_rate": 3e-06, "loss": 1.1063, "step": 867 }, { "epoch": 0.07715555555555556, "grad_norm": 129.4515838623047, "learning_rate": 3e-06, "loss": -2.7262, "step": 868 }, { "epoch": 0.07724444444444445, "grad_norm": 109.67815399169922, "learning_rate": 3e-06, "loss": 0.1256, "step": 869 }, { "epoch": 0.07733333333333334, "grad_norm": 116.57035827636719, "learning_rate": 3e-06, "loss": -2.946, "step": 870 }, { "epoch": 0.07742222222222223, "grad_norm": 122.09374237060547, "learning_rate": 3e-06, "loss": 13.0142, "step": 871 }, { "epoch": 0.07751111111111111, "grad_norm": 108.0525894165039, "learning_rate": 3e-06, "loss": 2.4968, "step": 872 }, { "epoch": 0.0776, "grad_norm": 108.2818374633789, "learning_rate": 3e-06, "loss": -0.8286, "step": 873 }, { "epoch": 0.07768888888888889, "grad_norm": 139.2396697998047, "learning_rate": 3e-06, "loss": -5.0471, "step": 874 }, { "epoch": 0.07777777777777778, "grad_norm": 114.9443588256836, "learning_rate": 3e-06, "loss": -1.3382, "step": 875 }, { "epoch": 0.07786666666666667, "grad_norm": 128.51272583007812, "learning_rate": 3e-06, "loss": -4.8307, "step": 876 }, { "completion_length": 252.50000762939453, "epoch": 0.07795555555555556, "grad_norm": 126.4562759399414, "learning_rate": 3e-06, "loss": 0.751, "reward": 1.0729166865348816, "reward_std": 0.22548970580101013, "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, "rewards/correctness_reward_func_math": 0.4583333283662796, "step": 877, "zero_std_ratio": 0.75 }, { "epoch": 0.07804444444444444, "grad_norm": 82.09607696533203, "learning_rate": 3e-06, "loss": 0.195, "step": 878 }, { "epoch": 0.07813333333333333, "grad_norm": 74.75113677978516, "learning_rate": 3e-06, "loss": -1.5817, "step": 879 }, { "epoch": 0.07822222222222222, "grad_norm": 115.73063659667969, "learning_rate": 3e-06, "loss": -8.3706, "step": 880 }, { "epoch": 0.07831111111111111, "grad_norm": 70.39916229248047, "learning_rate": 3e-06, "loss": -3.2191, "step": 881 }, { "epoch": 0.0784, "grad_norm": 103.28494262695312, "learning_rate": 3e-06, "loss": 1.0954, "step": 882 }, { "epoch": 0.07848888888888889, "grad_norm": 100.23104858398438, "learning_rate": 3e-06, "loss": 0.0428, "step": 883 }, { "epoch": 0.07857777777777777, "grad_norm": 90.33434295654297, "learning_rate": 3e-06, "loss": -0.8422, "step": 884 }, { "epoch": 0.07866666666666666, "grad_norm": 99.41636657714844, "learning_rate": 3e-06, "loss": -2.558, "step": 885 }, { "epoch": 0.07875555555555555, "grad_norm": 99.72211456298828, "learning_rate": 3e-06, "loss": -8.9057, "step": 886 }, { "epoch": 0.07884444444444444, "grad_norm": 73.4344711303711, "learning_rate": 3e-06, "loss": -4.09, "step": 887 }, { "epoch": 0.07893333333333333, "grad_norm": 98.25971221923828, "learning_rate": 3e-06, "loss": -0.4152, "step": 888 }, { "completion_length": 249.2916717529297, "epoch": 0.07902222222222222, "grad_norm": 165.06918334960938, "learning_rate": 3e-06, "loss": 25.8684, "reward": 1.4270833730697632, "reward_std": 0.5354157984256744, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.75, "step": 889, "zero_std_ratio": 0.375 }, { "epoch": 0.0791111111111111, "grad_norm": 132.98858642578125, "learning_rate": 3e-06, "loss": 19.4902, "step": 890 }, { "epoch": 0.0792, "grad_norm": 122.3958969116211, "learning_rate": 3e-06, "loss": 32.6103, "step": 891 }, { "epoch": 0.0792888888888889, "grad_norm": 118.38285064697266, "learning_rate": 3e-06, "loss": 32.6835, "step": 892 }, { "epoch": 0.07937777777777778, "grad_norm": 126.4738540649414, "learning_rate": 3e-06, "loss": 30.5181, "step": 893 }, { "epoch": 0.07946666666666667, "grad_norm": 128.20831298828125, "learning_rate": 3e-06, "loss": 33.4332, "step": 894 }, { "epoch": 0.07955555555555556, "grad_norm": 152.9354248046875, "learning_rate": 3e-06, "loss": 23.7035, "step": 895 }, { "epoch": 0.07964444444444445, "grad_norm": 130.27053833007812, "learning_rate": 3e-06, "loss": 16.7548, "step": 896 }, { "epoch": 0.07973333333333334, "grad_norm": 127.17219543457031, "learning_rate": 3e-06, "loss": 30.7848, "step": 897 }, { "epoch": 0.07982222222222222, "grad_norm": 118.670654296875, "learning_rate": 3e-06, "loss": 30.7772, "step": 898 }, { "epoch": 0.07991111111111111, "grad_norm": 120.19160461425781, "learning_rate": 3e-06, "loss": 27.401, "step": 899 }, { "epoch": 0.08, "grad_norm": 137.2371063232422, "learning_rate": 3e-06, "loss": 30.9334, "step": 900 }, { "completion_length": 255.89583587646484, "epoch": 0.08008888888888889, "grad_norm": 88.54483032226562, "learning_rate": 3e-06, "loss": 3.6467, "reward": 1.3437500596046448, "reward_std": 0.30770808458328247, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.6666666567325592, "step": 901, "zero_std_ratio": 0.625 }, { "epoch": 0.08017777777777778, "grad_norm": 87.48735046386719, "learning_rate": 3e-06, "loss": 1.8615, "step": 902 }, { "epoch": 0.08026666666666667, "grad_norm": 86.97764587402344, "learning_rate": 3e-06, "loss": 2.8386, "step": 903 }, { "epoch": 0.08035555555555556, "grad_norm": 105.64205932617188, "learning_rate": 3e-06, "loss": -3.9129, "step": 904 }, { "epoch": 0.08044444444444444, "grad_norm": 80.34683227539062, "learning_rate": 3e-06, "loss": 1.4293, "step": 905 }, { "epoch": 0.08053333333333333, "grad_norm": 91.16341400146484, "learning_rate": 3e-06, "loss": 2.803, "step": 906 }, { "epoch": 0.08062222222222222, "grad_norm": 96.49407196044922, "learning_rate": 3e-06, "loss": 2.4431, "step": 907 }, { "epoch": 0.08071111111111111, "grad_norm": 84.40055084228516, "learning_rate": 3e-06, "loss": 0.5478, "step": 908 }, { "epoch": 0.0808, "grad_norm": 78.79622650146484, "learning_rate": 3e-06, "loss": 1.4422, "step": 909 }, { "epoch": 0.08088888888888889, "grad_norm": 128.47531127929688, "learning_rate": 3e-06, "loss": -5.1302, "step": 910 }, { "epoch": 0.08097777777777777, "grad_norm": 79.19956970214844, "learning_rate": 3e-06, "loss": -0.2354, "step": 911 }, { "epoch": 0.08106666666666666, "grad_norm": 107.92975616455078, "learning_rate": 3e-06, "loss": 1.3494, "step": 912 }, { "completion_length": 236.7916717529297, "epoch": 0.08115555555555555, "grad_norm": 72.56483459472656, "learning_rate": 3e-06, "loss": -4.7678, "reward": 1.0, "reward_std": 0.23116151988506317, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.375, "step": 913, "zero_std_ratio": 0.75 }, { "epoch": 0.08124444444444444, "grad_norm": 84.60346984863281, "learning_rate": 3e-06, "loss": -1.6215, "step": 914 }, { "epoch": 0.08133333333333333, "grad_norm": 80.78106689453125, "learning_rate": 3e-06, "loss": 3.2587, "step": 915 }, { "epoch": 0.08142222222222223, "grad_norm": 71.9332275390625, "learning_rate": 3e-06, "loss": -4.7685, "step": 916 }, { "epoch": 0.08151111111111112, "grad_norm": 98.66748046875, "learning_rate": 3e-06, "loss": -6.589, "step": 917 }, { "epoch": 0.0816, "grad_norm": 135.69175720214844, "learning_rate": 3e-06, "loss": -7.5017, "step": 918 }, { "epoch": 0.0816888888888889, "grad_norm": 73.60437774658203, "learning_rate": 3e-06, "loss": -5.1495, "step": 919 }, { "epoch": 0.08177777777777778, "grad_norm": 90.75928497314453, "learning_rate": 3e-06, "loss": -2.384, "step": 920 }, { "epoch": 0.08186666666666667, "grad_norm": 82.05548095703125, "learning_rate": 3e-06, "loss": 2.8112, "step": 921 }, { "epoch": 0.08195555555555556, "grad_norm": 77.72819519042969, "learning_rate": 3e-06, "loss": -5.4085, "step": 922 }, { "epoch": 0.08204444444444445, "grad_norm": 100.81270599365234, "learning_rate": 3e-06, "loss": -7.311, "step": 923 }, { "epoch": 0.08213333333333334, "grad_norm": 86.93998718261719, "learning_rate": 3e-06, "loss": -7.627, "step": 924 }, { "completion_length": 244.2916717529297, "epoch": 0.08222222222222222, "grad_norm": 124.40040588378906, "learning_rate": 3e-06, "loss": 2.5607, "reward": 1.7708333730697632, "reward_std": 0.5440726578235626, "rewards/boxed_and_answer_tags_format_reward": 0.7291666567325592, "rewards/correctness_reward_func_math": 1.0416666567325592, "step": 925, "zero_std_ratio": 0.375 }, { "epoch": 0.08231111111111111, "grad_norm": 100.88472747802734, "learning_rate": 3e-06, "loss": -11.1251, "step": 926 }, { "epoch": 0.0824, "grad_norm": 139.0868377685547, "learning_rate": 3e-06, "loss": -4.6789, "step": 927 }, { "epoch": 0.08248888888888889, "grad_norm": 105.37358093261719, "learning_rate": 3e-06, "loss": -1.7812, "step": 928 }, { "epoch": 0.08257777777777778, "grad_norm": 122.72453308105469, "learning_rate": 3e-06, "loss": 5.9917, "step": 929 }, { "epoch": 0.08266666666666667, "grad_norm": 125.92015075683594, "learning_rate": 3e-06, "loss": -2.2776, "step": 930 }, { "epoch": 0.08275555555555555, "grad_norm": 122.41661834716797, "learning_rate": 3e-06, "loss": 0.8417, "step": 931 }, { "epoch": 0.08284444444444444, "grad_norm": 117.53387451171875, "learning_rate": 3e-06, "loss": -13.1484, "step": 932 }, { "epoch": 0.08293333333333333, "grad_norm": 126.8305892944336, "learning_rate": 3e-06, "loss": -7.6824, "step": 933 }, { "epoch": 0.08302222222222222, "grad_norm": 111.0191421508789, "learning_rate": 3e-06, "loss": -3.9524, "step": 934 }, { "epoch": 0.08311111111111111, "grad_norm": 131.84397888183594, "learning_rate": 3e-06, "loss": 2.9848, "step": 935 }, { "epoch": 0.0832, "grad_norm": 124.30160522460938, "learning_rate": 3e-06, "loss": -5.5558, "step": 936 }, { "completion_length": 240.9791717529297, "epoch": 0.08328888888888888, "grad_norm": 137.87579345703125, "learning_rate": 3e-06, "loss": -18.5164, "reward": 1.1354166865348816, "reward_std": 0.4608011841773987, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.4583333432674408, "step": 937, "zero_std_ratio": 0.375 }, { "epoch": 0.08337777777777777, "grad_norm": 132.598388671875, "learning_rate": 3e-06, "loss": -26.5032, "step": 938 }, { "epoch": 0.08346666666666666, "grad_norm": 147.24671936035156, "learning_rate": 3e-06, "loss": -20.2196, "step": 939 }, { "epoch": 0.08355555555555555, "grad_norm": 140.57591247558594, "learning_rate": 3e-06, "loss": -19.0462, "step": 940 }, { "epoch": 0.08364444444444444, "grad_norm": 124.26339721679688, "learning_rate": 3e-06, "loss": -22.6895, "step": 941 }, { "epoch": 0.08373333333333334, "grad_norm": 126.45221710205078, "learning_rate": 3e-06, "loss": -24.6262, "step": 942 }, { "epoch": 0.08382222222222223, "grad_norm": 135.6764373779297, "learning_rate": 3e-06, "loss": -20.8866, "step": 943 }, { "epoch": 0.08391111111111112, "grad_norm": 139.4601287841797, "learning_rate": 3e-06, "loss": -28.928, "step": 944 }, { "epoch": 0.084, "grad_norm": 173.5882568359375, "learning_rate": 3e-06, "loss": -23.599, "step": 945 }, { "epoch": 0.0840888888888889, "grad_norm": 131.79933166503906, "learning_rate": 3e-06, "loss": -22.4616, "step": 946 }, { "epoch": 0.08417777777777778, "grad_norm": 128.2574920654297, "learning_rate": 3e-06, "loss": -26.6084, "step": 947 }, { "epoch": 0.08426666666666667, "grad_norm": 133.56024169921875, "learning_rate": 3e-06, "loss": -29.0544, "step": 948 }, { "completion_length": 254.3541717529297, "epoch": 0.08435555555555556, "grad_norm": 113.98381805419922, "learning_rate": 3e-06, "loss": 2.0708, "reward": 1.291666716337204, "reward_std": 0.4701542258262634, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.541666679084301, "step": 949, "zero_std_ratio": 0.5 }, { "epoch": 0.08444444444444445, "grad_norm": 158.377197265625, "learning_rate": 3e-06, "loss": 0.0861, "step": 950 }, { "epoch": 0.08453333333333334, "grad_norm": 152.6723175048828, "learning_rate": 3e-06, "loss": 8.1982, "step": 951 }, { "epoch": 0.08462222222222222, "grad_norm": 122.41393280029297, "learning_rate": 3e-06, "loss": -2.2863, "step": 952 }, { "epoch": 0.08471111111111111, "grad_norm": 137.0810089111328, "learning_rate": 3e-06, "loss": -1.879, "step": 953 }, { "epoch": 0.0848, "grad_norm": 122.0219497680664, "learning_rate": 3e-06, "loss": -4.6058, "step": 954 }, { "epoch": 0.08488888888888889, "grad_norm": 122.55841064453125, "learning_rate": 3e-06, "loss": 0.6988, "step": 955 }, { "epoch": 0.08497777777777778, "grad_norm": 144.296875, "learning_rate": 3e-06, "loss": -2.2706, "step": 956 }, { "epoch": 0.08506666666666667, "grad_norm": 130.82684326171875, "learning_rate": 3e-06, "loss": 6.122, "step": 957 }, { "epoch": 0.08515555555555555, "grad_norm": 121.61994934082031, "learning_rate": 3e-06, "loss": -4.2647, "step": 958 }, { "epoch": 0.08524444444444444, "grad_norm": 124.73589324951172, "learning_rate": 3e-06, "loss": -4.8135, "step": 959 }, { "epoch": 0.08533333333333333, "grad_norm": 109.87874603271484, "learning_rate": 3e-06, "loss": -7.1554, "step": 960 }, { "completion_length": 244.50000762939453, "epoch": 0.08542222222222222, "grad_norm": 74.7403793334961, "learning_rate": 3e-06, "loss": 9.6961, "reward": 2.1666667461395264, "reward_std": 0.20412414520978928, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.4166666269302368, "step": 961, "zero_std_ratio": 0.75 }, { "epoch": 0.08551111111111111, "grad_norm": 68.04969024658203, "learning_rate": 3e-06, "loss": 12.6362, "step": 962 }, { "epoch": 0.0856, "grad_norm": 69.76187133789062, "learning_rate": 3e-06, "loss": 13.3129, "step": 963 }, { "epoch": 0.08568888888888888, "grad_norm": 81.5944595336914, "learning_rate": 3e-06, "loss": 9.7628, "step": 964 }, { "epoch": 0.08577777777777777, "grad_norm": 83.20171356201172, "learning_rate": 3e-06, "loss": 8.2983, "step": 965 }, { "epoch": 0.08586666666666666, "grad_norm": 78.74623107910156, "learning_rate": 3e-06, "loss": 3.5912, "step": 966 }, { "epoch": 0.08595555555555555, "grad_norm": 65.22360229492188, "learning_rate": 3e-06, "loss": 8.6179, "step": 967 }, { "epoch": 0.08604444444444445, "grad_norm": 67.84490966796875, "learning_rate": 3e-06, "loss": 10.9563, "step": 968 }, { "epoch": 0.08613333333333334, "grad_norm": 66.93883514404297, "learning_rate": 3e-06, "loss": 11.5826, "step": 969 }, { "epoch": 0.08622222222222223, "grad_norm": 75.27574157714844, "learning_rate": 3e-06, "loss": 8.5011, "step": 970 }, { "epoch": 0.08631111111111112, "grad_norm": 68.24022674560547, "learning_rate": 3e-06, "loss": 6.8511, "step": 971 }, { "epoch": 0.0864, "grad_norm": 70.8918685913086, "learning_rate": 3e-06, "loss": 2.4563, "step": 972 }, { "completion_length": 251.0416717529297, "epoch": 0.08648888888888889, "grad_norm": 130.6646270751953, "learning_rate": 3e-06, "loss": -5.8085, "reward": 1.4479166865348816, "reward_std": 0.5305383503437042, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.708333358168602, "step": 973, "zero_std_ratio": 0.375 }, { "epoch": 0.08657777777777778, "grad_norm": 134.2749786376953, "learning_rate": 3e-06, "loss": 1.4252, "step": 974 }, { "epoch": 0.08666666666666667, "grad_norm": 137.93409729003906, "learning_rate": 3e-06, "loss": -1.3707, "step": 975 }, { "epoch": 0.08675555555555556, "grad_norm": 121.03262329101562, "learning_rate": 3e-06, "loss": 1.4106, "step": 976 }, { "epoch": 0.08684444444444445, "grad_norm": 132.82774353027344, "learning_rate": 3e-06, "loss": -4.6112, "step": 977 }, { "epoch": 0.08693333333333333, "grad_norm": 163.79421997070312, "learning_rate": 3e-06, "loss": -9.213, "step": 978 }, { "epoch": 0.08702222222222222, "grad_norm": 135.431396484375, "learning_rate": 3e-06, "loss": -9.618, "step": 979 }, { "epoch": 0.08711111111111111, "grad_norm": 130.899658203125, "learning_rate": 3e-06, "loss": -2.0931, "step": 980 }, { "epoch": 0.0872, "grad_norm": 133.3518524169922, "learning_rate": 3e-06, "loss": -5.8637, "step": 981 }, { "epoch": 0.08728888888888889, "grad_norm": 144.94068908691406, "learning_rate": 3e-06, "loss": -2.6211, "step": 982 }, { "epoch": 0.08737777777777778, "grad_norm": 141.72738647460938, "learning_rate": 3e-06, "loss": -9.573, "step": 983 }, { "epoch": 0.08746666666666666, "grad_norm": 148.67123413085938, "learning_rate": 3e-06, "loss": -12.8677, "step": 984 }, { "completion_length": 255.89583587646484, "epoch": 0.08755555555555555, "grad_norm": 148.83518981933594, "learning_rate": 3e-06, "loss": -2.2086, "reward": 1.3958333432674408, "reward_std": 0.48216672986745834, "rewards/boxed_and_answer_tags_format_reward": 0.7291666865348816, "rewards/correctness_reward_func_math": 0.6666666865348816, "step": 985, "zero_std_ratio": 0.375 }, { "epoch": 0.08764444444444444, "grad_norm": 141.38470458984375, "learning_rate": 3e-06, "loss": -5.2298, "step": 986 }, { "epoch": 0.08773333333333333, "grad_norm": 114.79426574707031, "learning_rate": 3e-06, "loss": 0.6894, "step": 987 }, { "epoch": 0.08782222222222222, "grad_norm": 137.67416381835938, "learning_rate": 3e-06, "loss": 0.8801, "step": 988 }, { "epoch": 0.0879111111111111, "grad_norm": 138.06517028808594, "learning_rate": 3e-06, "loss": -1.6148, "step": 989 }, { "epoch": 0.088, "grad_norm": 153.39608764648438, "learning_rate": 3e-06, "loss": 0.5819, "step": 990 }, { "epoch": 0.08808888888888888, "grad_norm": 159.18431091308594, "learning_rate": 3e-06, "loss": -3.8668, "step": 991 }, { "epoch": 0.08817777777777777, "grad_norm": 141.29696655273438, "learning_rate": 3e-06, "loss": -7.4773, "step": 992 }, { "epoch": 0.08826666666666666, "grad_norm": 129.83058166503906, "learning_rate": 3e-06, "loss": -1.6063, "step": 993 }, { "epoch": 0.08835555555555556, "grad_norm": 133.49261474609375, "learning_rate": 3e-06, "loss": -0.5547, "step": 994 }, { "epoch": 0.08844444444444445, "grad_norm": 125.11674499511719, "learning_rate": 3e-06, "loss": -3.5547, "step": 995 }, { "epoch": 0.08853333333333334, "grad_norm": 113.9607925415039, "learning_rate": 3e-06, "loss": -1.1078, "step": 996 }, { "completion_length": 254.1041717529297, "epoch": 0.08862222222222223, "grad_norm": 112.0015869140625, "learning_rate": 3e-06, "loss": 0.6874, "reward": 1.6666666865348816, "reward_std": 0.3557328134775162, "rewards/boxed_and_answer_tags_format_reward": 0.7083333432674408, "rewards/correctness_reward_func_math": 0.9583333432674408, "step": 997, "zero_std_ratio": 0.375 }, { "epoch": 0.08871111111111112, "grad_norm": 117.1604232788086, "learning_rate": 3e-06, "loss": 10.6509, "step": 998 }, { "epoch": 0.0888, "grad_norm": 97.7497787475586, "learning_rate": 3e-06, "loss": 1.431, "step": 999 }, { "epoch": 0.08888888888888889, "grad_norm": 99.68553161621094, "learning_rate": 3e-06, "loss": 9.1808, "step": 1000 }, { "epoch": 0.08897777777777778, "grad_norm": 127.76787567138672, "learning_rate": 3e-06, "loss": 7.5762, "step": 1001 }, { "epoch": 0.08906666666666667, "grad_norm": 93.00830841064453, "learning_rate": 3e-06, "loss": 5.1825, "step": 1002 }, { "epoch": 0.08915555555555556, "grad_norm": 100.87223815917969, "learning_rate": 3e-06, "loss": -0.2862, "step": 1003 }, { "epoch": 0.08924444444444445, "grad_norm": 125.37133026123047, "learning_rate": 3e-06, "loss": 9.2038, "step": 1004 }, { "epoch": 0.08933333333333333, "grad_norm": 107.84559631347656, "learning_rate": 3e-06, "loss": -0.0948, "step": 1005 }, { "epoch": 0.08942222222222222, "grad_norm": 126.59029388427734, "learning_rate": 3e-06, "loss": 7.2332, "step": 1006 }, { "epoch": 0.08951111111111111, "grad_norm": 120.74652099609375, "learning_rate": 3e-06, "loss": 6.3094, "step": 1007 }, { "epoch": 0.0896, "grad_norm": 94.37996673583984, "learning_rate": 3e-06, "loss": 3.469, "step": 1008 }, { "completion_length": 242.25000762939453, "epoch": 0.08968888888888889, "grad_norm": 61.877044677734375, "learning_rate": 3e-06, "loss": 0.3003, "reward": 1.0104166865348816, "reward_std": 0.11467799544334412, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.3333333358168602, "step": 1009, "zero_std_ratio": 0.875 }, { "epoch": 0.08977777777777778, "grad_norm": 48.69044494628906, "learning_rate": 3e-06, "loss": -1.93, "step": 1010 }, { "epoch": 0.08986666666666666, "grad_norm": 62.56666946411133, "learning_rate": 3e-06, "loss": 0.7216, "step": 1011 }, { "epoch": 0.08995555555555555, "grad_norm": 54.1674690246582, "learning_rate": 3e-06, "loss": -1.7886, "step": 1012 }, { "epoch": 0.09004444444444444, "grad_norm": 52.60224533081055, "learning_rate": 3e-06, "loss": 0.5879, "step": 1013 }, { "epoch": 0.09013333333333333, "grad_norm": 45.58321762084961, "learning_rate": 3e-06, "loss": 0.5204, "step": 1014 }, { "epoch": 0.09022222222222222, "grad_norm": 57.8793830871582, "learning_rate": 3e-06, "loss": -0.4941, "step": 1015 }, { "epoch": 0.0903111111111111, "grad_norm": 51.80791091918945, "learning_rate": 3e-06, "loss": -2.7426, "step": 1016 }, { "epoch": 0.0904, "grad_norm": 56.86159896850586, "learning_rate": 3e-06, "loss": -0.3923, "step": 1017 }, { "epoch": 0.09048888888888888, "grad_norm": 48.4435920715332, "learning_rate": 3e-06, "loss": -3.0717, "step": 1018 }, { "epoch": 0.09057777777777777, "grad_norm": 52.369598388671875, "learning_rate": 3e-06, "loss": -0.6683, "step": 1019 }, { "epoch": 0.09066666666666667, "grad_norm": 45.13036346435547, "learning_rate": 3e-06, "loss": -0.8826, "step": 1020 }, { "completion_length": 251.9166717529297, "epoch": 0.09075555555555556, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "reward": 1.5, "reward_std": 0.0, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.75, "step": 1021, "zero_std_ratio": 1.0 }, { "epoch": 0.09084444444444445, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1022 }, { "epoch": 0.09093333333333334, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1023 }, { "epoch": 0.09102222222222223, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1024 }, { "epoch": 0.09111111111111111, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1025 }, { "epoch": 0.0912, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1026 }, { "epoch": 0.09128888888888889, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1027 }, { "epoch": 0.09137777777777778, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1028 }, { "epoch": 0.09146666666666667, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1029 }, { "epoch": 0.09155555555555556, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1030 }, { "epoch": 0.09164444444444445, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1031 }, { "epoch": 0.09173333333333333, "grad_norm": 0.0, "learning_rate": 3e-06, "loss": 0.0, "step": 1032 }, { "completion_length": 252.3541717529297, "epoch": 0.09182222222222222, "grad_norm": 126.83654022216797, "learning_rate": 3e-06, "loss": -8.3335, "reward": 1.625, "reward_std": 0.3410547971725464, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.875, "step": 1033, "zero_std_ratio": 0.625 }, { "epoch": 0.09191111111111111, "grad_norm": 102.25955963134766, "learning_rate": 3e-06, "loss": -6.4906, "step": 1034 }, { "epoch": 0.092, "grad_norm": 128.81582641601562, "learning_rate": 3e-06, "loss": -6.558, "step": 1035 }, { "epoch": 0.09208888888888889, "grad_norm": 112.67058563232422, "learning_rate": 3e-06, "loss": -9.3061, "step": 1036 }, { "epoch": 0.09217777777777778, "grad_norm": 108.50650024414062, "learning_rate": 3e-06, "loss": -6.0381, "step": 1037 }, { "epoch": 0.09226666666666666, "grad_norm": 108.7009506225586, "learning_rate": 3e-06, "loss": 0.0853, "step": 1038 }, { "epoch": 0.09235555555555555, "grad_norm": 114.24646759033203, "learning_rate": 3e-06, "loss": -9.0476, "step": 1039 }, { "epoch": 0.09244444444444444, "grad_norm": 99.69547271728516, "learning_rate": 3e-06, "loss": -8.1766, "step": 1040 }, { "epoch": 0.09253333333333333, "grad_norm": 142.9137725830078, "learning_rate": 3e-06, "loss": -7.6658, "step": 1041 }, { "epoch": 0.09262222222222222, "grad_norm": 113.05297088623047, "learning_rate": 3e-06, "loss": -10.693, "step": 1042 }, { "epoch": 0.0927111111111111, "grad_norm": 114.48544311523438, "learning_rate": 3e-06, "loss": -8.4665, "step": 1043 }, { "epoch": 0.0928, "grad_norm": 131.56544494628906, "learning_rate": 3e-06, "loss": -1.0749, "step": 1044 }, { "completion_length": 251.3125, "epoch": 0.09288888888888888, "grad_norm": 162.5680389404297, "learning_rate": 3e-06, "loss": 2.2852, "reward": 1.4479166865348816, "reward_std": 0.5932036638259888, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.708333358168602, "step": 1045, "zero_std_ratio": 0.375 }, { "epoch": 0.09297777777777778, "grad_norm": 183.73635864257812, "learning_rate": 3e-06, "loss": 4.3862, "step": 1046 }, { "epoch": 0.09306666666666667, "grad_norm": 149.2581329345703, "learning_rate": 3e-06, "loss": 3.2506, "step": 1047 }, { "epoch": 0.09315555555555556, "grad_norm": 146.53892517089844, "learning_rate": 3e-06, "loss": 6.7977, "step": 1048 }, { "epoch": 0.09324444444444445, "grad_norm": 187.605224609375, "learning_rate": 3e-06, "loss": -4.9329, "step": 1049 }, { "epoch": 0.09333333333333334, "grad_norm": 135.69638061523438, "learning_rate": 3e-06, "loss": 1.364, "step": 1050 }, { "epoch": 0.09342222222222223, "grad_norm": 155.4630126953125, "learning_rate": 3e-06, "loss": -0.1606, "step": 1051 }, { "epoch": 0.09351111111111111, "grad_norm": 137.24606323242188, "learning_rate": 3e-06, "loss": 2.4667, "step": 1052 }, { "epoch": 0.0936, "grad_norm": 145.3031768798828, "learning_rate": 3e-06, "loss": 0.3328, "step": 1053 }, { "epoch": 0.09368888888888889, "grad_norm": 138.85008239746094, "learning_rate": 3e-06, "loss": 3.6961, "step": 1054 }, { "epoch": 0.09377777777777778, "grad_norm": 151.01002502441406, "learning_rate": 3e-06, "loss": -8.223, "step": 1055 }, { "epoch": 0.09386666666666667, "grad_norm": 130.21697998046875, "learning_rate": 3e-06, "loss": -1.2589, "step": 1056 }, { "completion_length": 255.4791717529297, "epoch": 0.09395555555555556, "grad_norm": 129.92930603027344, "learning_rate": 3e-06, "loss": -4.7404, "reward": 1.0, "reward_std": 0.43528565764427185, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.375, "step": 1057, "zero_std_ratio": 0.5 }, { "epoch": 0.09404444444444444, "grad_norm": 134.7594451904297, "learning_rate": 3e-06, "loss": -7.4851, "step": 1058 }, { "epoch": 0.09413333333333333, "grad_norm": 145.62059020996094, "learning_rate": 3e-06, "loss": -11.0884, "step": 1059 }, { "epoch": 0.09422222222222222, "grad_norm": 116.93612670898438, "learning_rate": 3e-06, "loss": 6.683, "step": 1060 }, { "epoch": 0.09431111111111111, "grad_norm": 126.82006072998047, "learning_rate": 3e-06, "loss": -7.7905, "step": 1061 }, { "epoch": 0.0944, "grad_norm": 252.49966430664062, "learning_rate": 3e-06, "loss": 4.6391, "step": 1062 }, { "epoch": 0.09448888888888889, "grad_norm": 129.0404052734375, "learning_rate": 3e-06, "loss": -6.2658, "step": 1063 }, { "epoch": 0.09457777777777777, "grad_norm": 140.08370971679688, "learning_rate": 3e-06, "loss": -8.4286, "step": 1064 }, { "epoch": 0.09466666666666666, "grad_norm": 114.8161392211914, "learning_rate": 3e-06, "loss": -12.9169, "step": 1065 }, { "epoch": 0.09475555555555555, "grad_norm": 112.29281616210938, "learning_rate": 3e-06, "loss": 4.7452, "step": 1066 }, { "epoch": 0.09484444444444444, "grad_norm": 140.55029296875, "learning_rate": 3e-06, "loss": -9.5303, "step": 1067 }, { "epoch": 0.09493333333333333, "grad_norm": 147.0571746826172, "learning_rate": 3e-06, "loss": 3.0677, "step": 1068 }, { "completion_length": 245.0416717529297, "epoch": 0.09502222222222222, "grad_norm": 101.13983154296875, "learning_rate": 3e-06, "loss": 7.3268, "reward": 0.7916666865348816, "reward_std": 0.27350127696990967, "rewards/boxed_and_answer_tags_format_reward": 0.4999999850988388, "rewards/correctness_reward_func_math": 0.2916666679084301, "step": 1069, "zero_std_ratio": 0.625 }, { "epoch": 0.0951111111111111, "grad_norm": 121.06708526611328, "learning_rate": 3e-06, "loss": 4.489, "step": 1070 }, { "epoch": 0.0952, "grad_norm": 127.9291763305664, "learning_rate": 3e-06, "loss": 1.7732, "step": 1071 }, { "epoch": 0.0952888888888889, "grad_norm": 120.67790222167969, "learning_rate": 3e-06, "loss": 7.2385, "step": 1072 }, { "epoch": 0.09537777777777778, "grad_norm": 98.45962524414062, "learning_rate": 3e-06, "loss": -9.8857, "step": 1073 }, { "epoch": 0.09546666666666667, "grad_norm": 110.4314193725586, "learning_rate": 3e-06, "loss": -1.6899, "step": 1074 }, { "epoch": 0.09555555555555556, "grad_norm": 101.835693359375, "learning_rate": 3e-06, "loss": 5.3228, "step": 1075 }, { "epoch": 0.09564444444444445, "grad_norm": 119.22904205322266, "learning_rate": 3e-06, "loss": 3.0508, "step": 1076 }, { "epoch": 0.09573333333333334, "grad_norm": 126.35284423828125, "learning_rate": 3e-06, "loss": -1.1565, "step": 1077 }, { "epoch": 0.09582222222222223, "grad_norm": 129.94705200195312, "learning_rate": 3e-06, "loss": 4.3463, "step": 1078 }, { "epoch": 0.09591111111111111, "grad_norm": 95.31863403320312, "learning_rate": 3e-06, "loss": -12.0709, "step": 1079 }, { "epoch": 0.096, "grad_norm": 111.38770294189453, "learning_rate": 3e-06, "loss": -4.804, "step": 1080 }, { "completion_length": 227.33333587646484, "epoch": 0.09608888888888889, "grad_norm": 93.04568481445312, "learning_rate": 3e-06, "loss": -1.8293, "reward": 1.2604166865348816, "reward_std": 0.4915197938680649, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.5833333134651184, "step": 1081, "zero_std_ratio": 0.5 }, { "epoch": 0.09617777777777778, "grad_norm": 186.89431762695312, "learning_rate": 3e-06, "loss": -1.6852, "step": 1082 }, { "epoch": 0.09626666666666667, "grad_norm": 132.6072998046875, "learning_rate": 3e-06, "loss": -19.4138, "step": 1083 }, { "epoch": 0.09635555555555556, "grad_norm": 128.269287109375, "learning_rate": 3e-06, "loss": -7.5699, "step": 1084 }, { "epoch": 0.09644444444444444, "grad_norm": 104.99845123291016, "learning_rate": 3e-06, "loss": -14.2839, "step": 1085 }, { "epoch": 0.09653333333333333, "grad_norm": 127.01644897460938, "learning_rate": 3e-06, "loss": -6.2089, "step": 1086 }, { "epoch": 0.09662222222222222, "grad_norm": 99.6088638305664, "learning_rate": 3e-06, "loss": -3.0049, "step": 1087 }, { "epoch": 0.09671111111111111, "grad_norm": 120.30072784423828, "learning_rate": 3e-06, "loss": -3.4027, "step": 1088 }, { "epoch": 0.0968, "grad_norm": 121.11226654052734, "learning_rate": 3e-06, "loss": -20.6903, "step": 1089 }, { "epoch": 0.09688888888888889, "grad_norm": 101.49308013916016, "learning_rate": 3e-06, "loss": -8.6379, "step": 1090 }, { "epoch": 0.09697777777777777, "grad_norm": 117.81327819824219, "learning_rate": 3e-06, "loss": -15.1799, "step": 1091 }, { "epoch": 0.09706666666666666, "grad_norm": 114.92808532714844, "learning_rate": 3e-06, "loss": -8.7164, "step": 1092 }, { "completion_length": 245.68750762939453, "epoch": 0.09715555555555555, "grad_norm": 137.37049865722656, "learning_rate": 3e-06, "loss": -17.5632, "reward": 1.2604166865348816, "reward_std": 0.3936076760292053, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.5833333134651184, "step": 1093, "zero_std_ratio": 0.5 }, { "epoch": 0.09724444444444444, "grad_norm": 104.14005279541016, "learning_rate": 3e-06, "loss": -13.8548, "step": 1094 }, { "epoch": 0.09733333333333333, "grad_norm": 111.48066711425781, "learning_rate": 3e-06, "loss": -14.4865, "step": 1095 }, { "epoch": 0.09742222222222222, "grad_norm": 115.86460876464844, "learning_rate": 3e-06, "loss": -12.7074, "step": 1096 }, { "epoch": 0.09751111111111112, "grad_norm": 108.50313568115234, "learning_rate": 3e-06, "loss": -17.9489, "step": 1097 }, { "epoch": 0.0976, "grad_norm": 113.4880599975586, "learning_rate": 3e-06, "loss": -16.1087, "step": 1098 }, { "epoch": 0.0976888888888889, "grad_norm": 109.29180145263672, "learning_rate": 3e-06, "loss": -19.2428, "step": 1099 }, { "epoch": 0.09777777777777778, "grad_norm": 107.1020278930664, "learning_rate": 3e-06, "loss": -15.2564, "step": 1100 }, { "epoch": 0.09786666666666667, "grad_norm": 131.5577392578125, "learning_rate": 3e-06, "loss": -15.6807, "step": 1101 }, { "epoch": 0.09795555555555556, "grad_norm": 121.74998474121094, "learning_rate": 3e-06, "loss": -15.0192, "step": 1102 }, { "epoch": 0.09804444444444445, "grad_norm": 107.87700653076172, "learning_rate": 3e-06, "loss": -19.6246, "step": 1103 }, { "epoch": 0.09813333333333334, "grad_norm": 132.6728515625, "learning_rate": 3e-06, "loss": -17.4119, "step": 1104 }, { "completion_length": 247.375, "epoch": 0.09822222222222222, "grad_norm": 194.66749572753906, "learning_rate": 3e-06, "loss": -59.2499, "reward": 1.7083333730697632, "reward_std": 0.6184598803520203, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.9583333432674408, "step": 1105, "zero_std_ratio": 0.375 }, { "epoch": 0.09831111111111111, "grad_norm": 179.941162109375, "learning_rate": 3e-06, "loss": -46.7423, "step": 1106 }, { "epoch": 0.0984, "grad_norm": 232.29762268066406, "learning_rate": 3e-06, "loss": -54.0588, "step": 1107 }, { "epoch": 0.09848888888888889, "grad_norm": 208.61793518066406, "learning_rate": 3e-06, "loss": -72.7937, "step": 1108 }, { "epoch": 0.09857777777777778, "grad_norm": 180.18431091308594, "learning_rate": 3e-06, "loss": -37.6563, "step": 1109 }, { "epoch": 0.09866666666666667, "grad_norm": 191.83653259277344, "learning_rate": 3e-06, "loss": -56.5117, "step": 1110 }, { "epoch": 0.09875555555555555, "grad_norm": 192.1591796875, "learning_rate": 3e-06, "loss": -63.0639, "step": 1111 }, { "epoch": 0.09884444444444444, "grad_norm": 183.53610229492188, "learning_rate": 3e-06, "loss": -50.2258, "step": 1112 }, { "epoch": 0.09893333333333333, "grad_norm": 233.22872924804688, "learning_rate": 3e-06, "loss": -58.1688, "step": 1113 }, { "epoch": 0.09902222222222222, "grad_norm": 219.78233337402344, "learning_rate": 3e-06, "loss": -78.4412, "step": 1114 }, { "epoch": 0.09911111111111111, "grad_norm": 189.9258270263672, "learning_rate": 3e-06, "loss": -42.5197, "step": 1115 }, { "epoch": 0.0992, "grad_norm": 204.02183532714844, "learning_rate": 3e-06, "loss": -61.9828, "step": 1116 }, { "completion_length": 248.9166717529297, "epoch": 0.09928888888888888, "grad_norm": 114.85681915283203, "learning_rate": 3e-06, "loss": 1.5984, "reward": 1.2395833730697632, "reward_std": 0.2296396717429161, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.4999999850988388, "step": 1117, "zero_std_ratio": 0.625 }, { "epoch": 0.09937777777777777, "grad_norm": 113.66478729248047, "learning_rate": 3e-06, "loss": -2.2296, "step": 1118 }, { "epoch": 0.09946666666666666, "grad_norm": 124.4161148071289, "learning_rate": 3e-06, "loss": 4.382, "step": 1119 }, { "epoch": 0.09955555555555555, "grad_norm": 131.39085388183594, "learning_rate": 3e-06, "loss": 0.5734, "step": 1120 }, { "epoch": 0.09964444444444444, "grad_norm": 119.46894073486328, "learning_rate": 3e-06, "loss": 5.0378, "step": 1121 }, { "epoch": 0.09973333333333333, "grad_norm": 113.26507568359375, "learning_rate": 3e-06, "loss": 1.4215, "step": 1122 }, { "epoch": 0.09982222222222223, "grad_norm": 120.69562530517578, "learning_rate": 3e-06, "loss": 0.0203, "step": 1123 }, { "epoch": 0.09991111111111112, "grad_norm": 128.0107421875, "learning_rate": 3e-06, "loss": -3.7284, "step": 1124 }, { "epoch": 0.1, "grad_norm": 139.31997680664062, "learning_rate": 3e-06, "loss": 2.9527, "step": 1125 }, { "epoch": 0.1000888888888889, "grad_norm": 111.49156188964844, "learning_rate": 3e-06, "loss": -1.217, "step": 1126 }, { "epoch": 0.10017777777777778, "grad_norm": 107.92985534667969, "learning_rate": 3e-06, "loss": 3.7084, "step": 1127 }, { "epoch": 0.10026666666666667, "grad_norm": 112.88748168945312, "learning_rate": 3e-06, "loss": -0.5577, "step": 1128 }, { "completion_length": 245.18750762939453, "epoch": 0.10035555555555556, "grad_norm": 124.6043930053711, "learning_rate": 3e-06, "loss": -20.6745, "reward": 1.4583333432674408, "reward_std": 0.5722163170576096, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.8333333283662796, "step": 1129, "zero_std_ratio": 0.375 }, { "epoch": 0.10044444444444445, "grad_norm": 118.25672149658203, "learning_rate": 3e-06, "loss": -32.0798, "step": 1130 }, { "epoch": 0.10053333333333334, "grad_norm": 189.2974853515625, "learning_rate": 3e-06, "loss": -23.7609, "step": 1131 }, { "epoch": 0.10062222222222222, "grad_norm": 139.94485473632812, "learning_rate": 3e-06, "loss": -30.0811, "step": 1132 }, { "epoch": 0.10071111111111111, "grad_norm": 132.92324829101562, "learning_rate": 3e-06, "loss": -29.1304, "step": 1133 }, { "epoch": 0.1008, "grad_norm": 129.6322021484375, "learning_rate": 3e-06, "loss": -25.8999, "step": 1134 }, { "epoch": 0.10088888888888889, "grad_norm": 122.53899383544922, "learning_rate": 3e-06, "loss": -22.266, "step": 1135 }, { "epoch": 0.10097777777777778, "grad_norm": 111.6375961303711, "learning_rate": 3e-06, "loss": -32.9394, "step": 1136 }, { "epoch": 0.10106666666666667, "grad_norm": 162.63771057128906, "learning_rate": 3e-06, "loss": -25.9897, "step": 1137 }, { "epoch": 0.10115555555555555, "grad_norm": 154.28424072265625, "learning_rate": 3e-06, "loss": -32.7633, "step": 1138 }, { "epoch": 0.10124444444444444, "grad_norm": 132.94351196289062, "learning_rate": 3e-06, "loss": -31.2033, "step": 1139 }, { "epoch": 0.10133333333333333, "grad_norm": 131.9947052001953, "learning_rate": 3e-06, "loss": -27.4069, "step": 1140 }, { "completion_length": 248.625, "epoch": 0.10142222222222222, "grad_norm": 113.37858581542969, "learning_rate": 3e-06, "loss": -14.7283, "reward": 0.9479166865348816, "reward_std": 0.2296396642923355, "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, "rewards/correctness_reward_func_math": 0.3333333246409893, "step": 1141, "zero_std_ratio": 0.625 }, { "epoch": 0.10151111111111111, "grad_norm": 112.18434143066406, "learning_rate": 3e-06, "loss": -11.645, "step": 1142 }, { "epoch": 0.1016, "grad_norm": 105.41796112060547, "learning_rate": 3e-06, "loss": -15.9935, "step": 1143 }, { "epoch": 0.10168888888888888, "grad_norm": 91.3786849975586, "learning_rate": 3e-06, "loss": -16.6043, "step": 1144 }, { "epoch": 0.10177777777777777, "grad_norm": 104.3455810546875, "learning_rate": 3e-06, "loss": -3.4885, "step": 1145 }, { "epoch": 0.10186666666666666, "grad_norm": 87.41930389404297, "learning_rate": 3e-06, "loss": -20.634, "step": 1146 }, { "epoch": 0.10195555555555555, "grad_norm": 90.50940704345703, "learning_rate": 3e-06, "loss": -16.7866, "step": 1147 }, { "epoch": 0.10204444444444444, "grad_norm": 117.73979949951172, "learning_rate": 3e-06, "loss": -13.9565, "step": 1148 }, { "epoch": 0.10213333333333334, "grad_norm": 117.4783706665039, "learning_rate": 3e-06, "loss": -18.3371, "step": 1149 }, { "epoch": 0.10222222222222223, "grad_norm": 90.1675033569336, "learning_rate": 3e-06, "loss": -19.5285, "step": 1150 }, { "epoch": 0.10231111111111112, "grad_norm": 114.65315246582031, "learning_rate": 3e-06, "loss": -6.9923, "step": 1151 }, { "epoch": 0.1024, "grad_norm": 236.66262817382812, "learning_rate": 3e-06, "loss": -23.6847, "step": 1152 }, { "completion_length": 254.0625, "epoch": 0.1024888888888889, "grad_norm": 112.95475769042969, "learning_rate": 3e-06, "loss": -27.7434, "reward": 0.96875, "reward_std": 0.3734789788722992, "rewards/boxed_and_answer_tags_format_reward": 0.71875, "rewards/correctness_reward_func_math": 0.25, "step": 1153, "zero_std_ratio": 0.5 }, { "epoch": 0.10257777777777778, "grad_norm": 144.55364990234375, "learning_rate": 3e-06, "loss": -22.4297, "step": 1154 }, { "epoch": 0.10266666666666667, "grad_norm": 128.1786346435547, "learning_rate": 3e-06, "loss": -34.1587, "step": 1155 }, { "epoch": 0.10275555555555556, "grad_norm": 125.85458374023438, "learning_rate": 3e-06, "loss": -30.6015, "step": 1156 }, { "epoch": 0.10284444444444445, "grad_norm": 142.524658203125, "learning_rate": 3e-06, "loss": -31.4122, "step": 1157 }, { "epoch": 0.10293333333333334, "grad_norm": 111.42877197265625, "learning_rate": 3e-06, "loss": -26.1967, "step": 1158 }, { "epoch": 0.10302222222222222, "grad_norm": 123.39324951171875, "learning_rate": 3e-06, "loss": -29.3019, "step": 1159 }, { "epoch": 0.10311111111111111, "grad_norm": 151.65802001953125, "learning_rate": 3e-06, "loss": -24.3001, "step": 1160 }, { "epoch": 0.1032, "grad_norm": 127.43438720703125, "learning_rate": 3e-06, "loss": -36.1734, "step": 1161 }, { "epoch": 0.10328888888888889, "grad_norm": 123.67347717285156, "learning_rate": 3e-06, "loss": -33.6541, "step": 1162 }, { "epoch": 0.10337777777777778, "grad_norm": 147.1012420654297, "learning_rate": 3e-06, "loss": -33.954, "step": 1163 }, { "epoch": 0.10346666666666667, "grad_norm": 125.47201538085938, "learning_rate": 3e-06, "loss": -28.7879, "step": 1164 }, { "completion_length": 236.64583587646484, "epoch": 0.10355555555555555, "grad_norm": 348.24346923828125, "learning_rate": 3e-06, "loss": -9.8131, "reward": 1.4375000596046448, "reward_std": 0.4971916079521179, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.75, "step": 1165, "zero_std_ratio": 0.5 }, { "epoch": 0.10364444444444444, "grad_norm": 189.84584045410156, "learning_rate": 3e-06, "loss": -5.3377, "step": 1166 }, { "epoch": 0.10373333333333333, "grad_norm": 192.2544403076172, "learning_rate": 3e-06, "loss": -1.7722, "step": 1167 }, { "epoch": 0.10382222222222222, "grad_norm": 180.6472625732422, "learning_rate": 3e-06, "loss": -6.7487, "step": 1168 }, { "epoch": 0.1039111111111111, "grad_norm": 197.0133819580078, "learning_rate": 3e-06, "loss": -2.4648, "step": 1169 }, { "epoch": 0.104, "grad_norm": 256.5722351074219, "learning_rate": 3e-06, "loss": -17.2997, "step": 1170 }, { "epoch": 0.10408888888888888, "grad_norm": 179.86228942871094, "learning_rate": 3e-06, "loss": -12.5209, "step": 1171 }, { "epoch": 0.10417777777777777, "grad_norm": 195.3128204345703, "learning_rate": 3e-06, "loss": -9.6999, "step": 1172 }, { "epoch": 0.10426666666666666, "grad_norm": 204.9373321533203, "learning_rate": 3e-06, "loss": -6.9052, "step": 1173 }, { "epoch": 0.10435555555555555, "grad_norm": 192.8905792236328, "learning_rate": 3e-06, "loss": -10.4987, "step": 1174 }, { "epoch": 0.10444444444444445, "grad_norm": 181.99449157714844, "learning_rate": 3e-06, "loss": -7.0688, "step": 1175 }, { "epoch": 0.10453333333333334, "grad_norm": 189.1867218017578, "learning_rate": 3e-06, "loss": -22.4777, "step": 1176 }, { "completion_length": 250.33333587646484, "epoch": 0.10462222222222223, "grad_norm": 158.67459106445312, "learning_rate": 3e-06, "loss": -8.4502, "reward": 0.9583333432674408, "reward_std": 0.5451789498329163, "rewards/boxed_and_answer_tags_format_reward": 0.5, "rewards/correctness_reward_func_math": 0.4583333358168602, "step": 1177, "zero_std_ratio": 0.375 }, { "epoch": 0.10471111111111112, "grad_norm": 158.85533142089844, "learning_rate": 3e-06, "loss": -13.1446, "step": 1178 }, { "epoch": 0.1048, "grad_norm": 155.9822235107422, "learning_rate": 3e-06, "loss": -6.7899, "step": 1179 }, { "epoch": 0.10488888888888889, "grad_norm": 150.00985717773438, "learning_rate": 3e-06, "loss": 5.268, "step": 1180 }, { "epoch": 0.10497777777777778, "grad_norm": 140.22618103027344, "learning_rate": 3e-06, "loss": -3.0269, "step": 1181 }, { "epoch": 0.10506666666666667, "grad_norm": 130.9547119140625, "learning_rate": 3e-06, "loss": -6.5307, "step": 1182 }, { "epoch": 0.10515555555555556, "grad_norm": 157.96466064453125, "learning_rate": 3e-06, "loss": -10.1506, "step": 1183 }, { "epoch": 0.10524444444444445, "grad_norm": 162.57582092285156, "learning_rate": 3e-06, "loss": -14.3085, "step": 1184 }, { "epoch": 0.10533333333333333, "grad_norm": 163.5466766357422, "learning_rate": 3e-06, "loss": -9.5339, "step": 1185 }, { "epoch": 0.10542222222222222, "grad_norm": 126.0348129272461, "learning_rate": 3e-06, "loss": 2.7285, "step": 1186 }, { "epoch": 0.10551111111111111, "grad_norm": 152.0486297607422, "learning_rate": 3e-06, "loss": -5.0055, "step": 1187 }, { "epoch": 0.1056, "grad_norm": 135.672607421875, "learning_rate": 3e-06, "loss": -7.9305, "step": 1188 }, { "completion_length": 242.43750762939453, "epoch": 0.10568888888888889, "grad_norm": 102.83517456054688, "learning_rate": 3e-06, "loss": -9.6079, "reward": 1.5833333730697632, "reward_std": 0.3332235962152481, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.8333333134651184, "step": 1189, "zero_std_ratio": 0.625 }, { "epoch": 0.10577777777777778, "grad_norm": 104.57038116455078, "learning_rate": 3e-06, "loss": 3.4859, "step": 1190 }, { "epoch": 0.10586666666666666, "grad_norm": 130.5141143798828, "learning_rate": 3e-06, "loss": 9.6096, "step": 1191 }, { "epoch": 0.10595555555555555, "grad_norm": 121.0637435913086, "learning_rate": 3e-06, "loss": -7.0112, "step": 1192 }, { "epoch": 0.10604444444444444, "grad_norm": 116.66060638427734, "learning_rate": 3e-06, "loss": -0.6318, "step": 1193 }, { "epoch": 0.10613333333333333, "grad_norm": 93.47602844238281, "learning_rate": 3e-06, "loss": -7.121, "step": 1194 }, { "epoch": 0.10622222222222222, "grad_norm": 97.91071319580078, "learning_rate": 3e-06, "loss": -11.2048, "step": 1195 }, { "epoch": 0.1063111111111111, "grad_norm": 99.79684448242188, "learning_rate": 3e-06, "loss": 2.4369, "step": 1196 }, { "epoch": 0.1064, "grad_norm": 103.4743423461914, "learning_rate": 3e-06, "loss": 7.3113, "step": 1197 }, { "epoch": 0.10648888888888888, "grad_norm": 115.71762084960938, "learning_rate": 3e-06, "loss": -8.8378, "step": 1198 }, { "epoch": 0.10657777777777777, "grad_norm": 116.43769073486328, "learning_rate": 3e-06, "loss": -2.6688, "step": 1199 }, { "epoch": 0.10666666666666667, "grad_norm": 93.20524597167969, "learning_rate": 3e-06, "loss": -9.0718, "step": 1200 }, { "completion_length": 242.77083587646484, "epoch": 0.10675555555555556, "grad_norm": 156.0432586669922, "learning_rate": 3e-06, "loss": 6.0861, "reward": 1.6770833730697632, "reward_std": 0.5419133305549622, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 1.0, "step": 1201, "zero_std_ratio": 0.375 }, { "epoch": 0.10684444444444445, "grad_norm": 173.78958129882812, "learning_rate": 3e-06, "loss": 9.0247, "step": 1202 }, { "epoch": 0.10693333333333334, "grad_norm": 152.82534790039062, "learning_rate": 3e-06, "loss": 5.6834, "step": 1203 }, { "epoch": 0.10702222222222223, "grad_norm": 151.876953125, "learning_rate": 3e-06, "loss": 6.9018, "step": 1204 }, { "epoch": 0.10711111111111112, "grad_norm": 165.88400268554688, "learning_rate": 3e-06, "loss": 9.1234, "step": 1205 }, { "epoch": 0.1072, "grad_norm": 167.48348999023438, "learning_rate": 3e-06, "loss": 9.4483, "step": 1206 }, { "epoch": 0.10728888888888889, "grad_norm": 149.90536499023438, "learning_rate": 3e-06, "loss": 2.7725, "step": 1207 }, { "epoch": 0.10737777777777778, "grad_norm": 169.23411560058594, "learning_rate": 3e-06, "loss": 6.9296, "step": 1208 }, { "epoch": 0.10746666666666667, "grad_norm": 233.46914672851562, "learning_rate": 3e-06, "loss": 3.5024, "step": 1209 }, { "epoch": 0.10755555555555556, "grad_norm": 147.3250732421875, "learning_rate": 3e-06, "loss": 3.7723, "step": 1210 }, { "epoch": 0.10764444444444445, "grad_norm": 155.88914489746094, "learning_rate": 3e-06, "loss": 7.0329, "step": 1211 }, { "epoch": 0.10773333333333333, "grad_norm": 153.82785034179688, "learning_rate": 3e-06, "loss": 7.0486, "step": 1212 }, { "completion_length": 247.6666717529297, "epoch": 0.10782222222222222, "grad_norm": 158.41856384277344, "learning_rate": 3e-06, "loss": 18.2887, "reward": 1.614583432674408, "reward_std": 0.3665703386068344, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.8750000149011612, "step": 1213, "zero_std_ratio": 0.5 }, { "epoch": 0.10791111111111111, "grad_norm": 163.51866149902344, "learning_rate": 3e-06, "loss": 5.6461, "step": 1214 }, { "epoch": 0.108, "grad_norm": 131.4403839111328, "learning_rate": 3e-06, "loss": 3.3667, "step": 1215 }, { "epoch": 0.10808888888888889, "grad_norm": 136.77757263183594, "learning_rate": 3e-06, "loss": 4.406, "step": 1216 }, { "epoch": 0.10817777777777778, "grad_norm": 113.4407958984375, "learning_rate": 3e-06, "loss": 12.3995, "step": 1217 }, { "epoch": 0.10826666666666666, "grad_norm": 136.1168212890625, "learning_rate": 3e-06, "loss": 4.6776, "step": 1218 }, { "epoch": 0.10835555555555555, "grad_norm": 144.9372100830078, "learning_rate": 3e-06, "loss": 17.5724, "step": 1219 }, { "epoch": 0.10844444444444444, "grad_norm": 125.29820251464844, "learning_rate": 3e-06, "loss": 4.1424, "step": 1220 }, { "epoch": 0.10853333333333333, "grad_norm": 138.5077667236328, "learning_rate": 3e-06, "loss": 1.3947, "step": 1221 }, { "epoch": 0.10862222222222222, "grad_norm": 128.62693786621094, "learning_rate": 3e-06, "loss": 2.4864, "step": 1222 }, { "epoch": 0.1087111111111111, "grad_norm": 107.87577056884766, "learning_rate": 3e-06, "loss": 10.4239, "step": 1223 }, { "epoch": 0.1088, "grad_norm": 127.07205963134766, "learning_rate": 3e-06, "loss": 1.3109, "step": 1224 }, { "completion_length": 238.89584350585938, "epoch": 0.10888888888888888, "grad_norm": 154.35325622558594, "learning_rate": 3e-06, "loss": 8.6596, "reward": 1.5208333730697632, "reward_std": 0.3680921420454979, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.8333333432674408, "step": 1225, "zero_std_ratio": 0.625 }, { "epoch": 0.10897777777777778, "grad_norm": 165.58445739746094, "learning_rate": 3e-06, "loss": 1.8278, "step": 1226 }, { "epoch": 0.10906666666666667, "grad_norm": 360.30029296875, "learning_rate": 3e-06, "loss": -9.1779, "step": 1227 }, { "epoch": 0.10915555555555556, "grad_norm": 152.518310546875, "learning_rate": 3e-06, "loss": -9.97, "step": 1228 }, { "epoch": 0.10924444444444445, "grad_norm": 146.67100524902344, "learning_rate": 3e-06, "loss": -6.3997, "step": 1229 }, { "epoch": 0.10933333333333334, "grad_norm": 138.9771728515625, "learning_rate": 3e-06, "loss": -6.0985, "step": 1230 }, { "epoch": 0.10942222222222223, "grad_norm": 159.66302490234375, "learning_rate": 3e-06, "loss": 5.5207, "step": 1231 }, { "epoch": 0.10951111111111111, "grad_norm": 138.22695922851562, "learning_rate": 3e-06, "loss": -0.4886, "step": 1232 }, { "epoch": 0.1096, "grad_norm": 160.3970489501953, "learning_rate": 3e-06, "loss": -12.1071, "step": 1233 }, { "epoch": 0.10968888888888889, "grad_norm": 141.63226318359375, "learning_rate": 3e-06, "loss": -13.6489, "step": 1234 }, { "epoch": 0.10977777777777778, "grad_norm": 146.45748901367188, "learning_rate": 3e-06, "loss": -10.0014, "step": 1235 }, { "epoch": 0.10986666666666667, "grad_norm": 140.94992065429688, "learning_rate": 3e-06, "loss": -9.6508, "step": 1236 }, { "completion_length": 255.4166717529297, "epoch": 0.10995555555555556, "grad_norm": 160.60601806640625, "learning_rate": 3e-06, "loss": 1.8826, "reward": 0.8020833432674408, "reward_std": 0.3381742835044861, "rewards/boxed_and_answer_tags_format_reward": 0.6354166865348816, "rewards/correctness_reward_func_math": 0.1666666679084301, "step": 1237, "zero_std_ratio": 0.625 }, { "epoch": 0.11004444444444444, "grad_norm": 186.42630004882812, "learning_rate": 3e-06, "loss": -4.6822, "step": 1238 }, { "epoch": 0.11013333333333333, "grad_norm": 141.29820251464844, "learning_rate": 3e-06, "loss": 11.1707, "step": 1239 }, { "epoch": 0.11022222222222222, "grad_norm": 144.1931610107422, "learning_rate": 3e-06, "loss": 3.3338, "step": 1240 }, { "epoch": 0.11031111111111111, "grad_norm": 135.27630615234375, "learning_rate": 3e-06, "loss": 4.9367, "step": 1241 }, { "epoch": 0.1104, "grad_norm": 175.80433654785156, "learning_rate": 3e-06, "loss": -13.1628, "step": 1242 }, { "epoch": 0.11048888888888889, "grad_norm": 165.0531463623047, "learning_rate": 3e-06, "loss": -0.3721, "step": 1243 }, { "epoch": 0.11057777777777777, "grad_norm": 264.06695556640625, "learning_rate": 3e-06, "loss": -7.2717, "step": 1244 }, { "epoch": 0.11066666666666666, "grad_norm": 145.43423461914062, "learning_rate": 3e-06, "loss": 8.5606, "step": 1245 }, { "epoch": 0.11075555555555555, "grad_norm": 145.75946044921875, "learning_rate": 3e-06, "loss": 0.9458, "step": 1246 }, { "epoch": 0.11084444444444444, "grad_norm": 125.21862030029297, "learning_rate": 3e-06, "loss": 3.1516, "step": 1247 }, { "epoch": 0.11093333333333333, "grad_norm": 177.2153778076172, "learning_rate": 3e-06, "loss": -17.2454, "step": 1248 }, { "completion_length": 251.6041717529297, "epoch": 0.11102222222222222, "grad_norm": 371.7901306152344, "learning_rate": 3e-06, "loss": -30.6046, "reward": 1.9895833730697632, "reward_std": 0.8478911817073822, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 1.25, "step": 1249, "zero_std_ratio": 0.125 }, { "epoch": 0.1111111111111111, "grad_norm": 241.84666442871094, "learning_rate": 3e-06, "loss": -47.9038, "step": 1250 }, { "epoch": 0.1112, "grad_norm": 187.25973510742188, "learning_rate": 3e-06, "loss": -31.5847, "step": 1251 }, { "epoch": 0.1112888888888889, "grad_norm": 220.72537231445312, "learning_rate": 3e-06, "loss": -37.9464, "step": 1252 }, { "epoch": 0.11137777777777778, "grad_norm": 215.938720703125, "learning_rate": 3e-06, "loss": -36.917, "step": 1253 }, { "epoch": 0.11146666666666667, "grad_norm": 255.27755737304688, "learning_rate": 3e-06, "loss": -36.4102, "step": 1254 }, { "epoch": 0.11155555555555556, "grad_norm": 252.8043212890625, "learning_rate": 3e-06, "loss": -32.6218, "step": 1255 }, { "epoch": 0.11164444444444445, "grad_norm": 225.40321350097656, "learning_rate": 3e-06, "loss": -51.6723, "step": 1256 }, { "epoch": 0.11173333333333334, "grad_norm": 208.3738250732422, "learning_rate": 3e-06, "loss": -35.6313, "step": 1257 }, { "epoch": 0.11182222222222223, "grad_norm": 215.90704345703125, "learning_rate": 3e-06, "loss": -41.5126, "step": 1258 }, { "epoch": 0.11191111111111111, "grad_norm": 242.8232879638672, "learning_rate": 3e-06, "loss": -42.0952, "step": 1259 }, { "epoch": 0.112, "grad_norm": 220.46678161621094, "learning_rate": 3e-06, "loss": -40.7929, "step": 1260 }, { "completion_length": 252.2291717529297, "epoch": 0.11208888888888889, "grad_norm": 112.288330078125, "learning_rate": 3e-06, "loss": -28.9692, "reward": 0.8333333730697632, "reward_std": 0.23116151988506317, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.2083333283662796, "step": 1261, "zero_std_ratio": 0.75 }, { "epoch": 0.11217777777777778, "grad_norm": 111.31488800048828, "learning_rate": 3e-06, "loss": -22.9427, "step": 1262 }, { "epoch": 0.11226666666666667, "grad_norm": 130.2353057861328, "learning_rate": 3e-06, "loss": -30.2762, "step": 1263 }, { "epoch": 0.11235555555555556, "grad_norm": 154.38973999023438, "learning_rate": 3e-06, "loss": -39.3815, "step": 1264 }, { "epoch": 0.11244444444444444, "grad_norm": 109.89620971679688, "learning_rate": 3e-06, "loss": -28.5723, "step": 1265 }, { "epoch": 0.11253333333333333, "grad_norm": 104.00005340576172, "learning_rate": 3e-06, "loss": -25.6052, "step": 1266 }, { "epoch": 0.11262222222222222, "grad_norm": 121.78623962402344, "learning_rate": 3e-06, "loss": -30.0983, "step": 1267 }, { "epoch": 0.11271111111111111, "grad_norm": 119.55603790283203, "learning_rate": 3e-06, "loss": -23.8716, "step": 1268 }, { "epoch": 0.1128, "grad_norm": 124.7007827758789, "learning_rate": 3e-06, "loss": -31.7822, "step": 1269 }, { "epoch": 0.11288888888888889, "grad_norm": 133.42088317871094, "learning_rate": 3e-06, "loss": -42.2768, "step": 1270 }, { "epoch": 0.11297777777777777, "grad_norm": 128.7488555908203, "learning_rate": 3e-06, "loss": -30.9053, "step": 1271 }, { "epoch": 0.11306666666666666, "grad_norm": 128.32147216796875, "learning_rate": 3e-06, "loss": -27.7986, "step": 1272 }, { "completion_length": 251.02083587646484, "epoch": 0.11315555555555555, "grad_norm": 276.54681396484375, "learning_rate": 3e-06, "loss": -8.3431, "reward": 1.7916666865348816, "reward_std": 0.4701542258262634, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.0416666567325592, "step": 1273, "zero_std_ratio": 0.5 }, { "epoch": 0.11324444444444444, "grad_norm": 234.2393341064453, "learning_rate": 3e-06, "loss": -13.7623, "step": 1274 }, { "epoch": 0.11333333333333333, "grad_norm": 236.026611328125, "learning_rate": 3e-06, "loss": -4.7751, "step": 1275 }, { "epoch": 0.11342222222222222, "grad_norm": 245.18170166015625, "learning_rate": 3e-06, "loss": -16.0847, "step": 1276 }, { "epoch": 0.1135111111111111, "grad_norm": 314.53057861328125, "learning_rate": 3e-06, "loss": -8.6135, "step": 1277 }, { "epoch": 0.1136, "grad_norm": 207.09188842773438, "learning_rate": 3e-06, "loss": -13.115, "step": 1278 }, { "epoch": 0.1136888888888889, "grad_norm": 252.8507080078125, "learning_rate": 3e-06, "loss": -12.3574, "step": 1279 }, { "epoch": 0.11377777777777778, "grad_norm": 241.2433319091797, "learning_rate": 3e-06, "loss": -18.4114, "step": 1280 }, { "epoch": 0.11386666666666667, "grad_norm": 217.98683166503906, "learning_rate": 3e-06, "loss": -7.0416, "step": 1281 }, { "epoch": 0.11395555555555556, "grad_norm": 252.15773010253906, "learning_rate": 3e-06, "loss": -19.5222, "step": 1282 }, { "epoch": 0.11404444444444445, "grad_norm": 227.17205810546875, "learning_rate": 3e-06, "loss": -13.2521, "step": 1283 }, { "epoch": 0.11413333333333334, "grad_norm": 235.24327087402344, "learning_rate": 3e-06, "loss": -17.9427, "step": 1284 }, { "completion_length": 253.25, "epoch": 0.11422222222222222, "grad_norm": 206.74746704101562, "learning_rate": 3e-06, "loss": 0.218, "reward": 1.7187500596046448, "reward_std": 0.5723656415939331, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 1.0416666567325592, "step": 1285, "zero_std_ratio": 0.375 }, { "epoch": 0.11431111111111111, "grad_norm": 207.45079040527344, "learning_rate": 3e-06, "loss": 23.7089, "step": 1286 }, { "epoch": 0.1144, "grad_norm": 201.48416137695312, "learning_rate": 3e-06, "loss": 16.6222, "step": 1287 }, { "epoch": 0.11448888888888889, "grad_norm": 240.6068115234375, "learning_rate": 3e-06, "loss": 2.8229, "step": 1288 }, { "epoch": 0.11457777777777778, "grad_norm": 267.2186279296875, "learning_rate": 3e-06, "loss": -6.3361, "step": 1289 }, { "epoch": 0.11466666666666667, "grad_norm": 254.57681274414062, "learning_rate": 3e-06, "loss": 2.6077, "step": 1290 }, { "epoch": 0.11475555555555556, "grad_norm": 215.68649291992188, "learning_rate": 3e-06, "loss": -2.7734, "step": 1291 }, { "epoch": 0.11484444444444444, "grad_norm": 227.70590209960938, "learning_rate": 3e-06, "loss": 18.5952, "step": 1292 }, { "epoch": 0.11493333333333333, "grad_norm": 215.9313201904297, "learning_rate": 3e-06, "loss": 13.7655, "step": 1293 }, { "epoch": 0.11502222222222222, "grad_norm": 251.1554718017578, "learning_rate": 3e-06, "loss": -1.3305, "step": 1294 }, { "epoch": 0.11511111111111111, "grad_norm": 274.58538818359375, "learning_rate": 3e-06, "loss": -8.8694, "step": 1295 }, { "epoch": 0.1152, "grad_norm": 238.6010284423828, "learning_rate": 3e-06, "loss": -1.8695, "step": 1296 }, { "completion_length": 252.1666717529297, "epoch": 0.11528888888888889, "grad_norm": 220.25823974609375, "learning_rate": 3e-06, "loss": -7.0592, "reward": 1.0625, "reward_std": 0.39512956142425537, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.375, "step": 1297, "zero_std_ratio": 0.625 }, { "epoch": 0.11537777777777777, "grad_norm": 190.90309143066406, "learning_rate": 3e-06, "loss": -21.8086, "step": 1298 }, { "epoch": 0.11546666666666666, "grad_norm": 167.63180541992188, "learning_rate": 3e-06, "loss": -10.4386, "step": 1299 }, { "epoch": 0.11555555555555555, "grad_norm": 196.72048950195312, "learning_rate": 3e-06, "loss": -10.7261, "step": 1300 }, { "epoch": 0.11564444444444444, "grad_norm": 243.5116424560547, "learning_rate": 3e-06, "loss": -21.7371, "step": 1301 }, { "epoch": 0.11573333333333333, "grad_norm": 163.6327667236328, "learning_rate": 3e-06, "loss": -21.7888, "step": 1302 }, { "epoch": 0.11582222222222222, "grad_norm": 217.04978942871094, "learning_rate": 3e-06, "loss": -7.9678, "step": 1303 }, { "epoch": 0.11591111111111112, "grad_norm": 182.5911865234375, "learning_rate": 3e-06, "loss": -22.9758, "step": 1304 }, { "epoch": 0.116, "grad_norm": 167.71888732910156, "learning_rate": 3e-06, "loss": -12.1331, "step": 1305 }, { "epoch": 0.1160888888888889, "grad_norm": 168.7008819580078, "learning_rate": 3e-06, "loss": -13.2942, "step": 1306 }, { "epoch": 0.11617777777777778, "grad_norm": 210.5468292236328, "learning_rate": 3e-06, "loss": -22.5768, "step": 1307 }, { "epoch": 0.11626666666666667, "grad_norm": 179.2998046875, "learning_rate": 3e-06, "loss": -23.8159, "step": 1308 }, { "completion_length": 254.5416717529297, "epoch": 0.11635555555555556, "grad_norm": 219.9465789794922, "learning_rate": 3e-06, "loss": 7.1814, "reward": 1.2916667461395264, "reward_std": 0.48936043679714203, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.5416666716337204, "step": 1309, "zero_std_ratio": 0.5 }, { "epoch": 0.11644444444444445, "grad_norm": 224.42604064941406, "learning_rate": 3e-06, "loss": 8.354, "step": 1310 }, { "epoch": 0.11653333333333334, "grad_norm": 262.8431091308594, "learning_rate": 3e-06, "loss": 5.5188, "step": 1311 }, { "epoch": 0.11662222222222222, "grad_norm": 446.6980285644531, "learning_rate": 3e-06, "loss": 7.2822, "step": 1312 }, { "epoch": 0.11671111111111111, "grad_norm": 200.77650451660156, "learning_rate": 3e-06, "loss": -0.5339, "step": 1313 }, { "epoch": 0.1168, "grad_norm": 230.9583740234375, "learning_rate": 3e-06, "loss": -3.6372, "step": 1314 }, { "epoch": 0.11688888888888889, "grad_norm": 257.7409362792969, "learning_rate": 3e-06, "loss": 5.1944, "step": 1315 }, { "epoch": 0.11697777777777778, "grad_norm": 215.5637664794922, "learning_rate": 3e-06, "loss": 6.6842, "step": 1316 }, { "epoch": 0.11706666666666667, "grad_norm": 228.1314697265625, "learning_rate": 3e-06, "loss": 3.1597, "step": 1317 }, { "epoch": 0.11715555555555555, "grad_norm": 227.79237365722656, "learning_rate": 3e-06, "loss": 6.0615, "step": 1318 }, { "epoch": 0.11724444444444444, "grad_norm": 364.67889404296875, "learning_rate": 3e-06, "loss": -4.7295, "step": 1319 }, { "epoch": 0.11733333333333333, "grad_norm": 435.36578369140625, "learning_rate": 3e-06, "loss": -7.3331, "step": 1320 }, { "completion_length": 254.12500762939453, "epoch": 0.11742222222222222, "grad_norm": 181.09559631347656, "learning_rate": 3e-06, "loss": 9.7639, "reward": 1.3437500596046448, "reward_std": 0.3936076909303665, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.6666666865348816, "step": 1321, "zero_std_ratio": 0.5 }, { "epoch": 0.11751111111111111, "grad_norm": 184.9253387451172, "learning_rate": 3e-06, "loss": -7.8109, "step": 1322 }, { "epoch": 0.1176, "grad_norm": 178.65553283691406, "learning_rate": 3e-06, "loss": -0.5036, "step": 1323 }, { "epoch": 0.11768888888888888, "grad_norm": 172.2812042236328, "learning_rate": 3e-06, "loss": -5.0317, "step": 1324 }, { "epoch": 0.11777777777777777, "grad_norm": 189.1818084716797, "learning_rate": 3e-06, "loss": -5.0896, "step": 1325 }, { "epoch": 0.11786666666666666, "grad_norm": 195.15562438964844, "learning_rate": 3e-06, "loss": -6.7484, "step": 1326 }, { "epoch": 0.11795555555555555, "grad_norm": 180.81617736816406, "learning_rate": 3e-06, "loss": 6.0087, "step": 1327 }, { "epoch": 0.11804444444444444, "grad_norm": 181.2151336669922, "learning_rate": 3e-06, "loss": -12.1462, "step": 1328 }, { "epoch": 0.11813333333333334, "grad_norm": 232.097900390625, "learning_rate": 3e-06, "loss": -6.1373, "step": 1329 }, { "epoch": 0.11822222222222223, "grad_norm": 180.55462646484375, "learning_rate": 3e-06, "loss": -10.2139, "step": 1330 }, { "epoch": 0.11831111111111112, "grad_norm": 192.82818603515625, "learning_rate": 3e-06, "loss": -11.1656, "step": 1331 }, { "epoch": 0.1184, "grad_norm": 190.95399475097656, "learning_rate": 3e-06, "loss": -13.1171, "step": 1332 }, { "completion_length": 254.81250762939453, "epoch": 0.1184888888888889, "grad_norm": 65.9253921508789, "learning_rate": 3e-06, "loss": 2.6383, "reward": 0.9791666865348816, "reward_std": 0.10206206887960434, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.2916666679084301, "step": 1333, "zero_std_ratio": 0.875 }, { "epoch": 0.11857777777777778, "grad_norm": 66.89152526855469, "learning_rate": 3e-06, "loss": 0.0904, "step": 1334 }, { "epoch": 0.11866666666666667, "grad_norm": 52.20970916748047, "learning_rate": 3e-06, "loss": -2.1923, "step": 1335 }, { "epoch": 0.11875555555555556, "grad_norm": 62.94733810424805, "learning_rate": 3e-06, "loss": -0.1938, "step": 1336 }, { "epoch": 0.11884444444444445, "grad_norm": 74.16914367675781, "learning_rate": 3e-06, "loss": -0.0211, "step": 1337 }, { "epoch": 0.11893333333333334, "grad_norm": 60.86524963378906, "learning_rate": 3e-06, "loss": 1.9824, "step": 1338 }, { "epoch": 0.11902222222222222, "grad_norm": 68.02703857421875, "learning_rate": 3e-06, "loss": 1.8138, "step": 1339 }, { "epoch": 0.11911111111111111, "grad_norm": 65.81590270996094, "learning_rate": 3e-06, "loss": -0.8801, "step": 1340 }, { "epoch": 0.1192, "grad_norm": 49.24789810180664, "learning_rate": 3e-06, "loss": -2.9454, "step": 1341 }, { "epoch": 0.11928888888888889, "grad_norm": 66.03591918945312, "learning_rate": 3e-06, "loss": -2.1792, "step": 1342 }, { "epoch": 0.11937777777777778, "grad_norm": 68.27337646484375, "learning_rate": 3e-06, "loss": -1.5692, "step": 1343 }, { "epoch": 0.11946666666666667, "grad_norm": 76.02365112304688, "learning_rate": 3e-06, "loss": 1.3993, "step": 1344 }, { "completion_length": 242.02084350585938, "epoch": 0.11955555555555555, "grad_norm": 159.37770080566406, "learning_rate": 3e-06, "loss": -16.5166, "reward": 1.3125, "reward_std": 0.43528567254543304, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.625, "step": 1345, "zero_std_ratio": 0.5 }, { "epoch": 0.11964444444444444, "grad_norm": 156.10609436035156, "learning_rate": 3e-06, "loss": -19.2254, "step": 1346 }, { "epoch": 0.11973333333333333, "grad_norm": 159.9058074951172, "learning_rate": 3e-06, "loss": -21.4448, "step": 1347 }, { "epoch": 0.11982222222222222, "grad_norm": 217.29722595214844, "learning_rate": 3e-06, "loss": -22.2701, "step": 1348 }, { "epoch": 0.11991111111111111, "grad_norm": 220.8108673095703, "learning_rate": 3e-06, "loss": -9.9609, "step": 1349 }, { "epoch": 0.12, "grad_norm": 255.4961395263672, "learning_rate": 3e-06, "loss": -10.7417, "step": 1350 }, { "epoch": 0.12008888888888888, "grad_norm": 159.0548858642578, "learning_rate": 3e-06, "loss": -18.3206, "step": 1351 }, { "epoch": 0.12017777777777777, "grad_norm": 151.7304229736328, "learning_rate": 3e-06, "loss": -21.7915, "step": 1352 }, { "epoch": 0.12026666666666666, "grad_norm": 162.3264923095703, "learning_rate": 3e-06, "loss": -24.2052, "step": 1353 }, { "epoch": 0.12035555555555555, "grad_norm": 188.41310119628906, "learning_rate": 3e-06, "loss": -26.0542, "step": 1354 }, { "epoch": 0.12044444444444445, "grad_norm": 182.1976776123047, "learning_rate": 3e-06, "loss": -14.6291, "step": 1355 }, { "epoch": 0.12053333333333334, "grad_norm": 227.4834747314453, "learning_rate": 3e-06, "loss": -16.78, "step": 1356 }, { "completion_length": 236.93750762939453, "epoch": 0.12062222222222223, "grad_norm": 244.82566833496094, "learning_rate": 3e-06, "loss": -29.5738, "reward": 2.125, "reward_std": 0.39512956142425537, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.375, "step": 1357, "zero_std_ratio": 0.625 }, { "epoch": 0.12071111111111112, "grad_norm": 560.8680419921875, "learning_rate": 3e-06, "loss": -35.2668, "step": 1358 }, { "epoch": 0.1208, "grad_norm": 303.6029968261719, "learning_rate": 3e-06, "loss": -21.3551, "step": 1359 }, { "epoch": 0.12088888888888889, "grad_norm": 247.47055053710938, "learning_rate": 3e-06, "loss": -30.3557, "step": 1360 }, { "epoch": 0.12097777777777778, "grad_norm": 311.3307189941406, "learning_rate": 3e-06, "loss": -31.387, "step": 1361 }, { "epoch": 0.12106666666666667, "grad_norm": 296.8590087890625, "learning_rate": 3e-06, "loss": -33.6326, "step": 1362 }, { "epoch": 0.12115555555555556, "grad_norm": 253.0756378173828, "learning_rate": 3e-06, "loss": -31.4888, "step": 1363 }, { "epoch": 0.12124444444444445, "grad_norm": 293.0926513671875, "learning_rate": 3e-06, "loss": -38.9989, "step": 1364 }, { "epoch": 0.12133333333333333, "grad_norm": 318.564208984375, "learning_rate": 3e-06, "loss": -25.8663, "step": 1365 }, { "epoch": 0.12142222222222222, "grad_norm": 262.5808410644531, "learning_rate": 3e-06, "loss": -33.3059, "step": 1366 }, { "epoch": 0.12151111111111111, "grad_norm": 297.9688720703125, "learning_rate": 3e-06, "loss": -36.1247, "step": 1367 }, { "epoch": 0.1216, "grad_norm": 319.11968994140625, "learning_rate": 3e-06, "loss": -37.8042, "step": 1368 }, { "completion_length": 241.52083587646484, "epoch": 0.12168888888888889, "grad_norm": 149.28683471679688, "learning_rate": 3e-06, "loss": -5.2185, "reward": 0.979166716337204, "reward_std": 0.306186206638813, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.2916666641831398, "step": 1369, "zero_std_ratio": 0.625 }, { "epoch": 0.12177777777777778, "grad_norm": 234.47120666503906, "learning_rate": 3e-06, "loss": -0.5601, "step": 1370 }, { "epoch": 0.12186666666666666, "grad_norm": 168.30697631835938, "learning_rate": 3e-06, "loss": -6.9666, "step": 1371 }, { "epoch": 0.12195555555555555, "grad_norm": 304.0892333984375, "learning_rate": 3e-06, "loss": -11.5806, "step": 1372 }, { "epoch": 0.12204444444444444, "grad_norm": 210.6204071044922, "learning_rate": 3e-06, "loss": 3.7489, "step": 1373 }, { "epoch": 0.12213333333333333, "grad_norm": 227.05795288085938, "learning_rate": 3e-06, "loss": -19.6276, "step": 1374 }, { "epoch": 0.12222222222222222, "grad_norm": 148.8232879638672, "learning_rate": 3e-06, "loss": -8.3126, "step": 1375 }, { "epoch": 0.1223111111111111, "grad_norm": 216.28646850585938, "learning_rate": 3e-06, "loss": -4.662, "step": 1376 }, { "epoch": 0.1224, "grad_norm": 196.22518920898438, "learning_rate": 3e-06, "loss": -10.8248, "step": 1377 }, { "epoch": 0.12248888888888888, "grad_norm": 238.0521697998047, "learning_rate": 3e-06, "loss": -16.9448, "step": 1378 }, { "epoch": 0.12257777777777777, "grad_norm": 195.62860107421875, "learning_rate": 3e-06, "loss": -0.2301, "step": 1379 }, { "epoch": 0.12266666666666666, "grad_norm": 201.47740173339844, "learning_rate": 3e-06, "loss": -26.1931, "step": 1380 }, { "completion_length": 223.375, "epoch": 0.12275555555555556, "grad_norm": 340.7679443359375, "learning_rate": 3e-06, "loss": -35.7739, "reward": 1.5000000596046448, "reward_std": 0.4779854714870453, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.75, "step": 1381, "zero_std_ratio": 0.5 }, { "epoch": 0.12284444444444445, "grad_norm": 357.4112548828125, "learning_rate": 3e-06, "loss": -25.5447, "step": 1382 }, { "epoch": 0.12293333333333334, "grad_norm": 355.23419189453125, "learning_rate": 3e-06, "loss": -29.5073, "step": 1383 }, { "epoch": 0.12302222222222223, "grad_norm": 362.6241455078125, "learning_rate": 3e-06, "loss": -15.9052, "step": 1384 }, { "epoch": 0.12311111111111112, "grad_norm": 388.32904052734375, "learning_rate": 3e-06, "loss": -16.7547, "step": 1385 }, { "epoch": 0.1232, "grad_norm": 398.5157775878906, "learning_rate": 3e-06, "loss": -32.0836, "step": 1386 }, { "epoch": 0.12328888888888889, "grad_norm": 256.07763671875, "learning_rate": 3e-06, "loss": -36.567, "step": 1387 }, { "epoch": 0.12337777777777778, "grad_norm": 273.55108642578125, "learning_rate": 3e-06, "loss": -32.0325, "step": 1388 }, { "epoch": 0.12346666666666667, "grad_norm": 298.35675048828125, "learning_rate": 3e-06, "loss": -33.9138, "step": 1389 }, { "epoch": 0.12355555555555556, "grad_norm": 319.2604064941406, "learning_rate": 3e-06, "loss": -22.5196, "step": 1390 }, { "epoch": 0.12364444444444445, "grad_norm": 321.7362976074219, "learning_rate": 3e-06, "loss": -22.5907, "step": 1391 }, { "epoch": 0.12373333333333333, "grad_norm": 289.5376281738281, "learning_rate": 3e-06, "loss": -39.4107, "step": 1392 }, { "completion_length": 239.6041717529297, "epoch": 0.12382222222222222, "grad_norm": 189.32923889160156, "learning_rate": 3e-06, "loss": 33.0773, "reward": 1.3541666865348816, "reward_std": 0.23899273574352264, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.6666666567325592, "step": 1393, "zero_std_ratio": 0.75 }, { "epoch": 0.12391111111111111, "grad_norm": 280.063720703125, "learning_rate": 3e-06, "loss": 28.1537, "step": 1394 }, { "epoch": 0.124, "grad_norm": 239.7598114013672, "learning_rate": 3e-06, "loss": 30.4262, "step": 1395 }, { "epoch": 0.12408888888888889, "grad_norm": 230.4679718017578, "learning_rate": 3e-06, "loss": 26.9258, "step": 1396 }, { "epoch": 0.12417777777777778, "grad_norm": 241.6261444091797, "learning_rate": 3e-06, "loss": 39.8305, "step": 1397 }, { "epoch": 0.12426666666666666, "grad_norm": 305.56097412109375, "learning_rate": 3e-06, "loss": 29.9909, "step": 1398 }, { "epoch": 0.12435555555555555, "grad_norm": 190.80581665039062, "learning_rate": 3e-06, "loss": 28.9829, "step": 1399 }, { "epoch": 0.12444444444444444, "grad_norm": 272.18365478515625, "learning_rate": 3e-06, "loss": 21.7391, "step": 1400 }, { "epoch": 0.12453333333333333, "grad_norm": 236.96322631835938, "learning_rate": 3e-06, "loss": 24.1388, "step": 1401 }, { "epoch": 0.12462222222222222, "grad_norm": 257.829833984375, "learning_rate": 3e-06, "loss": 19.7315, "step": 1402 }, { "epoch": 0.1247111111111111, "grad_norm": 237.62989807128906, "learning_rate": 3e-06, "loss": 32.9331, "step": 1403 }, { "epoch": 0.1248, "grad_norm": 299.9111633300781, "learning_rate": 3e-06, "loss": 20.7263, "step": 1404 }, { "completion_length": 253.62500762939453, "epoch": 0.12488888888888888, "grad_norm": 351.87091064453125, "learning_rate": 3e-06, "loss": 1.3314, "reward": 1.5416666865348816, "reward_std": 0.48936042189598083, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.7916666716337204, "step": 1405, "zero_std_ratio": 0.5 }, { "epoch": 0.12497777777777777, "grad_norm": 318.410888671875, "learning_rate": 3e-06, "loss": -6.872, "step": 1406 }, { "epoch": 0.12506666666666666, "grad_norm": 296.7760925292969, "learning_rate": 3e-06, "loss": 7.1205, "step": 1407 }, { "epoch": 0.12515555555555555, "grad_norm": 264.91400146484375, "learning_rate": 3e-06, "loss": 18.3195, "step": 1408 }, { "epoch": 0.12524444444444444, "grad_norm": 309.8560485839844, "learning_rate": 3e-06, "loss": -9.7835, "step": 1409 }, { "epoch": 0.12533333333333332, "grad_norm": 343.082763671875, "learning_rate": 3e-06, "loss": 8.5194, "step": 1410 }, { "epoch": 0.1254222222222222, "grad_norm": 349.1229248046875, "learning_rate": 3e-06, "loss": -2.322, "step": 1411 }, { "epoch": 0.1255111111111111, "grad_norm": 343.0054626464844, "learning_rate": 3e-06, "loss": -10.4852, "step": 1412 }, { "epoch": 0.1256, "grad_norm": 293.6500549316406, "learning_rate": 3e-06, "loss": 3.7367, "step": 1413 }, { "epoch": 0.12568888888888888, "grad_norm": 244.24459838867188, "learning_rate": 3e-06, "loss": 12.8548, "step": 1414 }, { "epoch": 0.12577777777777777, "grad_norm": 308.9840393066406, "learning_rate": 3e-06, "loss": -14.9646, "step": 1415 }, { "epoch": 0.12586666666666665, "grad_norm": 302.1766052246094, "learning_rate": 3e-06, "loss": 4.8047, "step": 1416 }, { "completion_length": 249.6666717529297, "epoch": 0.12595555555555554, "grad_norm": 189.9821319580078, "learning_rate": 3e-06, "loss": -21.868, "reward": 1.4583333730697632, "reward_std": 0.4701542258262634, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.7083333134651184, "step": 1417, "zero_std_ratio": 0.5 }, { "epoch": 0.12604444444444443, "grad_norm": 230.71632385253906, "learning_rate": 3e-06, "loss": -6.3819, "step": 1418 }, { "epoch": 0.12613333333333332, "grad_norm": 245.25421142578125, "learning_rate": 3e-06, "loss": 10.5483, "step": 1419 }, { "epoch": 0.12622222222222224, "grad_norm": 153.20816040039062, "learning_rate": 3e-06, "loss": -8.6118, "step": 1420 }, { "epoch": 0.12631111111111112, "grad_norm": 167.58921813964844, "learning_rate": 3e-06, "loss": -8.8097, "step": 1421 }, { "epoch": 0.1264, "grad_norm": 190.3168182373047, "learning_rate": 3e-06, "loss": -14.9707, "step": 1422 }, { "epoch": 0.1264888888888889, "grad_norm": 191.2883758544922, "learning_rate": 3e-06, "loss": -23.6776, "step": 1423 }, { "epoch": 0.1265777777777778, "grad_norm": 223.39498901367188, "learning_rate": 3e-06, "loss": -8.999, "step": 1424 }, { "epoch": 0.12666666666666668, "grad_norm": 228.12818908691406, "learning_rate": 3e-06, "loss": 7.6836, "step": 1425 }, { "epoch": 0.12675555555555557, "grad_norm": 165.7634735107422, "learning_rate": 3e-06, "loss": -12.3289, "step": 1426 }, { "epoch": 0.12684444444444445, "grad_norm": 169.15396118164062, "learning_rate": 3e-06, "loss": -13.59, "step": 1427 }, { "epoch": 0.12693333333333334, "grad_norm": 204.25228881835938, "learning_rate": 3e-06, "loss": -19.9151, "step": 1428 }, { "completion_length": 228.02083587646484, "epoch": 0.12702222222222223, "grad_norm": 167.8898468017578, "learning_rate": 3e-06, "loss": 7.6334, "reward": 1.2083333432674408, "reward_std": 0.3602609783411026, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.4583333358168602, "step": 1429, "zero_std_ratio": 0.625 }, { "epoch": 0.12711111111111112, "grad_norm": 224.77040100097656, "learning_rate": 3e-06, "loss": -1.7483, "step": 1430 }, { "epoch": 0.1272, "grad_norm": 220.69076538085938, "learning_rate": 3e-06, "loss": 14.763, "step": 1431 }, { "epoch": 0.1272888888888889, "grad_norm": 150.462646484375, "learning_rate": 3e-06, "loss": 17.6324, "step": 1432 }, { "epoch": 0.12737777777777778, "grad_norm": 203.99217224121094, "learning_rate": 3e-06, "loss": 17.7821, "step": 1433 }, { "epoch": 0.12746666666666667, "grad_norm": 188.85665893554688, "learning_rate": 3e-06, "loss": 13.1598, "step": 1434 }, { "epoch": 0.12755555555555556, "grad_norm": 155.11329650878906, "learning_rate": 3e-06, "loss": 6.2584, "step": 1435 }, { "epoch": 0.12764444444444445, "grad_norm": 205.92800903320312, "learning_rate": 3e-06, "loss": -3.0899, "step": 1436 }, { "epoch": 0.12773333333333334, "grad_norm": 201.03298950195312, "learning_rate": 3e-06, "loss": 12.9818, "step": 1437 }, { "epoch": 0.12782222222222223, "grad_norm": 167.153076171875, "learning_rate": 3e-06, "loss": 15.1051, "step": 1438 }, { "epoch": 0.12791111111111111, "grad_norm": 223.22909545898438, "learning_rate": 3e-06, "loss": 14.2815, "step": 1439 }, { "epoch": 0.128, "grad_norm": 173.97694396972656, "learning_rate": 3e-06, "loss": 9.4034, "step": 1440 }, { "completion_length": 239.5416717529297, "epoch": 0.1280888888888889, "grad_norm": 299.9499816894531, "learning_rate": 3e-06, "loss": -27.7606, "reward": 1.5208333730697632, "reward_std": 0.3680921643972397, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.8333333432674408, "step": 1441, "zero_std_ratio": 0.625 }, { "epoch": 0.12817777777777778, "grad_norm": 236.49612426757812, "learning_rate": 3e-06, "loss": -27.7132, "step": 1442 }, { "epoch": 0.12826666666666667, "grad_norm": 186.70510864257812, "learning_rate": 3e-06, "loss": -28.2186, "step": 1443 }, { "epoch": 0.12835555555555556, "grad_norm": 294.79656982421875, "learning_rate": 3e-06, "loss": -18.5453, "step": 1444 }, { "epoch": 0.12844444444444444, "grad_norm": 257.8788146972656, "learning_rate": 3e-06, "loss": -26.3342, "step": 1445 }, { "epoch": 0.12853333333333333, "grad_norm": 280.1625061035156, "learning_rate": 3e-06, "loss": -37.5225, "step": 1446 }, { "epoch": 0.12862222222222222, "grad_norm": 257.7731018066406, "learning_rate": 3e-06, "loss": -31.1507, "step": 1447 }, { "epoch": 0.1287111111111111, "grad_norm": 221.82879638671875, "learning_rate": 3e-06, "loss": -30.7869, "step": 1448 }, { "epoch": 0.1288, "grad_norm": 227.20188903808594, "learning_rate": 3e-06, "loss": -31.8637, "step": 1449 }, { "epoch": 0.1288888888888889, "grad_norm": 319.30633544921875, "learning_rate": 3e-06, "loss": -24.0296, "step": 1450 }, { "epoch": 0.12897777777777777, "grad_norm": 206.82269287109375, "learning_rate": 3e-06, "loss": -28.7864, "step": 1451 }, { "epoch": 0.12906666666666666, "grad_norm": 196.76171875, "learning_rate": 3e-06, "loss": -42.0569, "step": 1452 }, { "completion_length": 248.1666717529297, "epoch": 0.12915555555555555, "grad_norm": 210.2032928466797, "learning_rate": 3e-06, "loss": -15.8435, "reward": 1.5000000596046448, "reward_std": 0.3680921941995621, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.75, "step": 1453, "zero_std_ratio": 0.625 }, { "epoch": 0.12924444444444444, "grad_norm": 209.77688598632812, "learning_rate": 3e-06, "loss": -14.2805, "step": 1454 }, { "epoch": 0.12933333333333333, "grad_norm": 662.65966796875, "learning_rate": 3e-06, "loss": -15.5908, "step": 1455 }, { "epoch": 0.12942222222222222, "grad_norm": 176.72958374023438, "learning_rate": 3e-06, "loss": -15.1466, "step": 1456 }, { "epoch": 0.1295111111111111, "grad_norm": 224.30841064453125, "learning_rate": 3e-06, "loss": -23.5471, "step": 1457 }, { "epoch": 0.1296, "grad_norm": 187.04263305664062, "learning_rate": 3e-06, "loss": -15.5373, "step": 1458 }, { "epoch": 0.12968888888888888, "grad_norm": 216.18629455566406, "learning_rate": 3e-06, "loss": -19.5843, "step": 1459 }, { "epoch": 0.12977777777777777, "grad_norm": 204.7811279296875, "learning_rate": 3e-06, "loss": -17.6737, "step": 1460 }, { "epoch": 0.12986666666666666, "grad_norm": 357.2877197265625, "learning_rate": 3e-06, "loss": -19.0236, "step": 1461 }, { "epoch": 0.12995555555555555, "grad_norm": 173.1217803955078, "learning_rate": 3e-06, "loss": -18.2209, "step": 1462 }, { "epoch": 0.13004444444444443, "grad_norm": 202.11126708984375, "learning_rate": 3e-06, "loss": -26.1672, "step": 1463 }, { "epoch": 0.13013333333333332, "grad_norm": 215.072265625, "learning_rate": 3e-06, "loss": -18.211, "step": 1464 }, { "completion_length": 248.45833587646484, "epoch": 0.1302222222222222, "grad_norm": 189.24551391601562, "learning_rate": 3e-06, "loss": -26.8632, "reward": 1.1041666865348816, "reward_std": 0.23899272084236145, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.4166666716337204, "step": 1465, "zero_std_ratio": 0.75 }, { "epoch": 0.1303111111111111, "grad_norm": 150.01397705078125, "learning_rate": 3e-06, "loss": -36.2487, "step": 1466 }, { "epoch": 0.1304, "grad_norm": 239.17184448242188, "learning_rate": 3e-06, "loss": -32.3168, "step": 1467 }, { "epoch": 0.13048888888888888, "grad_norm": 214.16525268554688, "learning_rate": 3e-06, "loss": -36.6605, "step": 1468 }, { "epoch": 0.13057777777777776, "grad_norm": 200.94650268554688, "learning_rate": 3e-06, "loss": -23.324, "step": 1469 }, { "epoch": 0.13066666666666665, "grad_norm": 202.95838928222656, "learning_rate": 3e-06, "loss": -34.8067, "step": 1470 }, { "epoch": 0.13075555555555557, "grad_norm": 147.8043670654297, "learning_rate": 3e-06, "loss": -29.223, "step": 1471 }, { "epoch": 0.13084444444444446, "grad_norm": 137.1348114013672, "learning_rate": 3e-06, "loss": -38.384, "step": 1472 }, { "epoch": 0.13093333333333335, "grad_norm": 198.6035614013672, "learning_rate": 3e-06, "loss": -35.8388, "step": 1473 }, { "epoch": 0.13102222222222223, "grad_norm": 225.8579864501953, "learning_rate": 3e-06, "loss": -39.2579, "step": 1474 }, { "epoch": 0.13111111111111112, "grad_norm": 193.4779052734375, "learning_rate": 3e-06, "loss": -25.8384, "step": 1475 }, { "epoch": 0.1312, "grad_norm": 188.15464782714844, "learning_rate": 3e-06, "loss": -37.6788, "step": 1476 }, { "completion_length": 236.9375, "epoch": 0.1312888888888889, "grad_norm": 433.47833251953125, "learning_rate": 3e-06, "loss": -44.9082, "reward": 1.9166667461395264, "reward_std": 0.6611596345901489, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.1666666865348816, "step": 1477, "zero_std_ratio": 0.375 }, { "epoch": 0.1313777777777778, "grad_norm": 327.1938171386719, "learning_rate": 3e-06, "loss": -49.2004, "step": 1478 }, { "epoch": 0.13146666666666668, "grad_norm": 295.2423095703125, "learning_rate": 3e-06, "loss": -47.3065, "step": 1479 }, { "epoch": 0.13155555555555556, "grad_norm": 288.9835510253906, "learning_rate": 3e-06, "loss": -35.3959, "step": 1480 }, { "epoch": 0.13164444444444445, "grad_norm": 310.02056884765625, "learning_rate": 3e-06, "loss": -36.4699, "step": 1481 }, { "epoch": 0.13173333333333334, "grad_norm": 288.0977478027344, "learning_rate": 3e-06, "loss": -36.6703, "step": 1482 }, { "epoch": 0.13182222222222223, "grad_norm": 266.6463317871094, "learning_rate": 3e-06, "loss": -47.7283, "step": 1483 }, { "epoch": 0.13191111111111112, "grad_norm": 405.67071533203125, "learning_rate": 3e-06, "loss": -51.2972, "step": 1484 }, { "epoch": 0.132, "grad_norm": 401.3482360839844, "learning_rate": 3e-06, "loss": -52.0379, "step": 1485 }, { "epoch": 0.1320888888888889, "grad_norm": 280.8495788574219, "learning_rate": 3e-06, "loss": -39.6706, "step": 1486 }, { "epoch": 0.13217777777777778, "grad_norm": 329.5269775390625, "learning_rate": 3e-06, "loss": -41.6526, "step": 1487 }, { "epoch": 0.13226666666666667, "grad_norm": 335.015380859375, "learning_rate": 3e-06, "loss": -42.414, "step": 1488 }, { "completion_length": 244.70833587646484, "epoch": 0.13235555555555556, "grad_norm": 365.8779602050781, "learning_rate": 3e-06, "loss": 0.0487, "reward": 1.1875, "reward_std": 0.23116151988506317, "rewards/boxed_and_answer_tags_format_reward": 0.5625, "rewards/correctness_reward_func_math": 0.625, "step": 1489, "zero_std_ratio": 0.75 }, { "epoch": 0.13244444444444445, "grad_norm": 270.31494140625, "learning_rate": 3e-06, "loss": -3.1951, "step": 1490 }, { "epoch": 0.13253333333333334, "grad_norm": 350.0372619628906, "learning_rate": 3e-06, "loss": 2.4081, "step": 1491 }, { "epoch": 0.13262222222222222, "grad_norm": 297.7118835449219, "learning_rate": 3e-06, "loss": 1.5965, "step": 1492 }, { "epoch": 0.1327111111111111, "grad_norm": 251.00436401367188, "learning_rate": 3e-06, "loss": -8.6016, "step": 1493 }, { "epoch": 0.1328, "grad_norm": 364.3514709472656, "learning_rate": 3e-06, "loss": 5.7272, "step": 1494 }, { "epoch": 0.1328888888888889, "grad_norm": 327.8075256347656, "learning_rate": 3e-06, "loss": -6.1829, "step": 1495 }, { "epoch": 0.13297777777777778, "grad_norm": 816.3570556640625, "learning_rate": 3e-06, "loss": -10.451, "step": 1496 }, { "epoch": 0.13306666666666667, "grad_norm": 319.318115234375, "learning_rate": 3e-06, "loss": -7.2984, "step": 1497 }, { "epoch": 0.13315555555555555, "grad_norm": 306.3028259277344, "learning_rate": 3e-06, "loss": -7.2124, "step": 1498 }, { "epoch": 0.13324444444444444, "grad_norm": 194.4175567626953, "learning_rate": 3e-06, "loss": -14.7213, "step": 1499 }, { "epoch": 0.13333333333333333, "grad_norm": 289.50408935546875, "learning_rate": 3e-06, "loss": -8.9605, "step": 1500 }, { "completion_length": 245.8541717529297, "epoch": 0.13342222222222222, "grad_norm": 177.177734375, "learning_rate": 3e-06, "loss": -19.9228, "reward": 1.0000000596046448, "reward_std": 0.23116152733564377, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.375, "step": 1501, "zero_std_ratio": 0.75 }, { "epoch": 0.1335111111111111, "grad_norm": 256.49334716796875, "learning_rate": 3e-06, "loss": -12.3773, "step": 1502 }, { "epoch": 0.1336, "grad_norm": 200.69879150390625, "learning_rate": 3e-06, "loss": -12.0433, "step": 1503 }, { "epoch": 0.13368888888888888, "grad_norm": 195.25538635253906, "learning_rate": 3e-06, "loss": -5.3936, "step": 1504 }, { "epoch": 0.13377777777777777, "grad_norm": 179.9781036376953, "learning_rate": 3e-06, "loss": -11.4454, "step": 1505 }, { "epoch": 0.13386666666666666, "grad_norm": 255.14865112304688, "learning_rate": 3e-06, "loss": -14.0628, "step": 1506 }, { "epoch": 0.13395555555555555, "grad_norm": 185.94732666015625, "learning_rate": 3e-06, "loss": -21.484, "step": 1507 }, { "epoch": 0.13404444444444444, "grad_norm": 233.72573852539062, "learning_rate": 3e-06, "loss": -15.3775, "step": 1508 }, { "epoch": 0.13413333333333333, "grad_norm": 329.163818359375, "learning_rate": 3e-06, "loss": -14.9938, "step": 1509 }, { "epoch": 0.13422222222222221, "grad_norm": 199.61465454101562, "learning_rate": 3e-06, "loss": -7.8012, "step": 1510 }, { "epoch": 0.1343111111111111, "grad_norm": 168.71255493164062, "learning_rate": 3e-06, "loss": -14.2907, "step": 1511 }, { "epoch": 0.1344, "grad_norm": 246.87896728515625, "learning_rate": 3e-06, "loss": -15.4667, "step": 1512 }, { "completion_length": 240.18750762939453, "epoch": 0.13448888888888888, "grad_norm": 273.19427490234375, "learning_rate": 3e-06, "loss": -30.8428, "reward": 1.5625, "reward_std": 0.3410547822713852, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.875, "step": 1513, "zero_std_ratio": 0.625 }, { "epoch": 0.13457777777777777, "grad_norm": 198.8998565673828, "learning_rate": 3e-06, "loss": -36.7035, "step": 1514 }, { "epoch": 0.13466666666666666, "grad_norm": 266.1399230957031, "learning_rate": 3e-06, "loss": -29.4233, "step": 1515 }, { "epoch": 0.13475555555555555, "grad_norm": 270.2858581542969, "learning_rate": 3e-06, "loss": -36.6868, "step": 1516 }, { "epoch": 0.13484444444444443, "grad_norm": 190.39768981933594, "learning_rate": 3e-06, "loss": -35.0192, "step": 1517 }, { "epoch": 0.13493333333333332, "grad_norm": 319.56494140625, "learning_rate": 3e-06, "loss": -35.0648, "step": 1518 }, { "epoch": 0.1350222222222222, "grad_norm": 224.65713500976562, "learning_rate": 3e-06, "loss": -35.3229, "step": 1519 }, { "epoch": 0.1351111111111111, "grad_norm": 188.5618438720703, "learning_rate": 3e-06, "loss": -41.7633, "step": 1520 }, { "epoch": 0.1352, "grad_norm": 334.20281982421875, "learning_rate": 3e-06, "loss": -34.6749, "step": 1521 }, { "epoch": 0.13528888888888888, "grad_norm": 232.7653350830078, "learning_rate": 3e-06, "loss": -42.387, "step": 1522 }, { "epoch": 0.1353777777777778, "grad_norm": 179.99102783203125, "learning_rate": 3e-06, "loss": -40.5993, "step": 1523 }, { "epoch": 0.13546666666666668, "grad_norm": 266.2838134765625, "learning_rate": 3e-06, "loss": -43.1418, "step": 1524 }, { "completion_length": 251.06250762939453, "epoch": 0.13555555555555557, "grad_norm": 379.1654052734375, "learning_rate": 3e-06, "loss": 5.5695, "reward": 1.0520833730697632, "reward_std": 0.5148759335279465, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.375, "step": 1525, "zero_std_ratio": 0.375 }, { "epoch": 0.13564444444444446, "grad_norm": 382.55987548828125, "learning_rate": 3e-06, "loss": -2.5248, "step": 1526 }, { "epoch": 0.13573333333333334, "grad_norm": 535.1154174804688, "learning_rate": 3e-06, "loss": 2.5235, "step": 1527 }, { "epoch": 0.13582222222222223, "grad_norm": 382.97515869140625, "learning_rate": 3e-06, "loss": 25.0744, "step": 1528 }, { "epoch": 0.13591111111111112, "grad_norm": 417.135009765625, "learning_rate": 3e-06, "loss": -7.6026, "step": 1529 }, { "epoch": 0.136, "grad_norm": 452.19580078125, "learning_rate": 3e-06, "loss": -7.3517, "step": 1530 }, { "epoch": 0.1360888888888889, "grad_norm": 404.5818176269531, "learning_rate": 3e-06, "loss": 4.4363, "step": 1531 }, { "epoch": 0.1361777777777778, "grad_norm": 378.475341796875, "learning_rate": 3e-06, "loss": -7.4401, "step": 1532 }, { "epoch": 0.13626666666666667, "grad_norm": 446.7852478027344, "learning_rate": 3e-06, "loss": -2.1065, "step": 1533 }, { "epoch": 0.13635555555555556, "grad_norm": 313.30340576171875, "learning_rate": 3e-06, "loss": 20.5916, "step": 1534 }, { "epoch": 0.13644444444444445, "grad_norm": 385.8117370605469, "learning_rate": 3e-06, "loss": -12.1391, "step": 1535 }, { "epoch": 0.13653333333333334, "grad_norm": 381.8157653808594, "learning_rate": 3e-06, "loss": -10.1047, "step": 1536 }, { "completion_length": 251.9166717529297, "epoch": 0.13662222222222223, "grad_norm": 151.30177307128906, "learning_rate": 3e-06, "loss": 7.8283, "reward": 1.5, "reward_std": 0.1369306445121765, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.875, "step": 1537, "zero_std_ratio": 0.875 }, { "epoch": 0.13671111111111112, "grad_norm": 152.4805908203125, "learning_rate": 3e-06, "loss": 2.9257, "step": 1538 }, { "epoch": 0.1368, "grad_norm": 143.77023315429688, "learning_rate": 3e-06, "loss": 1.5204, "step": 1539 }, { "epoch": 0.1368888888888889, "grad_norm": 134.1708526611328, "learning_rate": 3e-06, "loss": 2.3719, "step": 1540 }, { "epoch": 0.13697777777777778, "grad_norm": 152.87826538085938, "learning_rate": 3e-06, "loss": 4.1701, "step": 1541 }, { "epoch": 0.13706666666666667, "grad_norm": 140.14395141601562, "learning_rate": 3e-06, "loss": 5.3651, "step": 1542 }, { "epoch": 0.13715555555555556, "grad_norm": 126.84275817871094, "learning_rate": 3e-06, "loss": 7.1642, "step": 1543 }, { "epoch": 0.13724444444444445, "grad_norm": 168.67564392089844, "learning_rate": 3e-06, "loss": -0.2961, "step": 1544 }, { "epoch": 0.13733333333333334, "grad_norm": 163.78794860839844, "learning_rate": 3e-06, "loss": -0.8864, "step": 1545 }, { "epoch": 0.13742222222222222, "grad_norm": 125.58057403564453, "learning_rate": 3e-06, "loss": -0.0556, "step": 1546 }, { "epoch": 0.1375111111111111, "grad_norm": 173.51966857910156, "learning_rate": 3e-06, "loss": 3.8643, "step": 1547 }, { "epoch": 0.1376, "grad_norm": 120.23555755615234, "learning_rate": 3e-06, "loss": 1.9312, "step": 1548 }, { "completion_length": 254.1041717529297, "epoch": 0.1376888888888889, "grad_norm": 355.8426208496094, "learning_rate": 3e-06, "loss": -6.5178, "reward": 1.1250000596046448, "reward_std": 0.23116152733564377, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.375, "step": 1549, "zero_std_ratio": 0.75 }, { "epoch": 0.13777777777777778, "grad_norm": 321.2608947753906, "learning_rate": 3e-06, "loss": -6.46, "step": 1550 }, { "epoch": 0.13786666666666667, "grad_norm": 220.72544860839844, "learning_rate": 3e-06, "loss": -3.6261, "step": 1551 }, { "epoch": 0.13795555555555555, "grad_norm": 475.98944091796875, "learning_rate": 3e-06, "loss": -20.6756, "step": 1552 }, { "epoch": 0.13804444444444444, "grad_norm": 345.5748291015625, "learning_rate": 3e-06, "loss": -3.3905, "step": 1553 }, { "epoch": 0.13813333333333333, "grad_norm": 285.5007629394531, "learning_rate": 3e-06, "loss": -2.0237, "step": 1554 }, { "epoch": 0.13822222222222222, "grad_norm": 395.1439514160156, "learning_rate": 3e-06, "loss": -8.0344, "step": 1555 }, { "epoch": 0.1383111111111111, "grad_norm": 300.19091796875, "learning_rate": 3e-06, "loss": -9.601, "step": 1556 }, { "epoch": 0.1384, "grad_norm": 285.38763427734375, "learning_rate": 3e-06, "loss": -7.3983, "step": 1557 }, { "epoch": 0.13848888888888888, "grad_norm": 463.34136962890625, "learning_rate": 3e-06, "loss": -25.5177, "step": 1558 }, { "epoch": 0.13857777777777777, "grad_norm": 298.12005615234375, "learning_rate": 3e-06, "loss": -4.867, "step": 1559 }, { "epoch": 0.13866666666666666, "grad_norm": 274.00836181640625, "learning_rate": 3e-06, "loss": -3.8962, "step": 1560 }, { "completion_length": 242.33334350585938, "epoch": 0.13875555555555555, "grad_norm": 310.84564208984375, "learning_rate": 3e-06, "loss": -16.4582, "reward": 1.541666716337204, "reward_std": 0.3061862289905548, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.7916666492819786, "step": 1561, "zero_std_ratio": 0.625 }, { "epoch": 0.13884444444444444, "grad_norm": 398.2083435058594, "learning_rate": 3e-06, "loss": -20.4413, "step": 1562 }, { "epoch": 0.13893333333333333, "grad_norm": 378.9441833496094, "learning_rate": 3e-06, "loss": -16.1798, "step": 1563 }, { "epoch": 0.1390222222222222, "grad_norm": 350.91192626953125, "learning_rate": 3e-06, "loss": -20.2111, "step": 1564 }, { "epoch": 0.1391111111111111, "grad_norm": 365.54754638671875, "learning_rate": 3e-06, "loss": -17.1017, "step": 1565 }, { "epoch": 0.1392, "grad_norm": 551.0444946289062, "learning_rate": 3e-06, "loss": -29.4716, "step": 1566 }, { "epoch": 0.13928888888888888, "grad_norm": 320.9019775390625, "learning_rate": 3e-06, "loss": -20.3973, "step": 1567 }, { "epoch": 0.13937777777777777, "grad_norm": 456.51190185546875, "learning_rate": 3e-06, "loss": -24.394, "step": 1568 }, { "epoch": 0.13946666666666666, "grad_norm": 406.370361328125, "learning_rate": 3e-06, "loss": -20.4216, "step": 1569 }, { "epoch": 0.13955555555555554, "grad_norm": 356.31982421875, "learning_rate": 3e-06, "loss": -24.4687, "step": 1570 }, { "epoch": 0.13964444444444443, "grad_norm": 342.86468505859375, "learning_rate": 3e-06, "loss": -23.7611, "step": 1571 }, { "epoch": 0.13973333333333332, "grad_norm": 536.0755615234375, "learning_rate": 3e-06, "loss": -33.8363, "step": 1572 }, { "completion_length": 254.18750762939453, "epoch": 0.1398222222222222, "grad_norm": 186.11480712890625, "learning_rate": 3e-06, "loss": 4.595, "reward": 0.9791666865348816, "reward_std": 0.23899271339178085, "rewards/boxed_and_answer_tags_format_reward": 0.5625, "rewards/correctness_reward_func_math": 0.4166666567325592, "step": 1573, "zero_std_ratio": 0.75 }, { "epoch": 0.13991111111111112, "grad_norm": 166.36386108398438, "learning_rate": 3e-06, "loss": 6.7414, "step": 1574 }, { "epoch": 0.14, "grad_norm": 213.84388732910156, "learning_rate": 3e-06, "loss": 4.949, "step": 1575 }, { "epoch": 0.1400888888888889, "grad_norm": 196.5020294189453, "learning_rate": 3e-06, "loss": 8.9449, "step": 1576 }, { "epoch": 0.1401777777777778, "grad_norm": 216.48086547851562, "learning_rate": 3e-06, "loss": 7.0276, "step": 1577 }, { "epoch": 0.14026666666666668, "grad_norm": 188.523681640625, "learning_rate": 3e-06, "loss": 5.1123, "step": 1578 }, { "epoch": 0.14035555555555557, "grad_norm": 223.4918975830078, "learning_rate": 3e-06, "loss": 2.6615, "step": 1579 }, { "epoch": 0.14044444444444446, "grad_norm": 218.71112060546875, "learning_rate": 3e-06, "loss": 3.1185, "step": 1580 }, { "epoch": 0.14053333333333334, "grad_norm": 223.2500762939453, "learning_rate": 3e-06, "loss": 1.1539, "step": 1581 }, { "epoch": 0.14062222222222223, "grad_norm": 176.69094848632812, "learning_rate": 3e-06, "loss": 5.1857, "step": 1582 }, { "epoch": 0.14071111111111112, "grad_norm": 243.36929321289062, "learning_rate": 3e-06, "loss": 4.2964, "step": 1583 }, { "epoch": 0.1408, "grad_norm": 269.2211608886719, "learning_rate": 3e-06, "loss": 1.8612, "step": 1584 }, { "completion_length": 245.9166717529297, "epoch": 0.1408888888888889, "grad_norm": 542.5291137695312, "learning_rate": 3e-06, "loss": 14.2794, "reward": 0.9791666865348816, "reward_std": 0.5643851011991501, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.2916666716337204, "step": 1585, "zero_std_ratio": 0.375 }, { "epoch": 0.14097777777777779, "grad_norm": 575.147705078125, "learning_rate": 3e-06, "loss": 17.5223, "step": 1586 }, { "epoch": 0.14106666666666667, "grad_norm": 405.6719970703125, "learning_rate": 3e-06, "loss": 9.7247, "step": 1587 }, { "epoch": 0.14115555555555556, "grad_norm": 528.659912109375, "learning_rate": 3e-06, "loss": 35.2638, "step": 1588 }, { "epoch": 0.14124444444444445, "grad_norm": 514.4658203125, "learning_rate": 3e-06, "loss": 14.2266, "step": 1589 }, { "epoch": 0.14133333333333334, "grad_norm": 535.8659057617188, "learning_rate": 3e-06, "loss": 13.3064, "step": 1590 }, { "epoch": 0.14142222222222223, "grad_norm": 538.135009765625, "learning_rate": 3e-06, "loss": 7.7669, "step": 1591 }, { "epoch": 0.14151111111111112, "grad_norm": 547.5822143554688, "learning_rate": 3e-06, "loss": 13.0588, "step": 1592 }, { "epoch": 0.1416, "grad_norm": 415.9985046386719, "learning_rate": 3e-06, "loss": 4.3967, "step": 1593 }, { "epoch": 0.1416888888888889, "grad_norm": 553.2412719726562, "learning_rate": 3e-06, "loss": 26.3526, "step": 1594 }, { "epoch": 0.14177777777777778, "grad_norm": 512.5444946289062, "learning_rate": 3e-06, "loss": 5.7301, "step": 1595 }, { "epoch": 0.14186666666666667, "grad_norm": 580.6036987304688, "learning_rate": 3e-06, "loss": 4.7651, "step": 1596 }, { "completion_length": 235.7916717529297, "epoch": 0.14195555555555556, "grad_norm": 347.6015319824219, "learning_rate": 3e-06, "loss": -18.3809, "reward": 1.5416666865348816, "reward_std": 0.3332235887646675, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.9166666567325592, "step": 1597, "zero_std_ratio": 0.625 }, { "epoch": 0.14204444444444445, "grad_norm": 317.97137451171875, "learning_rate": 3e-06, "loss": 2.7823, "step": 1598 }, { "epoch": 0.14213333333333333, "grad_norm": 202.9899444580078, "learning_rate": 3e-06, "loss": -5.0647, "step": 1599 }, { "epoch": 0.14222222222222222, "grad_norm": 786.5781860351562, "learning_rate": 3e-06, "loss": -4.8509, "step": 1600 }, { "epoch": 0.1423111111111111, "grad_norm": 290.1077575683594, "learning_rate": 3e-06, "loss": -8.7908, "step": 1601 }, { "epoch": 0.1424, "grad_norm": 198.83493041992188, "learning_rate": 3e-06, "loss": 8.0596, "step": 1602 }, { "epoch": 0.1424888888888889, "grad_norm": 270.3849792480469, "learning_rate": 3e-06, "loss": -21.899, "step": 1603 }, { "epoch": 0.14257777777777778, "grad_norm": 1309.112060546875, "learning_rate": 3e-06, "loss": -1.111, "step": 1604 }, { "epoch": 0.14266666666666666, "grad_norm": 212.12266540527344, "learning_rate": 3e-06, "loss": -7.2685, "step": 1605 }, { "epoch": 0.14275555555555555, "grad_norm": 242.00680541992188, "learning_rate": 3e-06, "loss": -7.1693, "step": 1606 }, { "epoch": 0.14284444444444444, "grad_norm": 271.86090087890625, "learning_rate": 3e-06, "loss": -10.3562, "step": 1607 }, { "epoch": 0.14293333333333333, "grad_norm": 233.82144165039062, "learning_rate": 3e-06, "loss": 4.7331, "step": 1608 }, { "completion_length": 254.1666717529297, "epoch": 0.14302222222222222, "grad_norm": 183.18118286132812, "learning_rate": 3e-06, "loss": -27.8021, "reward": 1.8958333730697632, "reward_std": 0.10206206887960434, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 1.2083333134651184, "step": 1609, "zero_std_ratio": 0.875 }, { "epoch": 0.1431111111111111, "grad_norm": 393.4090270996094, "learning_rate": 3e-06, "loss": -29.9682, "step": 1610 }, { "epoch": 0.1432, "grad_norm": 232.8114776611328, "learning_rate": 3e-06, "loss": -21.6474, "step": 1611 }, { "epoch": 0.14328888888888888, "grad_norm": 304.4367370605469, "learning_rate": 3e-06, "loss": -21.4802, "step": 1612 }, { "epoch": 0.14337777777777777, "grad_norm": 162.92181396484375, "learning_rate": 3e-06, "loss": -28.1092, "step": 1613 }, { "epoch": 0.14346666666666666, "grad_norm": 209.14356994628906, "learning_rate": 3e-06, "loss": -19.8933, "step": 1614 }, { "epoch": 0.14355555555555555, "grad_norm": 697.1129760742188, "learning_rate": 3e-06, "loss": -27.563, "step": 1615 }, { "epoch": 0.14364444444444444, "grad_norm": 311.41851806640625, "learning_rate": 3e-06, "loss": -33.0986, "step": 1616 }, { "epoch": 0.14373333333333332, "grad_norm": 257.729248046875, "learning_rate": 3e-06, "loss": -26.3909, "step": 1617 }, { "epoch": 0.1438222222222222, "grad_norm": 258.4046936035156, "learning_rate": 3e-06, "loss": -26.2488, "step": 1618 }, { "epoch": 0.1439111111111111, "grad_norm": 130.1263885498047, "learning_rate": 3e-06, "loss": -30.1948, "step": 1619 }, { "epoch": 0.144, "grad_norm": 182.67807006835938, "learning_rate": 3e-06, "loss": -23.4364, "step": 1620 }, { "completion_length": 249.4375, "epoch": 0.14408888888888888, "grad_norm": 354.3665771484375, "learning_rate": 3e-06, "loss": 8.1684, "reward": 1.3229166865348816, "reward_std": 0.15461497008800507, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.5833333432674408, "step": 1621, "zero_std_ratio": 0.75 }, { "epoch": 0.14417777777777777, "grad_norm": 424.4937438964844, "learning_rate": 3e-06, "loss": -0.1374, "step": 1622 }, { "epoch": 0.14426666666666665, "grad_norm": 441.7712097167969, "learning_rate": 3e-06, "loss": -5.8721, "step": 1623 }, { "epoch": 0.14435555555555554, "grad_norm": 474.1778259277344, "learning_rate": 3e-06, "loss": -4.5006, "step": 1624 }, { "epoch": 0.14444444444444443, "grad_norm": 464.2291564941406, "learning_rate": 3e-06, "loss": -9.1126, "step": 1625 }, { "epoch": 0.14453333333333335, "grad_norm": 375.41595458984375, "learning_rate": 3e-06, "loss": -1.7981, "step": 1626 }, { "epoch": 0.14462222222222224, "grad_norm": 591.2286376953125, "learning_rate": 3e-06, "loss": -2.7549, "step": 1627 }, { "epoch": 0.14471111111111112, "grad_norm": 269.577880859375, "learning_rate": 3e-06, "loss": -15.3618, "step": 1628 }, { "epoch": 0.1448, "grad_norm": 366.1959533691406, "learning_rate": 3e-06, "loss": -17.4499, "step": 1629 }, { "epoch": 0.1448888888888889, "grad_norm": 248.89236450195312, "learning_rate": 3e-06, "loss": -18.4987, "step": 1630 }, { "epoch": 0.1449777777777778, "grad_norm": 173.4352569580078, "learning_rate": 3e-06, "loss": -16.0213, "step": 1631 }, { "epoch": 0.14506666666666668, "grad_norm": 558.5076904296875, "learning_rate": 3e-06, "loss": -18.2749, "step": 1632 }, { "completion_length": 251.20833587646484, "epoch": 0.14515555555555557, "grad_norm": 273.303955078125, "learning_rate": 3e-06, "loss": -13.4676, "reward": 1.6145833730697632, "reward_std": 0.33129163831472397, "rewards/boxed_and_answer_tags_format_reward": 0.6979166567325592, "rewards/correctness_reward_func_math": 0.9166666865348816, "step": 1633, "zero_std_ratio": 0.625 }, { "epoch": 0.14524444444444445, "grad_norm": 275.0503845214844, "learning_rate": 3e-06, "loss": -3.2551, "step": 1634 }, { "epoch": 0.14533333333333334, "grad_norm": 399.2369384765625, "learning_rate": 3e-06, "loss": -5.5698, "step": 1635 }, { "epoch": 0.14542222222222223, "grad_norm": 328.1588439941406, "learning_rate": 3e-06, "loss": -18.8269, "step": 1636 }, { "epoch": 0.14551111111111112, "grad_norm": 239.95028686523438, "learning_rate": 3e-06, "loss": -6.0394, "step": 1637 }, { "epoch": 0.1456, "grad_norm": 290.9996643066406, "learning_rate": 3e-06, "loss": -3.1009, "step": 1638 }, { "epoch": 0.1456888888888889, "grad_norm": 334.3653869628906, "learning_rate": 3e-06, "loss": -17.4325, "step": 1639 }, { "epoch": 0.14577777777777778, "grad_norm": 301.37139892578125, "learning_rate": 3e-06, "loss": -7.3761, "step": 1640 }, { "epoch": 0.14586666666666667, "grad_norm": 593.1119995117188, "learning_rate": 3e-06, "loss": -12.5855, "step": 1641 }, { "epoch": 0.14595555555555556, "grad_norm": 394.4963073730469, "learning_rate": 3e-06, "loss": -21.8609, "step": 1642 }, { "epoch": 0.14604444444444445, "grad_norm": 239.55348205566406, "learning_rate": 3e-06, "loss": -9.8962, "step": 1643 }, { "epoch": 0.14613333333333334, "grad_norm": 279.3672180175781, "learning_rate": 3e-06, "loss": -8.5061, "step": 1644 }, { "completion_length": 241.9166717529297, "epoch": 0.14622222222222223, "grad_norm": 477.223388671875, "learning_rate": 3e-06, "loss": -12.8053, "reward": 1.1875000596046448, "reward_std": 0.3332235887646675, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.4999999850988388, "step": 1645, "zero_std_ratio": 0.625 }, { "epoch": 0.14631111111111111, "grad_norm": 255.73138427734375, "learning_rate": 3e-06, "loss": 6.7089, "step": 1646 }, { "epoch": 0.1464, "grad_norm": 414.9920959472656, "learning_rate": 3e-06, "loss": -17.6565, "step": 1647 }, { "epoch": 0.1464888888888889, "grad_norm": 362.6864318847656, "learning_rate": 3e-06, "loss": -5.9471, "step": 1648 }, { "epoch": 0.14657777777777778, "grad_norm": 257.5548400878906, "learning_rate": 3e-06, "loss": 7.0843, "step": 1649 }, { "epoch": 0.14666666666666667, "grad_norm": 407.9867248535156, "learning_rate": 3e-06, "loss": -8.5254, "step": 1650 }, { "epoch": 0.14675555555555556, "grad_norm": 326.8368225097656, "learning_rate": 3e-06, "loss": -14.7068, "step": 1651 }, { "epoch": 0.14684444444444444, "grad_norm": 222.26805114746094, "learning_rate": 3e-06, "loss": 4.1678, "step": 1652 }, { "epoch": 0.14693333333333333, "grad_norm": 361.6373596191406, "learning_rate": 3e-06, "loss": -22.3731, "step": 1653 }, { "epoch": 0.14702222222222222, "grad_norm": 331.17803955078125, "learning_rate": 3e-06, "loss": -9.9171, "step": 1654 }, { "epoch": 0.1471111111111111, "grad_norm": 365.830078125, "learning_rate": 3e-06, "loss": 1.295, "step": 1655 }, { "epoch": 0.1472, "grad_norm": 400.97723388671875, "learning_rate": 3e-06, "loss": -15.896, "step": 1656 }, { "completion_length": 246.3541717529297, "epoch": 0.14728888888888889, "grad_norm": 407.52398681640625, "learning_rate": 3e-06, "loss": -95.2234, "reward": 1.4583333730697632, "reward_std": 0.6184598803520203, "rewards/boxed_and_answer_tags_format_reward": 0.5, "rewards/correctness_reward_func_math": 0.9583333432674408, "step": 1657, "zero_std_ratio": 0.375 }, { "epoch": 0.14737777777777777, "grad_norm": 386.23016357421875, "learning_rate": 3e-06, "loss": -94.5723, "step": 1658 }, { "epoch": 0.14746666666666666, "grad_norm": 337.7243347167969, "learning_rate": 3e-06, "loss": -78.7432, "step": 1659 }, { "epoch": 0.14755555555555555, "grad_norm": 391.17547607421875, "learning_rate": 3e-06, "loss": -95.9404, "step": 1660 }, { "epoch": 0.14764444444444444, "grad_norm": 462.5135498046875, "learning_rate": 3e-06, "loss": -69.7866, "step": 1661 }, { "epoch": 0.14773333333333333, "grad_norm": 326.1936340332031, "learning_rate": 3e-06, "loss": -70.0077, "step": 1662 }, { "epoch": 0.14782222222222222, "grad_norm": 417.0813903808594, "learning_rate": 3e-06, "loss": -104.9765, "step": 1663 }, { "epoch": 0.1479111111111111, "grad_norm": 407.79150390625, "learning_rate": 3e-06, "loss": -99.052, "step": 1664 }, { "epoch": 0.148, "grad_norm": 459.3847961425781, "learning_rate": 3e-06, "loss": -89.9644, "step": 1665 }, { "epoch": 0.14808888888888888, "grad_norm": 449.5730285644531, "learning_rate": 3e-06, "loss": -106.9796, "step": 1666 }, { "epoch": 0.14817777777777777, "grad_norm": 431.5627746582031, "learning_rate": 3e-06, "loss": -78.6433, "step": 1667 }, { "epoch": 0.14826666666666666, "grad_norm": 356.53759765625, "learning_rate": 3e-06, "loss": -78.1133, "step": 1668 }, { "completion_length": 250.12500762939453, "epoch": 0.14835555555555555, "grad_norm": 798.4279174804688, "learning_rate": 3e-06, "loss": 13.9371, "reward": 1.2500000596046448, "reward_std": 0.778884083032608, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.625, "step": 1669, "zero_std_ratio": 0.25 }, { "epoch": 0.14844444444444443, "grad_norm": 674.0550537109375, "learning_rate": 3e-06, "loss": -39.024, "step": 1670 }, { "epoch": 0.14853333333333332, "grad_norm": 686.0006103515625, "learning_rate": 3e-06, "loss": -26.081, "step": 1671 }, { "epoch": 0.1486222222222222, "grad_norm": 854.5956420898438, "learning_rate": 3e-06, "loss": -6.9918, "step": 1672 }, { "epoch": 0.1487111111111111, "grad_norm": 588.1673583984375, "learning_rate": 3e-06, "loss": -41.4908, "step": 1673 }, { "epoch": 0.1488, "grad_norm": 802.58544921875, "learning_rate": 3e-06, "loss": -9.7048, "step": 1674 }, { "epoch": 0.14888888888888888, "grad_norm": 842.8211669921875, "learning_rate": 3e-06, "loss": 3.388, "step": 1675 }, { "epoch": 0.14897777777777776, "grad_norm": 603.185791015625, "learning_rate": 3e-06, "loss": -51.6478, "step": 1676 }, { "epoch": 0.14906666666666665, "grad_norm": 667.0884399414062, "learning_rate": 3e-06, "loss": -34.1086, "step": 1677 }, { "epoch": 0.14915555555555557, "grad_norm": 702.9710693359375, "learning_rate": 3e-06, "loss": -14.8077, "step": 1678 }, { "epoch": 0.14924444444444446, "grad_norm": 887.6446533203125, "learning_rate": 3e-06, "loss": -53.0034, "step": 1679 }, { "epoch": 0.14933333333333335, "grad_norm": 738.0032348632812, "learning_rate": 3e-06, "loss": -19.4303, "step": 1680 }, { "completion_length": 252.8541717529297, "epoch": 0.14942222222222223, "grad_norm": 563.926513671875, "learning_rate": 3e-06, "loss": 4.8877, "reward": 1.0625, "reward_std": 0.43528565764427185, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.375, "step": 1681, "zero_std_ratio": 0.5 }, { "epoch": 0.14951111111111112, "grad_norm": 571.0492553710938, "learning_rate": 3e-06, "loss": -12.3935, "step": 1682 }, { "epoch": 0.1496, "grad_norm": 365.8373107910156, "learning_rate": 3e-06, "loss": -5.9182, "step": 1683 }, { "epoch": 0.1496888888888889, "grad_norm": 700.759033203125, "learning_rate": 3e-06, "loss": 14.5414, "step": 1684 }, { "epoch": 0.1497777777777778, "grad_norm": 600.5524291992188, "learning_rate": 3e-06, "loss": 4.631, "step": 1685 }, { "epoch": 0.14986666666666668, "grad_norm": 505.4430847167969, "learning_rate": 3e-06, "loss": -27.7184, "step": 1686 }, { "epoch": 0.14995555555555556, "grad_norm": 464.21759033203125, "learning_rate": 3e-06, "loss": -0.9671, "step": 1687 }, { "epoch": 0.15004444444444445, "grad_norm": 500.7622985839844, "learning_rate": 3e-06, "loss": -18.8001, "step": 1688 }, { "epoch": 0.15013333333333334, "grad_norm": 369.3395080566406, "learning_rate": 3e-06, "loss": -9.8684, "step": 1689 }, { "epoch": 0.15022222222222223, "grad_norm": 881.677001953125, "learning_rate": 3e-06, "loss": 6.5321, "step": 1690 }, { "epoch": 0.15031111111111112, "grad_norm": 586.9358520507812, "learning_rate": 3e-06, "loss": -3.3023, "step": 1691 }, { "epoch": 0.1504, "grad_norm": 406.6219787597656, "learning_rate": 3e-06, "loss": -33.3837, "step": 1692 }, { "completion_length": 250.125, "epoch": 0.1504888888888889, "grad_norm": 382.4415283203125, "learning_rate": 3e-06, "loss": 19.5614, "reward": 1.3750000596046448, "reward_std": 0.3602609932422638, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.625, "step": 1693, "zero_std_ratio": 0.625 }, { "epoch": 0.15057777777777778, "grad_norm": 349.4650573730469, "learning_rate": 3e-06, "loss": 22.1681, "step": 1694 }, { "epoch": 0.15066666666666667, "grad_norm": 306.9416809082031, "learning_rate": 3e-06, "loss": 34.961, "step": 1695 }, { "epoch": 0.15075555555555556, "grad_norm": 353.93121337890625, "learning_rate": 3e-06, "loss": 41.0782, "step": 1696 }, { "epoch": 0.15084444444444445, "grad_norm": 213.22996520996094, "learning_rate": 3e-06, "loss": 26.4404, "step": 1697 }, { "epoch": 0.15093333333333334, "grad_norm": 356.7481994628906, "learning_rate": 3e-06, "loss": 16.8735, "step": 1698 }, { "epoch": 0.15102222222222222, "grad_norm": 394.1246643066406, "learning_rate": 3e-06, "loss": 15.6199, "step": 1699 }, { "epoch": 0.1511111111111111, "grad_norm": 358.08831787109375, "learning_rate": 3e-06, "loss": 19.0218, "step": 1700 }, { "epoch": 0.1512, "grad_norm": 290.66485595703125, "learning_rate": 3e-06, "loss": 30.4685, "step": 1701 }, { "epoch": 0.1512888888888889, "grad_norm": 427.0489501953125, "learning_rate": 3e-06, "loss": 35.7429, "step": 1702 }, { "epoch": 0.15137777777777778, "grad_norm": 218.4366912841797, "learning_rate": 3e-06, "loss": 22.0305, "step": 1703 }, { "epoch": 0.15146666666666667, "grad_norm": 375.6819152832031, "learning_rate": 3e-06, "loss": 14.9698, "step": 1704 }, { "completion_length": 232.77083587646484, "epoch": 0.15155555555555555, "grad_norm": 213.29344177246094, "learning_rate": 3e-06, "loss": 5.0413, "reward": 1.4791666865348816, "reward_std": 0.10206206887960434, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.7916666567325592, "step": 1705, "zero_std_ratio": 0.875 }, { "epoch": 0.15164444444444444, "grad_norm": 168.3350067138672, "learning_rate": 3e-06, "loss": -3.6939, "step": 1706 }, { "epoch": 0.15173333333333333, "grad_norm": 159.5379638671875, "learning_rate": 3e-06, "loss": 1.1446, "step": 1707 }, { "epoch": 0.15182222222222222, "grad_norm": 144.53854370117188, "learning_rate": 3e-06, "loss": 3.3238, "step": 1708 }, { "epoch": 0.1519111111111111, "grad_norm": 174.06390380859375, "learning_rate": 3e-06, "loss": -0.1332, "step": 1709 }, { "epoch": 0.152, "grad_norm": 124.43144989013672, "learning_rate": 3e-06, "loss": -4.2987, "step": 1710 }, { "epoch": 0.15208888888888888, "grad_norm": 242.69232177734375, "learning_rate": 3e-06, "loss": 1.5975, "step": 1711 }, { "epoch": 0.15217777777777777, "grad_norm": 172.77381896972656, "learning_rate": 3e-06, "loss": -5.9669, "step": 1712 }, { "epoch": 0.15226666666666666, "grad_norm": 139.244873046875, "learning_rate": 3e-06, "loss": -1.5727, "step": 1713 }, { "epoch": 0.15235555555555555, "grad_norm": 133.6866455078125, "learning_rate": 3e-06, "loss": -0.2409, "step": 1714 }, { "epoch": 0.15244444444444444, "grad_norm": 171.1123809814453, "learning_rate": 3e-06, "loss": -3.8625, "step": 1715 }, { "epoch": 0.15253333333333333, "grad_norm": 110.9591064453125, "learning_rate": 3e-06, "loss": -6.3714, "step": 1716 }, { "completion_length": 254.12500762939453, "epoch": 0.15262222222222221, "grad_norm": 464.7311096191406, "learning_rate": 3e-06, "loss": -16.6423, "reward": 1.2916666865348816, "reward_std": 0.48936043679714203, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.5416666567325592, "step": 1717, "zero_std_ratio": 0.5 }, { "epoch": 0.1527111111111111, "grad_norm": 429.9415588378906, "learning_rate": 3e-06, "loss": -21.4782, "step": 1718 }, { "epoch": 0.1528, "grad_norm": 415.4131774902344, "learning_rate": 3e-06, "loss": -38.5279, "step": 1719 }, { "epoch": 0.15288888888888888, "grad_norm": 589.5310668945312, "learning_rate": 3e-06, "loss": -19.3757, "step": 1720 }, { "epoch": 0.15297777777777777, "grad_norm": 492.2614440917969, "learning_rate": 3e-06, "loss": -28.3813, "step": 1721 }, { "epoch": 0.15306666666666666, "grad_norm": 381.6370849609375, "learning_rate": 3e-06, "loss": -23.7667, "step": 1722 }, { "epoch": 0.15315555555555554, "grad_norm": 448.4298095703125, "learning_rate": 3e-06, "loss": -20.063, "step": 1723 }, { "epoch": 0.15324444444444443, "grad_norm": 351.5326232910156, "learning_rate": 3e-06, "loss": -25.6929, "step": 1724 }, { "epoch": 0.15333333333333332, "grad_norm": 951.7434692382812, "learning_rate": 3e-06, "loss": -43.7247, "step": 1725 }, { "epoch": 0.1534222222222222, "grad_norm": 457.6904602050781, "learning_rate": 3e-06, "loss": -27.8251, "step": 1726 }, { "epoch": 0.1535111111111111, "grad_norm": 433.86907958984375, "learning_rate": 3e-06, "loss": -37.9928, "step": 1727 }, { "epoch": 0.1536, "grad_norm": 506.2419128417969, "learning_rate": 3e-06, "loss": -31.6119, "step": 1728 }, { "completion_length": 250.25000762939453, "epoch": 0.1536888888888889, "grad_norm": 638.6153564453125, "learning_rate": 3e-06, "loss": -60.7029, "reward": 1.4895833730697632, "reward_std": 0.7616997957229614, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.75, "step": 1729, "zero_std_ratio": 0.125 }, { "epoch": 0.1537777777777778, "grad_norm": 645.9075927734375, "learning_rate": 3e-06, "loss": -100.7913, "step": 1730 }, { "epoch": 0.15386666666666668, "grad_norm": 769.6129760742188, "learning_rate": 3e-06, "loss": -41.8559, "step": 1731 }, { "epoch": 0.15395555555555557, "grad_norm": 594.3479614257812, "learning_rate": 3e-06, "loss": -85.2107, "step": 1732 }, { "epoch": 0.15404444444444446, "grad_norm": 513.4801635742188, "learning_rate": 3e-06, "loss": -78.7403, "step": 1733 }, { "epoch": 0.15413333333333334, "grad_norm": 596.7926635742188, "learning_rate": 3e-06, "loss": -38.3655, "step": 1734 }, { "epoch": 0.15422222222222223, "grad_norm": 762.2822875976562, "learning_rate": 3e-06, "loss": -68.4507, "step": 1735 }, { "epoch": 0.15431111111111112, "grad_norm": 507.63958740234375, "learning_rate": 3e-06, "loss": -108.803, "step": 1736 }, { "epoch": 0.1544, "grad_norm": 657.9226684570312, "learning_rate": 3e-06, "loss": -56.9832, "step": 1737 }, { "epoch": 0.1544888888888889, "grad_norm": 604.657958984375, "learning_rate": 3e-06, "loss": -96.6592, "step": 1738 }, { "epoch": 0.1545777777777778, "grad_norm": 491.42047119140625, "learning_rate": 3e-06, "loss": -87.8488, "step": 1739 }, { "epoch": 0.15466666666666667, "grad_norm": 542.5538940429688, "learning_rate": 3e-06, "loss": -52.5061, "step": 1740 }, { "completion_length": 241.3125, "epoch": 0.15475555555555556, "grad_norm": 257.529541015625, "learning_rate": 3e-06, "loss": -24.5069, "reward": 1.1250000596046448, "reward_std": 0.23116152733564377, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.375, "step": 1741, "zero_std_ratio": 0.75 }, { "epoch": 0.15484444444444445, "grad_norm": 272.9305725097656, "learning_rate": 3e-06, "loss": -31.0682, "step": 1742 }, { "epoch": 0.15493333333333334, "grad_norm": 241.25071716308594, "learning_rate": 3e-06, "loss": -32.0895, "step": 1743 }, { "epoch": 0.15502222222222223, "grad_norm": 208.89321899414062, "learning_rate": 3e-06, "loss": -28.8566, "step": 1744 }, { "epoch": 0.15511111111111112, "grad_norm": 186.21788024902344, "learning_rate": 3e-06, "loss": -25.5258, "step": 1745 }, { "epoch": 0.1552, "grad_norm": 208.84288024902344, "learning_rate": 3e-06, "loss": -36.6331, "step": 1746 }, { "epoch": 0.1552888888888889, "grad_norm": 273.28900146484375, "learning_rate": 3e-06, "loss": -26.8255, "step": 1747 }, { "epoch": 0.15537777777777778, "grad_norm": 255.3370361328125, "learning_rate": 3e-06, "loss": -35.3528, "step": 1748 }, { "epoch": 0.15546666666666667, "grad_norm": 265.7087097167969, "learning_rate": 3e-06, "loss": -37.0957, "step": 1749 }, { "epoch": 0.15555555555555556, "grad_norm": 174.1486358642578, "learning_rate": 3e-06, "loss": -31.8935, "step": 1750 }, { "epoch": 0.15564444444444445, "grad_norm": 208.65518188476562, "learning_rate": 3e-06, "loss": -29.0371, "step": 1751 }, { "epoch": 0.15573333333333333, "grad_norm": 172.7078857421875, "learning_rate": 3e-06, "loss": -39.3675, "step": 1752 }, { "completion_length": 252.39583587646484, "epoch": 0.15582222222222222, "grad_norm": 451.1358947753906, "learning_rate": 3e-06, "loss": -66.2963, "reward": 1.3541667461395264, "reward_std": 0.6070848852396011, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.6666666716337204, "step": 1753, "zero_std_ratio": 0.375 }, { "epoch": 0.1559111111111111, "grad_norm": 447.5511169433594, "learning_rate": 3e-06, "loss": -65.7659, "step": 1754 }, { "epoch": 0.156, "grad_norm": 481.50335693359375, "learning_rate": 3e-06, "loss": -67.9006, "step": 1755 }, { "epoch": 0.1560888888888889, "grad_norm": 418.3782043457031, "learning_rate": 3e-06, "loss": -59.6285, "step": 1756 }, { "epoch": 0.15617777777777778, "grad_norm": 487.20574951171875, "learning_rate": 3e-06, "loss": -82.0834, "step": 1757 }, { "epoch": 0.15626666666666666, "grad_norm": 454.06463623046875, "learning_rate": 3e-06, "loss": -57.3851, "step": 1758 }, { "epoch": 0.15635555555555555, "grad_norm": 408.6988830566406, "learning_rate": 3e-06, "loss": -69.432, "step": 1759 }, { "epoch": 0.15644444444444444, "grad_norm": 399.9183349609375, "learning_rate": 3e-06, "loss": -71.3301, "step": 1760 }, { "epoch": 0.15653333333333333, "grad_norm": 577.1817626953125, "learning_rate": 3e-06, "loss": -74.2775, "step": 1761 }, { "epoch": 0.15662222222222222, "grad_norm": 413.5326843261719, "learning_rate": 3e-06, "loss": -63.2282, "step": 1762 }, { "epoch": 0.1567111111111111, "grad_norm": 498.8305358886719, "learning_rate": 3e-06, "loss": -88.1602, "step": 1763 }, { "epoch": 0.1568, "grad_norm": 530.10595703125, "learning_rate": 3e-06, "loss": -64.1951, "step": 1764 }, { "completion_length": 247.00000762939453, "epoch": 0.15688888888888888, "grad_norm": 433.1192321777344, "learning_rate": 3e-06, "loss": -27.1999, "reward": 1.6458333730697632, "reward_std": 0.3602609783411026, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.9583333432674408, "step": 1765, "zero_std_ratio": 0.625 }, { "epoch": 0.15697777777777777, "grad_norm": 350.172119140625, "learning_rate": 3e-06, "loss": -32.7846, "step": 1766 }, { "epoch": 0.15706666666666666, "grad_norm": 443.9689636230469, "learning_rate": 3e-06, "loss": -25.8718, "step": 1767 }, { "epoch": 0.15715555555555555, "grad_norm": 491.6800537109375, "learning_rate": 3e-06, "loss": -21.288, "step": 1768 }, { "epoch": 0.15724444444444444, "grad_norm": 490.1741943359375, "learning_rate": 3e-06, "loss": -30.8959, "step": 1769 }, { "epoch": 0.15733333333333333, "grad_norm": 403.3340759277344, "learning_rate": 3e-06, "loss": -17.7178, "step": 1770 }, { "epoch": 0.1574222222222222, "grad_norm": 434.2870178222656, "learning_rate": 3e-06, "loss": -28.6675, "step": 1771 }, { "epoch": 0.1575111111111111, "grad_norm": 390.14208984375, "learning_rate": 3e-06, "loss": -35.2631, "step": 1772 }, { "epoch": 0.1576, "grad_norm": 545.4449462890625, "learning_rate": 3e-06, "loss": -27.2021, "step": 1773 }, { "epoch": 0.15768888888888888, "grad_norm": 409.76416015625, "learning_rate": 3e-06, "loss": -25.2074, "step": 1774 }, { "epoch": 0.15777777777777777, "grad_norm": 1407.31787109375, "learning_rate": 3e-06, "loss": -32.1991, "step": 1775 }, { "epoch": 0.15786666666666666, "grad_norm": 409.153076171875, "learning_rate": 3e-06, "loss": -21.3904, "step": 1776 }, { "completion_length": 255.6666717529297, "epoch": 0.15795555555555554, "grad_norm": 250.8477020263672, "learning_rate": 3e-06, "loss": -20.1606, "reward": 1.0625000596046448, "reward_std": 0.23116152733564377, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.3750000111758709, "step": 1777, "zero_std_ratio": 0.75 }, { "epoch": 0.15804444444444443, "grad_norm": 295.9480895996094, "learning_rate": 3e-06, "loss": -4.3906, "step": 1778 }, { "epoch": 0.15813333333333332, "grad_norm": 264.98590087890625, "learning_rate": 3e-06, "loss": 1.1149, "step": 1779 }, { "epoch": 0.1582222222222222, "grad_norm": 282.2425537109375, "learning_rate": 3e-06, "loss": -13.9569, "step": 1780 }, { "epoch": 0.15831111111111112, "grad_norm": 269.65191650390625, "learning_rate": 3e-06, "loss": -12.0395, "step": 1781 }, { "epoch": 0.1584, "grad_norm": 262.14825439453125, "learning_rate": 3e-06, "loss": -17.0425, "step": 1782 }, { "epoch": 0.1584888888888889, "grad_norm": 226.9910888671875, "learning_rate": 3e-06, "loss": -22.3367, "step": 1783 }, { "epoch": 0.1585777777777778, "grad_norm": 268.9870300292969, "learning_rate": 3e-06, "loss": -7.3806, "step": 1784 }, { "epoch": 0.15866666666666668, "grad_norm": 252.59866333007812, "learning_rate": 3e-06, "loss": -2.0587, "step": 1785 }, { "epoch": 0.15875555555555557, "grad_norm": 285.8102111816406, "learning_rate": 3e-06, "loss": -17.0219, "step": 1786 }, { "epoch": 0.15884444444444445, "grad_norm": 281.6475830078125, "learning_rate": 3e-06, "loss": -16.6806, "step": 1787 }, { "epoch": 0.15893333333333334, "grad_norm": 218.4907989501953, "learning_rate": 3e-06, "loss": -22.657, "step": 1788 }, { "completion_length": 252.0625, "epoch": 0.15902222222222223, "grad_norm": 365.0342102050781, "learning_rate": 3e-06, "loss": 4.7637, "reward": 1.0416666865348816, "reward_std": 0.23899271339178085, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.4166666679084301, "step": 1789, "zero_std_ratio": 0.75 }, { "epoch": 0.15911111111111112, "grad_norm": 306.0011291503906, "learning_rate": 3e-06, "loss": -12.1321, "step": 1790 }, { "epoch": 0.1592, "grad_norm": 385.5239562988281, "learning_rate": 3e-06, "loss": -11.9155, "step": 1791 }, { "epoch": 0.1592888888888889, "grad_norm": 502.8174743652344, "learning_rate": 3e-06, "loss": -9.7161, "step": 1792 }, { "epoch": 0.15937777777777778, "grad_norm": 286.40557861328125, "learning_rate": 3e-06, "loss": -0.1257, "step": 1793 }, { "epoch": 0.15946666666666667, "grad_norm": 250.90745544433594, "learning_rate": 3e-06, "loss": -5.5968, "step": 1794 }, { "epoch": 0.15955555555555556, "grad_norm": 337.0714416503906, "learning_rate": 3e-06, "loss": 1.3961, "step": 1795 }, { "epoch": 0.15964444444444445, "grad_norm": 279.7541198730469, "learning_rate": 3e-06, "loss": -15.7637, "step": 1796 }, { "epoch": 0.15973333333333334, "grad_norm": 293.27703857421875, "learning_rate": 3e-06, "loss": -16.9533, "step": 1797 }, { "epoch": 0.15982222222222223, "grad_norm": 398.4286193847656, "learning_rate": 3e-06, "loss": -14.4282, "step": 1798 }, { "epoch": 0.15991111111111111, "grad_norm": 314.15338134765625, "learning_rate": 3e-06, "loss": -2.8244, "step": 1799 }, { "epoch": 0.16, "grad_norm": 302.6134338378906, "learning_rate": 3e-06, "loss": -11.5238, "step": 1800 }, { "completion_length": 251.93750762939453, "epoch": 0.1600888888888889, "grad_norm": 598.98095703125, "learning_rate": 3e-06, "loss": -25.8615, "reward": 1.5, "reward_std": 0.20412415266036987, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.75, "step": 1801, "zero_std_ratio": 0.75 }, { "epoch": 0.16017777777777778, "grad_norm": 618.0178833007812, "learning_rate": 3e-06, "loss": -21.1122, "step": 1802 }, { "epoch": 0.16026666666666667, "grad_norm": 541.5601806640625, "learning_rate": 3e-06, "loss": -11.0087, "step": 1803 }, { "epoch": 0.16035555555555556, "grad_norm": 470.309814453125, "learning_rate": 3e-06, "loss": -22.2437, "step": 1804 }, { "epoch": 0.16044444444444445, "grad_norm": 413.4715270996094, "learning_rate": 3e-06, "loss": -33.0321, "step": 1805 }, { "epoch": 0.16053333333333333, "grad_norm": 376.2085266113281, "learning_rate": 3e-06, "loss": -25.3322, "step": 1806 }, { "epoch": 0.16062222222222222, "grad_norm": 322.1478576660156, "learning_rate": 3e-06, "loss": -37.8067, "step": 1807 }, { "epoch": 0.1607111111111111, "grad_norm": 272.68951416015625, "learning_rate": 3e-06, "loss": -34.8264, "step": 1808 }, { "epoch": 0.1608, "grad_norm": 365.9797058105469, "learning_rate": 3e-06, "loss": -24.8187, "step": 1809 }, { "epoch": 0.1608888888888889, "grad_norm": 314.506591796875, "learning_rate": 3e-06, "loss": -32.5667, "step": 1810 }, { "epoch": 0.16097777777777778, "grad_norm": 410.4781494140625, "learning_rate": 3e-06, "loss": -43.7721, "step": 1811 }, { "epoch": 0.16106666666666666, "grad_norm": 323.0709228515625, "learning_rate": 3e-06, "loss": -32.8399, "step": 1812 }, { "completion_length": 253.14583587646484, "epoch": 0.16115555555555555, "grad_norm": 490.2536315917969, "learning_rate": 3e-06, "loss": -24.4492, "reward": 1.4375000596046448, "reward_std": 0.3680921420454979, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.7500000149011612, "step": 1813, "zero_std_ratio": 0.625 }, { "epoch": 0.16124444444444444, "grad_norm": 376.43231201171875, "learning_rate": 3e-06, "loss": -22.8531, "step": 1814 }, { "epoch": 0.16133333333333333, "grad_norm": 370.8981018066406, "learning_rate": 3e-06, "loss": -6.1589, "step": 1815 }, { "epoch": 0.16142222222222222, "grad_norm": 350.6385498046875, "learning_rate": 3e-06, "loss": -17.2146, "step": 1816 }, { "epoch": 0.1615111111111111, "grad_norm": 413.9906311035156, "learning_rate": 3e-06, "loss": -13.1823, "step": 1817 }, { "epoch": 0.1616, "grad_norm": 511.4176940917969, "learning_rate": 3e-06, "loss": -23.6257, "step": 1818 }, { "epoch": 0.16168888888888888, "grad_norm": 425.5303039550781, "learning_rate": 3e-06, "loss": -29.0499, "step": 1819 }, { "epoch": 0.16177777777777777, "grad_norm": 329.032958984375, "learning_rate": 3e-06, "loss": -28.0294, "step": 1820 }, { "epoch": 0.16186666666666666, "grad_norm": 482.425537109375, "learning_rate": 3e-06, "loss": -8.7538, "step": 1821 }, { "epoch": 0.16195555555555555, "grad_norm": 422.88494873046875, "learning_rate": 3e-06, "loss": -22.7747, "step": 1822 }, { "epoch": 0.16204444444444444, "grad_norm": 394.9844055175781, "learning_rate": 3e-06, "loss": -18.7768, "step": 1823 }, { "epoch": 0.16213333333333332, "grad_norm": 449.5504455566406, "learning_rate": 3e-06, "loss": -25.7761, "step": 1824 }, { "completion_length": 240.8541717529297, "epoch": 0.1622222222222222, "grad_norm": 474.4821472167969, "learning_rate": 3e-06, "loss": 5.9184, "reward": 1.5416666865348816, "reward_std": 0.3602609857916832, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.7916666865348816, "step": 1825, "zero_std_ratio": 0.625 }, { "epoch": 0.1623111111111111, "grad_norm": 394.9033203125, "learning_rate": 3e-06, "loss": -8.6183, "step": 1826 }, { "epoch": 0.1624, "grad_norm": 585.3305053710938, "learning_rate": 3e-06, "loss": 17.2736, "step": 1827 }, { "epoch": 0.16248888888888888, "grad_norm": 411.5712585449219, "learning_rate": 3e-06, "loss": 2.0057, "step": 1828 }, { "epoch": 0.16257777777777777, "grad_norm": 326.8497619628906, "learning_rate": 3e-06, "loss": 5.208, "step": 1829 }, { "epoch": 0.16266666666666665, "grad_norm": 621.4788208007812, "learning_rate": 3e-06, "loss": -6.1221, "step": 1830 }, { "epoch": 0.16275555555555554, "grad_norm": 628.4518432617188, "learning_rate": 3e-06, "loss": 3.4633, "step": 1831 }, { "epoch": 0.16284444444444446, "grad_norm": 389.485595703125, "learning_rate": 3e-06, "loss": -14.53, "step": 1832 }, { "epoch": 0.16293333333333335, "grad_norm": 760.7333374023438, "learning_rate": 3e-06, "loss": 11.3702, "step": 1833 }, { "epoch": 0.16302222222222224, "grad_norm": 327.4570617675781, "learning_rate": 3e-06, "loss": -4.0738, "step": 1834 }, { "epoch": 0.16311111111111112, "grad_norm": 325.6021728515625, "learning_rate": 3e-06, "loss": -0.9882, "step": 1835 }, { "epoch": 0.1632, "grad_norm": 434.9088439941406, "learning_rate": 3e-06, "loss": -14.5048, "step": 1836 }, { "completion_length": 252.3541717529297, "epoch": 0.1632888888888889, "grad_norm": 364.32318115234375, "learning_rate": 3e-06, "loss": 5.0981, "reward": 1.520833432674408, "reward_std": 0.20412413775920868, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.8333333544433117, "step": 1837, "zero_std_ratio": 0.75 }, { "epoch": 0.1633777777777778, "grad_norm": 327.6516418457031, "learning_rate": 3e-06, "loss": 13.1354, "step": 1838 }, { "epoch": 0.16346666666666668, "grad_norm": 327.8387145996094, "learning_rate": 3e-06, "loss": 7.7064, "step": 1839 }, { "epoch": 0.16355555555555557, "grad_norm": 483.0592346191406, "learning_rate": 3e-06, "loss": 9.8312, "step": 1840 }, { "epoch": 0.16364444444444445, "grad_norm": 409.86724853515625, "learning_rate": 3e-06, "loss": 13.0568, "step": 1841 }, { "epoch": 0.16373333333333334, "grad_norm": 282.73626708984375, "learning_rate": 3e-06, "loss": 3.5377, "step": 1842 }, { "epoch": 0.16382222222222223, "grad_norm": 262.7396240234375, "learning_rate": 3e-06, "loss": -0.2709, "step": 1843 }, { "epoch": 0.16391111111111112, "grad_norm": 375.62359619140625, "learning_rate": 3e-06, "loss": 8.4106, "step": 1844 }, { "epoch": 0.164, "grad_norm": 295.78814697265625, "learning_rate": 3e-06, "loss": 2.2675, "step": 1845 }, { "epoch": 0.1640888888888889, "grad_norm": 555.635009765625, "learning_rate": 3e-06, "loss": 1.7638, "step": 1846 }, { "epoch": 0.16417777777777778, "grad_norm": 362.09722900390625, "learning_rate": 3e-06, "loss": 2.102, "step": 1847 }, { "epoch": 0.16426666666666667, "grad_norm": 349.70440673828125, "learning_rate": 3e-06, "loss": -2.454, "step": 1848 }, { "completion_length": 249.08333587646484, "epoch": 0.16435555555555556, "grad_norm": 437.3484802246094, "learning_rate": 3e-06, "loss": -29.0477, "reward": 1.4166666865348816, "reward_std": 0.4701542556285858, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.7916666567325592, "step": 1849, "zero_std_ratio": 0.5 }, { "epoch": 0.16444444444444445, "grad_norm": 638.8463745117188, "learning_rate": 3e-06, "loss": -47.733, "step": 1850 }, { "epoch": 0.16453333333333334, "grad_norm": 460.33624267578125, "learning_rate": 3e-06, "loss": -34.4307, "step": 1851 }, { "epoch": 0.16462222222222223, "grad_norm": 491.926025390625, "learning_rate": 3e-06, "loss": -52.2731, "step": 1852 }, { "epoch": 0.1647111111111111, "grad_norm": 607.3854370117188, "learning_rate": 3e-06, "loss": -47.2157, "step": 1853 }, { "epoch": 0.1648, "grad_norm": 512.1332397460938, "learning_rate": 3e-06, "loss": -50.0155, "step": 1854 }, { "epoch": 0.1648888888888889, "grad_norm": 403.7186279296875, "learning_rate": 3e-06, "loss": -33.8899, "step": 1855 }, { "epoch": 0.16497777777777778, "grad_norm": 423.3454284667969, "learning_rate": 3e-06, "loss": -51.1046, "step": 1856 }, { "epoch": 0.16506666666666667, "grad_norm": 615.0731811523438, "learning_rate": 3e-06, "loss": -36.3091, "step": 1857 }, { "epoch": 0.16515555555555556, "grad_norm": 557.7341918945312, "learning_rate": 3e-06, "loss": -58.0938, "step": 1858 }, { "epoch": 0.16524444444444444, "grad_norm": 575.3082885742188, "learning_rate": 3e-06, "loss": -56.4051, "step": 1859 }, { "epoch": 0.16533333333333333, "grad_norm": 530.2061157226562, "learning_rate": 3e-06, "loss": -57.1385, "step": 1860 }, { "completion_length": 229.4791717529297, "epoch": 0.16542222222222222, "grad_norm": 290.0047912597656, "learning_rate": 3e-06, "loss": -26.4189, "reward": 1.4375000596046448, "reward_std": 0.3332236111164093, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.75, "step": 1861, "zero_std_ratio": 0.625 }, { "epoch": 0.1655111111111111, "grad_norm": 343.6731262207031, "learning_rate": 3e-06, "loss": -30.5266, "step": 1862 }, { "epoch": 0.1656, "grad_norm": 321.04400634765625, "learning_rate": 3e-06, "loss": -26.9367, "step": 1863 }, { "epoch": 0.16568888888888889, "grad_norm": 423.7695007324219, "learning_rate": 3e-06, "loss": -20.3331, "step": 1864 }, { "epoch": 0.16577777777777777, "grad_norm": 301.9991760253906, "learning_rate": 3e-06, "loss": -22.0128, "step": 1865 }, { "epoch": 0.16586666666666666, "grad_norm": 266.446533203125, "learning_rate": 3e-06, "loss": -13.6451, "step": 1866 }, { "epoch": 0.16595555555555555, "grad_norm": 289.8598937988281, "learning_rate": 3e-06, "loss": -29.9844, "step": 1867 }, { "epoch": 0.16604444444444444, "grad_norm": 474.86016845703125, "learning_rate": 3e-06, "loss": -32.2196, "step": 1868 }, { "epoch": 0.16613333333333333, "grad_norm": 314.1653137207031, "learning_rate": 3e-06, "loss": -29.7642, "step": 1869 }, { "epoch": 0.16622222222222222, "grad_norm": 362.4693603515625, "learning_rate": 3e-06, "loss": -24.4635, "step": 1870 }, { "epoch": 0.1663111111111111, "grad_norm": 312.52569580078125, "learning_rate": 3e-06, "loss": -24.009, "step": 1871 }, { "epoch": 0.1664, "grad_norm": 259.9459228515625, "learning_rate": 3e-06, "loss": -18.4052, "step": 1872 }, { "completion_length": 250.8541717529297, "epoch": 0.16648888888888888, "grad_norm": 374.1226501464844, "learning_rate": 3e-06, "loss": -4.5437, "reward": 1.6666667461395264, "reward_std": 0.20412414520978928, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.9166666865348816, "step": 1873, "zero_std_ratio": 0.75 }, { "epoch": 0.16657777777777777, "grad_norm": 501.4902038574219, "learning_rate": 3e-06, "loss": -16.285, "step": 1874 }, { "epoch": 0.16666666666666666, "grad_norm": 511.9213562011719, "learning_rate": 3e-06, "loss": -5.965, "step": 1875 }, { "epoch": 0.16675555555555555, "grad_norm": 377.8996276855469, "learning_rate": 3e-06, "loss": -11.742, "step": 1876 }, { "epoch": 0.16684444444444443, "grad_norm": 275.3067321777344, "learning_rate": 3e-06, "loss": -2.8953, "step": 1877 }, { "epoch": 0.16693333333333332, "grad_norm": 311.0989990234375, "learning_rate": 3e-06, "loss": 3.1096, "step": 1878 }, { "epoch": 0.1670222222222222, "grad_norm": 408.4767761230469, "learning_rate": 3e-06, "loss": -5.1197, "step": 1879 }, { "epoch": 0.1671111111111111, "grad_norm": 411.1174621582031, "learning_rate": 3e-06, "loss": -20.4177, "step": 1880 }, { "epoch": 0.1672, "grad_norm": 432.32159423828125, "learning_rate": 3e-06, "loss": -14.0135, "step": 1881 }, { "epoch": 0.16728888888888888, "grad_norm": 637.0897216796875, "learning_rate": 3e-06, "loss": -17.1271, "step": 1882 }, { "epoch": 0.16737777777777776, "grad_norm": 277.6168212890625, "learning_rate": 3e-06, "loss": -7.087, "step": 1883 }, { "epoch": 0.16746666666666668, "grad_norm": 364.540283203125, "learning_rate": 3e-06, "loss": 0.8844, "step": 1884 }, { "completion_length": 236.375, "epoch": 0.16755555555555557, "grad_norm": 518.0326538085938, "learning_rate": 3e-06, "loss": 7.3195, "reward": 1.5208333730697632, "reward_std": 0.3332235887646675, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.8333333283662796, "step": 1885, "zero_std_ratio": 0.625 }, { "epoch": 0.16764444444444446, "grad_norm": 477.8092346191406, "learning_rate": 3e-06, "loss": 9.9487, "step": 1886 }, { "epoch": 0.16773333333333335, "grad_norm": 573.36083984375, "learning_rate": 3e-06, "loss": -5.14, "step": 1887 }, { "epoch": 0.16782222222222223, "grad_norm": 552.39599609375, "learning_rate": 3e-06, "loss": 8.7078, "step": 1888 }, { "epoch": 0.16791111111111112, "grad_norm": 585.54296875, "learning_rate": 3e-06, "loss": 1.6238, "step": 1889 }, { "epoch": 0.168, "grad_norm": 431.30364990234375, "learning_rate": 3e-06, "loss": 2.3952, "step": 1890 }, { "epoch": 0.1680888888888889, "grad_norm": 878.414306640625, "learning_rate": 3e-06, "loss": 3.0619, "step": 1891 }, { "epoch": 0.1681777777777778, "grad_norm": 494.9808044433594, "learning_rate": 3e-06, "loss": 5.923, "step": 1892 }, { "epoch": 0.16826666666666668, "grad_norm": 434.52093505859375, "learning_rate": 3e-06, "loss": -13.3846, "step": 1893 }, { "epoch": 0.16835555555555556, "grad_norm": 355.6122131347656, "learning_rate": 3e-06, "loss": 1.8179, "step": 1894 }, { "epoch": 0.16844444444444445, "grad_norm": 613.9844970703125, "learning_rate": 3e-06, "loss": -2.4103, "step": 1895 }, { "epoch": 0.16853333333333334, "grad_norm": 337.2001953125, "learning_rate": 3e-06, "loss": -8.9448, "step": 1896 }, { "completion_length": 253.52084350585938, "epoch": 0.16862222222222223, "grad_norm": 439.8132629394531, "learning_rate": 3e-06, "loss": -50.7881, "reward": 1.291666716337204, "reward_std": 0.5373477265238762, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.6666666716337204, "step": 1897, "zero_std_ratio": 0.375 }, { "epoch": 0.16871111111111112, "grad_norm": 414.8184509277344, "learning_rate": 3e-06, "loss": -54.9974, "step": 1898 }, { "epoch": 0.1688, "grad_norm": 449.9412536621094, "learning_rate": 3e-06, "loss": -85.2961, "step": 1899 }, { "epoch": 0.1688888888888889, "grad_norm": 588.9105224609375, "learning_rate": 3e-06, "loss": -72.8215, "step": 1900 }, { "epoch": 0.16897777777777778, "grad_norm": 511.78790283203125, "learning_rate": 3e-06, "loss": -88.0678, "step": 1901 }, { "epoch": 0.16906666666666667, "grad_norm": 514.567138671875, "learning_rate": 3e-06, "loss": -90.2521, "step": 1902 }, { "epoch": 0.16915555555555556, "grad_norm": 394.65826416015625, "learning_rate": 3e-06, "loss": -56.8307, "step": 1903 }, { "epoch": 0.16924444444444445, "grad_norm": 463.7818603515625, "learning_rate": 3e-06, "loss": -62.5249, "step": 1904 }, { "epoch": 0.16933333333333334, "grad_norm": 579.9658813476562, "learning_rate": 3e-06, "loss": -90.807, "step": 1905 }, { "epoch": 0.16942222222222222, "grad_norm": 578.1275024414062, "learning_rate": 3e-06, "loss": -82.3221, "step": 1906 }, { "epoch": 0.1695111111111111, "grad_norm": 405.3597412109375, "learning_rate": 3e-06, "loss": -100.0349, "step": 1907 }, { "epoch": 0.1696, "grad_norm": 545.5985107421875, "learning_rate": 3e-06, "loss": -101.4231, "step": 1908 }, { "completion_length": 233.64584350585938, "epoch": 0.1696888888888889, "grad_norm": 459.9978942871094, "learning_rate": 3e-06, "loss": -49.3001, "reward": 1.7916667461395264, "reward_std": 0.4701542258262634, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.0416666865348816, "step": 1909, "zero_std_ratio": 0.5 }, { "epoch": 0.16977777777777778, "grad_norm": 383.19635009765625, "learning_rate": 3e-06, "loss": -29.5879, "step": 1910 }, { "epoch": 0.16986666666666667, "grad_norm": 451.4841003417969, "learning_rate": 3e-06, "loss": -53.6264, "step": 1911 }, { "epoch": 0.16995555555555555, "grad_norm": 429.31640625, "learning_rate": 3e-06, "loss": -41.3605, "step": 1912 }, { "epoch": 0.17004444444444444, "grad_norm": 476.54168701171875, "learning_rate": 3e-06, "loss": -33.034, "step": 1913 }, { "epoch": 0.17013333333333333, "grad_norm": 411.1586608886719, "learning_rate": 3e-06, "loss": -33.1554, "step": 1914 }, { "epoch": 0.17022222222222222, "grad_norm": 504.1165771484375, "learning_rate": 3e-06, "loss": -54.2275, "step": 1915 }, { "epoch": 0.1703111111111111, "grad_norm": 430.4921569824219, "learning_rate": 3e-06, "loss": -36.929, "step": 1916 }, { "epoch": 0.1704, "grad_norm": 488.67071533203125, "learning_rate": 3e-06, "loss": -59.2601, "step": 1917 }, { "epoch": 0.17048888888888888, "grad_norm": 381.2705078125, "learning_rate": 3e-06, "loss": -46.7398, "step": 1918 }, { "epoch": 0.17057777777777777, "grad_norm": 439.42071533203125, "learning_rate": 3e-06, "loss": -40.6781, "step": 1919 }, { "epoch": 0.17066666666666666, "grad_norm": 468.06365966796875, "learning_rate": 3e-06, "loss": -39.7694, "step": 1920 }, { "completion_length": 248.89583587646484, "epoch": 0.17075555555555555, "grad_norm": 389.05523681640625, "learning_rate": 3e-06, "loss": -15.4944, "reward": 1.2291667461395264, "reward_std": 0.23116152733564377, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.5416666716337204, "step": 1921, "zero_std_ratio": 0.75 }, { "epoch": 0.17084444444444444, "grad_norm": 437.8026428222656, "learning_rate": 3e-06, "loss": -1.488, "step": 1922 }, { "epoch": 0.17093333333333333, "grad_norm": 251.8334197998047, "learning_rate": 3e-06, "loss": -4.1204, "step": 1923 }, { "epoch": 0.17102222222222221, "grad_norm": 547.3317260742188, "learning_rate": 3e-06, "loss": -28.7346, "step": 1924 }, { "epoch": 0.1711111111111111, "grad_norm": 289.30096435546875, "learning_rate": 3e-06, "loss": -16.5796, "step": 1925 }, { "epoch": 0.1712, "grad_norm": 335.68682861328125, "learning_rate": 3e-06, "loss": -26.3341, "step": 1926 }, { "epoch": 0.17128888888888888, "grad_norm": 396.2536926269531, "learning_rate": 3e-06, "loss": -18.4726, "step": 1927 }, { "epoch": 0.17137777777777777, "grad_norm": 490.247802734375, "learning_rate": 3e-06, "loss": -8.9601, "step": 1928 }, { "epoch": 0.17146666666666666, "grad_norm": 386.6977844238281, "learning_rate": 3e-06, "loss": -11.4419, "step": 1929 }, { "epoch": 0.17155555555555554, "grad_norm": 547.1292114257812, "learning_rate": 3e-06, "loss": -34.8083, "step": 1930 }, { "epoch": 0.17164444444444443, "grad_norm": 253.44793701171875, "learning_rate": 3e-06, "loss": -20.5172, "step": 1931 }, { "epoch": 0.17173333333333332, "grad_norm": 536.4385375976562, "learning_rate": 3e-06, "loss": -35.105, "step": 1932 }, { "completion_length": 254.0416717529297, "epoch": 0.1718222222222222, "grad_norm": 513.8276977539062, "learning_rate": 3e-06, "loss": 3.0418, "reward": 1.625, "reward_std": 0.39512956142425537, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.875, "step": 1933, "zero_std_ratio": 0.625 }, { "epoch": 0.1719111111111111, "grad_norm": 700.8008422851562, "learning_rate": 3e-06, "loss": -24.1725, "step": 1934 }, { "epoch": 0.172, "grad_norm": 640.3589477539062, "learning_rate": 3e-06, "loss": -18.554, "step": 1935 }, { "epoch": 0.1720888888888889, "grad_norm": 507.8769836425781, "learning_rate": 3e-06, "loss": -30.509, "step": 1936 }, { "epoch": 0.1721777777777778, "grad_norm": 414.5351257324219, "learning_rate": 3e-06, "loss": -39.3267, "step": 1937 }, { "epoch": 0.17226666666666668, "grad_norm": 445.7782897949219, "learning_rate": 3e-06, "loss": -25.2095, "step": 1938 }, { "epoch": 0.17235555555555557, "grad_norm": 665.8450927734375, "learning_rate": 3e-06, "loss": -6.8353, "step": 1939 }, { "epoch": 0.17244444444444446, "grad_norm": 558.6971435546875, "learning_rate": 3e-06, "loss": -33.1913, "step": 1940 }, { "epoch": 0.17253333333333334, "grad_norm": 500.55841064453125, "learning_rate": 3e-06, "loss": -30.2144, "step": 1941 }, { "epoch": 0.17262222222222223, "grad_norm": 530.8914184570312, "learning_rate": 3e-06, "loss": -38.4873, "step": 1942 }, { "epoch": 0.17271111111111112, "grad_norm": 427.3404846191406, "learning_rate": 3e-06, "loss": -45.8584, "step": 1943 }, { "epoch": 0.1728, "grad_norm": 391.1449279785156, "learning_rate": 3e-06, "loss": -29.2224, "step": 1944 }, { "completion_length": 255.1041717529297, "epoch": 0.1728888888888889, "grad_norm": 763.403076171875, "learning_rate": 3e-06, "loss": -9.7223, "reward": 1.8333333730697632, "reward_std": 0.4971916079521179, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.0833333134651184, "step": 1945, "zero_std_ratio": 0.5 }, { "epoch": 0.17297777777777779, "grad_norm": 562.3991088867188, "learning_rate": 3e-06, "loss": -7.2214, "step": 1946 }, { "epoch": 0.17306666666666667, "grad_norm": 686.5684814453125, "learning_rate": 3e-06, "loss": 2.4798, "step": 1947 }, { "epoch": 0.17315555555555556, "grad_norm": 897.48046875, "learning_rate": 3e-06, "loss": -0.4508, "step": 1948 }, { "epoch": 0.17324444444444445, "grad_norm": 439.28924560546875, "learning_rate": 3e-06, "loss": -15.6095, "step": 1949 }, { "epoch": 0.17333333333333334, "grad_norm": 501.46044921875, "learning_rate": 3e-06, "loss": -7.5293, "step": 1950 }, { "epoch": 0.17342222222222223, "grad_norm": 659.4769287109375, "learning_rate": 3e-06, "loss": -14.7485, "step": 1951 }, { "epoch": 0.17351111111111112, "grad_norm": 528.5435180664062, "learning_rate": 3e-06, "loss": -14.1578, "step": 1952 }, { "epoch": 0.1736, "grad_norm": 827.1624145507812, "learning_rate": 3e-06, "loss": -3.8977, "step": 1953 }, { "epoch": 0.1736888888888889, "grad_norm": 593.5472412109375, "learning_rate": 3e-06, "loss": -4.4025, "step": 1954 }, { "epoch": 0.17377777777777778, "grad_norm": 444.0652160644531, "learning_rate": 3e-06, "loss": -18.2012, "step": 1955 }, { "epoch": 0.17386666666666667, "grad_norm": 478.5816345214844, "learning_rate": 3e-06, "loss": -13.0243, "step": 1956 }, { "completion_length": 253.25000762939453, "epoch": 0.17395555555555556, "grad_norm": 375.89666748046875, "learning_rate": 3e-06, "loss": -12.1653, "reward": 1.2291666865348816, "reward_std": 0.10206207633018494, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.5416666567325592, "step": 1957, "zero_std_ratio": 0.875 }, { "epoch": 0.17404444444444445, "grad_norm": 902.545654296875, "learning_rate": 3e-06, "loss": -25.8535, "step": 1958 }, { "epoch": 0.17413333333333333, "grad_norm": 629.4489135742188, "learning_rate": 3e-06, "loss": -22.1681, "step": 1959 }, { "epoch": 0.17422222222222222, "grad_norm": 359.41644287109375, "learning_rate": 3e-06, "loss": -19.6096, "step": 1960 }, { "epoch": 0.1743111111111111, "grad_norm": 356.7577819824219, "learning_rate": 3e-06, "loss": -12.0564, "step": 1961 }, { "epoch": 0.1744, "grad_norm": 236.30433654785156, "learning_rate": 3e-06, "loss": -25.8836, "step": 1962 }, { "epoch": 0.1744888888888889, "grad_norm": 340.1228332519531, "learning_rate": 3e-06, "loss": -14.5898, "step": 1963 }, { "epoch": 0.17457777777777778, "grad_norm": 186.7640838623047, "learning_rate": 3e-06, "loss": -29.7451, "step": 1964 }, { "epoch": 0.17466666666666666, "grad_norm": 411.6400451660156, "learning_rate": 3e-06, "loss": -20.199, "step": 1965 }, { "epoch": 0.17475555555555555, "grad_norm": 319.7075500488281, "learning_rate": 3e-06, "loss": -22.8842, "step": 1966 }, { "epoch": 0.17484444444444444, "grad_norm": 417.3407287597656, "learning_rate": 3e-06, "loss": -16.5058, "step": 1967 }, { "epoch": 0.17493333333333333, "grad_norm": 243.57456970214844, "learning_rate": 3e-06, "loss": -31.0802, "step": 1968 }, { "completion_length": 249.93750762939453, "epoch": 0.17502222222222222, "grad_norm": 276.6730041503906, "learning_rate": 3e-06, "loss": -29.1006, "reward": 1.3958333730697632, "reward_std": 0.10206206887960434, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.7083333432674408, "step": 1969, "zero_std_ratio": 0.875 }, { "epoch": 0.1751111111111111, "grad_norm": 699.74658203125, "learning_rate": 3e-06, "loss": -14.0976, "step": 1970 }, { "epoch": 0.1752, "grad_norm": 471.5838623046875, "learning_rate": 3e-06, "loss": -20.9071, "step": 1971 }, { "epoch": 0.17528888888888888, "grad_norm": 472.9868469238281, "learning_rate": 3e-06, "loss": -25.4522, "step": 1972 }, { "epoch": 0.17537777777777777, "grad_norm": 254.69761657714844, "learning_rate": 3e-06, "loss": -29.7094, "step": 1973 }, { "epoch": 0.17546666666666666, "grad_norm": 248.12869262695312, "learning_rate": 3e-06, "loss": -28.2889, "step": 1974 }, { "epoch": 0.17555555555555555, "grad_norm": 236.1974639892578, "learning_rate": 3e-06, "loss": -32.1523, "step": 1975 }, { "epoch": 0.17564444444444444, "grad_norm": 648.0961303710938, "learning_rate": 3e-06, "loss": -16.4427, "step": 1976 }, { "epoch": 0.17573333333333332, "grad_norm": 378.50634765625, "learning_rate": 3e-06, "loss": -27.5418, "step": 1977 }, { "epoch": 0.1758222222222222, "grad_norm": 439.670654296875, "learning_rate": 3e-06, "loss": -30.8163, "step": 1978 }, { "epoch": 0.1759111111111111, "grad_norm": 216.43161010742188, "learning_rate": 3e-06, "loss": -34.2176, "step": 1979 }, { "epoch": 0.176, "grad_norm": 242.0641632080078, "learning_rate": 3e-06, "loss": -31.2334, "step": 1980 }, { "completion_length": 248.2916717529297, "epoch": 0.17608888888888888, "grad_norm": 431.9725646972656, "learning_rate": 3e-06, "loss": -90.337, "reward": 1.0208333730697632, "reward_std": 0.37592336535453796, "rewards/boxed_and_answer_tags_format_reward": 0.5625, "rewards/correctness_reward_func_math": 0.4583333432674408, "step": 1981, "zero_std_ratio": 0.625 }, { "epoch": 0.17617777777777777, "grad_norm": 421.96124267578125, "learning_rate": 3e-06, "loss": -74.4792, "step": 1982 }, { "epoch": 0.17626666666666665, "grad_norm": 376.8736877441406, "learning_rate": 3e-06, "loss": -66.9567, "step": 1983 }, { "epoch": 0.17635555555555554, "grad_norm": 489.1451110839844, "learning_rate": 3e-06, "loss": -61.4308, "step": 1984 }, { "epoch": 0.17644444444444443, "grad_norm": 597.60498046875, "learning_rate": 3e-06, "loss": -58.1928, "step": 1985 }, { "epoch": 0.17653333333333332, "grad_norm": 466.6503601074219, "learning_rate": 3e-06, "loss": -63.9999, "step": 1986 }, { "epoch": 0.17662222222222224, "grad_norm": 420.9534606933594, "learning_rate": 3e-06, "loss": -98.6249, "step": 1987 }, { "epoch": 0.17671111111111112, "grad_norm": 428.0660400390625, "learning_rate": 3e-06, "loss": -80.7811, "step": 1988 }, { "epoch": 0.1768, "grad_norm": 381.4080505371094, "learning_rate": 3e-06, "loss": -75.1067, "step": 1989 }, { "epoch": 0.1768888888888889, "grad_norm": 451.9751892089844, "learning_rate": 3e-06, "loss": -66.6592, "step": 1990 }, { "epoch": 0.1769777777777778, "grad_norm": 526.015625, "learning_rate": 3e-06, "loss": -69.5291, "step": 1991 }, { "epoch": 0.17706666666666668, "grad_norm": 532.5322265625, "learning_rate": 3e-06, "loss": -72.9259, "step": 1992 }, { "completion_length": 254.4166717529297, "epoch": 0.17715555555555557, "grad_norm": 836.5994873046875, "learning_rate": 3e-06, "loss": -56.0218, "reward": 1.0625000298023224, "reward_std": 0.6184598803520203, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.3749999925494194, "step": 1993, "zero_std_ratio": 0.375 }, { "epoch": 0.17724444444444445, "grad_norm": 828.1771240234375, "learning_rate": 3e-06, "loss": 7.2773, "step": 1994 }, { "epoch": 0.17733333333333334, "grad_norm": 832.0421752929688, "learning_rate": 3e-06, "loss": -26.279, "step": 1995 }, { "epoch": 0.17742222222222223, "grad_norm": 842.8486938476562, "learning_rate": 3e-06, "loss": -26.3621, "step": 1996 }, { "epoch": 0.17751111111111112, "grad_norm": 641.175048828125, "learning_rate": 3e-06, "loss": -35.8716, "step": 1997 }, { "epoch": 0.1776, "grad_norm": 621.6319580078125, "learning_rate": 3e-06, "loss": -46.1934, "step": 1998 }, { "epoch": 0.1776888888888889, "grad_norm": 908.200927734375, "learning_rate": 3e-06, "loss": -65.1173, "step": 1999 }, { "epoch": 0.17777777777777778, "grad_norm": 875.9900512695312, "learning_rate": 3e-06, "loss": 0.5691, "step": 2000 }, { "epoch": 0.17786666666666667, "grad_norm": 801.6747436523438, "learning_rate": 3e-06, "loss": -35.2798, "step": 2001 }, { "epoch": 0.17795555555555556, "grad_norm": 942.287353515625, "learning_rate": 3e-06, "loss": -34.8283, "step": 2002 }, { "epoch": 0.17804444444444445, "grad_norm": 616.319091796875, "learning_rate": 3e-06, "loss": -39.5028, "step": 2003 }, { "epoch": 0.17813333333333334, "grad_norm": 901.8914184570312, "learning_rate": 3e-06, "loss": -52.1208, "step": 2004 }, { "completion_length": 248.33333587646484, "epoch": 0.17822222222222223, "grad_norm": 706.6128540039062, "learning_rate": 3e-06, "loss": -29.2574, "reward": 1.5416666865348816, "reward_std": 0.6341222822666168, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.7916666865348816, "step": 2005, "zero_std_ratio": 0.375 }, { "epoch": 0.17831111111111111, "grad_norm": 781.2979736328125, "learning_rate": 3e-06, "loss": -8.4374, "step": 2006 }, { "epoch": 0.1784, "grad_norm": 799.8029174804688, "learning_rate": 3e-06, "loss": -62.7724, "step": 2007 }, { "epoch": 0.1784888888888889, "grad_norm": 783.2274780273438, "learning_rate": 3e-06, "loss": -32.8635, "step": 2008 }, { "epoch": 0.17857777777777778, "grad_norm": 657.0900268554688, "learning_rate": 3e-06, "loss": -6.8832, "step": 2009 }, { "epoch": 0.17866666666666667, "grad_norm": 736.504638671875, "learning_rate": 3e-06, "loss": -34.957, "step": 2010 }, { "epoch": 0.17875555555555556, "grad_norm": 723.53173828125, "learning_rate": 3e-06, "loss": -38.0641, "step": 2011 }, { "epoch": 0.17884444444444444, "grad_norm": 789.1529541015625, "learning_rate": 3e-06, "loss": -14.8114, "step": 2012 }, { "epoch": 0.17893333333333333, "grad_norm": 596.5152587890625, "learning_rate": 3e-06, "loss": -74.1531, "step": 2013 }, { "epoch": 0.17902222222222222, "grad_norm": 761.294189453125, "learning_rate": 3e-06, "loss": -44.1236, "step": 2014 }, { "epoch": 0.1791111111111111, "grad_norm": 669.9348754882812, "learning_rate": 3e-06, "loss": -13.3884, "step": 2015 }, { "epoch": 0.1792, "grad_norm": 624.5435180664062, "learning_rate": 3e-06, "loss": -43.6932, "step": 2016 }, { "completion_length": 248.64584350585938, "epoch": 0.1792888888888889, "grad_norm": 277.96478271484375, "learning_rate": 3e-06, "loss": 27.6711, "reward": 1.0625, "reward_std": 0.23116151988506317, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.375, "step": 2017, "zero_std_ratio": 0.75 }, { "epoch": 0.17937777777777777, "grad_norm": 346.2447814941406, "learning_rate": 3e-06, "loss": 23.3867, "step": 2018 }, { "epoch": 0.17946666666666666, "grad_norm": 250.9003448486328, "learning_rate": 3e-06, "loss": 24.0388, "step": 2019 }, { "epoch": 0.17955555555555555, "grad_norm": 308.0636291503906, "learning_rate": 3e-06, "loss": 22.8199, "step": 2020 }, { "epoch": 0.17964444444444444, "grad_norm": 356.0393371582031, "learning_rate": 3e-06, "loss": 17.5979, "step": 2021 }, { "epoch": 0.17973333333333333, "grad_norm": 350.8787841796875, "learning_rate": 3e-06, "loss": 21.2881, "step": 2022 }, { "epoch": 0.17982222222222222, "grad_norm": 290.56292724609375, "learning_rate": 3e-06, "loss": 24.3076, "step": 2023 }, { "epoch": 0.1799111111111111, "grad_norm": 291.3890075683594, "learning_rate": 3e-06, "loss": 19.0537, "step": 2024 }, { "epoch": 0.18, "grad_norm": 249.847412109375, "learning_rate": 3e-06, "loss": 20.9012, "step": 2025 }, { "epoch": 0.18008888888888888, "grad_norm": 343.4595031738281, "learning_rate": 3e-06, "loss": 17.5784, "step": 2026 }, { "epoch": 0.18017777777777777, "grad_norm": 551.070068359375, "learning_rate": 3e-06, "loss": 11.5671, "step": 2027 }, { "epoch": 0.18026666666666666, "grad_norm": 353.6826171875, "learning_rate": 3e-06, "loss": 13.8267, "step": 2028 }, { "completion_length": 252.8541717529297, "epoch": 0.18035555555555555, "grad_norm": 691.8173828125, "learning_rate": 3e-06, "loss": 19.1562, "reward": 1.3958333730697632, "reward_std": 0.505022794008255, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.7083333134651184, "step": 2029, "zero_std_ratio": 0.5 }, { "epoch": 0.18044444444444444, "grad_norm": 769.9490356445312, "learning_rate": 3e-06, "loss": -6.2182, "step": 2030 }, { "epoch": 0.18053333333333332, "grad_norm": 746.6094360351562, "learning_rate": 3e-06, "loss": 26.2743, "step": 2031 }, { "epoch": 0.1806222222222222, "grad_norm": 538.3868408203125, "learning_rate": 3e-06, "loss": 41.1944, "step": 2032 }, { "epoch": 0.1807111111111111, "grad_norm": 1304.790771484375, "learning_rate": 3e-06, "loss": -12.3747, "step": 2033 }, { "epoch": 0.1808, "grad_norm": 551.8775634765625, "learning_rate": 3e-06, "loss": 18.321, "step": 2034 }, { "epoch": 0.18088888888888888, "grad_norm": 639.1527709960938, "learning_rate": 3e-06, "loss": 9.5878, "step": 2035 }, { "epoch": 0.18097777777777777, "grad_norm": 659.2324829101562, "learning_rate": 3e-06, "loss": -12.4579, "step": 2036 }, { "epoch": 0.18106666666666665, "grad_norm": 709.0194702148438, "learning_rate": 3e-06, "loss": 14.4976, "step": 2037 }, { "epoch": 0.18115555555555554, "grad_norm": 522.2162475585938, "learning_rate": 3e-06, "loss": 31.9036, "step": 2038 }, { "epoch": 0.18124444444444446, "grad_norm": 605.5569458007812, "learning_rate": 3e-06, "loss": -16.3849, "step": 2039 }, { "epoch": 0.18133333333333335, "grad_norm": 586.4031982421875, "learning_rate": 3e-06, "loss": 7.3009, "step": 2040 }, { "completion_length": 255.25, "epoch": 0.18142222222222223, "grad_norm": 294.46368408203125, "learning_rate": 3e-06, "loss": -46.1645, "reward": 1.1666666865348816, "reward_std": 0.26603010296821594, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.5416666567325592, "step": 2041, "zero_std_ratio": 0.75 }, { "epoch": 0.18151111111111112, "grad_norm": 281.5460510253906, "learning_rate": 3e-06, "loss": -45.9692, "step": 2042 }, { "epoch": 0.1816, "grad_norm": 426.70758056640625, "learning_rate": 3e-06, "loss": -59.0595, "step": 2043 }, { "epoch": 0.1816888888888889, "grad_norm": 379.1589660644531, "learning_rate": 3e-06, "loss": -48.9399, "step": 2044 }, { "epoch": 0.1817777777777778, "grad_norm": 372.3189697265625, "learning_rate": 3e-06, "loss": -59.5334, "step": 2045 }, { "epoch": 0.18186666666666668, "grad_norm": 316.384765625, "learning_rate": 3e-06, "loss": -42.7701, "step": 2046 }, { "epoch": 0.18195555555555556, "grad_norm": 333.133056640625, "learning_rate": 3e-06, "loss": -52.4876, "step": 2047 }, { "epoch": 0.18204444444444445, "grad_norm": 302.69488525390625, "learning_rate": 3e-06, "loss": -50.558, "step": 2048 }, { "epoch": 0.18213333333333334, "grad_norm": 297.52264404296875, "learning_rate": 3e-06, "loss": -65.7042, "step": 2049 }, { "epoch": 0.18222222222222223, "grad_norm": 390.93719482421875, "learning_rate": 3e-06, "loss": -56.6311, "step": 2050 }, { "epoch": 0.18231111111111112, "grad_norm": 510.16064453125, "learning_rate": 3e-06, "loss": -66.4786, "step": 2051 }, { "epoch": 0.1824, "grad_norm": 386.4976501464844, "learning_rate": 3e-06, "loss": -55.7261, "step": 2052 }, { "completion_length": 231.77083587646484, "epoch": 0.1824888888888889, "grad_norm": 214.66683959960938, "learning_rate": 3e-06, "loss": -13.3666, "reward": 1.0208333730697632, "reward_std": 0.10206207633018494, "rewards/boxed_and_answer_tags_format_reward": 0.5625, "rewards/correctness_reward_func_math": 0.4583333283662796, "step": 2053, "zero_std_ratio": 0.875 }, { "epoch": 0.18257777777777778, "grad_norm": 241.9335174560547, "learning_rate": 3e-06, "loss": -16.4601, "step": 2054 }, { "epoch": 0.18266666666666667, "grad_norm": 227.05535888671875, "learning_rate": 3e-06, "loss": -15.4358, "step": 2055 }, { "epoch": 0.18275555555555556, "grad_norm": 280.6861267089844, "learning_rate": 3e-06, "loss": -19.7707, "step": 2056 }, { "epoch": 0.18284444444444445, "grad_norm": 211.3414306640625, "learning_rate": 3e-06, "loss": -11.4767, "step": 2057 }, { "epoch": 0.18293333333333334, "grad_norm": 332.11248779296875, "learning_rate": 3e-06, "loss": -11.748, "step": 2058 }, { "epoch": 0.18302222222222223, "grad_norm": 210.33470153808594, "learning_rate": 3e-06, "loss": -16.0377, "step": 2059 }, { "epoch": 0.1831111111111111, "grad_norm": 230.14593505859375, "learning_rate": 3e-06, "loss": -19.6348, "step": 2060 }, { "epoch": 0.1832, "grad_norm": 215.13331604003906, "learning_rate": 3e-06, "loss": -19.3665, "step": 2061 }, { "epoch": 0.1832888888888889, "grad_norm": 401.6134338378906, "learning_rate": 3e-06, "loss": -22.9875, "step": 2062 }, { "epoch": 0.18337777777777778, "grad_norm": 223.34193420410156, "learning_rate": 3e-06, "loss": -16.2301, "step": 2063 }, { "epoch": 0.18346666666666667, "grad_norm": 241.48159790039062, "learning_rate": 3e-06, "loss": -17.0173, "step": 2064 }, { "completion_length": 250.12500762939453, "epoch": 0.18355555555555556, "grad_norm": 783.0624389648438, "learning_rate": 3e-06, "loss": -7.0804, "reward": 1.2500000596046448, "reward_std": 0.3332235962152481, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.5000000111758709, "step": 2065, "zero_std_ratio": 0.625 }, { "epoch": 0.18364444444444444, "grad_norm": 456.5368957519531, "learning_rate": 3e-06, "loss": -5.7073, "step": 2066 }, { "epoch": 0.18373333333333333, "grad_norm": 801.0426635742188, "learning_rate": 3e-06, "loss": 24.9016, "step": 2067 }, { "epoch": 0.18382222222222222, "grad_norm": 452.8072204589844, "learning_rate": 3e-06, "loss": -16.3536, "step": 2068 }, { "epoch": 0.1839111111111111, "grad_norm": 573.7277221679688, "learning_rate": 3e-06, "loss": -14.2969, "step": 2069 }, { "epoch": 0.184, "grad_norm": 490.78375244140625, "learning_rate": 3e-06, "loss": -29.7209, "step": 2070 }, { "epoch": 0.18408888888888889, "grad_norm": 967.42578125, "learning_rate": 3e-06, "loss": -11.7546, "step": 2071 }, { "epoch": 0.18417777777777777, "grad_norm": 446.62945556640625, "learning_rate": 3e-06, "loss": -11.8012, "step": 2072 }, { "epoch": 0.18426666666666666, "grad_norm": 586.0079345703125, "learning_rate": 3e-06, "loss": 16.1318, "step": 2073 }, { "epoch": 0.18435555555555555, "grad_norm": 483.7483825683594, "learning_rate": 3e-06, "loss": -22.5055, "step": 2074 }, { "epoch": 0.18444444444444444, "grad_norm": 552.7943725585938, "learning_rate": 3e-06, "loss": -23.8956, "step": 2075 }, { "epoch": 0.18453333333333333, "grad_norm": 516.2462768554688, "learning_rate": 3e-06, "loss": -36.5617, "step": 2076 }, { "completion_length": 245.45833587646484, "epoch": 0.18462222222222222, "grad_norm": 526.9989013671875, "learning_rate": 3e-06, "loss": -1.3188, "reward": 1.8541667461395264, "reward_std": 0.3872983753681183, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 1.1666666269302368, "step": 2077, "zero_std_ratio": 0.625 }, { "epoch": 0.1847111111111111, "grad_norm": 695.4895629882812, "learning_rate": 3e-06, "loss": -7.7195, "step": 2078 }, { "epoch": 0.1848, "grad_norm": 732.2682495117188, "learning_rate": 3e-06, "loss": -3.051, "step": 2079 }, { "epoch": 0.18488888888888888, "grad_norm": 717.4251098632812, "learning_rate": 3e-06, "loss": 6.4156, "step": 2080 }, { "epoch": 0.18497777777777777, "grad_norm": 588.271484375, "learning_rate": 3e-06, "loss": -1.8704, "step": 2081 }, { "epoch": 0.18506666666666666, "grad_norm": 563.7857055664062, "learning_rate": 3e-06, "loss": 5.8266, "step": 2082 }, { "epoch": 0.18515555555555555, "grad_norm": 523.8809814453125, "learning_rate": 3e-06, "loss": -6.2269, "step": 2083 }, { "epoch": 0.18524444444444443, "grad_norm": 566.2587890625, "learning_rate": 3e-06, "loss": -11.1423, "step": 2084 }, { "epoch": 0.18533333333333332, "grad_norm": 575.639892578125, "learning_rate": 3e-06, "loss": -8.9986, "step": 2085 }, { "epoch": 0.1854222222222222, "grad_norm": 607.2640380859375, "learning_rate": 3e-06, "loss": 0.5698, "step": 2086 }, { "epoch": 0.1855111111111111, "grad_norm": 685.4660034179688, "learning_rate": 3e-06, "loss": -13.8418, "step": 2087 }, { "epoch": 0.1856, "grad_norm": 1059.2657470703125, "learning_rate": 3e-06, "loss": -3.093, "step": 2088 }, { "completion_length": 243.375, "epoch": 0.18568888888888888, "grad_norm": 533.3504638671875, "learning_rate": 3e-06, "loss": -2.4458, "reward": 1.7916667461395264, "reward_std": 0.43528565764427185, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.0416666567325592, "step": 2089, "zero_std_ratio": 0.5 }, { "epoch": 0.18577777777777776, "grad_norm": 466.76678466796875, "learning_rate": 3e-06, "loss": -25.5447, "step": 2090 }, { "epoch": 0.18586666666666668, "grad_norm": 482.4854736328125, "learning_rate": 3e-06, "loss": -11.9751, "step": 2091 }, { "epoch": 0.18595555555555557, "grad_norm": 617.836669921875, "learning_rate": 3e-06, "loss": -16.149, "step": 2092 }, { "epoch": 0.18604444444444446, "grad_norm": 806.7719116210938, "learning_rate": 3e-06, "loss": -12.1531, "step": 2093 }, { "epoch": 0.18613333333333335, "grad_norm": 436.9642333984375, "learning_rate": 3e-06, "loss": -20.6401, "step": 2094 }, { "epoch": 0.18622222222222223, "grad_norm": 533.0576171875, "learning_rate": 3e-06, "loss": -6.39, "step": 2095 }, { "epoch": 0.18631111111111112, "grad_norm": 578.9844360351562, "learning_rate": 3e-06, "loss": -28.3945, "step": 2096 }, { "epoch": 0.1864, "grad_norm": 518.856201171875, "learning_rate": 3e-06, "loss": -18.442, "step": 2097 }, { "epoch": 0.1864888888888889, "grad_norm": 681.1263427734375, "learning_rate": 3e-06, "loss": -22.9392, "step": 2098 }, { "epoch": 0.1865777777777778, "grad_norm": 759.8504028320312, "learning_rate": 3e-06, "loss": -19.4906, "step": 2099 }, { "epoch": 0.18666666666666668, "grad_norm": 497.5691833496094, "learning_rate": 3e-06, "loss": -25.688, "step": 2100 }, { "completion_length": 247.7291717529297, "epoch": 0.18675555555555556, "grad_norm": 766.0548706054688, "learning_rate": 3e-06, "loss": 9.0542, "reward": 1.145833358168602, "reward_std": 0.20412415266036987, "rewards/boxed_and_answer_tags_format_reward": 0.5625, "rewards/correctness_reward_func_math": 0.5833333544433117, "step": 2101, "zero_std_ratio": 0.75 }, { "epoch": 0.18684444444444445, "grad_norm": 274.5552062988281, "learning_rate": 3e-06, "loss": -14.4338, "step": 2102 }, { "epoch": 0.18693333333333334, "grad_norm": 357.243896484375, "learning_rate": 3e-06, "loss": -3.3275, "step": 2103 }, { "epoch": 0.18702222222222223, "grad_norm": 349.6654968261719, "learning_rate": 3e-06, "loss": -3.714, "step": 2104 }, { "epoch": 0.18711111111111112, "grad_norm": 252.651611328125, "learning_rate": 3e-06, "loss": 4.8234, "step": 2105 }, { "epoch": 0.1872, "grad_norm": 422.1712951660156, "learning_rate": 3e-06, "loss": 2.0141, "step": 2106 }, { "epoch": 0.1872888888888889, "grad_norm": 848.9307250976562, "learning_rate": 3e-06, "loss": 1.1462, "step": 2107 }, { "epoch": 0.18737777777777778, "grad_norm": 255.5582275390625, "learning_rate": 3e-06, "loss": -17.6895, "step": 2108 }, { "epoch": 0.18746666666666667, "grad_norm": 307.6992492675781, "learning_rate": 3e-06, "loss": -9.5895, "step": 2109 }, { "epoch": 0.18755555555555556, "grad_norm": 277.8653259277344, "learning_rate": 3e-06, "loss": -9.602, "step": 2110 }, { "epoch": 0.18764444444444445, "grad_norm": 234.34913635253906, "learning_rate": 3e-06, "loss": 0.8899, "step": 2111 }, { "epoch": 0.18773333333333334, "grad_norm": 355.8470764160156, "learning_rate": 3e-06, "loss": -4.6677, "step": 2112 }, { "completion_length": 250.89583587646484, "epoch": 0.18782222222222222, "grad_norm": 810.5846557617188, "learning_rate": 3e-06, "loss": 15.7377, "reward": 1.125, "reward_std": 0.23116151988506317, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.375, "step": 2113, "zero_std_ratio": 0.75 }, { "epoch": 0.1879111111111111, "grad_norm": 509.224853515625, "learning_rate": 3e-06, "loss": 18.5617, "step": 2114 }, { "epoch": 0.188, "grad_norm": 530.5845336914062, "learning_rate": 3e-06, "loss": 7.082, "step": 2115 }, { "epoch": 0.1880888888888889, "grad_norm": 672.1024169921875, "learning_rate": 3e-06, "loss": 1.3842, "step": 2116 }, { "epoch": 0.18817777777777778, "grad_norm": 408.4438171386719, "learning_rate": 3e-06, "loss": 16.7967, "step": 2117 }, { "epoch": 0.18826666666666667, "grad_norm": 569.5597534179688, "learning_rate": 3e-06, "loss": 15.6526, "step": 2118 }, { "epoch": 0.18835555555555555, "grad_norm": 613.7771606445312, "learning_rate": 3e-06, "loss": 11.1284, "step": 2119 }, { "epoch": 0.18844444444444444, "grad_norm": 443.5073547363281, "learning_rate": 3e-06, "loss": 13.1326, "step": 2120 }, { "epoch": 0.18853333333333333, "grad_norm": 745.5543823242188, "learning_rate": 3e-06, "loss": 3.177, "step": 2121 }, { "epoch": 0.18862222222222222, "grad_norm": 776.3263549804688, "learning_rate": 3e-06, "loss": -4.0693, "step": 2122 }, { "epoch": 0.1887111111111111, "grad_norm": 439.9760437011719, "learning_rate": 3e-06, "loss": 14.0792, "step": 2123 }, { "epoch": 0.1888, "grad_norm": 620.5515747070312, "learning_rate": 3e-06, "loss": 10.3075, "step": 2124 }, { "completion_length": 255.89583587646484, "epoch": 0.18888888888888888, "grad_norm": 1261.142578125, "learning_rate": 3e-06, "loss": -52.8242, "reward": 1.5416666865348816, "reward_std": 0.7283531129360199, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.7916666567325592, "step": 2125, "zero_std_ratio": 0.25 }, { "epoch": 0.18897777777777777, "grad_norm": 828.8499755859375, "learning_rate": 3e-06, "loss": -18.1417, "step": 2126 }, { "epoch": 0.18906666666666666, "grad_norm": 921.6665649414062, "learning_rate": 3e-06, "loss": -28.653, "step": 2127 }, { "epoch": 0.18915555555555555, "grad_norm": 1188.288330078125, "learning_rate": 3e-06, "loss": -14.6245, "step": 2128 }, { "epoch": 0.18924444444444444, "grad_norm": 875.7568969726562, "learning_rate": 3e-06, "loss": -19.076, "step": 2129 }, { "epoch": 0.18933333333333333, "grad_norm": 935.9678955078125, "learning_rate": 3e-06, "loss": -30.5958, "step": 2130 }, { "epoch": 0.18942222222222221, "grad_norm": 933.6688842773438, "learning_rate": 3e-06, "loss": -56.0788, "step": 2131 }, { "epoch": 0.1895111111111111, "grad_norm": 1004.11572265625, "learning_rate": 3e-06, "loss": -27.5338, "step": 2132 }, { "epoch": 0.1896, "grad_norm": 805.9441528320312, "learning_rate": 3e-06, "loss": -38.4037, "step": 2133 }, { "epoch": 0.18968888888888888, "grad_norm": 1125.8046875, "learning_rate": 3e-06, "loss": -21.7139, "step": 2134 }, { "epoch": 0.18977777777777777, "grad_norm": 892.211181640625, "learning_rate": 3e-06, "loss": -29.3145, "step": 2135 }, { "epoch": 0.18986666666666666, "grad_norm": 895.8474731445312, "learning_rate": 3e-06, "loss": -38.8515, "step": 2136 }, { "completion_length": 241.06250762939453, "epoch": 0.18995555555555554, "grad_norm": 587.2543334960938, "learning_rate": 3e-06, "loss": -30.24, "reward": 1.7083333730697632, "reward_std": 0.5128540322184563, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 1.0833333134651184, "step": 2137, "zero_std_ratio": 0.5 }, { "epoch": 0.19004444444444443, "grad_norm": 483.7323913574219, "learning_rate": 3e-06, "loss": -54.7314, "step": 2138 }, { "epoch": 0.19013333333333332, "grad_norm": 695.5252075195312, "learning_rate": 3e-06, "loss": -37.2405, "step": 2139 }, { "epoch": 0.1902222222222222, "grad_norm": 659.273681640625, "learning_rate": 3e-06, "loss": -54.6989, "step": 2140 }, { "epoch": 0.1903111111111111, "grad_norm": 665.203857421875, "learning_rate": 3e-06, "loss": -49.4977, "step": 2141 }, { "epoch": 0.1904, "grad_norm": 570.3987426757812, "learning_rate": 3e-06, "loss": -29.7209, "step": 2142 }, { "epoch": 0.1904888888888889, "grad_norm": 543.2655029296875, "learning_rate": 3e-06, "loss": -35.8428, "step": 2143 }, { "epoch": 0.1905777777777778, "grad_norm": 810.827880859375, "learning_rate": 3e-06, "loss": -59.7728, "step": 2144 }, { "epoch": 0.19066666666666668, "grad_norm": 702.2298583984375, "learning_rate": 3e-06, "loss": -49.0496, "step": 2145 }, { "epoch": 0.19075555555555557, "grad_norm": 575.1386108398438, "learning_rate": 3e-06, "loss": -64.258, "step": 2146 }, { "epoch": 0.19084444444444446, "grad_norm": 646.1061401367188, "learning_rate": 3e-06, "loss": -60.6597, "step": 2147 }, { "epoch": 0.19093333333333334, "grad_norm": 583.6048583984375, "learning_rate": 3e-06, "loss": -42.0407, "step": 2148 }, { "completion_length": 230.43750762939453, "epoch": 0.19102222222222223, "grad_norm": 553.2525024414062, "learning_rate": 3e-06, "loss": 31.5824, "reward": 1.2083333730697632, "reward_std": 0.46232303977012634, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.5833333283662796, "step": 2149, "zero_std_ratio": 0.5 }, { "epoch": 0.19111111111111112, "grad_norm": 672.8676147460938, "learning_rate": 3e-06, "loss": 26.2286, "step": 2150 }, { "epoch": 0.1912, "grad_norm": 594.0144653320312, "learning_rate": 3e-06, "loss": -2.6153, "step": 2151 }, { "epoch": 0.1912888888888889, "grad_norm": 828.8351440429688, "learning_rate": 3e-06, "loss": 2.5627, "step": 2152 }, { "epoch": 0.19137777777777779, "grad_norm": 601.149658203125, "learning_rate": 3e-06, "loss": -1.2538, "step": 2153 }, { "epoch": 0.19146666666666667, "grad_norm": 685.7401123046875, "learning_rate": 3e-06, "loss": 8.7495, "step": 2154 }, { "epoch": 0.19155555555555556, "grad_norm": 561.976318359375, "learning_rate": 3e-06, "loss": 22.0952, "step": 2155 }, { "epoch": 0.19164444444444445, "grad_norm": 583.328369140625, "learning_rate": 3e-06, "loss": 15.3746, "step": 2156 }, { "epoch": 0.19173333333333334, "grad_norm": 495.55609130859375, "learning_rate": 3e-06, "loss": -9.5814, "step": 2157 }, { "epoch": 0.19182222222222223, "grad_norm": 764.2197265625, "learning_rate": 3e-06, "loss": -2.7963, "step": 2158 }, { "epoch": 0.19191111111111112, "grad_norm": 2350.23779296875, "learning_rate": 3e-06, "loss": -8.0366, "step": 2159 }, { "epoch": 0.192, "grad_norm": 904.4937744140625, "learning_rate": 3e-06, "loss": -6.0055, "step": 2160 }, { "completion_length": 247.93750762939453, "epoch": 0.1920888888888889, "grad_norm": 899.4290771484375, "learning_rate": 3e-06, "loss": 51.9844, "reward": 1.5625000596046448, "reward_std": 0.5050228163599968, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.8749999701976776, "step": 2161, "zero_std_ratio": 0.5 }, { "epoch": 0.19217777777777778, "grad_norm": 936.0744018554688, "learning_rate": 3e-06, "loss": 24.5592, "step": 2162 }, { "epoch": 0.19226666666666667, "grad_norm": 865.0958251953125, "learning_rate": 3e-06, "loss": 4.2507, "step": 2163 }, { "epoch": 0.19235555555555556, "grad_norm": 819.485107421875, "learning_rate": 3e-06, "loss": -17.4293, "step": 2164 }, { "epoch": 0.19244444444444445, "grad_norm": 823.7230224609375, "learning_rate": 3e-06, "loss": -19.1261, "step": 2165 }, { "epoch": 0.19253333333333333, "grad_norm": 1002.2861328125, "learning_rate": 3e-06, "loss": -19.0886, "step": 2166 }, { "epoch": 0.19262222222222222, "grad_norm": 926.2667236328125, "learning_rate": 3e-06, "loss": 38.8715, "step": 2167 }, { "epoch": 0.1927111111111111, "grad_norm": 890.3988037109375, "learning_rate": 3e-06, "loss": 16.1366, "step": 2168 }, { "epoch": 0.1928, "grad_norm": 863.7974853515625, "learning_rate": 3e-06, "loss": -6.0243, "step": 2169 }, { "epoch": 0.1928888888888889, "grad_norm": 675.4772338867188, "learning_rate": 3e-06, "loss": -29.941, "step": 2170 }, { "epoch": 0.19297777777777778, "grad_norm": 795.9426879882812, "learning_rate": 3e-06, "loss": -37.6626, "step": 2171 }, { "epoch": 0.19306666666666666, "grad_norm": 1173.4658203125, "learning_rate": 3e-06, "loss": -42.7799, "step": 2172 }, { "completion_length": 255.27083587646484, "epoch": 0.19315555555555555, "grad_norm": 1052.441650390625, "learning_rate": 3e-06, "loss": 17.4183, "reward": 1.75, "reward_std": 0.720521941781044, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.0, "step": 2173, "zero_std_ratio": 0.25 }, { "epoch": 0.19324444444444444, "grad_norm": 1221.40673828125, "learning_rate": 3e-06, "loss": 10.0883, "step": 2174 }, { "epoch": 0.19333333333333333, "grad_norm": 733.4461059570312, "learning_rate": 3e-06, "loss": 7.5639, "step": 2175 }, { "epoch": 0.19342222222222222, "grad_norm": 981.289794921875, "learning_rate": 3e-06, "loss": 32.1803, "step": 2176 }, { "epoch": 0.1935111111111111, "grad_norm": 1169.4273681640625, "learning_rate": 3e-06, "loss": 38.9057, "step": 2177 }, { "epoch": 0.1936, "grad_norm": 1102.001220703125, "learning_rate": 3e-06, "loss": 11.3484, "step": 2178 }, { "epoch": 0.19368888888888888, "grad_norm": 992.0403442382812, "learning_rate": 3e-06, "loss": 17.4565, "step": 2179 }, { "epoch": 0.19377777777777777, "grad_norm": 1574.7171630859375, "learning_rate": 3e-06, "loss": -4.7767, "step": 2180 }, { "epoch": 0.19386666666666666, "grad_norm": 713.719482421875, "learning_rate": 3e-06, "loss": -2.3293, "step": 2181 }, { "epoch": 0.19395555555555555, "grad_norm": 999.0922241210938, "learning_rate": 3e-06, "loss": 20.7797, "step": 2182 }, { "epoch": 0.19404444444444444, "grad_norm": 1164.4508056640625, "learning_rate": 3e-06, "loss": 22.3378, "step": 2183 }, { "epoch": 0.19413333333333332, "grad_norm": 1133.1566162109375, "learning_rate": 3e-06, "loss": -3.4018, "step": 2184 }, { "completion_length": 246.64584350585938, "epoch": 0.1942222222222222, "grad_norm": 668.209716796875, "learning_rate": 3e-06, "loss": -9.4869, "reward": 1.7083333730697632, "reward_std": 0.3061862215399742, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.9583333432674408, "step": 2185, "zero_std_ratio": 0.625 }, { "epoch": 0.1943111111111111, "grad_norm": 433.6723937988281, "learning_rate": 3e-06, "loss": -11.4995, "step": 2186 }, { "epoch": 0.1944, "grad_norm": 682.7325439453125, "learning_rate": 3e-06, "loss": -28.6521, "step": 2187 }, { "epoch": 0.19448888888888888, "grad_norm": 636.8472900390625, "learning_rate": 3e-06, "loss": -13.4065, "step": 2188 }, { "epoch": 0.19457777777777777, "grad_norm": 447.03173828125, "learning_rate": 3e-06, "loss": -18.6822, "step": 2189 }, { "epoch": 0.19466666666666665, "grad_norm": 855.8515014648438, "learning_rate": 3e-06, "loss": -18.9689, "step": 2190 }, { "epoch": 0.19475555555555554, "grad_norm": 1044.5152587890625, "learning_rate": 3e-06, "loss": -20.745, "step": 2191 }, { "epoch": 0.19484444444444443, "grad_norm": 422.6670837402344, "learning_rate": 3e-06, "loss": -16.4546, "step": 2192 }, { "epoch": 0.19493333333333332, "grad_norm": 698.4715576171875, "learning_rate": 3e-06, "loss": -37.7735, "step": 2193 }, { "epoch": 0.19502222222222224, "grad_norm": 689.1241455078125, "learning_rate": 3e-06, "loss": -24.1425, "step": 2194 }, { "epoch": 0.19511111111111112, "grad_norm": 521.723876953125, "learning_rate": 3e-06, "loss": -25.6598, "step": 2195 }, { "epoch": 0.1952, "grad_norm": 641.6820678710938, "learning_rate": 3e-06, "loss": -27.7095, "step": 2196 }, { "completion_length": 250.18750762939453, "epoch": 0.1952888888888889, "grad_norm": 695.4179077148438, "learning_rate": 3e-06, "loss": -23.0851, "reward": 1.5625000596046448, "reward_std": 0.23116153478622437, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.8750000149011612, "step": 2197, "zero_std_ratio": 0.75 }, { "epoch": 0.1953777777777778, "grad_norm": 606.8045654296875, "learning_rate": 3e-06, "loss": 1.2567, "step": 2198 }, { "epoch": 0.19546666666666668, "grad_norm": 742.4933471679688, "learning_rate": 3e-06, "loss": 1.28, "step": 2199 }, { "epoch": 0.19555555555555557, "grad_norm": 555.1710205078125, "learning_rate": 3e-06, "loss": -2.6916, "step": 2200 }, { "epoch": 0.19564444444444445, "grad_norm": 658.5838012695312, "learning_rate": 3e-06, "loss": -3.5826, "step": 2201 }, { "epoch": 0.19573333333333334, "grad_norm": 481.04693603515625, "learning_rate": 3e-06, "loss": -9.7868, "step": 2202 }, { "epoch": 0.19582222222222223, "grad_norm": 583.1636352539062, "learning_rate": 3e-06, "loss": -26.7921, "step": 2203 }, { "epoch": 0.19591111111111112, "grad_norm": 582.9187622070312, "learning_rate": 3e-06, "loss": -1.0781, "step": 2204 }, { "epoch": 0.196, "grad_norm": 660.7078247070312, "learning_rate": 3e-06, "loss": -2.465, "step": 2205 }, { "epoch": 0.1960888888888889, "grad_norm": 522.7738647460938, "learning_rate": 3e-06, "loss": -3.8803, "step": 2206 }, { "epoch": 0.19617777777777778, "grad_norm": 1109.6839599609375, "learning_rate": 3e-06, "loss": -13.524, "step": 2207 }, { "epoch": 0.19626666666666667, "grad_norm": 588.8282470703125, "learning_rate": 3e-06, "loss": -12.7292, "step": 2208 }, { "completion_length": 244.9791717529297, "epoch": 0.19635555555555556, "grad_norm": 403.5253601074219, "learning_rate": 3e-06, "loss": -24.4641, "reward": 1.4895833730697632, "reward_std": 0.20219219475984573, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.75, "step": 2209, "zero_std_ratio": 0.75 }, { "epoch": 0.19644444444444445, "grad_norm": 545.7481079101562, "learning_rate": 3e-06, "loss": -28.5028, "step": 2210 }, { "epoch": 0.19653333333333334, "grad_norm": 737.4202270507812, "learning_rate": 3e-06, "loss": -21.5566, "step": 2211 }, { "epoch": 0.19662222222222223, "grad_norm": 415.2235412597656, "learning_rate": 3e-06, "loss": -19.7415, "step": 2212 }, { "epoch": 0.19671111111111111, "grad_norm": 661.0859375, "learning_rate": 3e-06, "loss": -27.5604, "step": 2213 }, { "epoch": 0.1968, "grad_norm": 688.0073852539062, "learning_rate": 3e-06, "loss": -25.3044, "step": 2214 }, { "epoch": 0.1968888888888889, "grad_norm": 377.343505859375, "learning_rate": 3e-06, "loss": -30.7918, "step": 2215 }, { "epoch": 0.19697777777777778, "grad_norm": 535.9216918945312, "learning_rate": 3e-06, "loss": -28.4559, "step": 2216 }, { "epoch": 0.19706666666666667, "grad_norm": 631.3789672851562, "learning_rate": 3e-06, "loss": -28.4084, "step": 2217 }, { "epoch": 0.19715555555555556, "grad_norm": 467.3281555175781, "learning_rate": 3e-06, "loss": -24.7054, "step": 2218 }, { "epoch": 0.19724444444444444, "grad_norm": 673.6080932617188, "learning_rate": 3e-06, "loss": -30.1266, "step": 2219 }, { "epoch": 0.19733333333333333, "grad_norm": 650.60009765625, "learning_rate": 3e-06, "loss": -30.6526, "step": 2220 }, { "completion_length": 251.27083587646484, "epoch": 0.19742222222222222, "grad_norm": 937.9761962890625, "learning_rate": 3e-06, "loss": -1.5455, "reward": 1.9166667461395264, "reward_std": 0.4779854342341423, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.1666666567325592, "step": 2221, "zero_std_ratio": 0.5 }, { "epoch": 0.1975111111111111, "grad_norm": 618.4127807617188, "learning_rate": 3e-06, "loss": 8.2676, "step": 2222 }, { "epoch": 0.1976, "grad_norm": 681.9295654296875, "learning_rate": 3e-06, "loss": 5.7096, "step": 2223 }, { "epoch": 0.1976888888888889, "grad_norm": 1006.9591064453125, "learning_rate": 3e-06, "loss": 52.6603, "step": 2224 }, { "epoch": 0.19777777777777777, "grad_norm": 786.645263671875, "learning_rate": 3e-06, "loss": 7.371, "step": 2225 }, { "epoch": 0.19786666666666666, "grad_norm": 779.1517333984375, "learning_rate": 3e-06, "loss": 15.4347, "step": 2226 }, { "epoch": 0.19795555555555555, "grad_norm": 1029.3656005859375, "learning_rate": 3e-06, "loss": -12.7798, "step": 2227 }, { "epoch": 0.19804444444444444, "grad_norm": 1035.653076171875, "learning_rate": 3e-06, "loss": -0.4735, "step": 2228 }, { "epoch": 0.19813333333333333, "grad_norm": 829.162841796875, "learning_rate": 3e-06, "loss": -2.8614, "step": 2229 }, { "epoch": 0.19822222222222222, "grad_norm": 873.4863891601562, "learning_rate": 3e-06, "loss": 30.9012, "step": 2230 }, { "epoch": 0.1983111111111111, "grad_norm": 649.9591674804688, "learning_rate": 3e-06, "loss": -6.5108, "step": 2231 }, { "epoch": 0.1984, "grad_norm": 805.36328125, "learning_rate": 3e-06, "loss": 2.5386, "step": 2232 }, { "completion_length": 246.1041717529297, "epoch": 0.19848888888888888, "grad_norm": 895.270263671875, "learning_rate": 3e-06, "loss": -57.1749, "reward": 1.2500000596046448, "reward_std": 0.4779854342341423, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.5000000111758709, "step": 2233, "zero_std_ratio": 0.5 }, { "epoch": 0.19857777777777777, "grad_norm": 765.6522827148438, "learning_rate": 3e-06, "loss": -48.4423, "step": 2234 }, { "epoch": 0.19866666666666666, "grad_norm": 755.2459716796875, "learning_rate": 3e-06, "loss": -29.8722, "step": 2235 }, { "epoch": 0.19875555555555555, "grad_norm": 685.5021362304688, "learning_rate": 3e-06, "loss": -19.4638, "step": 2236 }, { "epoch": 0.19884444444444443, "grad_norm": 832.52880859375, "learning_rate": 3e-06, "loss": -41.564, "step": 2237 }, { "epoch": 0.19893333333333332, "grad_norm": 681.293701171875, "learning_rate": 3e-06, "loss": -56.8457, "step": 2238 }, { "epoch": 0.1990222222222222, "grad_norm": 733.9737548828125, "learning_rate": 3e-06, "loss": -71.0826, "step": 2239 }, { "epoch": 0.1991111111111111, "grad_norm": 759.0897216796875, "learning_rate": 3e-06, "loss": -59.8898, "step": 2240 }, { "epoch": 0.1992, "grad_norm": 752.1569213867188, "learning_rate": 3e-06, "loss": -35.6279, "step": 2241 }, { "epoch": 0.19928888888888888, "grad_norm": 793.9288330078125, "learning_rate": 3e-06, "loss": -31.1288, "step": 2242 }, { "epoch": 0.19937777777777776, "grad_norm": 875.0328979492188, "learning_rate": 3e-06, "loss": -54.7059, "step": 2243 }, { "epoch": 0.19946666666666665, "grad_norm": 735.8705444335938, "learning_rate": 3e-06, "loss": -64.119, "step": 2244 }, { "completion_length": 245.1875, "epoch": 0.19955555555555557, "grad_norm": 691.626953125, "learning_rate": 3e-06, "loss": 33.6045, "reward": 1.0729166865348816, "reward_std": 0.41281384229660034, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.3333333432674408, "step": 2245, "zero_std_ratio": 0.5 }, { "epoch": 0.19964444444444446, "grad_norm": 847.1795654296875, "learning_rate": 3e-06, "loss": 36.038, "step": 2246 }, { "epoch": 0.19973333333333335, "grad_norm": 502.91473388671875, "learning_rate": 3e-06, "loss": 22.1079, "step": 2247 }, { "epoch": 0.19982222222222223, "grad_norm": 577.439697265625, "learning_rate": 3e-06, "loss": 33.3352, "step": 2248 }, { "epoch": 0.19991111111111112, "grad_norm": 830.4883422851562, "learning_rate": 3e-06, "loss": 41.7401, "step": 2249 }, { "epoch": 0.2, "grad_norm": 769.1807861328125, "learning_rate": 3e-06, "loss": 37.7994, "step": 2250 }, { "epoch": 0.2000888888888889, "grad_norm": 636.1270141601562, "learning_rate": 3e-06, "loss": 27.5784, "step": 2251 }, { "epoch": 0.2001777777777778, "grad_norm": 761.4356079101562, "learning_rate": 3e-06, "loss": 27.7986, "step": 2252 }, { "epoch": 0.20026666666666668, "grad_norm": 530.1929931640625, "learning_rate": 3e-06, "loss": 17.7146, "step": 2253 }, { "epoch": 0.20035555555555556, "grad_norm": 564.9805908203125, "learning_rate": 3e-06, "loss": 27.1709, "step": 2254 }, { "epoch": 0.20044444444444445, "grad_norm": 600.7700805664062, "learning_rate": 3e-06, "loss": 37.6539, "step": 2255 }, { "epoch": 0.20053333333333334, "grad_norm": 710.3458862304688, "learning_rate": 3e-06, "loss": 30.2361, "step": 2256 }, { "completion_length": 249.9791717529297, "epoch": 0.20062222222222223, "grad_norm": 404.9451904296875, "learning_rate": 3e-06, "loss": -93.8459, "reward": 1.1458333730697632, "reward_std": 0.23116151988506317, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.4583333283662796, "step": 2257, "zero_std_ratio": 0.75 }, { "epoch": 0.20071111111111112, "grad_norm": 394.6553649902344, "learning_rate": 3e-06, "loss": -76.6418, "step": 2258 }, { "epoch": 0.2008, "grad_norm": 366.0687255859375, "learning_rate": 3e-06, "loss": -73.8687, "step": 2259 }, { "epoch": 0.2008888888888889, "grad_norm": 464.67388916015625, "learning_rate": 3e-06, "loss": -82.5149, "step": 2260 }, { "epoch": 0.20097777777777778, "grad_norm": 437.32470703125, "learning_rate": 3e-06, "loss": -62.3202, "step": 2261 }, { "epoch": 0.20106666666666667, "grad_norm": 884.9143676757812, "learning_rate": 3e-06, "loss": -76.1318, "step": 2262 }, { "epoch": 0.20115555555555556, "grad_norm": 431.22882080078125, "learning_rate": 3e-06, "loss": -101.0349, "step": 2263 }, { "epoch": 0.20124444444444445, "grad_norm": 407.47344970703125, "learning_rate": 3e-06, "loss": -83.3404, "step": 2264 }, { "epoch": 0.20133333333333334, "grad_norm": 450.6275634765625, "learning_rate": 3e-06, "loss": -81.2565, "step": 2265 }, { "epoch": 0.20142222222222222, "grad_norm": 446.26715087890625, "learning_rate": 3e-06, "loss": -92.9891, "step": 2266 }, { "epoch": 0.2015111111111111, "grad_norm": 421.58514404296875, "learning_rate": 3e-06, "loss": -72.7071, "step": 2267 }, { "epoch": 0.2016, "grad_norm": 499.9620361328125, "learning_rate": 3e-06, "loss": -86.6616, "step": 2268 }, { "completion_length": 251.43750762939453, "epoch": 0.2016888888888889, "grad_norm": 1807.4974365234375, "learning_rate": 3e-06, "loss": -2.4199, "reward": 1.4375, "reward_std": 0.46232306957244873, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.75, "step": 2269, "zero_std_ratio": 0.5 }, { "epoch": 0.20177777777777778, "grad_norm": 1014.031005859375, "learning_rate": 3e-06, "loss": -11.0913, "step": 2270 }, { "epoch": 0.20186666666666667, "grad_norm": 969.154541015625, "learning_rate": 3e-06, "loss": -45.8343, "step": 2271 }, { "epoch": 0.20195555555555555, "grad_norm": 1514.218994140625, "learning_rate": 3e-06, "loss": -6.767, "step": 2272 }, { "epoch": 0.20204444444444444, "grad_norm": 1047.0552978515625, "learning_rate": 3e-06, "loss": -41.9022, "step": 2273 }, { "epoch": 0.20213333333333333, "grad_norm": 1069.608642578125, "learning_rate": 3e-06, "loss": -32.74, "step": 2274 }, { "epoch": 0.20222222222222222, "grad_norm": 1186.3797607421875, "learning_rate": 3e-06, "loss": -5.627, "step": 2275 }, { "epoch": 0.2023111111111111, "grad_norm": 1353.1217041015625, "learning_rate": 3e-06, "loss": -20.7514, "step": 2276 }, { "epoch": 0.2024, "grad_norm": 973.7822875976562, "learning_rate": 3e-06, "loss": -57.7389, "step": 2277 }, { "epoch": 0.20248888888888888, "grad_norm": 1122.6533203125, "learning_rate": 3e-06, "loss": -19.221, "step": 2278 }, { "epoch": 0.20257777777777777, "grad_norm": 1007.2978515625, "learning_rate": 3e-06, "loss": -49.3046, "step": 2279 }, { "epoch": 0.20266666666666666, "grad_norm": 1087.270751953125, "learning_rate": 3e-06, "loss": -46.9498, "step": 2280 }, { "completion_length": 248.0, "epoch": 0.20275555555555555, "grad_norm": 684.866455078125, "learning_rate": 3e-06, "loss": -24.7332, "reward": 1.0416666865348816, "reward_std": 0.40296074748039246, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 0.4166666716337204, "step": 2281, "zero_std_ratio": 0.625 }, { "epoch": 0.20284444444444444, "grad_norm": 685.3329467773438, "learning_rate": 3e-06, "loss": 10.4014, "step": 2282 }, { "epoch": 0.20293333333333333, "grad_norm": 828.5563354492188, "learning_rate": 3e-06, "loss": -17.5656, "step": 2283 }, { "epoch": 0.20302222222222222, "grad_norm": 797.2943115234375, "learning_rate": 3e-06, "loss": -24.5962, "step": 2284 }, { "epoch": 0.2031111111111111, "grad_norm": 778.4367065429688, "learning_rate": 3e-06, "loss": 13.921, "step": 2285 }, { "epoch": 0.2032, "grad_norm": 694.8433837890625, "learning_rate": 3e-06, "loss": -1.7194, "step": 2286 }, { "epoch": 0.20328888888888888, "grad_norm": 808.0003662109375, "learning_rate": 3e-06, "loss": -30.4908, "step": 2287 }, { "epoch": 0.20337777777777777, "grad_norm": 724.6696166992188, "learning_rate": 3e-06, "loss": 2.6068, "step": 2288 }, { "epoch": 0.20346666666666666, "grad_norm": 830.2708129882812, "learning_rate": 3e-06, "loss": -28.4917, "step": 2289 }, { "epoch": 0.20355555555555555, "grad_norm": 785.9896850585938, "learning_rate": 3e-06, "loss": -31.1686, "step": 2290 }, { "epoch": 0.20364444444444443, "grad_norm": 851.0347290039062, "learning_rate": 3e-06, "loss": 0.2481, "step": 2291 }, { "epoch": 0.20373333333333332, "grad_norm": 774.1303100585938, "learning_rate": 3e-06, "loss": -6.2776, "step": 2292 }, { "completion_length": 246.0625, "epoch": 0.2038222222222222, "grad_norm": 635.4057006835938, "learning_rate": 3e-06, "loss": 4.3122, "reward": 1.5104167461395264, "reward_std": 0.34120412170886993, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 0.8333333283662796, "step": 2293, "zero_std_ratio": 0.625 }, { "epoch": 0.2039111111111111, "grad_norm": 653.86328125, "learning_rate": 3e-06, "loss": 12.7972, "step": 2294 }, { "epoch": 0.204, "grad_norm": 610.3839721679688, "learning_rate": 3e-06, "loss": -2.9267, "step": 2295 }, { "epoch": 0.20408888888888888, "grad_norm": 714.1361083984375, "learning_rate": 3e-06, "loss": -23.5832, "step": 2296 }, { "epoch": 0.2041777777777778, "grad_norm": 718.49658203125, "learning_rate": 3e-06, "loss": -4.627, "step": 2297 }, { "epoch": 0.20426666666666668, "grad_norm": 696.8006591796875, "learning_rate": 3e-06, "loss": -11.8357, "step": 2298 }, { "epoch": 0.20435555555555557, "grad_norm": 814.3312377929688, "learning_rate": 3e-06, "loss": -1.5097, "step": 2299 }, { "epoch": 0.20444444444444446, "grad_norm": 556.5509643554688, "learning_rate": 3e-06, "loss": 10.6618, "step": 2300 }, { "epoch": 0.20453333333333334, "grad_norm": 667.0651245117188, "learning_rate": 3e-06, "loss": -6.0333, "step": 2301 }, { "epoch": 0.20462222222222223, "grad_norm": 694.1640625, "learning_rate": 3e-06, "loss": -24.5284, "step": 2302 }, { "epoch": 0.20471111111111112, "grad_norm": 923.972900390625, "learning_rate": 3e-06, "loss": -11.2173, "step": 2303 }, { "epoch": 0.2048, "grad_norm": 659.1800537109375, "learning_rate": 3e-06, "loss": -16.7782, "step": 2304 }, { "completion_length": 254.4375, "epoch": 0.2048888888888889, "grad_norm": 971.201171875, "learning_rate": 3e-06, "loss": -6.4646, "reward": 2.125, "reward_std": 0.43528565764427185, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.375, "step": 2305, "zero_std_ratio": 0.5 }, { "epoch": 0.2049777777777778, "grad_norm": 1799.6265869140625, "learning_rate": 3e-06, "loss": 13.6479, "step": 2306 }, { "epoch": 0.20506666666666667, "grad_norm": 1432.3759765625, "learning_rate": 3e-06, "loss": -12.8131, "step": 2307 }, { "epoch": 0.20515555555555556, "grad_norm": 1100.3665771484375, "learning_rate": 3e-06, "loss": 19.3096, "step": 2308 }, { "epoch": 0.20524444444444445, "grad_norm": 954.0089111328125, "learning_rate": 3e-06, "loss": -5.624, "step": 2309 }, { "epoch": 0.20533333333333334, "grad_norm": 1022.4109497070312, "learning_rate": 3e-06, "loss": -26.2045, "step": 2310 }, { "epoch": 0.20542222222222223, "grad_norm": 1009.416015625, "learning_rate": 3e-06, "loss": -9.7483, "step": 2311 }, { "epoch": 0.20551111111111112, "grad_norm": 1540.92333984375, "learning_rate": 3e-06, "loss": 0.92, "step": 2312 }, { "epoch": 0.2056, "grad_norm": 1027.509765625, "learning_rate": 3e-06, "loss": -24.657, "step": 2313 }, { "epoch": 0.2056888888888889, "grad_norm": 948.5579833984375, "learning_rate": 3e-06, "loss": 8.6437, "step": 2314 }, { "epoch": 0.20577777777777778, "grad_norm": 919.4548950195312, "learning_rate": 3e-06, "loss": -21.9334, "step": 2315 }, { "epoch": 0.20586666666666667, "grad_norm": 979.2217407226562, "learning_rate": 3e-06, "loss": -36.1305, "step": 2316 }, { "completion_length": 253.52083587646484, "epoch": 0.20595555555555556, "grad_norm": 382.2681579589844, "learning_rate": 3e-06, "loss": 13.32, "reward": 0.9791666865348816, "reward_std": 0.10206206887960434, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.2916666679084301, "step": 2317, "zero_std_ratio": 0.875 }, { "epoch": 0.20604444444444445, "grad_norm": 585.9896850585938, "learning_rate": 3e-06, "loss": 14.567, "step": 2318 }, { "epoch": 0.20613333333333334, "grad_norm": 433.8167419433594, "learning_rate": 3e-06, "loss": 0.179, "step": 2319 }, { "epoch": 0.20622222222222222, "grad_norm": 473.40289306640625, "learning_rate": 3e-06, "loss": -8.1342, "step": 2320 }, { "epoch": 0.2063111111111111, "grad_norm": 454.9488220214844, "learning_rate": 3e-06, "loss": 6.4596, "step": 2321 }, { "epoch": 0.2064, "grad_norm": 545.6451416015625, "learning_rate": 3e-06, "loss": -0.0272, "step": 2322 }, { "epoch": 0.2064888888888889, "grad_norm": 393.2840576171875, "learning_rate": 3e-06, "loss": 10.1428, "step": 2323 }, { "epoch": 0.20657777777777778, "grad_norm": 624.9196166992188, "learning_rate": 3e-06, "loss": 10.519, "step": 2324 }, { "epoch": 0.20666666666666667, "grad_norm": 423.4202880859375, "learning_rate": 3e-06, "loss": -6.3045, "step": 2325 }, { "epoch": 0.20675555555555555, "grad_norm": 396.0754699707031, "learning_rate": 3e-06, "loss": -11.096, "step": 2326 }, { "epoch": 0.20684444444444444, "grad_norm": 583.927734375, "learning_rate": 3e-06, "loss": 2.5989, "step": 2327 }, { "epoch": 0.20693333333333333, "grad_norm": 597.1937255859375, "learning_rate": 3e-06, "loss": -8.4196, "step": 2328 }, { "completion_length": 242.62500762939453, "epoch": 0.20702222222222222, "grad_norm": 800.9533081054688, "learning_rate": 3e-06, "loss": -33.827, "reward": 1.4375000596046448, "reward_std": 0.46232303977012634, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.75, "step": 2329, "zero_std_ratio": 0.5 }, { "epoch": 0.2071111111111111, "grad_norm": 696.8334350585938, "learning_rate": 3e-06, "loss": -20.2565, "step": 2330 }, { "epoch": 0.2072, "grad_norm": 728.7371826171875, "learning_rate": 3e-06, "loss": -55.6491, "step": 2331 }, { "epoch": 0.20728888888888888, "grad_norm": 845.1964111328125, "learning_rate": 3e-06, "loss": -34.6344, "step": 2332 }, { "epoch": 0.20737777777777777, "grad_norm": 797.7058715820312, "learning_rate": 3e-06, "loss": -38.3381, "step": 2333 }, { "epoch": 0.20746666666666666, "grad_norm": 815.6392211914062, "learning_rate": 3e-06, "loss": -39.5545, "step": 2334 }, { "epoch": 0.20755555555555555, "grad_norm": 824.5341796875, "learning_rate": 3e-06, "loss": -42.9804, "step": 2335 }, { "epoch": 0.20764444444444444, "grad_norm": 839.4075927734375, "learning_rate": 3e-06, "loss": -31.0013, "step": 2336 }, { "epoch": 0.20773333333333333, "grad_norm": 808.272705078125, "learning_rate": 3e-06, "loss": -63.1393, "step": 2337 }, { "epoch": 0.2078222222222222, "grad_norm": 937.5029296875, "learning_rate": 3e-06, "loss": -49.1489, "step": 2338 }, { "epoch": 0.2079111111111111, "grad_norm": 852.005859375, "learning_rate": 3e-06, "loss": -50.4064, "step": 2339 }, { "epoch": 0.208, "grad_norm": 897.2970581054688, "learning_rate": 3e-06, "loss": -52.8892, "step": 2340 }, { "completion_length": 222.70834350585938, "epoch": 0.20808888888888888, "grad_norm": 757.0648803710938, "learning_rate": 3e-06, "loss": 35.128, "reward": 1.7083333730697632, "reward_std": 0.3332235887646675, "rewards/boxed_and_answer_tags_format_reward": 0.625, "rewards/correctness_reward_func_math": 1.0833333134651184, "step": 2341, "zero_std_ratio": 0.625 }, { "epoch": 0.20817777777777777, "grad_norm": 974.6918334960938, "learning_rate": 3e-06, "loss": 26.4346, "step": 2342 }, { "epoch": 0.20826666666666666, "grad_norm": 960.5851440429688, "learning_rate": 3e-06, "loss": 18.2117, "step": 2343 }, { "epoch": 0.20835555555555554, "grad_norm": 748.3045654296875, "learning_rate": 3e-06, "loss": 19.8586, "step": 2344 }, { "epoch": 0.20844444444444443, "grad_norm": 900.9320068359375, "learning_rate": 3e-06, "loss": 9.7996, "step": 2345 }, { "epoch": 0.20853333333333332, "grad_norm": 769.8694458007812, "learning_rate": 3e-06, "loss": 25.5797, "step": 2346 }, { "epoch": 0.2086222222222222, "grad_norm": 734.0509033203125, "learning_rate": 3e-06, "loss": 27.9682, "step": 2347 }, { "epoch": 0.2087111111111111, "grad_norm": 869.8523559570312, "learning_rate": 3e-06, "loss": 21.5981, "step": 2348 }, { "epoch": 0.2088, "grad_norm": 1000.9803466796875, "learning_rate": 3e-06, "loss": 4.3671, "step": 2349 }, { "epoch": 0.2088888888888889, "grad_norm": 766.8132934570312, "learning_rate": 3e-06, "loss": 10.1062, "step": 2350 }, { "epoch": 0.2089777777777778, "grad_norm": 1140.35986328125, "learning_rate": 3e-06, "loss": -0.0209, "step": 2351 }, { "epoch": 0.20906666666666668, "grad_norm": 769.6608276367188, "learning_rate": 3e-06, "loss": 17.1774, "step": 2352 }, { "completion_length": 235.12500762939453, "epoch": 0.20915555555555557, "grad_norm": 1358.110595703125, "learning_rate": 3e-06, "loss": -29.3384, "reward": 1.5625000596046448, "reward_std": 0.599253699183464, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.8750000298023224, "step": 2353, "zero_std_ratio": 0.375 }, { "epoch": 0.20924444444444446, "grad_norm": 1078.0203857421875, "learning_rate": 3e-06, "loss": -4.6919, "step": 2354 }, { "epoch": 0.20933333333333334, "grad_norm": 1239.9210205078125, "learning_rate": 3e-06, "loss": -58.9311, "step": 2355 }, { "epoch": 0.20942222222222223, "grad_norm": 1190.312255859375, "learning_rate": 3e-06, "loss": -41.3338, "step": 2356 }, { "epoch": 0.20951111111111112, "grad_norm": 1177.6978759765625, "learning_rate": 3e-06, "loss": -77.211, "step": 2357 }, { "epoch": 0.2096, "grad_norm": 1262.98876953125, "learning_rate": 3e-06, "loss": -30.6502, "step": 2358 }, { "epoch": 0.2096888888888889, "grad_norm": 1281.507568359375, "learning_rate": 3e-06, "loss": -46.3909, "step": 2359 }, { "epoch": 0.20977777777777779, "grad_norm": 1242.6148681640625, "learning_rate": 3e-06, "loss": -20.0434, "step": 2360 }, { "epoch": 0.20986666666666667, "grad_norm": 1216.9324951171875, "learning_rate": 3e-06, "loss": -75.5012, "step": 2361 }, { "epoch": 0.20995555555555556, "grad_norm": 1148.396240234375, "learning_rate": 3e-06, "loss": -60.3074, "step": 2362 }, { "epoch": 0.21004444444444445, "grad_norm": 1038.05224609375, "learning_rate": 3e-06, "loss": -90.1416, "step": 2363 }, { "epoch": 0.21013333333333334, "grad_norm": 1481.82470703125, "learning_rate": 3e-06, "loss": -41.4112, "step": 2364 }, { "completion_length": 236.4375, "epoch": 0.21022222222222223, "grad_norm": 1174.8533935546875, "learning_rate": 3e-06, "loss": -15.5789, "reward": 1.2500000298023224, "reward_std": 0.4431168735027313, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.4999999925494194, "step": 2365, "zero_std_ratio": 0.5 }, { "epoch": 0.21031111111111112, "grad_norm": 3015.200927734375, "learning_rate": 3e-06, "loss": -27.8838, "step": 2366 }, { "epoch": 0.2104, "grad_norm": 1439.3358154296875, "learning_rate": 3e-06, "loss": -38.5885, "step": 2367 }, { "epoch": 0.2104888888888889, "grad_norm": 869.1331787109375, "learning_rate": 3e-06, "loss": -19.1413, "step": 2368 }, { "epoch": 0.21057777777777778, "grad_norm": 954.2206420898438, "learning_rate": 3e-06, "loss": -22.8123, "step": 2369 }, { "epoch": 0.21066666666666667, "grad_norm": 923.0101928710938, "learning_rate": 3e-06, "loss": -34.3305, "step": 2370 }, { "epoch": 0.21075555555555556, "grad_norm": 1316.6534423828125, "learning_rate": 3e-06, "loss": -30.9986, "step": 2371 }, { "epoch": 0.21084444444444445, "grad_norm": 1236.0667724609375, "learning_rate": 3e-06, "loss": -48.6076, "step": 2372 }, { "epoch": 0.21093333333333333, "grad_norm": 1252.537109375, "learning_rate": 3e-06, "loss": -61.8291, "step": 2373 }, { "epoch": 0.21102222222222222, "grad_norm": 1343.861328125, "learning_rate": 3e-06, "loss": -29.769, "step": 2374 }, { "epoch": 0.2111111111111111, "grad_norm": 1069.728515625, "learning_rate": 3e-06, "loss": -31.3108, "step": 2375 }, { "epoch": 0.2112, "grad_norm": 898.3480834960938, "learning_rate": 3e-06, "loss": -53.5648, "step": 2376 }, { "completion_length": 250.2916717529297, "epoch": 0.2112888888888889, "grad_norm": 1555.4439697265625, "learning_rate": 3e-06, "loss": 23.924, "reward": 0.9583333432674408, "reward_std": 0.43528565764427185, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.2083333358168602, "step": 2377, "zero_std_ratio": 0.5 }, { "epoch": 0.21137777777777778, "grad_norm": 1271.503662109375, "learning_rate": 3e-06, "loss": -28.6245, "step": 2378 }, { "epoch": 0.21146666666666666, "grad_norm": 1083.2822265625, "learning_rate": 3e-06, "loss": -61.1598, "step": 2379 }, { "epoch": 0.21155555555555555, "grad_norm": 950.6062622070312, "learning_rate": 3e-06, "loss": -19.0397, "step": 2380 }, { "epoch": 0.21164444444444444, "grad_norm": 1111.03857421875, "learning_rate": 3e-06, "loss": 11.29, "step": 2381 }, { "epoch": 0.21173333333333333, "grad_norm": 1028.947509765625, "learning_rate": 3e-06, "loss": -16.2803, "step": 2382 }, { "epoch": 0.21182222222222222, "grad_norm": 1200.6107177734375, "learning_rate": 3e-06, "loss": 14.5257, "step": 2383 }, { "epoch": 0.2119111111111111, "grad_norm": 1430.796630859375, "learning_rate": 3e-06, "loss": -34.2578, "step": 2384 }, { "epoch": 0.212, "grad_norm": 1028.548583984375, "learning_rate": 3e-06, "loss": -68.5076, "step": 2385 }, { "epoch": 0.21208888888888888, "grad_norm": 1001.2411499023438, "learning_rate": 3e-06, "loss": -25.6736, "step": 2386 }, { "epoch": 0.21217777777777777, "grad_norm": 984.0308227539062, "learning_rate": 3e-06, "loss": -2.7987, "step": 2387 }, { "epoch": 0.21226666666666666, "grad_norm": 965.3121337890625, "learning_rate": 3e-06, "loss": -28.9898, "step": 2388 }, { "completion_length": 229.2916717529297, "epoch": 0.21235555555555555, "grad_norm": 557.446533203125, "learning_rate": 3e-06, "loss": 21.0555, "reward": 1.8437500596046448, "reward_std": 0.1546149756759405, "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, "rewards/correctness_reward_func_math": 1.1666666716337204, "step": 2389, "zero_std_ratio": 0.75 }, { "epoch": 0.21244444444444444, "grad_norm": 575.1875610351562, "learning_rate": 3e-06, "loss": 23.3964, "step": 2390 }, { "epoch": 0.21253333333333332, "grad_norm": 360.0423278808594, "learning_rate": 3e-06, "loss": 33.3316, "step": 2391 }, { "epoch": 0.2126222222222222, "grad_norm": 255.54953002929688, "learning_rate": 3e-06, "loss": 31.5129, "step": 2392 }, { "epoch": 0.2127111111111111, "grad_norm": 265.9867248535156, "learning_rate": 3e-06, "loss": 16.7137, "step": 2393 }, { "epoch": 0.2128, "grad_norm": 356.7539978027344, "learning_rate": 3e-06, "loss": 29.8486, "step": 2394 }, { "epoch": 0.21288888888888888, "grad_norm": 380.1522521972656, "learning_rate": 3e-06, "loss": 18.9171, "step": 2395 }, { "epoch": 0.21297777777777777, "grad_norm": 627.226806640625, "learning_rate": 3e-06, "loss": 14.277, "step": 2396 }, { "epoch": 0.21306666666666665, "grad_norm": 473.0029296875, "learning_rate": 3e-06, "loss": 26.1599, "step": 2397 }, { "epoch": 0.21315555555555554, "grad_norm": 256.2850646972656, "learning_rate": 3e-06, "loss": 27.8036, "step": 2398 }, { "epoch": 0.21324444444444443, "grad_norm": 294.08056640625, "learning_rate": 3e-06, "loss": 11.133, "step": 2399 }, { "epoch": 0.21333333333333335, "grad_norm": 292.5602111816406, "learning_rate": 3e-06, "loss": 21.8162, "step": 2400 }, { "completion_length": 248.7291717529297, "epoch": 0.21342222222222224, "grad_norm": 991.9778442382812, "learning_rate": 3e-06, "loss": 12.964, "reward": 1.5000000596046448, "reward_std": 0.20412413775920868, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.75, "step": 2401, "zero_std_ratio": 0.75 }, { "epoch": 0.21351111111111112, "grad_norm": 644.521484375, "learning_rate": 3e-06, "loss": -4.3122, "step": 2402 }, { "epoch": 0.2136, "grad_norm": 680.9287719726562, "learning_rate": 3e-06, "loss": 3.5692, "step": 2403 }, { "epoch": 0.2136888888888889, "grad_norm": 824.704345703125, "learning_rate": 3e-06, "loss": 20.7909, "step": 2404 }, { "epoch": 0.2137777777777778, "grad_norm": 956.5078125, "learning_rate": 3e-06, "loss": 10.9202, "step": 2405 }, { "epoch": 0.21386666666666668, "grad_norm": 1738.125244140625, "learning_rate": 3e-06, "loss": 5.5927, "step": 2406 }, { "epoch": 0.21395555555555557, "grad_norm": 920.435546875, "learning_rate": 3e-06, "loss": 2.9809, "step": 2407 }, { "epoch": 0.21404444444444445, "grad_norm": 677.7364501953125, "learning_rate": 3e-06, "loss": -7.6728, "step": 2408 }, { "epoch": 0.21413333333333334, "grad_norm": 724.0176391601562, "learning_rate": 3e-06, "loss": -5.1334, "step": 2409 }, { "epoch": 0.21422222222222223, "grad_norm": 885.7861938476562, "learning_rate": 3e-06, "loss": 17.665, "step": 2410 }, { "epoch": 0.21431111111111112, "grad_norm": 884.716552734375, "learning_rate": 3e-06, "loss": 3.5877, "step": 2411 }, { "epoch": 0.2144, "grad_norm": 1398.6461181640625, "learning_rate": 3e-06, "loss": 3.2704, "step": 2412 }, { "completion_length": 234.20834350585938, "epoch": 0.2144888888888889, "grad_norm": 1070.3880615234375, "learning_rate": 3e-06, "loss": -4.4756, "reward": 1.3750000596046448, "reward_std": 0.23116153478622437, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.6249999813735485, "step": 2413, "zero_std_ratio": 0.75 }, { "epoch": 0.21457777777777778, "grad_norm": 1026.6502685546875, "learning_rate": 3e-06, "loss": 44.4837, "step": 2414 }, { "epoch": 0.21466666666666667, "grad_norm": 772.33837890625, "learning_rate": 3e-06, "loss": 37.0703, "step": 2415 }, { "epoch": 0.21475555555555556, "grad_norm": 1085.96875, "learning_rate": 3e-06, "loss": 30.7537, "step": 2416 }, { "epoch": 0.21484444444444445, "grad_norm": 827.8580932617188, "learning_rate": 3e-06, "loss": 48.8112, "step": 2417 }, { "epoch": 0.21493333333333334, "grad_norm": 781.8536376953125, "learning_rate": 3e-06, "loss": 35.331, "step": 2418 }, { "epoch": 0.21502222222222223, "grad_norm": 830.3689575195312, "learning_rate": 3e-06, "loss": -6.2381, "step": 2419 }, { "epoch": 0.21511111111111111, "grad_norm": 1895.36572265625, "learning_rate": 3e-06, "loss": 38.7443, "step": 2420 }, { "epoch": 0.2152, "grad_norm": 1093.376220703125, "learning_rate": 3e-06, "loss": 32.4039, "step": 2421 }, { "epoch": 0.2152888888888889, "grad_norm": 1121.695556640625, "learning_rate": 3e-06, "loss": 21.949, "step": 2422 }, { "epoch": 0.21537777777777778, "grad_norm": 908.1818237304688, "learning_rate": 3e-06, "loss": 39.3356, "step": 2423 }, { "epoch": 0.21546666666666667, "grad_norm": 780.5909423828125, "learning_rate": 3e-06, "loss": 28.4789, "step": 2424 }, { "completion_length": 240.37500762939453, "epoch": 0.21555555555555556, "grad_norm": 1291.5950927734375, "learning_rate": 3e-06, "loss": -48.4281, "reward": 1.625, "reward_std": 0.564385175704956, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.875, "step": 2425, "zero_std_ratio": 0.375 }, { "epoch": 0.21564444444444444, "grad_norm": 1240.4404296875, "learning_rate": 3e-06, "loss": -54.5814, "step": 2426 }, { "epoch": 0.21573333333333333, "grad_norm": 1328.792724609375, "learning_rate": 3e-06, "loss": -77.3574, "step": 2427 }, { "epoch": 0.21582222222222222, "grad_norm": 1365.1212158203125, "learning_rate": 3e-06, "loss": -56.5684, "step": 2428 }, { "epoch": 0.2159111111111111, "grad_norm": 1141.714111328125, "learning_rate": 3e-06, "loss": -52.8116, "step": 2429 }, { "epoch": 0.216, "grad_norm": 1254.617919921875, "learning_rate": 3e-06, "loss": -45.3142, "step": 2430 }, { "epoch": 0.21608888888888889, "grad_norm": 1215.14404296875, "learning_rate": 3e-06, "loss": -61.5184, "step": 2431 }, { "epoch": 0.21617777777777777, "grad_norm": 2824.7529296875, "learning_rate": 3e-06, "loss": -67.1598, "step": 2432 }, { "epoch": 0.21626666666666666, "grad_norm": 1158.3133544921875, "learning_rate": 3e-06, "loss": -84.968, "step": 2433 }, { "epoch": 0.21635555555555555, "grad_norm": 948.725341796875, "learning_rate": 3e-06, "loss": -67.6487, "step": 2434 }, { "epoch": 0.21644444444444444, "grad_norm": 1137.3834228515625, "learning_rate": 3e-06, "loss": -60.4848, "step": 2435 }, { "epoch": 0.21653333333333333, "grad_norm": 1104.3291015625, "learning_rate": 3e-06, "loss": -61.2857, "step": 2436 }, { "completion_length": 240.14583587646484, "epoch": 0.21662222222222222, "grad_norm": 668.9032592773438, "learning_rate": 3e-06, "loss": 11.3774, "reward": 1.2291667461395264, "reward_std": 0.23116152733564377, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.5416666716337204, "step": 2437, "zero_std_ratio": 0.75 }, { "epoch": 0.2167111111111111, "grad_norm": 636.1585693359375, "learning_rate": 3e-06, "loss": -7.5818, "step": 2438 }, { "epoch": 0.2168, "grad_norm": 497.7554626464844, "learning_rate": 3e-06, "loss": 6.8708, "step": 2439 }, { "epoch": 0.21688888888888888, "grad_norm": 553.154052734375, "learning_rate": 3e-06, "loss": 24.3757, "step": 2440 }, { "epoch": 0.21697777777777777, "grad_norm": 1472.5419921875, "learning_rate": 3e-06, "loss": -3.4629, "step": 2441 }, { "epoch": 0.21706666666666666, "grad_norm": 652.6502685546875, "learning_rate": 3e-06, "loss": 18.709, "step": 2442 }, { "epoch": 0.21715555555555555, "grad_norm": 599.662353515625, "learning_rate": 3e-06, "loss": 6.6085, "step": 2443 }, { "epoch": 0.21724444444444443, "grad_norm": 596.9681396484375, "learning_rate": 3e-06, "loss": -10.8602, "step": 2444 }, { "epoch": 0.21733333333333332, "grad_norm": 487.63580322265625, "learning_rate": 3e-06, "loss": -1.5804, "step": 2445 }, { "epoch": 0.2174222222222222, "grad_norm": 1273.145751953125, "learning_rate": 3e-06, "loss": 16.3577, "step": 2446 }, { "epoch": 0.2175111111111111, "grad_norm": 771.951904296875, "learning_rate": 3e-06, "loss": -6.3023, "step": 2447 }, { "epoch": 0.2176, "grad_norm": 712.5418701171875, "learning_rate": 3e-06, "loss": 16.2355, "step": 2448 }, { "completion_length": 248.37500762939453, "epoch": 0.21768888888888888, "grad_norm": 790.022216796875, "learning_rate": 3e-06, "loss": 7.116, "reward": 2.2291667461395264, "reward_std": 0.26603007316589355, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 1.5416666865348816, "step": 2449, "zero_std_ratio": 0.75 }, { "epoch": 0.21777777777777776, "grad_norm": 546.7139892578125, "learning_rate": 3e-06, "loss": -10.9262, "step": 2450 }, { "epoch": 0.21786666666666665, "grad_norm": 517.8162231445312, "learning_rate": 3e-06, "loss": 0.3609, "step": 2451 }, { "epoch": 0.21795555555555557, "grad_norm": 682.6878051757812, "learning_rate": 3e-06, "loss": 0.4365, "step": 2452 }, { "epoch": 0.21804444444444446, "grad_norm": 607.7827758789062, "learning_rate": 3e-06, "loss": 10.3393, "step": 2453 }, { "epoch": 0.21813333333333335, "grad_norm": 518.8916625976562, "learning_rate": 3e-06, "loss": 0.2425, "step": 2454 }, { "epoch": 0.21822222222222223, "grad_norm": 629.7891235351562, "learning_rate": 3e-06, "loss": -3.1223, "step": 2455 }, { "epoch": 0.21831111111111112, "grad_norm": 510.4330139160156, "learning_rate": 3e-06, "loss": -15.683, "step": 2456 }, { "epoch": 0.2184, "grad_norm": 553.461669921875, "learning_rate": 3e-06, "loss": -4.2699, "step": 2457 }, { "epoch": 0.2184888888888889, "grad_norm": 526.62109375, "learning_rate": 3e-06, "loss": -6.1042, "step": 2458 }, { "epoch": 0.2185777777777778, "grad_norm": 562.5404052734375, "learning_rate": 3e-06, "loss": -0.4739, "step": 2459 }, { "epoch": 0.21866666666666668, "grad_norm": 544.2666625976562, "learning_rate": 3e-06, "loss": -6.7032, "step": 2460 }, { "completion_length": 246.02083587646484, "epoch": 0.21875555555555556, "grad_norm": 1100.9122314453125, "learning_rate": 3e-06, "loss": 12.7639, "reward": 1.8750001192092896, "reward_std": 0.3061862289905548, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 1.1250000298023224, "step": 2461, "zero_std_ratio": 0.625 }, { "epoch": 0.21884444444444445, "grad_norm": 897.7620239257812, "learning_rate": 3e-06, "loss": 20.9614, "step": 2462 }, { "epoch": 0.21893333333333334, "grad_norm": 986.5072021484375, "learning_rate": 3e-06, "loss": 42.7016, "step": 2463 }, { "epoch": 0.21902222222222223, "grad_norm": 1005.2491455078125, "learning_rate": 3e-06, "loss": 59.914, "step": 2464 }, { "epoch": 0.21911111111111112, "grad_norm": 723.766357421875, "learning_rate": 3e-06, "loss": -18.4072, "step": 2465 }, { "epoch": 0.2192, "grad_norm": 814.1770629882812, "learning_rate": 3e-06, "loss": 56.8266, "step": 2466 }, { "epoch": 0.2192888888888889, "grad_norm": 880.131591796875, "learning_rate": 3e-06, "loss": 0.9147, "step": 2467 }, { "epoch": 0.21937777777777778, "grad_norm": 1002.6812744140625, "learning_rate": 3e-06, "loss": 10.1945, "step": 2468 }, { "epoch": 0.21946666666666667, "grad_norm": 928.541748046875, "learning_rate": 3e-06, "loss": 28.8268, "step": 2469 }, { "epoch": 0.21955555555555556, "grad_norm": 1081.3568115234375, "learning_rate": 3e-06, "loss": 42.3389, "step": 2470 }, { "epoch": 0.21964444444444445, "grad_norm": 849.524658203125, "learning_rate": 3e-06, "loss": -28.4933, "step": 2471 }, { "epoch": 0.21973333333333334, "grad_norm": 800.930419921875, "learning_rate": 3e-06, "loss": 48.021, "step": 2472 }, { "completion_length": 243.7291717529297, "epoch": 0.21982222222222222, "grad_norm": 840.5211181640625, "learning_rate": 3e-06, "loss": -29.1071, "reward": 1.4062500596046448, "reward_std": 0.47030356526374817, "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, "rewards/correctness_reward_func_math": 0.6666666567325592, "step": 2473, "zero_std_ratio": 0.5 }, { "epoch": 0.2199111111111111, "grad_norm": 766.2994384765625, "learning_rate": 3e-06, "loss": -22.7254, "step": 2474 }, { "epoch": 0.22, "grad_norm": 1315.990966796875, "learning_rate": 3e-06, "loss": -2.9843, "step": 2475 }, { "epoch": 0.2200888888888889, "grad_norm": 823.3301391601562, "learning_rate": 3e-06, "loss": -45.7856, "step": 2476 }, { "epoch": 0.22017777777777778, "grad_norm": 969.9705810546875, "learning_rate": 3e-06, "loss": -26.3312, "step": 2477 }, { "epoch": 0.22026666666666667, "grad_norm": 1052.6732177734375, "learning_rate": 3e-06, "loss": -6.0265, "step": 2478 }, { "epoch": 0.22035555555555555, "grad_norm": 810.92041015625, "learning_rate": 3e-06, "loss": -35.8816, "step": 2479 }, { "epoch": 0.22044444444444444, "grad_norm": 716.6881713867188, "learning_rate": 3e-06, "loss": -34.8377, "step": 2480 }, { "epoch": 0.22053333333333333, "grad_norm": 1079.2554931640625, "learning_rate": 3e-06, "loss": -13.4226, "step": 2481 }, { "epoch": 0.22062222222222222, "grad_norm": 770.211669921875, "learning_rate": 3e-06, "loss": -50.3465, "step": 2482 }, { "epoch": 0.2207111111111111, "grad_norm": 886.2747802734375, "learning_rate": 3e-06, "loss": -33.0956, "step": 2483 }, { "epoch": 0.2208, "grad_norm": 890.3179321289062, "learning_rate": 3e-06, "loss": -15.2544, "step": 2484 }, { "completion_length": 253.58333587646484, "epoch": 0.22088888888888888, "grad_norm": 303.0287780761719, "learning_rate": 3e-06, "loss": -15.111, "reward": 1.2916666865348816, "reward_std": 0.10206207633018494, "rewards/boxed_and_answer_tags_format_reward": 0.75, "rewards/correctness_reward_func_math": 0.5416666567325592, "step": 2485, "zero_std_ratio": 0.875 }, { "epoch": 0.22097777777777777, "grad_norm": 377.69757080078125, "learning_rate": 3e-06, "loss": -16.8297, "step": 2486 }, { "epoch": 0.22106666666666666, "grad_norm": 388.11773681640625, "learning_rate": 3e-06, "loss": -14.6188, "step": 2487 }, { "epoch": 0.22115555555555555, "grad_norm": 291.3501892089844, "learning_rate": 3e-06, "loss": -13.4672, "step": 2488 }, { "epoch": 0.22124444444444444, "grad_norm": 439.7605285644531, "learning_rate": 3e-06, "loss": -23.8357, "step": 2489 }, { "epoch": 0.22133333333333333, "grad_norm": 379.401611328125, "learning_rate": 3e-06, "loss": -9.9997, "step": 2490 }, { "epoch": 0.22142222222222221, "grad_norm": 297.3055725097656, "learning_rate": 3e-06, "loss": -18.6912, "step": 2491 }, { "epoch": 0.2215111111111111, "grad_norm": 412.62890625, "learning_rate": 3e-06, "loss": -20.8417, "step": 2492 }, { "epoch": 0.2216, "grad_norm": 300.6817321777344, "learning_rate": 3e-06, "loss": -19.9718, "step": 2493 }, { "epoch": 0.22168888888888888, "grad_norm": 268.96551513671875, "learning_rate": 3e-06, "loss": -18.9416, "step": 2494 }, { "epoch": 0.22177777777777777, "grad_norm": 426.9893798828125, "learning_rate": 3e-06, "loss": -27.8923, "step": 2495 }, { "epoch": 0.22186666666666666, "grad_norm": 381.15704345703125, "learning_rate": 3e-06, "loss": -19.2658, "step": 2496 }, { "completion_length": 249.95833587646484, "epoch": 0.22195555555555554, "grad_norm": 1263.931640625, "learning_rate": 3e-06, "loss": -12.4574, "reward": 1.1875, "reward_std": 0.46232303977012634, "rewards/boxed_and_answer_tags_format_reward": 0.6875, "rewards/correctness_reward_func_math": 0.5, "step": 2497, "zero_std_ratio": 0.5 }, { "epoch": 0.22204444444444443, "grad_norm": 963.8367919921875, "learning_rate": 3e-06, "loss": 15.1327, "step": 2498 }, { "epoch": 0.22213333333333332, "grad_norm": 1354.34619140625, "learning_rate": 3e-06, "loss": -37.5826, "step": 2499 }, { "epoch": 0.2222222222222222, "grad_norm": 1030.41650390625, "learning_rate": 3e-06, "loss": 3.9406, "step": 2500 } ], "logging_steps": 1, "max_steps": 112500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }