| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.62962962962963, | |
| "eval_steps": 500, | |
| "global_step": 130, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.07407407407407407, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 1.5384615384615387e-05, | |
| "loss": 3.0102, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.37037037037037035, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 7.692307692307693e-05, | |
| "loss": 3.0215, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 11.5, | |
| "learning_rate": 0.00015384615384615385, | |
| "loss": 2.7544, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.9629629629629629, | |
| "eval_loss": 2.8721797466278076, | |
| "eval_runtime": 0.652, | |
| "eval_samples_per_second": 15.338, | |
| "eval_steps_per_second": 1.534, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 0.00019985583705641418, | |
| "loss": 2.379, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 1.4814814814814814, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 0.00019823877374156647, | |
| "loss": 2.0489, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.8518518518518519, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 0.00019485364419471454, | |
| "loss": 1.7723, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 2.60640549659729, | |
| "eval_runtime": 0.5469, | |
| "eval_samples_per_second": 18.283, | |
| "eval_steps_per_second": 1.828, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0001897613727639014, | |
| "loss": 1.6022, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 2.5925925925925926, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.00018305360832480117, | |
| "loss": 1.4855, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 2.962962962962963, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.00017485107481711012, | |
| "loss": 1.4023, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.962962962962963, | |
| "eval_loss": 2.5709500312805176, | |
| "eval_runtime": 0.7217, | |
| "eval_samples_per_second": 13.857, | |
| "eval_steps_per_second": 1.386, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 0.0001653013984983585, | |
| "loss": 1.3253, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 3.7037037037037037, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.00015457645101945046, | |
| "loss": 1.2778, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 2.5349316596984863, | |
| "eval_runtime": 0.547, | |
| "eval_samples_per_second": 18.283, | |
| "eval_steps_per_second": 1.828, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 4.074074074074074, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 0.00014286925614030542, | |
| "loss": 1.2498, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.0001303905157574247, | |
| "loss": 1.2221, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 4.814814814814815, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 1.1848, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 4.962962962962963, | |
| "eval_loss": 2.5175788402557373, | |
| "eval_runtime": 0.6693, | |
| "eval_samples_per_second": 14.942, | |
| "eval_steps_per_second": 1.494, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 5.185185185185185, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.00010402659401094152, | |
| "loss": 1.1814, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 9.061590105968208e-05, | |
| "loss": 1.1574, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 5.925925925925926, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 7.73740997570278e-05, | |
| "loss": 1.1522, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 2.5044538974761963, | |
| "eval_runtime": 0.5444, | |
| "eval_samples_per_second": 18.369, | |
| "eval_steps_per_second": 1.837, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 6.296296296296296, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 6.453951129574644e-05, | |
| "loss": 1.1367, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 5.234312799786921e-05, | |
| "loss": 1.1305, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 6.962962962962963, | |
| "eval_loss": 2.506514310836792, | |
| "eval_runtime": 0.685, | |
| "eval_samples_per_second": 14.598, | |
| "eval_steps_per_second": 1.46, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 7.037037037037037, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 4.100445599768774e-05, | |
| "loss": 1.1188, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 7.407407407407407, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 3.072756464904006e-05, | |
| "loss": 1.1222, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 7.777777777777778, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 2.1697413758237784e-05, | |
| "loss": 1.1075, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 2.5136494636535645, | |
| "eval_runtime": 0.5462, | |
| "eval_samples_per_second": 18.307, | |
| "eval_steps_per_second": 1.831, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 8.148148148148149, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1.4076524743778319e-05, | |
| "loss": 1.1126, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 8.518518518518519, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 8.002055634117578e-06, | |
| "loss": 1.1118, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 3.5833325466437694e-06, | |
| "loss": 1.1049, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 8.962962962962964, | |
| "eval_loss": 2.512882709503174, | |
| "eval_runtime": 0.6584, | |
| "eval_samples_per_second": 15.188, | |
| "eval_steps_per_second": 1.519, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 9.25925925925926, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 8.998820754091531e-07, | |
| "loss": 1.1116, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 9.62962962962963, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 0.0, | |
| "loss": 1.1048, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 9.62962962962963, | |
| "eval_loss": 2.513336658477783, | |
| "eval_runtime": 0.5375, | |
| "eval_samples_per_second": 18.606, | |
| "eval_steps_per_second": 1.861, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 9.62962962962963, | |
| "step": 130, | |
| "total_flos": 1.018113810235392e+17, | |
| "train_loss": 1.4375356710874116, | |
| "train_runtime": 456.268, | |
| "train_samples_per_second": 18.301, | |
| "train_steps_per_second": 0.285 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 130, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "total_flos": 1.018113810235392e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |