{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 60.0, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 2.099515914916992, "learning_rate": 0.0, "loss": 2.6549, "step": 1 }, { "epoch": 2.0, "grad_norm": 2.1028008460998535, "learning_rate": 4e-05, "loss": 2.6566, "step": 2 }, { "epoch": 3.0, "grad_norm": 2.096004009246826, "learning_rate": 8e-05, "loss": 2.6172, "step": 3 }, { "epoch": 4.0, "grad_norm": 2.2220966815948486, "learning_rate": 0.00012, "loss": 2.4567, "step": 4 }, { "epoch": 5.0, "grad_norm": 2.1684439182281494, "learning_rate": 0.00016, "loss": 2.0182, "step": 5 }, { "epoch": 6.0, "grad_norm": 2.1147849559783936, "learning_rate": 0.0002, "loss": 1.4709, "step": 6 }, { "epoch": 7.0, "grad_norm": 5.454170227050781, "learning_rate": 0.00019636363636363636, "loss": 1.0252, "step": 7 }, { "epoch": 8.0, "grad_norm": 2.096377372741699, "learning_rate": 0.00019272727272727274, "loss": 0.7436, "step": 8 }, { "epoch": 9.0, "grad_norm": 1.4990514516830444, "learning_rate": 0.0001890909090909091, "loss": 0.4376, "step": 9 }, { "epoch": 10.0, "grad_norm": 0.6785975098609924, "learning_rate": 0.00018545454545454545, "loss": 0.3168, "step": 10 }, { "epoch": 11.0, "grad_norm": 0.8837186098098755, "learning_rate": 0.00018181818181818183, "loss": 0.2594, "step": 11 }, { "epoch": 12.0, "grad_norm": 0.6441168189048767, "learning_rate": 0.0001781818181818182, "loss": 0.2084, "step": 12 }, { "epoch": 13.0, "grad_norm": 0.7236498594284058, "learning_rate": 0.00017454545454545454, "loss": 0.1832, "step": 13 }, { "epoch": 14.0, "grad_norm": 0.7865864038467407, "learning_rate": 0.0001709090909090909, "loss": 0.1614, "step": 14 }, { "epoch": 15.0, "grad_norm": 0.8386063575744629, "learning_rate": 0.00016727272727272728, "loss": 0.145, "step": 15 }, { "epoch": 16.0, "grad_norm": 0.9354585409164429, "learning_rate": 0.00016363636363636366, "loss": 0.1331, "step": 16 }, { "epoch": 17.0, "grad_norm": 1.0136165618896484, "learning_rate": 0.00016, "loss": 0.119, "step": 17 }, { "epoch": 18.0, "grad_norm": 1.152154564857483, "learning_rate": 0.00015636363636363637, "loss": 0.1078, "step": 18 }, { "epoch": 19.0, "grad_norm": 1.3258788585662842, "learning_rate": 0.00015272727272727275, "loss": 0.0945, "step": 19 }, { "epoch": 20.0, "grad_norm": 1.4479986429214478, "learning_rate": 0.0001490909090909091, "loss": 0.0782, "step": 20 }, { "epoch": 21.0, "grad_norm": 1.5057014226913452, "learning_rate": 0.00014545454545454546, "loss": 0.0627, "step": 21 }, { "epoch": 22.0, "grad_norm": 1.353816270828247, "learning_rate": 0.00014181818181818184, "loss": 0.0479, "step": 22 }, { "epoch": 23.0, "grad_norm": 0.9007766842842102, "learning_rate": 0.0001381818181818182, "loss": 0.037, "step": 23 }, { "epoch": 24.0, "grad_norm": 0.44463616609573364, "learning_rate": 0.00013454545454545455, "loss": 0.0317, "step": 24 }, { "epoch": 25.0, "grad_norm": 0.15833492577075958, "learning_rate": 0.00013090909090909093, "loss": 0.0279, "step": 25 }, { "epoch": 26.0, "grad_norm": 0.11187602579593658, "learning_rate": 0.00012727272727272728, "loss": 0.0279, "step": 26 }, { "epoch": 27.0, "grad_norm": 0.07458829134702682, "learning_rate": 0.00012363636363636364, "loss": 0.0274, "step": 27 }, { "epoch": 28.0, "grad_norm": 0.05743066221475601, "learning_rate": 0.00012, "loss": 0.0271, "step": 28 }, { "epoch": 29.0, "grad_norm": 0.08287172019481659, "learning_rate": 0.00011636363636363636, "loss": 0.0273, "step": 29 }, { "epoch": 30.0, "grad_norm": 0.06937997788190842, "learning_rate": 0.00011272727272727272, "loss": 0.027, "step": 30 }, { "epoch": 31.0, "grad_norm": 0.04856820032000542, "learning_rate": 0.00010909090909090909, "loss": 0.027, "step": 31 }, { "epoch": 32.0, "grad_norm": 0.07065649330615997, "learning_rate": 0.00010545454545454545, "loss": 0.027, "step": 32 }, { "epoch": 33.0, "grad_norm": 0.0560951754450798, "learning_rate": 0.00010181818181818181, "loss": 0.0268, "step": 33 }, { "epoch": 34.0, "grad_norm": 0.06166462227702141, "learning_rate": 9.818181818181818e-05, "loss": 0.0268, "step": 34 }, { "epoch": 35.0, "grad_norm": 0.06152236461639404, "learning_rate": 9.454545454545455e-05, "loss": 0.0269, "step": 35 }, { "epoch": 36.0, "grad_norm": 0.045099712908267975, "learning_rate": 9.090909090909092e-05, "loss": 0.0268, "step": 36 }, { "epoch": 37.0, "grad_norm": 0.01705729216337204, "learning_rate": 8.727272727272727e-05, "loss": 0.0267, "step": 37 }, { "epoch": 38.0, "grad_norm": 0.05170425772666931, "learning_rate": 8.363636363636364e-05, "loss": 0.0266, "step": 38 }, { "epoch": 39.0, "grad_norm": 0.06034626066684723, "learning_rate": 8e-05, "loss": 0.0269, "step": 39 }, { "epoch": 40.0, "grad_norm": 0.03125019744038582, "learning_rate": 7.636363636363637e-05, "loss": 0.0268, "step": 40 }, { "epoch": 41.0, "grad_norm": 0.03972804546356201, "learning_rate": 7.272727272727273e-05, "loss": 0.0266, "step": 41 }, { "epoch": 42.0, "grad_norm": 0.05231940746307373, "learning_rate": 6.90909090909091e-05, "loss": 0.0267, "step": 42 }, { "epoch": 43.0, "grad_norm": 0.03405136987566948, "learning_rate": 6.545454545454546e-05, "loss": 0.0265, "step": 43 }, { "epoch": 44.0, "grad_norm": 0.010633698664605618, "learning_rate": 6.181818181818182e-05, "loss": 0.0265, "step": 44 }, { "epoch": 45.0, "grad_norm": 0.03109440766274929, "learning_rate": 5.818181818181818e-05, "loss": 0.0267, "step": 45 }, { "epoch": 46.0, "grad_norm": 0.027693096548318863, "learning_rate": 5.4545454545454546e-05, "loss": 0.0267, "step": 46 }, { "epoch": 47.0, "grad_norm": 0.03655220940709114, "learning_rate": 5.090909090909091e-05, "loss": 0.0266, "step": 47 }, { "epoch": 48.0, "grad_norm": 0.022899532690644264, "learning_rate": 4.7272727272727275e-05, "loss": 0.0268, "step": 48 }, { "epoch": 49.0, "grad_norm": 0.01907823234796524, "learning_rate": 4.3636363636363636e-05, "loss": 0.0265, "step": 49 }, { "epoch": 50.0, "grad_norm": 0.03373124822974205, "learning_rate": 4e-05, "loss": 0.0263, "step": 50 }, { "epoch": 51.0, "grad_norm": 0.03454992547631264, "learning_rate": 3.6363636363636364e-05, "loss": 0.0266, "step": 51 }, { "epoch": 52.0, "grad_norm": 0.019019270315766335, "learning_rate": 3.272727272727273e-05, "loss": 0.0267, "step": 52 }, { "epoch": 53.0, "grad_norm": 0.0051805428229272366, "learning_rate": 2.909090909090909e-05, "loss": 0.0266, "step": 53 }, { "epoch": 54.0, "grad_norm": 0.005404070485383272, "learning_rate": 2.5454545454545454e-05, "loss": 0.0266, "step": 54 }, { "epoch": 55.0, "grad_norm": 0.012555314227938652, "learning_rate": 2.1818181818181818e-05, "loss": 0.0264, "step": 55 }, { "epoch": 56.0, "grad_norm": 0.02643151953816414, "learning_rate": 1.8181818181818182e-05, "loss": 0.0266, "step": 56 }, { "epoch": 57.0, "grad_norm": 0.015639500692486763, "learning_rate": 1.4545454545454545e-05, "loss": 0.0269, "step": 57 }, { "epoch": 58.0, "grad_norm": 0.032283101230859756, "learning_rate": 1.0909090909090909e-05, "loss": 0.0264, "step": 58 }, { "epoch": 59.0, "grad_norm": 0.016383344307541847, "learning_rate": 7.272727272727272e-06, "loss": 0.0266, "step": 59 }, { "epoch": 60.0, "grad_norm": 0.005381885915994644, "learning_rate": 3.636363636363636e-06, "loss": 0.0267, "step": 60 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 60, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 886560737599488.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }