{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.2496759559300066, "eval_steps": 500, "global_step": 4623, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.699784017278618e-07, "loss": 1.0969, "step": 25 }, { "epoch": 0.01, "learning_rate": 5.399568034557235e-07, "loss": 1.0905, "step": 50 }, { "epoch": 0.01, "learning_rate": 8.099352051835855e-07, "loss": 1.0853, "step": 75 }, { "epoch": 0.02, "learning_rate": 1.079913606911447e-06, "loss": 1.0712, "step": 100 }, { "epoch": 0.02, "learning_rate": 1.349892008639309e-06, "loss": 1.0577, "step": 125 }, { "epoch": 0.02, "learning_rate": 1.619870410367171e-06, "loss": 1.0345, "step": 150 }, { "epoch": 0.03, "learning_rate": 1.8898488120950326e-06, "loss": 1.0094, "step": 175 }, { "epoch": 0.03, "learning_rate": 2.159827213822894e-06, "loss": 0.9685, "step": 200 }, { "epoch": 0.04, "learning_rate": 2.429805615550756e-06, "loss": 0.9441, "step": 225 }, { "epoch": 0.04, "learning_rate": 2.699784017278618e-06, "loss": 0.9111, "step": 250 }, { "epoch": 0.04, "learning_rate": 2.96976241900648e-06, "loss": 0.8798, "step": 275 }, { "epoch": 0.05, "learning_rate": 3.239740820734342e-06, "loss": 0.8499, "step": 300 }, { "epoch": 0.05, "learning_rate": 3.5097192224622035e-06, "loss": 0.8209, "step": 325 }, { "epoch": 0.06, "learning_rate": 3.779697624190065e-06, "loss": 0.7895, "step": 350 }, { "epoch": 0.06, "learning_rate": 4.049676025917927e-06, "loss": 0.7687, "step": 375 }, { "epoch": 0.06, "learning_rate": 4.319654427645788e-06, "loss": 0.7366, "step": 400 }, { "epoch": 0.07, "learning_rate": 4.58963282937365e-06, "loss": 0.717, "step": 425 }, { "epoch": 0.07, "learning_rate": 4.859611231101512e-06, "loss": 0.6993, "step": 450 }, { "epoch": 0.08, "learning_rate": 5.129589632829374e-06, "loss": 0.6828, "step": 475 }, { "epoch": 0.08, "learning_rate": 5.399568034557236e-06, "loss": 0.6755, "step": 500 }, { "epoch": 0.09, "learning_rate": 5.669546436285097e-06, "loss": 0.6622, "step": 525 }, { "epoch": 0.09, "learning_rate": 5.93952483801296e-06, "loss": 0.6537, "step": 550 }, { "epoch": 0.09, "learning_rate": 6.209503239740822e-06, "loss": 0.6405, "step": 575 }, { "epoch": 0.1, "learning_rate": 6.479481641468684e-06, "loss": 0.638, "step": 600 }, { "epoch": 0.1, "learning_rate": 6.749460043196545e-06, "loss": 0.6309, "step": 625 }, { "epoch": 0.11, "learning_rate": 7.019438444924407e-06, "loss": 0.6236, "step": 650 }, { "epoch": 0.11, "learning_rate": 7.289416846652268e-06, "loss": 0.6211, "step": 675 }, { "epoch": 0.11, "learning_rate": 7.55939524838013e-06, "loss": 0.6163, "step": 700 }, { "epoch": 0.12, "learning_rate": 7.829373650107991e-06, "loss": 0.6114, "step": 725 }, { "epoch": 0.12, "learning_rate": 8.099352051835854e-06, "loss": 0.6075, "step": 750 }, { "epoch": 0.13, "learning_rate": 8.369330453563716e-06, "loss": 0.6041, "step": 775 }, { "epoch": 0.13, "learning_rate": 8.639308855291577e-06, "loss": 0.6029, "step": 800 }, { "epoch": 0.13, "learning_rate": 8.90928725701944e-06, "loss": 0.5985, "step": 825 }, { "epoch": 0.14, "learning_rate": 9.1792656587473e-06, "loss": 0.5971, "step": 850 }, { "epoch": 0.14, "learning_rate": 9.449244060475162e-06, "loss": 0.5889, "step": 875 }, { "epoch": 0.15, "learning_rate": 9.719222462203023e-06, "loss": 0.5939, "step": 900 }, { "epoch": 0.15, "learning_rate": 9.989200863930886e-06, "loss": 0.5902, "step": 925 }, { "epoch": 0.15, "learning_rate": 1.0259179265658747e-05, "loss": 0.5884, "step": 950 }, { "epoch": 0.16, "learning_rate": 1.0529157667386608e-05, "loss": 0.5854, "step": 975 }, { "epoch": 0.16, "learning_rate": 1.0799136069114471e-05, "loss": 0.5849, "step": 1000 }, { "epoch": 0.17, "learning_rate": 1.1069114470842333e-05, "loss": 0.5839, "step": 1025 }, { "epoch": 0.17, "learning_rate": 1.1339092872570194e-05, "loss": 0.5832, "step": 1050 }, { "epoch": 0.17, "learning_rate": 1.1609071274298057e-05, "loss": 0.5784, "step": 1075 }, { "epoch": 0.18, "learning_rate": 1.187904967602592e-05, "loss": 0.5753, "step": 1100 }, { "epoch": 0.18, "learning_rate": 1.2149028077753782e-05, "loss": 0.5746, "step": 1125 }, { "epoch": 0.19, "learning_rate": 1.2419006479481644e-05, "loss": 0.5724, "step": 1150 }, { "epoch": 0.19, "learning_rate": 1.2688984881209505e-05, "loss": 0.5659, "step": 1175 }, { "epoch": 0.19, "learning_rate": 1.2958963282937368e-05, "loss": 0.5716, "step": 1200 }, { "epoch": 0.2, "learning_rate": 1.3228941684665229e-05, "loss": 0.5691, "step": 1225 }, { "epoch": 0.2, "learning_rate": 1.349892008639309e-05, "loss": 0.5641, "step": 1250 }, { "epoch": 0.21, "learning_rate": 1.3768898488120951e-05, "loss": 0.5669, "step": 1275 }, { "epoch": 0.21, "learning_rate": 1.4038876889848814e-05, "loss": 0.5654, "step": 1300 }, { "epoch": 0.21, "learning_rate": 1.4308855291576675e-05, "loss": 0.5651, "step": 1325 }, { "epoch": 0.22, "learning_rate": 1.4578833693304537e-05, "loss": 0.5619, "step": 1350 }, { "epoch": 0.22, "learning_rate": 1.48488120950324e-05, "loss": 0.5621, "step": 1375 }, { "epoch": 0.23, "learning_rate": 1.511879049676026e-05, "loss": 0.5603, "step": 1400 }, { "epoch": 0.23, "learning_rate": 1.5388768898488122e-05, "loss": 0.5597, "step": 1425 }, { "epoch": 0.23, "learning_rate": 1.5658747300215983e-05, "loss": 0.564, "step": 1450 }, { "epoch": 0.24, "learning_rate": 1.5928725701943844e-05, "loss": 0.5581, "step": 1475 }, { "epoch": 0.24, "learning_rate": 1.619870410367171e-05, "loss": 0.5554, "step": 1500 }, { "epoch": 0.25, "learning_rate": 1.646868250539957e-05, "loss": 0.5507, "step": 1525 }, { "epoch": 1.0, "learning_rate": 1.673866090712743e-05, "loss": 0.5533, "step": 1550 }, { "epoch": 1.01, "learning_rate": 1.7008639308855292e-05, "loss": 0.5498, "step": 1575 }, { "epoch": 1.01, "learning_rate": 1.7278617710583154e-05, "loss": 0.5526, "step": 1600 }, { "epoch": 1.01, "learning_rate": 1.7548596112311015e-05, "loss": 0.5503, "step": 1625 }, { "epoch": 1.02, "learning_rate": 1.781857451403888e-05, "loss": 0.5503, "step": 1650 }, { "epoch": 1.02, "learning_rate": 1.808855291576674e-05, "loss": 0.5485, "step": 1675 }, { "epoch": 1.03, "learning_rate": 1.83585313174946e-05, "loss": 0.5502, "step": 1700 }, { "epoch": 1.03, "learning_rate": 1.8628509719222463e-05, "loss": 0.5446, "step": 1725 }, { "epoch": 1.03, "learning_rate": 1.8898488120950324e-05, "loss": 0.5375, "step": 1750 }, { "epoch": 1.04, "learning_rate": 1.9168466522678185e-05, "loss": 0.5407, "step": 1775 }, { "epoch": 1.04, "learning_rate": 1.9438444924406046e-05, "loss": 0.5453, "step": 1800 }, { "epoch": 1.05, "learning_rate": 1.970842332613391e-05, "loss": 0.5447, "step": 1825 }, { "epoch": 1.05, "learning_rate": 1.9978401727861772e-05, "loss": 0.5421, "step": 1850 }, { "epoch": 1.05, "learning_rate": 1.997239558329333e-05, "loss": 0.5425, "step": 1875 }, { "epoch": 1.06, "learning_rate": 1.9942390782525205e-05, "loss": 0.5418, "step": 1900 }, { "epoch": 1.06, "learning_rate": 1.9912385981757083e-05, "loss": 0.5429, "step": 1925 }, { "epoch": 1.07, "learning_rate": 1.9882381180988958e-05, "loss": 0.5388, "step": 1950 }, { "epoch": 1.07, "learning_rate": 1.9852376380220836e-05, "loss": 0.5402, "step": 1975 }, { "epoch": 1.07, "learning_rate": 1.9822371579452714e-05, "loss": 0.5348, "step": 2000 }, { "epoch": 1.08, "learning_rate": 1.9792366778684592e-05, "loss": 0.5385, "step": 2025 }, { "epoch": 1.08, "learning_rate": 1.976236197791647e-05, "loss": 0.5415, "step": 2050 }, { "epoch": 1.09, "learning_rate": 1.9732357177148345e-05, "loss": 0.5346, "step": 2075 }, { "epoch": 1.09, "learning_rate": 1.9702352376380223e-05, "loss": 0.5333, "step": 2100 }, { "epoch": 1.09, "learning_rate": 1.9672347575612098e-05, "loss": 0.5339, "step": 2125 }, { "epoch": 1.1, "learning_rate": 1.9642342774843976e-05, "loss": 0.5325, "step": 2150 }, { "epoch": 1.1, "learning_rate": 1.9612337974075854e-05, "loss": 0.5346, "step": 2175 }, { "epoch": 1.11, "learning_rate": 1.958233317330773e-05, "loss": 0.527, "step": 2200 }, { "epoch": 1.11, "learning_rate": 1.9552328372539607e-05, "loss": 0.5343, "step": 2225 }, { "epoch": 1.11, "learning_rate": 1.9522323571771486e-05, "loss": 0.5282, "step": 2250 }, { "epoch": 1.12, "learning_rate": 1.9492318771003364e-05, "loss": 0.527, "step": 2275 }, { "epoch": 1.12, "learning_rate": 1.946231397023524e-05, "loss": 0.5287, "step": 2300 }, { "epoch": 1.13, "learning_rate": 1.9432309169467117e-05, "loss": 0.5287, "step": 2325 }, { "epoch": 1.13, "learning_rate": 1.9402304368698995e-05, "loss": 0.5296, "step": 2350 }, { "epoch": 1.14, "learning_rate": 1.937229956793087e-05, "loss": 0.5265, "step": 2375 }, { "epoch": 1.14, "learning_rate": 1.9342294767162748e-05, "loss": 0.5225, "step": 2400 }, { "epoch": 1.14, "learning_rate": 1.9312289966394622e-05, "loss": 0.5254, "step": 2425 }, { "epoch": 1.15, "learning_rate": 1.92822851656265e-05, "loss": 0.5273, "step": 2450 }, { "epoch": 1.15, "learning_rate": 1.925228036485838e-05, "loss": 0.5248, "step": 2475 }, { "epoch": 1.16, "learning_rate": 1.9222275564090257e-05, "loss": 0.5255, "step": 2500 }, { "epoch": 1.16, "learning_rate": 1.9192270763322135e-05, "loss": 0.5252, "step": 2525 }, { "epoch": 1.16, "learning_rate": 1.916226596255401e-05, "loss": 0.5255, "step": 2550 }, { "epoch": 1.17, "learning_rate": 1.9132261161785888e-05, "loss": 0.5273, "step": 2575 }, { "epoch": 1.17, "learning_rate": 1.9102256361017763e-05, "loss": 0.522, "step": 2600 }, { "epoch": 1.18, "learning_rate": 1.907225156024964e-05, "loss": 0.5244, "step": 2625 }, { "epoch": 1.18, "learning_rate": 1.904224675948152e-05, "loss": 0.5195, "step": 2650 }, { "epoch": 1.18, "learning_rate": 1.9012241958713397e-05, "loss": 0.5204, "step": 2675 }, { "epoch": 1.19, "learning_rate": 1.8982237157945272e-05, "loss": 0.5153, "step": 2700 }, { "epoch": 1.19, "learning_rate": 1.895223235717715e-05, "loss": 0.5173, "step": 2725 }, { "epoch": 1.2, "learning_rate": 1.8922227556409028e-05, "loss": 0.5196, "step": 2750 }, { "epoch": 1.2, "learning_rate": 1.8892222755640903e-05, "loss": 0.5153, "step": 2775 }, { "epoch": 1.2, "learning_rate": 1.886221795487278e-05, "loss": 0.518, "step": 2800 }, { "epoch": 1.21, "learning_rate": 1.883221315410466e-05, "loss": 0.5162, "step": 2825 }, { "epoch": 1.21, "learning_rate": 1.8802208353336534e-05, "loss": 0.5177, "step": 2850 }, { "epoch": 1.22, "learning_rate": 1.8772203552568412e-05, "loss": 0.5187, "step": 2875 }, { "epoch": 1.22, "learning_rate": 1.874219875180029e-05, "loss": 0.5164, "step": 2900 }, { "epoch": 1.22, "learning_rate": 1.871219395103217e-05, "loss": 0.5142, "step": 2925 }, { "epoch": 1.23, "learning_rate": 1.8682189150264043e-05, "loss": 0.5183, "step": 2950 }, { "epoch": 1.23, "learning_rate": 1.865218434949592e-05, "loss": 0.5146, "step": 2975 }, { "epoch": 1.24, "learning_rate": 1.86221795487278e-05, "loss": 0.5187, "step": 3000 }, { "epoch": 1.24, "learning_rate": 1.8592174747959674e-05, "loss": 0.5147, "step": 3025 }, { "epoch": 1.24, "learning_rate": 1.8562169947191553e-05, "loss": 0.5127, "step": 3050 }, { "epoch": 1.25, "learning_rate": 1.8532165146423427e-05, "loss": 0.5089, "step": 3075 }, { "epoch": 2.0, "learning_rate": 1.8502160345655305e-05, "loss": 0.5117, "step": 3100 }, { "epoch": 2.01, "learning_rate": 1.8472155544887184e-05, "loss": 0.5101, "step": 3125 }, { "epoch": 2.01, "learning_rate": 1.8442150744119062e-05, "loss": 0.5113, "step": 3150 }, { "epoch": 2.02, "learning_rate": 1.841214594335094e-05, "loss": 0.5109, "step": 3175 }, { "epoch": 2.02, "learning_rate": 1.8382141142582815e-05, "loss": 0.5108, "step": 3200 }, { "epoch": 2.02, "learning_rate": 1.8352136341814693e-05, "loss": 0.5094, "step": 3225 }, { "epoch": 2.03, "learning_rate": 1.8322131541046568e-05, "loss": 0.5118, "step": 3250 }, { "epoch": 2.03, "learning_rate": 1.8292126740278446e-05, "loss": 0.5035, "step": 3275 }, { "epoch": 2.04, "learning_rate": 1.826212193951032e-05, "loss": 0.5029, "step": 3300 }, { "epoch": 2.04, "learning_rate": 1.82321171387422e-05, "loss": 0.5042, "step": 3325 }, { "epoch": 2.04, "learning_rate": 1.8202112337974077e-05, "loss": 0.5079, "step": 3350 }, { "epoch": 2.05, "learning_rate": 1.8172107537205955e-05, "loss": 0.506, "step": 3375 }, { "epoch": 2.05, "learning_rate": 1.8142102736437833e-05, "loss": 0.5114, "step": 3400 }, { "epoch": 2.06, "learning_rate": 1.8112097935669708e-05, "loss": 0.5023, "step": 3425 }, { "epoch": 2.06, "learning_rate": 1.8082093134901586e-05, "loss": 0.5104, "step": 3450 }, { "epoch": 2.06, "learning_rate": 1.805208833413346e-05, "loss": 0.5091, "step": 3475 }, { "epoch": 2.07, "learning_rate": 1.802208353336534e-05, "loss": 0.504, "step": 3500 }, { "epoch": 2.07, "learning_rate": 1.7992078732597217e-05, "loss": 0.5054, "step": 3525 }, { "epoch": 2.08, "learning_rate": 1.7962073931829095e-05, "loss": 0.5013, "step": 3550 }, { "epoch": 2.08, "learning_rate": 1.7932069131060973e-05, "loss": 0.5077, "step": 3575 }, { "epoch": 2.08, "learning_rate": 1.7902064330292848e-05, "loss": 0.509, "step": 3600 }, { "epoch": 2.09, "learning_rate": 1.7872059529524726e-05, "loss": 0.5014, "step": 3625 }, { "epoch": 2.09, "learning_rate": 1.78420547287566e-05, "loss": 0.5001, "step": 3650 }, { "epoch": 2.1, "learning_rate": 1.7813250120019206e-05, "loss": 0.5027, "step": 3675 }, { "epoch": 2.1, "learning_rate": 1.778324531925108e-05, "loss": 0.501, "step": 3700 }, { "epoch": 2.1, "learning_rate": 1.7754440710513683e-05, "loss": 0.4997, "step": 3725 }, { "epoch": 2.11, "learning_rate": 1.772443590974556e-05, "loss": 0.5, "step": 3750 }, { "epoch": 2.11, "learning_rate": 1.7694431108977435e-05, "loss": 0.5038, "step": 3775 }, { "epoch": 2.12, "learning_rate": 1.7664426308209314e-05, "loss": 0.4962, "step": 3800 }, { "epoch": 2.12, "learning_rate": 1.7634421507441192e-05, "loss": 0.4995, "step": 3825 }, { "epoch": 2.12, "learning_rate": 1.760441670667307e-05, "loss": 0.5, "step": 3850 }, { "epoch": 2.13, "learning_rate": 1.7574411905904948e-05, "loss": 0.4991, "step": 3875 }, { "epoch": 2.13, "learning_rate": 1.7544407105136823e-05, "loss": 0.4985, "step": 3900 }, { "epoch": 2.14, "learning_rate": 1.75144023043687e-05, "loss": 0.4981, "step": 3925 }, { "epoch": 2.14, "learning_rate": 1.7484397503600576e-05, "loss": 0.4935, "step": 3950 }, { "epoch": 2.14, "learning_rate": 1.7454392702832454e-05, "loss": 0.4998, "step": 3975 }, { "epoch": 2.15, "learning_rate": 1.7424387902064332e-05, "loss": 0.4972, "step": 4000 }, { "epoch": 2.15, "learning_rate": 1.7394383101296207e-05, "loss": 0.4972, "step": 4025 }, { "epoch": 2.16, "learning_rate": 1.7364378300528085e-05, "loss": 0.4981, "step": 4050 }, { "epoch": 2.16, "learning_rate": 1.7334373499759963e-05, "loss": 0.4989, "step": 4075 }, { "epoch": 2.16, "learning_rate": 1.730436869899184e-05, "loss": 0.4961, "step": 4100 }, { "epoch": 2.17, "learning_rate": 1.727436389822372e-05, "loss": 0.5008, "step": 4125 }, { "epoch": 2.17, "learning_rate": 1.7244359097455594e-05, "loss": 0.4949, "step": 4150 }, { "epoch": 2.18, "learning_rate": 1.7214354296687472e-05, "loss": 0.4978, "step": 4175 }, { "epoch": 2.18, "learning_rate": 1.7184349495919347e-05, "loss": 0.4931, "step": 4200 }, { "epoch": 2.19, "learning_rate": 1.7154344695151225e-05, "loss": 0.491, "step": 4225 }, { "epoch": 2.19, "learning_rate": 1.7124339894383103e-05, "loss": 0.4905, "step": 4250 }, { "epoch": 2.19, "learning_rate": 1.7094335093614978e-05, "loss": 0.4935, "step": 4275 }, { "epoch": 2.2, "learning_rate": 1.7064330292846856e-05, "loss": 0.494, "step": 4300 }, { "epoch": 2.2, "learning_rate": 1.7034325492078735e-05, "loss": 0.4882, "step": 4325 }, { "epoch": 2.21, "learning_rate": 1.7004320691310613e-05, "loss": 0.4906, "step": 4350 }, { "epoch": 2.21, "learning_rate": 1.6974315890542487e-05, "loss": 0.4927, "step": 4375 }, { "epoch": 2.21, "learning_rate": 1.6944311089774366e-05, "loss": 0.4929, "step": 4400 }, { "epoch": 2.22, "learning_rate": 1.691430628900624e-05, "loss": 0.4928, "step": 4425 }, { "epoch": 2.22, "learning_rate": 1.688430148823812e-05, "loss": 0.4907, "step": 4450 }, { "epoch": 2.23, "learning_rate": 1.6854296687469997e-05, "loss": 0.4909, "step": 4475 }, { "epoch": 2.23, "learning_rate": 1.6824291886701875e-05, "loss": 0.492, "step": 4500 }, { "epoch": 2.23, "learning_rate": 1.6794287085933753e-05, "loss": 0.4925, "step": 4525 }, { "epoch": 2.24, "learning_rate": 1.6764282285165628e-05, "loss": 0.4917, "step": 4550 }, { "epoch": 2.24, "learning_rate": 1.6734277484397506e-05, "loss": 0.4899, "step": 4575 }, { "epoch": 2.25, "learning_rate": 1.670427268362938e-05, "loss": 0.4876, "step": 4600 } ], "logging_steps": 25, "max_steps": 18516, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.2933954243483664e+19, "trial_name": null, "trial_params": null }