{ "best_metric": 0.41398245096206665, "best_model_checkpoint": "autotrain-y4yyk-zwyl5/checkpoint-60204", "epoch": 3.0, "eval_steps": 500, "global_step": 60204, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 52.348899841308594, "learning_rate": 2.0760670984886232e-07, "loss": 2.2769, "step": 25 }, { "epoch": 0.0, "grad_norm": 28.035192489624023, "learning_rate": 4.1521341969772463e-07, "loss": 1.9582, "step": 50 }, { "epoch": 0.0, "grad_norm": 32.72040557861328, "learning_rate": 6.22820129546587e-07, "loss": 1.5803, "step": 75 }, { "epoch": 0.0, "grad_norm": 26.133230209350586, "learning_rate": 8.304268393954493e-07, "loss": 1.1819, "step": 100 }, { "epoch": 0.01, "grad_norm": 22.792537689208984, "learning_rate": 1.0380335492443115e-06, "loss": 0.927, "step": 125 }, { "epoch": 0.01, "grad_norm": 32.68214797973633, "learning_rate": 1.245640259093174e-06, "loss": 0.8114, "step": 150 }, { "epoch": 0.01, "grad_norm": 22.632478713989258, "learning_rate": 1.4532469689420362e-06, "loss": 0.8253, "step": 175 }, { "epoch": 0.01, "grad_norm": 19.682205200195312, "learning_rate": 1.6608536787908985e-06, "loss": 0.8464, "step": 200 }, { "epoch": 0.01, "grad_norm": 12.069725036621094, "learning_rate": 1.868460388639761e-06, "loss": 0.7647, "step": 225 }, { "epoch": 0.01, "grad_norm": 24.726076126098633, "learning_rate": 2.076067098488623e-06, "loss": 0.8112, "step": 250 }, { "epoch": 0.01, "grad_norm": 33.87165451049805, "learning_rate": 2.2836738083374856e-06, "loss": 0.7807, "step": 275 }, { "epoch": 0.01, "grad_norm": 18.317752838134766, "learning_rate": 2.491280518186348e-06, "loss": 0.7254, "step": 300 }, { "epoch": 0.02, "grad_norm": 15.429110527038574, "learning_rate": 2.6988872280352103e-06, "loss": 0.8093, "step": 325 }, { "epoch": 0.02, "grad_norm": 24.479049682617188, "learning_rate": 2.9064939378840724e-06, "loss": 0.8076, "step": 350 }, { "epoch": 0.02, "grad_norm": 38.9715461730957, "learning_rate": 3.114100647732935e-06, "loss": 0.819, "step": 375 }, { "epoch": 0.02, "grad_norm": 39.28196716308594, "learning_rate": 3.321707357581797e-06, "loss": 0.7896, "step": 400 }, { "epoch": 0.02, "grad_norm": 20.000776290893555, "learning_rate": 3.529314067430659e-06, "loss": 0.8073, "step": 425 }, { "epoch": 0.02, "grad_norm": 38.01788330078125, "learning_rate": 3.736920777279522e-06, "loss": 0.763, "step": 450 }, { "epoch": 0.02, "grad_norm": 23.642269134521484, "learning_rate": 3.944527487128384e-06, "loss": 0.7345, "step": 475 }, { "epoch": 0.02, "grad_norm": 11.121870994567871, "learning_rate": 4.152134196977246e-06, "loss": 0.8294, "step": 500 }, { "epoch": 0.03, "grad_norm": 13.160523414611816, "learning_rate": 4.359740906826109e-06, "loss": 0.7715, "step": 525 }, { "epoch": 0.03, "grad_norm": 26.109317779541016, "learning_rate": 4.567347616674971e-06, "loss": 0.6893, "step": 550 }, { "epoch": 0.03, "grad_norm": 13.156538963317871, "learning_rate": 4.774954326523833e-06, "loss": 0.7625, "step": 575 }, { "epoch": 0.03, "grad_norm": 16.808387756347656, "learning_rate": 4.982561036372696e-06, "loss": 0.743, "step": 600 }, { "epoch": 0.03, "grad_norm": 14.919551849365234, "learning_rate": 5.190167746221558e-06, "loss": 0.8217, "step": 625 }, { "epoch": 0.03, "grad_norm": 19.161149978637695, "learning_rate": 5.3977744560704205e-06, "loss": 0.7677, "step": 650 }, { "epoch": 0.03, "grad_norm": 15.927726745605469, "learning_rate": 5.605381165919283e-06, "loss": 0.7144, "step": 675 }, { "epoch": 0.03, "grad_norm": 13.279458045959473, "learning_rate": 5.812987875768145e-06, "loss": 0.7376, "step": 700 }, { "epoch": 0.04, "grad_norm": 38.41741943359375, "learning_rate": 6.020594585617007e-06, "loss": 0.7485, "step": 725 }, { "epoch": 0.04, "grad_norm": 14.47808837890625, "learning_rate": 6.22820129546587e-06, "loss": 0.7733, "step": 750 }, { "epoch": 0.04, "grad_norm": 15.392128944396973, "learning_rate": 6.4358080053147324e-06, "loss": 0.6558, "step": 775 }, { "epoch": 0.04, "grad_norm": 42.021183013916016, "learning_rate": 6.643414715163594e-06, "loss": 0.7423, "step": 800 }, { "epoch": 0.04, "grad_norm": 21.30915641784668, "learning_rate": 6.851021425012457e-06, "loss": 0.7232, "step": 825 }, { "epoch": 0.04, "grad_norm": 38.241329193115234, "learning_rate": 7.058628134861318e-06, "loss": 0.6753, "step": 850 }, { "epoch": 0.04, "grad_norm": 23.84617042541504, "learning_rate": 7.266234844710182e-06, "loss": 0.6784, "step": 875 }, { "epoch": 0.04, "grad_norm": 18.533485412597656, "learning_rate": 7.473841554559044e-06, "loss": 0.7696, "step": 900 }, { "epoch": 0.05, "grad_norm": 54.84447479248047, "learning_rate": 7.681448264407906e-06, "loss": 0.6925, "step": 925 }, { "epoch": 0.05, "grad_norm": 17.614309310913086, "learning_rate": 7.889054974256769e-06, "loss": 0.6852, "step": 950 }, { "epoch": 0.05, "grad_norm": 19.179882049560547, "learning_rate": 8.09666168410563e-06, "loss": 0.6878, "step": 975 }, { "epoch": 0.05, "grad_norm": 22.098026275634766, "learning_rate": 8.304268393954492e-06, "loss": 0.7911, "step": 1000 }, { "epoch": 0.05, "grad_norm": 17.752079010009766, "learning_rate": 8.511875103803356e-06, "loss": 0.6564, "step": 1025 }, { "epoch": 0.05, "grad_norm": 8.17482852935791, "learning_rate": 8.719481813652217e-06, "loss": 0.7486, "step": 1050 }, { "epoch": 0.05, "grad_norm": 22.92099952697754, "learning_rate": 8.92708852350108e-06, "loss": 0.6488, "step": 1075 }, { "epoch": 0.05, "grad_norm": 16.468435287475586, "learning_rate": 9.134695233349942e-06, "loss": 0.7229, "step": 1100 }, { "epoch": 0.06, "grad_norm": 17.73703384399414, "learning_rate": 9.342301943198805e-06, "loss": 0.6508, "step": 1125 }, { "epoch": 0.06, "grad_norm": 15.647977828979492, "learning_rate": 9.549908653047666e-06, "loss": 0.6739, "step": 1150 }, { "epoch": 0.06, "grad_norm": 13.75230884552002, "learning_rate": 9.75751536289653e-06, "loss": 0.5561, "step": 1175 }, { "epoch": 0.06, "grad_norm": 43.96486282348633, "learning_rate": 9.965122072745393e-06, "loss": 0.7703, "step": 1200 }, { "epoch": 0.06, "grad_norm": 14.04170036315918, "learning_rate": 1.0172728782594253e-05, "loss": 0.7327, "step": 1225 }, { "epoch": 0.06, "grad_norm": 7.018296718597412, "learning_rate": 1.0380335492443116e-05, "loss": 0.6681, "step": 1250 }, { "epoch": 0.06, "grad_norm": 31.058786392211914, "learning_rate": 1.0587942202291978e-05, "loss": 0.71, "step": 1275 }, { "epoch": 0.06, "grad_norm": 28.84793472290039, "learning_rate": 1.0795548912140841e-05, "loss": 0.5982, "step": 1300 }, { "epoch": 0.07, "grad_norm": 34.36916732788086, "learning_rate": 1.1003155621989704e-05, "loss": 0.7612, "step": 1325 }, { "epoch": 0.07, "grad_norm": 22.128067016601562, "learning_rate": 1.1210762331838566e-05, "loss": 0.6835, "step": 1350 }, { "epoch": 0.07, "grad_norm": 73.33786010742188, "learning_rate": 1.1418369041687429e-05, "loss": 0.5649, "step": 1375 }, { "epoch": 0.07, "grad_norm": 12.824825286865234, "learning_rate": 1.162597575153629e-05, "loss": 0.638, "step": 1400 }, { "epoch": 0.07, "grad_norm": 11.171917915344238, "learning_rate": 1.1833582461385152e-05, "loss": 0.6838, "step": 1425 }, { "epoch": 0.07, "grad_norm": 19.83733558654785, "learning_rate": 1.2041189171234015e-05, "loss": 0.6683, "step": 1450 }, { "epoch": 0.07, "grad_norm": 19.352357864379883, "learning_rate": 1.2248795881082877e-05, "loss": 0.7044, "step": 1475 }, { "epoch": 0.07, "grad_norm": 33.03659439086914, "learning_rate": 1.245640259093174e-05, "loss": 0.6281, "step": 1500 }, { "epoch": 0.08, "grad_norm": 160.98451232910156, "learning_rate": 1.2664009300780602e-05, "loss": 0.7068, "step": 1525 }, { "epoch": 0.08, "grad_norm": 16.802547454833984, "learning_rate": 1.2871616010629465e-05, "loss": 0.676, "step": 1550 }, { "epoch": 0.08, "grad_norm": 19.06223487854004, "learning_rate": 1.3079222720478326e-05, "loss": 0.7226, "step": 1575 }, { "epoch": 0.08, "grad_norm": 18.991416931152344, "learning_rate": 1.3286829430327188e-05, "loss": 0.6897, "step": 1600 }, { "epoch": 0.08, "grad_norm": 3.39277982711792, "learning_rate": 1.3494436140176051e-05, "loss": 0.4943, "step": 1625 }, { "epoch": 0.08, "grad_norm": 14.731120109558105, "learning_rate": 1.3702042850024913e-05, "loss": 0.7963, "step": 1650 }, { "epoch": 0.08, "grad_norm": 17.09242057800293, "learning_rate": 1.3909649559873774e-05, "loss": 0.6485, "step": 1675 }, { "epoch": 0.08, "grad_norm": 49.163719177246094, "learning_rate": 1.4117256269722637e-05, "loss": 0.5697, "step": 1700 }, { "epoch": 0.09, "grad_norm": 6.796900749206543, "learning_rate": 1.4324862979571501e-05, "loss": 0.6677, "step": 1725 }, { "epoch": 0.09, "grad_norm": 13.346964836120605, "learning_rate": 1.4532469689420364e-05, "loss": 0.632, "step": 1750 }, { "epoch": 0.09, "grad_norm": 19.15069007873535, "learning_rate": 1.4740076399269226e-05, "loss": 0.6362, "step": 1775 }, { "epoch": 0.09, "grad_norm": 14.083284378051758, "learning_rate": 1.4947683109118089e-05, "loss": 0.6034, "step": 1800 }, { "epoch": 0.09, "grad_norm": 23.270427703857422, "learning_rate": 1.515528981896695e-05, "loss": 0.6496, "step": 1825 }, { "epoch": 0.09, "grad_norm": 27.949331283569336, "learning_rate": 1.5362896528815812e-05, "loss": 0.6456, "step": 1850 }, { "epoch": 0.09, "grad_norm": 9.217957496643066, "learning_rate": 1.5570503238664673e-05, "loss": 0.5383, "step": 1875 }, { "epoch": 0.09, "grad_norm": 26.01340675354004, "learning_rate": 1.5778109948513537e-05, "loss": 0.5334, "step": 1900 }, { "epoch": 0.1, "grad_norm": 4.734766483306885, "learning_rate": 1.5985716658362398e-05, "loss": 0.6377, "step": 1925 }, { "epoch": 0.1, "grad_norm": 9.05659008026123, "learning_rate": 1.619332336821126e-05, "loss": 0.6443, "step": 1950 }, { "epoch": 0.1, "grad_norm": 12.056586265563965, "learning_rate": 1.6400930078060123e-05, "loss": 0.6515, "step": 1975 }, { "epoch": 0.1, "grad_norm": 9.164318084716797, "learning_rate": 1.6608536787908984e-05, "loss": 0.6515, "step": 2000 }, { "epoch": 0.1, "grad_norm": 7.113138675689697, "learning_rate": 1.681614349775785e-05, "loss": 0.5369, "step": 2025 }, { "epoch": 0.1, "grad_norm": 3.411200523376465, "learning_rate": 1.7023750207606713e-05, "loss": 0.5783, "step": 2050 }, { "epoch": 0.1, "grad_norm": 11.924202919006348, "learning_rate": 1.7231356917455573e-05, "loss": 0.5549, "step": 2075 }, { "epoch": 0.1, "grad_norm": 5.846712112426758, "learning_rate": 1.7438963627304434e-05, "loss": 0.5598, "step": 2100 }, { "epoch": 0.11, "grad_norm": 16.387100219726562, "learning_rate": 1.76465703371533e-05, "loss": 0.5761, "step": 2125 }, { "epoch": 0.11, "grad_norm": 5.820246696472168, "learning_rate": 1.785417704700216e-05, "loss": 0.5706, "step": 2150 }, { "epoch": 0.11, "grad_norm": 5.995615482330322, "learning_rate": 1.8061783756851024e-05, "loss": 0.5396, "step": 2175 }, { "epoch": 0.11, "grad_norm": 11.19775104522705, "learning_rate": 1.8269390466699885e-05, "loss": 0.5248, "step": 2200 }, { "epoch": 0.11, "grad_norm": 12.269454956054688, "learning_rate": 1.8476997176548745e-05, "loss": 0.5094, "step": 2225 }, { "epoch": 0.11, "grad_norm": 13.744296073913574, "learning_rate": 1.868460388639761e-05, "loss": 0.4868, "step": 2250 }, { "epoch": 0.11, "grad_norm": 8.654863357543945, "learning_rate": 1.889221059624647e-05, "loss": 0.6201, "step": 2275 }, { "epoch": 0.11, "grad_norm": 26.114696502685547, "learning_rate": 1.909981730609533e-05, "loss": 0.5775, "step": 2300 }, { "epoch": 0.12, "grad_norm": 5.729313373565674, "learning_rate": 1.9307424015944196e-05, "loss": 0.8127, "step": 2325 }, { "epoch": 0.12, "grad_norm": 10.743517875671387, "learning_rate": 1.951503072579306e-05, "loss": 0.7149, "step": 2350 }, { "epoch": 0.12, "grad_norm": 24.40814781188965, "learning_rate": 1.972263743564192e-05, "loss": 0.633, "step": 2375 }, { "epoch": 0.12, "grad_norm": 50.178707122802734, "learning_rate": 1.9930244145490785e-05, "loss": 0.6349, "step": 2400 }, { "epoch": 0.12, "grad_norm": 29.33345603942871, "learning_rate": 2.0137850855339646e-05, "loss": 0.5914, "step": 2425 }, { "epoch": 0.12, "grad_norm": 21.70145606994629, "learning_rate": 2.0345457565188507e-05, "loss": 0.6143, "step": 2450 }, { "epoch": 0.12, "grad_norm": 40.49439239501953, "learning_rate": 2.055306427503737e-05, "loss": 0.7122, "step": 2475 }, { "epoch": 0.12, "grad_norm": 14.44583797454834, "learning_rate": 2.0760670984886232e-05, "loss": 0.5066, "step": 2500 }, { "epoch": 0.13, "grad_norm": 8.811293601989746, "learning_rate": 2.0968277694735096e-05, "loss": 0.6458, "step": 2525 }, { "epoch": 0.13, "grad_norm": 10.258934020996094, "learning_rate": 2.1175884404583957e-05, "loss": 0.596, "step": 2550 }, { "epoch": 0.13, "grad_norm": 9.4044828414917, "learning_rate": 2.1383491114432818e-05, "loss": 0.5651, "step": 2575 }, { "epoch": 0.13, "grad_norm": 11.9262056350708, "learning_rate": 2.1591097824281682e-05, "loss": 0.5821, "step": 2600 }, { "epoch": 0.13, "grad_norm": 4.892654895782471, "learning_rate": 2.1798704534130543e-05, "loss": 0.5606, "step": 2625 }, { "epoch": 0.13, "grad_norm": 58.10175323486328, "learning_rate": 2.2006311243979407e-05, "loss": 0.5521, "step": 2650 }, { "epoch": 0.13, "grad_norm": 22.51751708984375, "learning_rate": 2.221391795382827e-05, "loss": 0.5206, "step": 2675 }, { "epoch": 0.13, "grad_norm": 12.897263526916504, "learning_rate": 2.2421524663677132e-05, "loss": 0.6067, "step": 2700 }, { "epoch": 0.14, "grad_norm": 16.60735511779785, "learning_rate": 2.2629131373525993e-05, "loss": 0.5365, "step": 2725 }, { "epoch": 0.14, "grad_norm": 7.0244622230529785, "learning_rate": 2.2836738083374857e-05, "loss": 0.5689, "step": 2750 }, { "epoch": 0.14, "grad_norm": 13.883830070495605, "learning_rate": 2.3044344793223718e-05, "loss": 0.5949, "step": 2775 }, { "epoch": 0.14, "grad_norm": 6.281734466552734, "learning_rate": 2.325195150307258e-05, "loss": 0.5586, "step": 2800 }, { "epoch": 0.14, "grad_norm": 15.306391716003418, "learning_rate": 2.3459558212921443e-05, "loss": 0.5077, "step": 2825 }, { "epoch": 0.14, "grad_norm": 7.228984355926514, "learning_rate": 2.3667164922770304e-05, "loss": 0.5468, "step": 2850 }, { "epoch": 0.14, "grad_norm": 21.7454891204834, "learning_rate": 2.3874771632619165e-05, "loss": 0.6936, "step": 2875 }, { "epoch": 0.14, "grad_norm": 7.734707355499268, "learning_rate": 2.408237834246803e-05, "loss": 0.4939, "step": 2900 }, { "epoch": 0.15, "grad_norm": 3.2207603454589844, "learning_rate": 2.428998505231689e-05, "loss": 0.5253, "step": 2925 }, { "epoch": 0.15, "grad_norm": 22.73624038696289, "learning_rate": 2.4497591762165754e-05, "loss": 0.5403, "step": 2950 }, { "epoch": 0.15, "grad_norm": 82.33477020263672, "learning_rate": 2.470519847201462e-05, "loss": 0.5683, "step": 2975 }, { "epoch": 0.15, "grad_norm": 19.013668060302734, "learning_rate": 2.491280518186348e-05, "loss": 0.56, "step": 3000 }, { "epoch": 0.15, "grad_norm": 11.089810371398926, "learning_rate": 2.512041189171234e-05, "loss": 0.5, "step": 3025 }, { "epoch": 0.15, "grad_norm": 10.986812591552734, "learning_rate": 2.5328018601561205e-05, "loss": 0.5554, "step": 3050 }, { "epoch": 0.15, "grad_norm": 6.21177339553833, "learning_rate": 2.5535625311410066e-05, "loss": 0.4463, "step": 3075 }, { "epoch": 0.15, "grad_norm": 8.5263090133667, "learning_rate": 2.574323202125893e-05, "loss": 0.5294, "step": 3100 }, { "epoch": 0.16, "grad_norm": 11.42646598815918, "learning_rate": 2.595083873110779e-05, "loss": 0.5584, "step": 3125 }, { "epoch": 0.16, "grad_norm": 46.24335479736328, "learning_rate": 2.615844544095665e-05, "loss": 0.5024, "step": 3150 }, { "epoch": 0.16, "grad_norm": 7.996013164520264, "learning_rate": 2.6366052150805516e-05, "loss": 0.474, "step": 3175 }, { "epoch": 0.16, "grad_norm": 53.60097122192383, "learning_rate": 2.6573658860654377e-05, "loss": 0.5053, "step": 3200 }, { "epoch": 0.16, "grad_norm": 19.06157112121582, "learning_rate": 2.6781265570503237e-05, "loss": 0.5921, "step": 3225 }, { "epoch": 0.16, "grad_norm": 37.257633209228516, "learning_rate": 2.6988872280352102e-05, "loss": 0.5614, "step": 3250 }, { "epoch": 0.16, "grad_norm": 16.04386329650879, "learning_rate": 2.7196478990200963e-05, "loss": 0.6043, "step": 3275 }, { "epoch": 0.16, "grad_norm": 3.808250665664673, "learning_rate": 2.7404085700049827e-05, "loss": 0.5175, "step": 3300 }, { "epoch": 0.17, "grad_norm": 10.740408897399902, "learning_rate": 2.7611692409898688e-05, "loss": 0.4591, "step": 3325 }, { "epoch": 0.17, "grad_norm": 10.769336700439453, "learning_rate": 2.781929911974755e-05, "loss": 0.6154, "step": 3350 }, { "epoch": 0.17, "grad_norm": 11.40727424621582, "learning_rate": 2.8026905829596413e-05, "loss": 0.4908, "step": 3375 }, { "epoch": 0.17, "grad_norm": 3.83750319480896, "learning_rate": 2.8234512539445274e-05, "loss": 0.5967, "step": 3400 }, { "epoch": 0.17, "grad_norm": 27.651927947998047, "learning_rate": 2.844211924929414e-05, "loss": 0.5095, "step": 3425 }, { "epoch": 0.17, "grad_norm": 4.459410190582275, "learning_rate": 2.8649725959143002e-05, "loss": 0.5288, "step": 3450 }, { "epoch": 0.17, "grad_norm": 5.757278919219971, "learning_rate": 2.8857332668991866e-05, "loss": 0.5613, "step": 3475 }, { "epoch": 0.17, "grad_norm": 20.3660888671875, "learning_rate": 2.9064939378840727e-05, "loss": 0.5968, "step": 3500 }, { "epoch": 0.18, "grad_norm": 4.496192932128906, "learning_rate": 2.9272546088689588e-05, "loss": 0.4816, "step": 3525 }, { "epoch": 0.18, "grad_norm": 10.203181266784668, "learning_rate": 2.9480152798538452e-05, "loss": 0.5615, "step": 3550 }, { "epoch": 0.18, "grad_norm": 18.76901626586914, "learning_rate": 2.9687759508387313e-05, "loss": 0.5196, "step": 3575 }, { "epoch": 0.18, "grad_norm": 51.70024490356445, "learning_rate": 2.9895366218236178e-05, "loss": 0.6804, "step": 3600 }, { "epoch": 0.18, "grad_norm": 2.5735995769500732, "learning_rate": 3.010297292808504e-05, "loss": 0.5755, "step": 3625 }, { "epoch": 0.18, "grad_norm": 11.979140281677246, "learning_rate": 3.03105796379339e-05, "loss": 0.5352, "step": 3650 }, { "epoch": 0.18, "grad_norm": 2.2723140716552734, "learning_rate": 3.051818634778276e-05, "loss": 0.5037, "step": 3675 }, { "epoch": 0.18, "grad_norm": 1.0300755500793457, "learning_rate": 3.0725793057631624e-05, "loss": 0.4458, "step": 3700 }, { "epoch": 0.19, "grad_norm": 7.549310207366943, "learning_rate": 3.093339976748049e-05, "loss": 0.5912, "step": 3725 }, { "epoch": 0.19, "grad_norm": 1.4594346284866333, "learning_rate": 3.1141006477329346e-05, "loss": 0.6347, "step": 3750 }, { "epoch": 0.19, "grad_norm": 8.113776206970215, "learning_rate": 3.134861318717821e-05, "loss": 0.5614, "step": 3775 }, { "epoch": 0.19, "grad_norm": 1.6310756206512451, "learning_rate": 3.1556219897027075e-05, "loss": 0.5177, "step": 3800 }, { "epoch": 0.19, "grad_norm": 5.420190811157227, "learning_rate": 3.176382660687593e-05, "loss": 0.5223, "step": 3825 }, { "epoch": 0.19, "grad_norm": 7.095544338226318, "learning_rate": 3.1971433316724796e-05, "loss": 0.4932, "step": 3850 }, { "epoch": 0.19, "grad_norm": 17.664386749267578, "learning_rate": 3.217904002657366e-05, "loss": 0.5635, "step": 3875 }, { "epoch": 0.19, "grad_norm": 25.511335372924805, "learning_rate": 3.238664673642252e-05, "loss": 0.4911, "step": 3900 }, { "epoch": 0.2, "grad_norm": 20.20481300354004, "learning_rate": 3.259425344627138e-05, "loss": 0.481, "step": 3925 }, { "epoch": 0.2, "grad_norm": 1.4173545837402344, "learning_rate": 3.2801860156120247e-05, "loss": 0.4748, "step": 3950 }, { "epoch": 0.2, "grad_norm": 15.927542686462402, "learning_rate": 3.300946686596911e-05, "loss": 0.3865, "step": 3975 }, { "epoch": 0.2, "grad_norm": 7.046652317047119, "learning_rate": 3.321707357581797e-05, "loss": 0.6209, "step": 4000 }, { "epoch": 0.2, "grad_norm": 5.920745849609375, "learning_rate": 3.342468028566683e-05, "loss": 0.5593, "step": 4025 }, { "epoch": 0.2, "grad_norm": 13.87470817565918, "learning_rate": 3.36322869955157e-05, "loss": 0.5426, "step": 4050 }, { "epoch": 0.2, "grad_norm": 1.830471396446228, "learning_rate": 3.383989370536456e-05, "loss": 0.3738, "step": 4075 }, { "epoch": 0.2, "grad_norm": 57.95819091796875, "learning_rate": 3.4047500415213425e-05, "loss": 0.7062, "step": 4100 }, { "epoch": 0.21, "grad_norm": 6.367457389831543, "learning_rate": 3.425510712506228e-05, "loss": 0.5627, "step": 4125 }, { "epoch": 0.21, "grad_norm": 26.165958404541016, "learning_rate": 3.446271383491115e-05, "loss": 0.5509, "step": 4150 }, { "epoch": 0.21, "grad_norm": 32.05531692504883, "learning_rate": 3.467032054476001e-05, "loss": 0.5006, "step": 4175 }, { "epoch": 0.21, "grad_norm": 6.160696029663086, "learning_rate": 3.487792725460887e-05, "loss": 0.5143, "step": 4200 }, { "epoch": 0.21, "grad_norm": 5.989635944366455, "learning_rate": 3.508553396445773e-05, "loss": 0.5025, "step": 4225 }, { "epoch": 0.21, "grad_norm": 10.148011207580566, "learning_rate": 3.52931406743066e-05, "loss": 0.626, "step": 4250 }, { "epoch": 0.21, "grad_norm": 28.23525619506836, "learning_rate": 3.550074738415546e-05, "loss": 0.4305, "step": 4275 }, { "epoch": 0.21, "grad_norm": 3.1064085960388184, "learning_rate": 3.570835409400432e-05, "loss": 0.5236, "step": 4300 }, { "epoch": 0.22, "grad_norm": 3.7058463096618652, "learning_rate": 3.591596080385318e-05, "loss": 0.5068, "step": 4325 }, { "epoch": 0.22, "grad_norm": 8.256972312927246, "learning_rate": 3.612356751370205e-05, "loss": 0.4297, "step": 4350 }, { "epoch": 0.22, "grad_norm": 6.961075782775879, "learning_rate": 3.6331174223550905e-05, "loss": 0.4994, "step": 4375 }, { "epoch": 0.22, "grad_norm": 1.5909925699234009, "learning_rate": 3.653878093339977e-05, "loss": 0.764, "step": 4400 }, { "epoch": 0.22, "grad_norm": 15.037834167480469, "learning_rate": 3.674638764324863e-05, "loss": 0.5137, "step": 4425 }, { "epoch": 0.22, "grad_norm": 3.972102642059326, "learning_rate": 3.695399435309749e-05, "loss": 0.4061, "step": 4450 }, { "epoch": 0.22, "grad_norm": 26.55154800415039, "learning_rate": 3.7161601062946355e-05, "loss": 0.59, "step": 4475 }, { "epoch": 0.22, "grad_norm": 8.56869888305664, "learning_rate": 3.736920777279522e-05, "loss": 0.544, "step": 4500 }, { "epoch": 0.23, "grad_norm": 20.557531356811523, "learning_rate": 3.757681448264408e-05, "loss": 0.5488, "step": 4525 }, { "epoch": 0.23, "grad_norm": 5.790639877319336, "learning_rate": 3.778442119249294e-05, "loss": 0.4406, "step": 4550 }, { "epoch": 0.23, "grad_norm": 23.40848731994629, "learning_rate": 3.7992027902341805e-05, "loss": 0.6572, "step": 4575 }, { "epoch": 0.23, "grad_norm": 1.9709395170211792, "learning_rate": 3.819963461219066e-05, "loss": 0.4951, "step": 4600 }, { "epoch": 0.23, "grad_norm": 4.339324951171875, "learning_rate": 3.840724132203953e-05, "loss": 0.5135, "step": 4625 }, { "epoch": 0.23, "grad_norm": 80.32009887695312, "learning_rate": 3.861484803188839e-05, "loss": 0.4385, "step": 4650 }, { "epoch": 0.23, "grad_norm": 22.651634216308594, "learning_rate": 3.8822454741737256e-05, "loss": 0.533, "step": 4675 }, { "epoch": 0.23, "grad_norm": 2.43072247505188, "learning_rate": 3.903006145158612e-05, "loss": 0.523, "step": 4700 }, { "epoch": 0.24, "grad_norm": 19.212390899658203, "learning_rate": 3.9237668161434984e-05, "loss": 0.6725, "step": 4725 }, { "epoch": 0.24, "grad_norm": 2.255681276321411, "learning_rate": 3.944527487128384e-05, "loss": 0.4607, "step": 4750 }, { "epoch": 0.24, "grad_norm": 11.483686447143555, "learning_rate": 3.9652881581132706e-05, "loss": 0.4615, "step": 4775 }, { "epoch": 0.24, "grad_norm": 4.3608527183532715, "learning_rate": 3.986048829098157e-05, "loss": 0.5218, "step": 4800 }, { "epoch": 0.24, "grad_norm": 36.954200744628906, "learning_rate": 4.006809500083043e-05, "loss": 0.4754, "step": 4825 }, { "epoch": 0.24, "grad_norm": 1.0579265356063843, "learning_rate": 4.027570171067929e-05, "loss": 0.4862, "step": 4850 }, { "epoch": 0.24, "grad_norm": 3.5092215538024902, "learning_rate": 4.0483308420528156e-05, "loss": 0.6089, "step": 4875 }, { "epoch": 0.24, "grad_norm": 7.657075881958008, "learning_rate": 4.0690915130377013e-05, "loss": 0.4453, "step": 4900 }, { "epoch": 0.25, "grad_norm": 121.91183471679688, "learning_rate": 4.089852184022588e-05, "loss": 0.4193, "step": 4925 }, { "epoch": 0.25, "grad_norm": 23.41520118713379, "learning_rate": 4.110612855007474e-05, "loss": 0.5111, "step": 4950 }, { "epoch": 0.25, "grad_norm": 8.844143867492676, "learning_rate": 4.13137352599236e-05, "loss": 0.5808, "step": 4975 }, { "epoch": 0.25, "grad_norm": 2.992230176925659, "learning_rate": 4.1521341969772464e-05, "loss": 0.4588, "step": 5000 }, { "epoch": 0.25, "grad_norm": 3.5053603649139404, "learning_rate": 4.172894867962133e-05, "loss": 0.3961, "step": 5025 }, { "epoch": 0.25, "grad_norm": 6.6925740242004395, "learning_rate": 4.193655538947019e-05, "loss": 0.5423, "step": 5050 }, { "epoch": 0.25, "grad_norm": 15.226505279541016, "learning_rate": 4.214416209931905e-05, "loss": 0.5825, "step": 5075 }, { "epoch": 0.25, "grad_norm": 3.675689220428467, "learning_rate": 4.2351768809167914e-05, "loss": 0.53, "step": 5100 }, { "epoch": 0.26, "grad_norm": 17.315486907958984, "learning_rate": 4.255937551901678e-05, "loss": 0.4808, "step": 5125 }, { "epoch": 0.26, "grad_norm": 51.50734329223633, "learning_rate": 4.2766982228865636e-05, "loss": 0.5167, "step": 5150 }, { "epoch": 0.26, "grad_norm": 0.9922437071800232, "learning_rate": 4.29745889387145e-05, "loss": 0.6093, "step": 5175 }, { "epoch": 0.26, "grad_norm": 4.319682598114014, "learning_rate": 4.3182195648563364e-05, "loss": 0.4104, "step": 5200 }, { "epoch": 0.26, "grad_norm": 38.934505462646484, "learning_rate": 4.338980235841222e-05, "loss": 0.524, "step": 5225 }, { "epoch": 0.26, "grad_norm": 19.051517486572266, "learning_rate": 4.3597409068261086e-05, "loss": 0.5192, "step": 5250 }, { "epoch": 0.26, "grad_norm": 25.00204849243164, "learning_rate": 4.380501577810995e-05, "loss": 0.5483, "step": 5275 }, { "epoch": 0.26, "grad_norm": 46.21389389038086, "learning_rate": 4.4012622487958814e-05, "loss": 0.4743, "step": 5300 }, { "epoch": 0.27, "grad_norm": 6.3099565505981445, "learning_rate": 4.422022919780768e-05, "loss": 0.5155, "step": 5325 }, { "epoch": 0.27, "grad_norm": 17.59408950805664, "learning_rate": 4.442783590765654e-05, "loss": 0.5393, "step": 5350 }, { "epoch": 0.27, "grad_norm": 2.886939287185669, "learning_rate": 4.46354426175054e-05, "loss": 0.6733, "step": 5375 }, { "epoch": 0.27, "grad_norm": 3.5486369132995605, "learning_rate": 4.4843049327354265e-05, "loss": 0.6322, "step": 5400 }, { "epoch": 0.27, "grad_norm": 26.038715362548828, "learning_rate": 4.505065603720313e-05, "loss": 0.5641, "step": 5425 }, { "epoch": 0.27, "grad_norm": 5.1668901443481445, "learning_rate": 4.5258262747051986e-05, "loss": 0.5498, "step": 5450 }, { "epoch": 0.27, "grad_norm": 4.700604438781738, "learning_rate": 4.546586945690085e-05, "loss": 0.4702, "step": 5475 }, { "epoch": 0.27, "grad_norm": 16.96105194091797, "learning_rate": 4.5673476166749715e-05, "loss": 0.465, "step": 5500 }, { "epoch": 0.28, "grad_norm": 4.128838062286377, "learning_rate": 4.588108287659857e-05, "loss": 0.519, "step": 5525 }, { "epoch": 0.28, "grad_norm": 18.705781936645508, "learning_rate": 4.6088689586447437e-05, "loss": 0.4802, "step": 5550 }, { "epoch": 0.28, "grad_norm": 1.0547155141830444, "learning_rate": 4.62962962962963e-05, "loss": 0.556, "step": 5575 }, { "epoch": 0.28, "grad_norm": 7.441427230834961, "learning_rate": 4.650390300614516e-05, "loss": 0.6299, "step": 5600 }, { "epoch": 0.28, "grad_norm": 2.871375799179077, "learning_rate": 4.671150971599402e-05, "loss": 0.6662, "step": 5625 }, { "epoch": 0.28, "grad_norm": 5.272790431976318, "learning_rate": 4.691911642584289e-05, "loss": 0.5033, "step": 5650 }, { "epoch": 0.28, "grad_norm": 4.835639476776123, "learning_rate": 4.7126723135691744e-05, "loss": 0.3825, "step": 5675 }, { "epoch": 0.28, "grad_norm": 8.706759452819824, "learning_rate": 4.733432984554061e-05, "loss": 0.5942, "step": 5700 }, { "epoch": 0.29, "grad_norm": 7.657235145568848, "learning_rate": 4.754193655538947e-05, "loss": 0.5206, "step": 5725 }, { "epoch": 0.29, "grad_norm": 7.8590874671936035, "learning_rate": 4.774954326523833e-05, "loss": 0.4882, "step": 5750 }, { "epoch": 0.29, "grad_norm": 8.485298156738281, "learning_rate": 4.7957149975087194e-05, "loss": 0.4087, "step": 5775 }, { "epoch": 0.29, "grad_norm": 2.479032039642334, "learning_rate": 4.816475668493606e-05, "loss": 0.5665, "step": 5800 }, { "epoch": 0.29, "grad_norm": 50.31918716430664, "learning_rate": 4.837236339478492e-05, "loss": 0.5743, "step": 5825 }, { "epoch": 0.29, "grad_norm": 22.77115821838379, "learning_rate": 4.857997010463378e-05, "loss": 0.5766, "step": 5850 }, { "epoch": 0.29, "grad_norm": 13.277716636657715, "learning_rate": 4.8787576814482645e-05, "loss": 0.6116, "step": 5875 }, { "epoch": 0.29, "grad_norm": 6.051513671875, "learning_rate": 4.899518352433151e-05, "loss": 0.4923, "step": 5900 }, { "epoch": 0.3, "grad_norm": 0.7650220394134521, "learning_rate": 4.920279023418037e-05, "loss": 0.4387, "step": 5925 }, { "epoch": 0.3, "grad_norm": 3.7846858501434326, "learning_rate": 4.941039694402924e-05, "loss": 0.5334, "step": 5950 }, { "epoch": 0.3, "grad_norm": 6.026222229003906, "learning_rate": 4.9618003653878095e-05, "loss": 0.4932, "step": 5975 }, { "epoch": 0.3, "grad_norm": 16.16804313659668, "learning_rate": 4.982561036372696e-05, "loss": 0.4351, "step": 6000 }, { "epoch": 0.3, "grad_norm": 20.083784103393555, "learning_rate": 4.9996308805344855e-05, "loss": 0.3674, "step": 6025 }, { "epoch": 0.3, "grad_norm": 7.546694755554199, "learning_rate": 4.9973238838750166e-05, "loss": 0.5934, "step": 6050 }, { "epoch": 0.3, "grad_norm": 33.574642181396484, "learning_rate": 4.995016887215547e-05, "loss": 0.4848, "step": 6075 }, { "epoch": 0.3, "grad_norm": 1.2845981121063232, "learning_rate": 4.992709890556079e-05, "loss": 0.4931, "step": 6100 }, { "epoch": 0.31, "grad_norm": 1.067752718925476, "learning_rate": 4.9904028938966094e-05, "loss": 0.5156, "step": 6125 }, { "epoch": 0.31, "grad_norm": 8.510303497314453, "learning_rate": 4.988095897237141e-05, "loss": 0.4853, "step": 6150 }, { "epoch": 0.31, "grad_norm": 24.643705368041992, "learning_rate": 4.985788900577672e-05, "loss": 0.5828, "step": 6175 }, { "epoch": 0.31, "grad_norm": 3.2508111000061035, "learning_rate": 4.9834819039182034e-05, "loss": 0.528, "step": 6200 }, { "epoch": 0.31, "grad_norm": 3.186040163040161, "learning_rate": 4.9811749072587346e-05, "loss": 0.4924, "step": 6225 }, { "epoch": 0.31, "grad_norm": 5.922456741333008, "learning_rate": 4.978867910599266e-05, "loss": 0.4725, "step": 6250 }, { "epoch": 0.31, "grad_norm": 18.49318504333496, "learning_rate": 4.976560913939797e-05, "loss": 0.5518, "step": 6275 }, { "epoch": 0.31, "grad_norm": 16.115798950195312, "learning_rate": 4.974253917280328e-05, "loss": 0.5084, "step": 6300 }, { "epoch": 0.32, "grad_norm": 27.040874481201172, "learning_rate": 4.971946920620859e-05, "loss": 0.6729, "step": 6325 }, { "epoch": 0.32, "grad_norm": 3.820512056350708, "learning_rate": 4.96963992396139e-05, "loss": 0.5121, "step": 6350 }, { "epoch": 0.32, "grad_norm": 46.661659240722656, "learning_rate": 4.9673329273019214e-05, "loss": 0.4332, "step": 6375 }, { "epoch": 0.32, "grad_norm": 5.819820404052734, "learning_rate": 4.965025930642453e-05, "loss": 0.4221, "step": 6400 }, { "epoch": 0.32, "grad_norm": 2.76592755317688, "learning_rate": 4.962718933982984e-05, "loss": 0.4391, "step": 6425 }, { "epoch": 0.32, "grad_norm": 3.112344980239868, "learning_rate": 4.960411937323515e-05, "loss": 0.3965, "step": 6450 }, { "epoch": 0.32, "grad_norm": 0.7750204205513, "learning_rate": 4.958104940664046e-05, "loss": 0.4986, "step": 6475 }, { "epoch": 0.32, "grad_norm": 108.8819808959961, "learning_rate": 4.955797944004577e-05, "loss": 0.511, "step": 6500 }, { "epoch": 0.33, "grad_norm": 13.656049728393555, "learning_rate": 4.953490947345108e-05, "loss": 0.4491, "step": 6525 }, { "epoch": 0.33, "grad_norm": 33.91952896118164, "learning_rate": 4.9511839506856393e-05, "loss": 0.4709, "step": 6550 }, { "epoch": 0.33, "grad_norm": 142.18125915527344, "learning_rate": 4.948876954026171e-05, "loss": 0.5995, "step": 6575 }, { "epoch": 0.33, "grad_norm": 5.344429016113281, "learning_rate": 4.9465699573667016e-05, "loss": 0.602, "step": 6600 }, { "epoch": 0.33, "grad_norm": 0.9350224137306213, "learning_rate": 4.9442629607072334e-05, "loss": 0.4517, "step": 6625 }, { "epoch": 0.33, "grad_norm": 10.79479694366455, "learning_rate": 4.941955964047764e-05, "loss": 0.482, "step": 6650 }, { "epoch": 0.33, "grad_norm": 22.03511619567871, "learning_rate": 4.939648967388296e-05, "loss": 0.4952, "step": 6675 }, { "epoch": 0.33, "grad_norm": 3.9321937561035156, "learning_rate": 4.937341970728827e-05, "loss": 0.4992, "step": 6700 }, { "epoch": 0.34, "grad_norm": 2.330125331878662, "learning_rate": 4.935034974069358e-05, "loss": 0.5021, "step": 6725 }, { "epoch": 0.34, "grad_norm": 230.00167846679688, "learning_rate": 4.932727977409889e-05, "loss": 0.4186, "step": 6750 }, { "epoch": 0.34, "grad_norm": 2.341181516647339, "learning_rate": 4.93042098075042e-05, "loss": 0.519, "step": 6775 }, { "epoch": 0.34, "grad_norm": 0.8861842155456543, "learning_rate": 4.9281139840909514e-05, "loss": 0.4363, "step": 6800 }, { "epoch": 0.34, "grad_norm": 41.04738998413086, "learning_rate": 4.925806987431482e-05, "loss": 0.5219, "step": 6825 }, { "epoch": 0.34, "grad_norm": 6.22245454788208, "learning_rate": 4.9234999907720137e-05, "loss": 0.5745, "step": 6850 }, { "epoch": 0.34, "grad_norm": 1.7552106380462646, "learning_rate": 4.921192994112545e-05, "loss": 0.5304, "step": 6875 }, { "epoch": 0.34, "grad_norm": 1.7670912742614746, "learning_rate": 4.918885997453076e-05, "loss": 0.5369, "step": 6900 }, { "epoch": 0.35, "grad_norm": 2.4497487545013428, "learning_rate": 4.916579000793607e-05, "loss": 0.4918, "step": 6925 }, { "epoch": 0.35, "grad_norm": 1.5100780725479126, "learning_rate": 4.914272004134138e-05, "loss": 0.4696, "step": 6950 }, { "epoch": 0.35, "grad_norm": 8.250646591186523, "learning_rate": 4.911965007474669e-05, "loss": 0.4365, "step": 6975 }, { "epoch": 0.35, "grad_norm": 6.092109203338623, "learning_rate": 4.9096580108152005e-05, "loss": 0.5336, "step": 7000 }, { "epoch": 0.35, "grad_norm": 57.527923583984375, "learning_rate": 4.9073510141557316e-05, "loss": 0.5304, "step": 7025 }, { "epoch": 0.35, "grad_norm": 5.689362049102783, "learning_rate": 4.905044017496263e-05, "loss": 0.471, "step": 7050 }, { "epoch": 0.35, "grad_norm": 5.691891670227051, "learning_rate": 4.902737020836794e-05, "loss": 0.5668, "step": 7075 }, { "epoch": 0.35, "grad_norm": 8.563892364501953, "learning_rate": 4.900430024177326e-05, "loss": 0.492, "step": 7100 }, { "epoch": 0.36, "grad_norm": 18.62669563293457, "learning_rate": 4.898123027517856e-05, "loss": 0.5833, "step": 7125 }, { "epoch": 0.36, "grad_norm": 60.172019958496094, "learning_rate": 4.895816030858388e-05, "loss": 0.4275, "step": 7150 }, { "epoch": 0.36, "grad_norm": 10.911247253417969, "learning_rate": 4.8935090341989184e-05, "loss": 0.6028, "step": 7175 }, { "epoch": 0.36, "grad_norm": 105.57086181640625, "learning_rate": 4.8912020375394496e-05, "loss": 0.5596, "step": 7200 }, { "epoch": 0.36, "grad_norm": 19.55571746826172, "learning_rate": 4.8888950408799814e-05, "loss": 0.6494, "step": 7225 }, { "epoch": 0.36, "grad_norm": 2.126455307006836, "learning_rate": 4.886588044220512e-05, "loss": 0.4685, "step": 7250 }, { "epoch": 0.36, "grad_norm": 6.115291118621826, "learning_rate": 4.8842810475610436e-05, "loss": 0.5611, "step": 7275 }, { "epoch": 0.36, "grad_norm": 3.771648645401001, "learning_rate": 4.881974050901574e-05, "loss": 0.5599, "step": 7300 }, { "epoch": 0.37, "grad_norm": 6.9629225730896, "learning_rate": 4.879667054242106e-05, "loss": 0.5673, "step": 7325 }, { "epoch": 0.37, "grad_norm": 18.98270034790039, "learning_rate": 4.8773600575826364e-05, "loss": 0.3507, "step": 7350 }, { "epoch": 0.37, "grad_norm": 14.278607368469238, "learning_rate": 4.875053060923168e-05, "loss": 0.4875, "step": 7375 }, { "epoch": 0.37, "grad_norm": 15.664734840393066, "learning_rate": 4.872746064263699e-05, "loss": 0.6233, "step": 7400 }, { "epoch": 0.37, "grad_norm": 17.01627540588379, "learning_rate": 4.8704390676042305e-05, "loss": 0.4525, "step": 7425 }, { "epoch": 0.37, "grad_norm": 3.3598997592926025, "learning_rate": 4.8681320709447616e-05, "loss": 0.517, "step": 7450 }, { "epoch": 0.37, "grad_norm": 11.31663703918457, "learning_rate": 4.865825074285293e-05, "loss": 0.482, "step": 7475 }, { "epoch": 0.37, "grad_norm": 3.1184511184692383, "learning_rate": 4.863518077625824e-05, "loss": 0.5363, "step": 7500 }, { "epoch": 0.37, "grad_norm": 60.25141906738281, "learning_rate": 4.861211080966355e-05, "loss": 0.5031, "step": 7525 }, { "epoch": 0.38, "grad_norm": 36.034324645996094, "learning_rate": 4.858904084306886e-05, "loss": 0.5599, "step": 7550 }, { "epoch": 0.38, "grad_norm": 11.539170265197754, "learning_rate": 4.856597087647417e-05, "loss": 0.5183, "step": 7575 }, { "epoch": 0.38, "grad_norm": 28.977872848510742, "learning_rate": 4.8542900909879484e-05, "loss": 0.5144, "step": 7600 }, { "epoch": 0.38, "grad_norm": 1.6298623085021973, "learning_rate": 4.8519830943284796e-05, "loss": 0.4624, "step": 7625 }, { "epoch": 0.38, "grad_norm": 12.326157569885254, "learning_rate": 4.849676097669011e-05, "loss": 0.4219, "step": 7650 }, { "epoch": 0.38, "grad_norm": 11.35020923614502, "learning_rate": 4.847369101009542e-05, "loss": 0.5089, "step": 7675 }, { "epoch": 0.38, "grad_norm": 0.975980281829834, "learning_rate": 4.845062104350073e-05, "loss": 0.4878, "step": 7700 }, { "epoch": 0.38, "grad_norm": 21.873689651489258, "learning_rate": 4.842755107690604e-05, "loss": 0.491, "step": 7725 }, { "epoch": 0.39, "grad_norm": 2.1827073097229004, "learning_rate": 4.840448111031136e-05, "loss": 0.4746, "step": 7750 }, { "epoch": 0.39, "grad_norm": 19.413707733154297, "learning_rate": 4.8381411143716664e-05, "loss": 0.686, "step": 7775 }, { "epoch": 0.39, "grad_norm": 68.73217010498047, "learning_rate": 4.835834117712198e-05, "loss": 0.4889, "step": 7800 }, { "epoch": 0.39, "grad_norm": 3.8209142684936523, "learning_rate": 4.8335271210527286e-05, "loss": 0.4441, "step": 7825 }, { "epoch": 0.39, "grad_norm": 8.087347984313965, "learning_rate": 4.8312201243932605e-05, "loss": 0.396, "step": 7850 }, { "epoch": 0.39, "grad_norm": 24.778383255004883, "learning_rate": 4.828913127733791e-05, "loss": 0.5427, "step": 7875 }, { "epoch": 0.39, "grad_norm": 2.647791624069214, "learning_rate": 4.826606131074323e-05, "loss": 0.504, "step": 7900 }, { "epoch": 0.39, "grad_norm": 15.89461612701416, "learning_rate": 4.824299134414854e-05, "loss": 0.4808, "step": 7925 }, { "epoch": 0.4, "grad_norm": 6.659229755401611, "learning_rate": 4.821992137755384e-05, "loss": 0.598, "step": 7950 }, { "epoch": 0.4, "grad_norm": 8.807088851928711, "learning_rate": 4.819685141095916e-05, "loss": 0.3952, "step": 7975 }, { "epoch": 0.4, "grad_norm": 4.321648120880127, "learning_rate": 4.8173781444364466e-05, "loss": 0.5091, "step": 8000 }, { "epoch": 0.4, "grad_norm": 2.7659878730773926, "learning_rate": 4.8150711477769784e-05, "loss": 0.4897, "step": 8025 }, { "epoch": 0.4, "grad_norm": 2.758430004119873, "learning_rate": 4.8127641511175095e-05, "loss": 0.5376, "step": 8050 }, { "epoch": 0.4, "grad_norm": 8.904570579528809, "learning_rate": 4.810457154458041e-05, "loss": 0.5006, "step": 8075 }, { "epoch": 0.4, "grad_norm": 45.822593688964844, "learning_rate": 4.808150157798572e-05, "loss": 0.4573, "step": 8100 }, { "epoch": 0.4, "grad_norm": 3.2372379302978516, "learning_rate": 4.805843161139103e-05, "loss": 0.3971, "step": 8125 }, { "epoch": 0.41, "grad_norm": 84.43883514404297, "learning_rate": 4.803536164479634e-05, "loss": 0.3685, "step": 8150 }, { "epoch": 0.41, "grad_norm": 5.58286714553833, "learning_rate": 4.801229167820165e-05, "loss": 0.5961, "step": 8175 }, { "epoch": 0.41, "grad_norm": 1.6324478387832642, "learning_rate": 4.7989221711606964e-05, "loss": 0.6016, "step": 8200 }, { "epoch": 0.41, "grad_norm": 4.677681922912598, "learning_rate": 4.7966151745012275e-05, "loss": 0.4963, "step": 8225 }, { "epoch": 0.41, "grad_norm": 9.602376937866211, "learning_rate": 4.7943081778417586e-05, "loss": 0.5354, "step": 8250 }, { "epoch": 0.41, "grad_norm": 2.9427170753479004, "learning_rate": 4.7920011811822904e-05, "loss": 0.4566, "step": 8275 }, { "epoch": 0.41, "grad_norm": 11.23338794708252, "learning_rate": 4.789694184522821e-05, "loss": 0.5302, "step": 8300 }, { "epoch": 0.41, "grad_norm": 15.995217323303223, "learning_rate": 4.787387187863352e-05, "loss": 0.5199, "step": 8325 }, { "epoch": 0.42, "grad_norm": 24.4979190826416, "learning_rate": 4.785080191203883e-05, "loss": 0.3849, "step": 8350 }, { "epoch": 0.42, "grad_norm": 1.894063949584961, "learning_rate": 4.782773194544414e-05, "loss": 0.5529, "step": 8375 }, { "epoch": 0.42, "grad_norm": 18.658781051635742, "learning_rate": 4.7804661978849454e-05, "loss": 0.5758, "step": 8400 }, { "epoch": 0.42, "grad_norm": 4.197040557861328, "learning_rate": 4.7781592012254766e-05, "loss": 0.4704, "step": 8425 }, { "epoch": 0.42, "grad_norm": 1.2483537197113037, "learning_rate": 4.7758522045660084e-05, "loss": 0.4748, "step": 8450 }, { "epoch": 0.42, "grad_norm": 23.75387954711914, "learning_rate": 4.773545207906539e-05, "loss": 0.5097, "step": 8475 }, { "epoch": 0.42, "grad_norm": 3.9099292755126953, "learning_rate": 4.771238211247071e-05, "loss": 0.4713, "step": 8500 }, { "epoch": 0.42, "grad_norm": 0.816892683506012, "learning_rate": 4.768931214587601e-05, "loss": 0.4907, "step": 8525 }, { "epoch": 0.43, "grad_norm": 3.892925977706909, "learning_rate": 4.766624217928133e-05, "loss": 0.6005, "step": 8550 }, { "epoch": 0.43, "grad_norm": 3.5819435119628906, "learning_rate": 4.764317221268664e-05, "loss": 0.4862, "step": 8575 }, { "epoch": 0.43, "grad_norm": 4.449002265930176, "learning_rate": 4.762010224609195e-05, "loss": 0.557, "step": 8600 }, { "epoch": 0.43, "grad_norm": 28.792911529541016, "learning_rate": 4.7597032279497263e-05, "loss": 0.4196, "step": 8625 }, { "epoch": 0.43, "grad_norm": 66.64269256591797, "learning_rate": 4.7573962312902575e-05, "loss": 0.4574, "step": 8650 }, { "epoch": 0.43, "grad_norm": 37.438262939453125, "learning_rate": 4.7550892346307886e-05, "loss": 0.4338, "step": 8675 }, { "epoch": 0.43, "grad_norm": 6.17923641204834, "learning_rate": 4.752782237971319e-05, "loss": 0.461, "step": 8700 }, { "epoch": 0.43, "grad_norm": 6.727214336395264, "learning_rate": 4.750475241311851e-05, "loss": 0.5826, "step": 8725 }, { "epoch": 0.44, "grad_norm": 14.901581764221191, "learning_rate": 4.748168244652382e-05, "loss": 0.5953, "step": 8750 }, { "epoch": 0.44, "grad_norm": 11.645687103271484, "learning_rate": 4.745861247992913e-05, "loss": 0.5776, "step": 8775 }, { "epoch": 0.44, "grad_norm": 2.653366804122925, "learning_rate": 4.743554251333444e-05, "loss": 0.5414, "step": 8800 }, { "epoch": 0.44, "grad_norm": 95.31356811523438, "learning_rate": 4.7412472546739754e-05, "loss": 0.527, "step": 8825 }, { "epoch": 0.44, "grad_norm": 6.815218925476074, "learning_rate": 4.7389402580145066e-05, "loss": 0.4701, "step": 8850 }, { "epoch": 0.44, "grad_norm": 2.4729785919189453, "learning_rate": 4.736633261355038e-05, "loss": 0.4491, "step": 8875 }, { "epoch": 0.44, "grad_norm": 13.656978607177734, "learning_rate": 4.734326264695569e-05, "loss": 0.4799, "step": 8900 }, { "epoch": 0.44, "grad_norm": 2.3728275299072266, "learning_rate": 4.7320192680361e-05, "loss": 0.4938, "step": 8925 }, { "epoch": 0.45, "grad_norm": 20.797361373901367, "learning_rate": 4.729712271376631e-05, "loss": 0.5013, "step": 8950 }, { "epoch": 0.45, "grad_norm": 0.904093325138092, "learning_rate": 4.727405274717163e-05, "loss": 0.3779, "step": 8975 }, { "epoch": 0.45, "grad_norm": 9.007451057434082, "learning_rate": 4.7250982780576934e-05, "loss": 0.7228, "step": 9000 }, { "epoch": 0.45, "grad_norm": 2.9835093021392822, "learning_rate": 4.722791281398225e-05, "loss": 0.3662, "step": 9025 }, { "epoch": 0.45, "grad_norm": 13.8798189163208, "learning_rate": 4.7204842847387557e-05, "loss": 0.4275, "step": 9050 }, { "epoch": 0.45, "grad_norm": 6.156960487365723, "learning_rate": 4.718177288079287e-05, "loss": 0.4141, "step": 9075 }, { "epoch": 0.45, "grad_norm": 27.34996795654297, "learning_rate": 4.7158702914198186e-05, "loss": 0.423, "step": 9100 }, { "epoch": 0.45, "grad_norm": 1.5032697916030884, "learning_rate": 4.713563294760349e-05, "loss": 0.465, "step": 9125 }, { "epoch": 0.46, "grad_norm": 3.1437747478485107, "learning_rate": 4.711256298100881e-05, "loss": 0.4046, "step": 9150 }, { "epoch": 0.46, "grad_norm": 3.2048637866973877, "learning_rate": 4.708949301441411e-05, "loss": 0.4301, "step": 9175 }, { "epoch": 0.46, "grad_norm": 4.686911582946777, "learning_rate": 4.706642304781943e-05, "loss": 0.5915, "step": 9200 }, { "epoch": 0.46, "grad_norm": 4.331536769866943, "learning_rate": 4.7043353081224736e-05, "loss": 0.5044, "step": 9225 }, { "epoch": 0.46, "grad_norm": 5.540689468383789, "learning_rate": 4.7020283114630054e-05, "loss": 0.5204, "step": 9250 }, { "epoch": 0.46, "grad_norm": 15.265934944152832, "learning_rate": 4.6997213148035366e-05, "loss": 0.4567, "step": 9275 }, { "epoch": 0.46, "grad_norm": 4.857421398162842, "learning_rate": 4.697414318144068e-05, "loss": 0.5886, "step": 9300 }, { "epoch": 0.46, "grad_norm": 1.7066471576690674, "learning_rate": 4.695107321484599e-05, "loss": 0.4756, "step": 9325 }, { "epoch": 0.47, "grad_norm": 20.88972282409668, "learning_rate": 4.69280032482513e-05, "loss": 0.4356, "step": 9350 }, { "epoch": 0.47, "grad_norm": 11.743831634521484, "learning_rate": 4.690493328165661e-05, "loss": 0.3707, "step": 9375 }, { "epoch": 0.47, "grad_norm": 19.77294158935547, "learning_rate": 4.688186331506192e-05, "loss": 0.519, "step": 9400 }, { "epoch": 0.47, "grad_norm": 0.8997837901115417, "learning_rate": 4.6858793348467234e-05, "loss": 0.4333, "step": 9425 }, { "epoch": 0.47, "grad_norm": 3.1104369163513184, "learning_rate": 4.6835723381872545e-05, "loss": 0.4259, "step": 9450 }, { "epoch": 0.47, "grad_norm": 9.724773406982422, "learning_rate": 4.6812653415277856e-05, "loss": 0.5336, "step": 9475 }, { "epoch": 0.47, "grad_norm": 6.291820049285889, "learning_rate": 4.678958344868317e-05, "loss": 0.4473, "step": 9500 }, { "epoch": 0.47, "grad_norm": 37.74559783935547, "learning_rate": 4.676651348208848e-05, "loss": 0.5415, "step": 9525 }, { "epoch": 0.48, "grad_norm": 2.051961898803711, "learning_rate": 4.674344351549379e-05, "loss": 0.4485, "step": 9550 }, { "epoch": 0.48, "grad_norm": 6.569771766662598, "learning_rate": 4.67203735488991e-05, "loss": 0.5229, "step": 9575 }, { "epoch": 0.48, "grad_norm": 2.9228270053863525, "learning_rate": 4.669730358230441e-05, "loss": 0.3779, "step": 9600 }, { "epoch": 0.48, "grad_norm": 3.5740840435028076, "learning_rate": 4.667423361570973e-05, "loss": 0.4852, "step": 9625 }, { "epoch": 0.48, "grad_norm": 5.161360740661621, "learning_rate": 4.6651163649115036e-05, "loss": 0.483, "step": 9650 }, { "epoch": 0.48, "grad_norm": 1.6016749143600464, "learning_rate": 4.6628093682520354e-05, "loss": 0.3926, "step": 9675 }, { "epoch": 0.48, "grad_norm": 28.20662498474121, "learning_rate": 4.660502371592566e-05, "loss": 0.4485, "step": 9700 }, { "epoch": 0.48, "grad_norm": 3.143033742904663, "learning_rate": 4.658195374933098e-05, "loss": 0.523, "step": 9725 }, { "epoch": 0.49, "grad_norm": 4.893307685852051, "learning_rate": 4.655888378273628e-05, "loss": 0.586, "step": 9750 }, { "epoch": 0.49, "grad_norm": 2.808121681213379, "learning_rate": 4.65358138161416e-05, "loss": 0.4798, "step": 9775 }, { "epoch": 0.49, "grad_norm": 1.5118587017059326, "learning_rate": 4.651274384954691e-05, "loss": 0.5001, "step": 9800 }, { "epoch": 0.49, "grad_norm": 1.5315314531326294, "learning_rate": 4.6489673882952215e-05, "loss": 0.4757, "step": 9825 }, { "epoch": 0.49, "grad_norm": 26.32784080505371, "learning_rate": 4.6466603916357534e-05, "loss": 0.5262, "step": 9850 }, { "epoch": 0.49, "grad_norm": 11.232955932617188, "learning_rate": 4.644353394976284e-05, "loss": 0.4995, "step": 9875 }, { "epoch": 0.49, "grad_norm": 2.1047329902648926, "learning_rate": 4.6420463983168156e-05, "loss": 0.4882, "step": 9900 }, { "epoch": 0.49, "grad_norm": 1.3566862344741821, "learning_rate": 4.639739401657347e-05, "loss": 0.5503, "step": 9925 }, { "epoch": 0.5, "grad_norm": 2.307016134262085, "learning_rate": 4.637432404997878e-05, "loss": 0.359, "step": 9950 }, { "epoch": 0.5, "grad_norm": 6.331679821014404, "learning_rate": 4.635125408338409e-05, "loss": 0.4108, "step": 9975 }, { "epoch": 0.5, "grad_norm": 5.821734428405762, "learning_rate": 4.63281841167894e-05, "loss": 0.438, "step": 10000 }, { "epoch": 0.5, "grad_norm": 8.801206588745117, "learning_rate": 4.630511415019471e-05, "loss": 0.5269, "step": 10025 }, { "epoch": 0.5, "grad_norm": 1.4149796962738037, "learning_rate": 4.6282044183600025e-05, "loss": 0.5253, "step": 10050 }, { "epoch": 0.5, "grad_norm": 1.5596532821655273, "learning_rate": 4.6258974217005336e-05, "loss": 0.4622, "step": 10075 }, { "epoch": 0.5, "grad_norm": 0.8093569874763489, "learning_rate": 4.623590425041065e-05, "loss": 0.3444, "step": 10100 }, { "epoch": 0.5, "grad_norm": 4.408332824707031, "learning_rate": 4.621283428381596e-05, "loss": 0.5996, "step": 10125 }, { "epoch": 0.51, "grad_norm": 1.2513713836669922, "learning_rate": 4.618976431722128e-05, "loss": 0.3927, "step": 10150 }, { "epoch": 0.51, "grad_norm": 118.37767791748047, "learning_rate": 4.616669435062658e-05, "loss": 0.4931, "step": 10175 }, { "epoch": 0.51, "grad_norm": 3.265443801879883, "learning_rate": 4.614362438403189e-05, "loss": 0.5738, "step": 10200 }, { "epoch": 0.51, "grad_norm": 3.070544719696045, "learning_rate": 4.6120554417437204e-05, "loss": 0.4596, "step": 10225 }, { "epoch": 0.51, "grad_norm": 6.112723350524902, "learning_rate": 4.6097484450842515e-05, "loss": 0.4394, "step": 10250 }, { "epoch": 0.51, "grad_norm": 6.3622307777404785, "learning_rate": 4.607441448424783e-05, "loss": 0.3585, "step": 10275 }, { "epoch": 0.51, "grad_norm": 4.152879238128662, "learning_rate": 4.605134451765314e-05, "loss": 0.5883, "step": 10300 }, { "epoch": 0.51, "grad_norm": 3.7687737941741943, "learning_rate": 4.6028274551058456e-05, "loss": 0.3813, "step": 10325 }, { "epoch": 0.52, "grad_norm": 3.0186991691589355, "learning_rate": 4.600520458446376e-05, "loss": 0.5107, "step": 10350 }, { "epoch": 0.52, "grad_norm": 0.6705989837646484, "learning_rate": 4.598213461786908e-05, "loss": 0.3563, "step": 10375 }, { "epoch": 0.52, "grad_norm": 3.8952481746673584, "learning_rate": 4.5959064651274384e-05, "loss": 0.4218, "step": 10400 }, { "epoch": 0.52, "grad_norm": 1.3632725477218628, "learning_rate": 4.59359946846797e-05, "loss": 0.4832, "step": 10425 }, { "epoch": 0.52, "grad_norm": 1.1923586130142212, "learning_rate": 4.591292471808501e-05, "loss": 0.4351, "step": 10450 }, { "epoch": 0.52, "grad_norm": 3.5886104106903076, "learning_rate": 4.5889854751490324e-05, "loss": 0.5084, "step": 10475 }, { "epoch": 0.52, "grad_norm": 0.7589617967605591, "learning_rate": 4.5866784784895636e-05, "loss": 0.4147, "step": 10500 }, { "epoch": 0.52, "grad_norm": 16.9898681640625, "learning_rate": 4.584371481830095e-05, "loss": 0.5198, "step": 10525 }, { "epoch": 0.53, "grad_norm": 3.4914662837982178, "learning_rate": 4.582064485170626e-05, "loss": 0.4613, "step": 10550 }, { "epoch": 0.53, "grad_norm": 8.139269828796387, "learning_rate": 4.579757488511156e-05, "loss": 0.4438, "step": 10575 }, { "epoch": 0.53, "grad_norm": 5.789552688598633, "learning_rate": 4.577450491851688e-05, "loss": 0.4678, "step": 10600 }, { "epoch": 0.53, "grad_norm": 3.183175802230835, "learning_rate": 4.575143495192219e-05, "loss": 0.3469, "step": 10625 }, { "epoch": 0.53, "grad_norm": 2.14797043800354, "learning_rate": 4.5728364985327504e-05, "loss": 0.5492, "step": 10650 }, { "epoch": 0.53, "grad_norm": 6.795634746551514, "learning_rate": 4.5705295018732815e-05, "loss": 0.4079, "step": 10675 }, { "epoch": 0.53, "grad_norm": 4.230640888214111, "learning_rate": 4.568222505213813e-05, "loss": 0.5704, "step": 10700 }, { "epoch": 0.53, "grad_norm": 3.867367744445801, "learning_rate": 4.565915508554344e-05, "loss": 0.4377, "step": 10725 }, { "epoch": 0.54, "grad_norm": 3.2728750705718994, "learning_rate": 4.563608511894875e-05, "loss": 0.55, "step": 10750 }, { "epoch": 0.54, "grad_norm": 4.1966552734375, "learning_rate": 4.561301515235406e-05, "loss": 0.4194, "step": 10775 }, { "epoch": 0.54, "grad_norm": 0.6582638025283813, "learning_rate": 4.558994518575937e-05, "loss": 0.4398, "step": 10800 }, { "epoch": 0.54, "grad_norm": 0.6245179176330566, "learning_rate": 4.5566875219164683e-05, "loss": 0.3498, "step": 10825 }, { "epoch": 0.54, "grad_norm": 18.104259490966797, "learning_rate": 4.554380525257e-05, "loss": 0.6087, "step": 10850 }, { "epoch": 0.54, "grad_norm": 0.6365619897842407, "learning_rate": 4.5520735285975306e-05, "loss": 0.4309, "step": 10875 }, { "epoch": 0.54, "grad_norm": 15.83073616027832, "learning_rate": 4.5497665319380624e-05, "loss": 0.566, "step": 10900 }, { "epoch": 0.54, "grad_norm": 32.08317947387695, "learning_rate": 4.547459535278593e-05, "loss": 0.4114, "step": 10925 }, { "epoch": 0.55, "grad_norm": 74.06217956542969, "learning_rate": 4.545152538619124e-05, "loss": 0.3804, "step": 10950 }, { "epoch": 0.55, "grad_norm": 6.568449974060059, "learning_rate": 4.542845541959656e-05, "loss": 0.4735, "step": 10975 }, { "epoch": 0.55, "grad_norm": 3.6441125869750977, "learning_rate": 4.540538545300186e-05, "loss": 0.5924, "step": 11000 }, { "epoch": 0.55, "grad_norm": 5.191139221191406, "learning_rate": 4.538231548640718e-05, "loss": 0.4658, "step": 11025 }, { "epoch": 0.55, "grad_norm": 185.90740966796875, "learning_rate": 4.5359245519812486e-05, "loss": 0.4915, "step": 11050 }, { "epoch": 0.55, "grad_norm": 6.593203067779541, "learning_rate": 4.5336175553217804e-05, "loss": 0.4706, "step": 11075 }, { "epoch": 0.55, "grad_norm": 9.717700004577637, "learning_rate": 4.531310558662311e-05, "loss": 0.4965, "step": 11100 }, { "epoch": 0.55, "grad_norm": 74.32408905029297, "learning_rate": 4.5290035620028427e-05, "loss": 0.5012, "step": 11125 }, { "epoch": 0.56, "grad_norm": 3.009906530380249, "learning_rate": 4.526696565343374e-05, "loss": 0.5012, "step": 11150 }, { "epoch": 0.56, "grad_norm": 7.148441314697266, "learning_rate": 4.524389568683905e-05, "loss": 0.4644, "step": 11175 }, { "epoch": 0.56, "grad_norm": 3.9309608936309814, "learning_rate": 4.522082572024436e-05, "loss": 0.5467, "step": 11200 }, { "epoch": 0.56, "grad_norm": 2.191103219985962, "learning_rate": 4.519775575364967e-05, "loss": 0.4983, "step": 11225 }, { "epoch": 0.56, "grad_norm": 29.83028793334961, "learning_rate": 4.517468578705498e-05, "loss": 0.4761, "step": 11250 }, { "epoch": 0.56, "grad_norm": 3.0909762382507324, "learning_rate": 4.5151615820460295e-05, "loss": 0.5817, "step": 11275 }, { "epoch": 0.56, "grad_norm": 7.159379482269287, "learning_rate": 4.5128545853865606e-05, "loss": 0.6388, "step": 11300 }, { "epoch": 0.56, "grad_norm": 1.8289567232131958, "learning_rate": 4.510547588727092e-05, "loss": 0.5193, "step": 11325 }, { "epoch": 0.57, "grad_norm": 1.2371580600738525, "learning_rate": 4.508240592067623e-05, "loss": 0.4833, "step": 11350 }, { "epoch": 0.57, "grad_norm": 2.108119010925293, "learning_rate": 4.505933595408154e-05, "loss": 0.4918, "step": 11375 }, { "epoch": 0.57, "grad_norm": 1.026004433631897, "learning_rate": 4.503626598748685e-05, "loss": 0.4207, "step": 11400 }, { "epoch": 0.57, "grad_norm": 1.4752520322799683, "learning_rate": 4.501319602089216e-05, "loss": 0.4227, "step": 11425 }, { "epoch": 0.57, "grad_norm": 3.9238433837890625, "learning_rate": 4.4990126054297474e-05, "loss": 0.3833, "step": 11450 }, { "epoch": 0.57, "grad_norm": 1.4511189460754395, "learning_rate": 4.4967056087702786e-05, "loss": 0.6285, "step": 11475 }, { "epoch": 0.57, "grad_norm": 4.272202968597412, "learning_rate": 4.4943986121108104e-05, "loss": 0.4541, "step": 11500 }, { "epoch": 0.57, "grad_norm": 5.563125133514404, "learning_rate": 4.492091615451341e-05, "loss": 0.4257, "step": 11525 }, { "epoch": 0.58, "grad_norm": 7.777960300445557, "learning_rate": 4.4897846187918726e-05, "loss": 0.5858, "step": 11550 }, { "epoch": 0.58, "grad_norm": 24.097171783447266, "learning_rate": 4.487477622132403e-05, "loss": 0.4463, "step": 11575 }, { "epoch": 0.58, "grad_norm": 0.9840202927589417, "learning_rate": 4.485170625472935e-05, "loss": 0.5353, "step": 11600 }, { "epoch": 0.58, "grad_norm": 37.77027130126953, "learning_rate": 4.4828636288134654e-05, "loss": 0.5239, "step": 11625 }, { "epoch": 0.58, "grad_norm": 0.9202004075050354, "learning_rate": 4.480556632153997e-05, "loss": 0.4119, "step": 11650 }, { "epoch": 0.58, "grad_norm": 5.5572428703308105, "learning_rate": 4.478249635494528e-05, "loss": 0.553, "step": 11675 }, { "epoch": 0.58, "grad_norm": 6.949309825897217, "learning_rate": 4.475942638835059e-05, "loss": 0.4679, "step": 11700 }, { "epoch": 0.58, "grad_norm": 2.6731739044189453, "learning_rate": 4.4736356421755906e-05, "loss": 0.5218, "step": 11725 }, { "epoch": 0.59, "grad_norm": 1.7549457550048828, "learning_rate": 4.471328645516121e-05, "loss": 0.4876, "step": 11750 }, { "epoch": 0.59, "grad_norm": 3.102992534637451, "learning_rate": 4.469021648856653e-05, "loss": 0.5375, "step": 11775 }, { "epoch": 0.59, "grad_norm": 6.229030132293701, "learning_rate": 4.466714652197183e-05, "loss": 0.4951, "step": 11800 }, { "epoch": 0.59, "grad_norm": 9.176125526428223, "learning_rate": 4.464407655537715e-05, "loss": 0.5659, "step": 11825 }, { "epoch": 0.59, "grad_norm": 19.079057693481445, "learning_rate": 4.462100658878246e-05, "loss": 0.3716, "step": 11850 }, { "epoch": 0.59, "grad_norm": 0.6645026206970215, "learning_rate": 4.4597936622187774e-05, "loss": 0.5027, "step": 11875 }, { "epoch": 0.59, "grad_norm": 2.9463725090026855, "learning_rate": 4.4574866655593085e-05, "loss": 0.5442, "step": 11900 }, { "epoch": 0.59, "grad_norm": 3.1484124660491943, "learning_rate": 4.45517966889984e-05, "loss": 0.4664, "step": 11925 }, { "epoch": 0.6, "grad_norm": 24.31441307067871, "learning_rate": 4.452872672240371e-05, "loss": 0.5003, "step": 11950 }, { "epoch": 0.6, "grad_norm": 3.507324695587158, "learning_rate": 4.450565675580902e-05, "loss": 0.4564, "step": 11975 }, { "epoch": 0.6, "grad_norm": 18.16114044189453, "learning_rate": 4.448258678921433e-05, "loss": 0.4471, "step": 12000 }, { "epoch": 0.6, "grad_norm": 4.681582927703857, "learning_rate": 4.445951682261965e-05, "loss": 0.5089, "step": 12025 }, { "epoch": 0.6, "grad_norm": 27.28392791748047, "learning_rate": 4.4436446856024954e-05, "loss": 0.49, "step": 12050 }, { "epoch": 0.6, "grad_norm": 4.916137218475342, "learning_rate": 4.4413376889430265e-05, "loss": 0.4045, "step": 12075 }, { "epoch": 0.6, "grad_norm": 7.058281898498535, "learning_rate": 4.4390306922835576e-05, "loss": 0.3812, "step": 12100 }, { "epoch": 0.6, "grad_norm": 4.769257545471191, "learning_rate": 4.436723695624089e-05, "loss": 0.4888, "step": 12125 }, { "epoch": 0.61, "grad_norm": 7.5978288650512695, "learning_rate": 4.43441669896462e-05, "loss": 0.6009, "step": 12150 }, { "epoch": 0.61, "grad_norm": 1.2456187009811401, "learning_rate": 4.432109702305151e-05, "loss": 0.4086, "step": 12175 }, { "epoch": 0.61, "grad_norm": 5.981049537658691, "learning_rate": 4.429802705645683e-05, "loss": 0.4829, "step": 12200 }, { "epoch": 0.61, "grad_norm": 0.44680818915367126, "learning_rate": 4.427495708986213e-05, "loss": 0.3603, "step": 12225 }, { "epoch": 0.61, "grad_norm": 33.1859016418457, "learning_rate": 4.425188712326745e-05, "loss": 0.4617, "step": 12250 }, { "epoch": 0.61, "grad_norm": 0.866825520992279, "learning_rate": 4.4228817156672756e-05, "loss": 0.509, "step": 12275 }, { "epoch": 0.61, "grad_norm": 20.547454833984375, "learning_rate": 4.4205747190078074e-05, "loss": 0.5846, "step": 12300 }, { "epoch": 0.61, "grad_norm": 2.952171564102173, "learning_rate": 4.418267722348338e-05, "loss": 0.4625, "step": 12325 }, { "epoch": 0.62, "grad_norm": 3.1319684982299805, "learning_rate": 4.41596072568887e-05, "loss": 0.4154, "step": 12350 }, { "epoch": 0.62, "grad_norm": 6.610553741455078, "learning_rate": 4.413653729029401e-05, "loss": 0.3864, "step": 12375 }, { "epoch": 0.62, "grad_norm": 8.751646041870117, "learning_rate": 4.411346732369932e-05, "loss": 0.3609, "step": 12400 }, { "epoch": 0.62, "grad_norm": 3.6355435848236084, "learning_rate": 4.409039735710463e-05, "loss": 0.5158, "step": 12425 }, { "epoch": 0.62, "grad_norm": 5.307819366455078, "learning_rate": 4.4067327390509935e-05, "loss": 0.4856, "step": 12450 }, { "epoch": 0.62, "grad_norm": 2.587369918823242, "learning_rate": 4.4044257423915254e-05, "loss": 0.4296, "step": 12475 }, { "epoch": 0.62, "grad_norm": 3.888181686401367, "learning_rate": 4.4021187457320565e-05, "loss": 0.5317, "step": 12500 }, { "epoch": 0.62, "grad_norm": 15.251875877380371, "learning_rate": 4.3998117490725876e-05, "loss": 0.4947, "step": 12525 }, { "epoch": 0.63, "grad_norm": 2.4063093662261963, "learning_rate": 4.397504752413119e-05, "loss": 0.4243, "step": 12550 }, { "epoch": 0.63, "grad_norm": 13.604540824890137, "learning_rate": 4.39519775575365e-05, "loss": 0.5706, "step": 12575 }, { "epoch": 0.63, "grad_norm": 4.274982929229736, "learning_rate": 4.392890759094181e-05, "loss": 0.3754, "step": 12600 }, { "epoch": 0.63, "grad_norm": 2.1278367042541504, "learning_rate": 4.390583762434712e-05, "loss": 0.45, "step": 12625 }, { "epoch": 0.63, "grad_norm": 25.345394134521484, "learning_rate": 4.388276765775243e-05, "loss": 0.5732, "step": 12650 }, { "epoch": 0.63, "grad_norm": 2.4565417766571045, "learning_rate": 4.3859697691157744e-05, "loss": 0.554, "step": 12675 }, { "epoch": 0.63, "grad_norm": 16.725379943847656, "learning_rate": 4.3836627724563056e-05, "loss": 0.3648, "step": 12700 }, { "epoch": 0.63, "grad_norm": 14.322278022766113, "learning_rate": 4.3813557757968374e-05, "loss": 0.4051, "step": 12725 }, { "epoch": 0.64, "grad_norm": 3.2702584266662598, "learning_rate": 4.379048779137368e-05, "loss": 0.4747, "step": 12750 }, { "epoch": 0.64, "grad_norm": 25.298032760620117, "learning_rate": 4.3767417824778997e-05, "loss": 0.495, "step": 12775 }, { "epoch": 0.64, "grad_norm": 5.421559810638428, "learning_rate": 4.37443478581843e-05, "loss": 0.5211, "step": 12800 }, { "epoch": 0.64, "grad_norm": 11.022579193115234, "learning_rate": 4.372127789158961e-05, "loss": 0.4934, "step": 12825 }, { "epoch": 0.64, "grad_norm": 2.5336997509002686, "learning_rate": 4.3698207924994924e-05, "loss": 0.5397, "step": 12850 }, { "epoch": 0.64, "grad_norm": 23.128211975097656, "learning_rate": 4.3675137958400235e-05, "loss": 0.4359, "step": 12875 }, { "epoch": 0.64, "grad_norm": 25.914918899536133, "learning_rate": 4.3652067991805553e-05, "loss": 0.3482, "step": 12900 }, { "epoch": 0.64, "grad_norm": 2.854637861251831, "learning_rate": 4.362899802521086e-05, "loss": 0.4871, "step": 12925 }, { "epoch": 0.65, "grad_norm": 1.9324437379837036, "learning_rate": 4.3605928058616176e-05, "loss": 0.5043, "step": 12950 }, { "epoch": 0.65, "grad_norm": 12.712705612182617, "learning_rate": 4.358285809202148e-05, "loss": 0.4016, "step": 12975 }, { "epoch": 0.65, "grad_norm": 3.4530861377716064, "learning_rate": 4.35597881254268e-05, "loss": 0.5581, "step": 13000 }, { "epoch": 0.65, "grad_norm": 3.0090839862823486, "learning_rate": 4.353671815883211e-05, "loss": 0.4368, "step": 13025 }, { "epoch": 0.65, "grad_norm": 1.4557719230651855, "learning_rate": 4.351364819223742e-05, "loss": 0.4501, "step": 13050 }, { "epoch": 0.65, "grad_norm": 4.6525750160217285, "learning_rate": 4.349057822564273e-05, "loss": 0.4408, "step": 13075 }, { "epoch": 0.65, "grad_norm": 4.434586524963379, "learning_rate": 4.3467508259048044e-05, "loss": 0.415, "step": 13100 }, { "epoch": 0.65, "grad_norm": 9.57250690460205, "learning_rate": 4.3444438292453356e-05, "loss": 0.3814, "step": 13125 }, { "epoch": 0.66, "grad_norm": 1.1053205728530884, "learning_rate": 4.342136832585867e-05, "loss": 0.6334, "step": 13150 }, { "epoch": 0.66, "grad_norm": 2.1338109970092773, "learning_rate": 4.339829835926398e-05, "loss": 0.5457, "step": 13175 }, { "epoch": 0.66, "grad_norm": 3.43638277053833, "learning_rate": 4.337522839266929e-05, "loss": 0.6277, "step": 13200 }, { "epoch": 0.66, "grad_norm": 5.341610908508301, "learning_rate": 4.33521584260746e-05, "loss": 0.533, "step": 13225 }, { "epoch": 0.66, "grad_norm": 12.302016258239746, "learning_rate": 4.332908845947991e-05, "loss": 0.4382, "step": 13250 }, { "epoch": 0.66, "grad_norm": 1.702040195465088, "learning_rate": 4.3306018492885224e-05, "loss": 0.4355, "step": 13275 }, { "epoch": 0.66, "grad_norm": 0.8742398023605347, "learning_rate": 4.3282948526290535e-05, "loss": 0.4374, "step": 13300 }, { "epoch": 0.66, "grad_norm": 1.9180805683135986, "learning_rate": 4.3259878559695847e-05, "loss": 0.4913, "step": 13325 }, { "epoch": 0.67, "grad_norm": 1.864307165145874, "learning_rate": 4.323680859310116e-05, "loss": 0.7072, "step": 13350 }, { "epoch": 0.67, "grad_norm": 5.213928699493408, "learning_rate": 4.321373862650647e-05, "loss": 0.428, "step": 13375 }, { "epoch": 0.67, "grad_norm": 3.17887020111084, "learning_rate": 4.319066865991178e-05, "loss": 0.4272, "step": 13400 }, { "epoch": 0.67, "grad_norm": 2.970327377319336, "learning_rate": 4.31675986933171e-05, "loss": 0.472, "step": 13425 }, { "epoch": 0.67, "grad_norm": 1.9041247367858887, "learning_rate": 4.31445287267224e-05, "loss": 0.4437, "step": 13450 }, { "epoch": 0.67, "grad_norm": 4.694064617156982, "learning_rate": 4.312145876012772e-05, "loss": 0.4872, "step": 13475 }, { "epoch": 0.67, "grad_norm": 21.480302810668945, "learning_rate": 4.3098388793533026e-05, "loss": 0.4416, "step": 13500 }, { "epoch": 0.67, "grad_norm": 19.836280822753906, "learning_rate": 4.3075318826938344e-05, "loss": 0.5722, "step": 13525 }, { "epoch": 0.68, "grad_norm": 2.7376246452331543, "learning_rate": 4.3052248860343656e-05, "loss": 0.5197, "step": 13550 }, { "epoch": 0.68, "grad_norm": 7.109920024871826, "learning_rate": 4.302917889374896e-05, "loss": 0.5437, "step": 13575 }, { "epoch": 0.68, "grad_norm": 50.2864875793457, "learning_rate": 4.300610892715428e-05, "loss": 0.5431, "step": 13600 }, { "epoch": 0.68, "grad_norm": 19.68246078491211, "learning_rate": 4.298303896055958e-05, "loss": 0.4416, "step": 13625 }, { "epoch": 0.68, "grad_norm": 17.506366729736328, "learning_rate": 4.29599689939649e-05, "loss": 0.5243, "step": 13650 }, { "epoch": 0.68, "grad_norm": 2.211810827255249, "learning_rate": 4.2936899027370206e-05, "loss": 0.4889, "step": 13675 }, { "epoch": 0.68, "grad_norm": 4.403297424316406, "learning_rate": 4.2913829060775524e-05, "loss": 0.4649, "step": 13700 }, { "epoch": 0.68, "grad_norm": 1.4525673389434814, "learning_rate": 4.2890759094180835e-05, "loss": 0.4872, "step": 13725 }, { "epoch": 0.69, "grad_norm": 24.48954200744629, "learning_rate": 4.2867689127586146e-05, "loss": 0.5875, "step": 13750 }, { "epoch": 0.69, "grad_norm": 9.83705997467041, "learning_rate": 4.284461916099146e-05, "loss": 0.4716, "step": 13775 }, { "epoch": 0.69, "grad_norm": 2.3815503120422363, "learning_rate": 4.282154919439677e-05, "loss": 0.5094, "step": 13800 }, { "epoch": 0.69, "grad_norm": 7.4601149559021, "learning_rate": 4.279847922780208e-05, "loss": 0.4932, "step": 13825 }, { "epoch": 0.69, "grad_norm": 2.108978033065796, "learning_rate": 4.277540926120739e-05, "loss": 0.5116, "step": 13850 }, { "epoch": 0.69, "grad_norm": 2.290255546569824, "learning_rate": 4.27523392946127e-05, "loss": 0.4903, "step": 13875 }, { "epoch": 0.69, "grad_norm": 3.3242883682250977, "learning_rate": 4.2729269328018015e-05, "loss": 0.5187, "step": 13900 }, { "epoch": 0.69, "grad_norm": 1.0054996013641357, "learning_rate": 4.2706199361423326e-05, "loss": 0.4068, "step": 13925 }, { "epoch": 0.7, "grad_norm": 5.556132793426514, "learning_rate": 4.268312939482864e-05, "loss": 0.6164, "step": 13950 }, { "epoch": 0.7, "grad_norm": 4.102598190307617, "learning_rate": 4.266005942823395e-05, "loss": 0.4948, "step": 13975 }, { "epoch": 0.7, "grad_norm": 1.5323078632354736, "learning_rate": 4.263698946163926e-05, "loss": 0.5195, "step": 14000 }, { "epoch": 0.7, "grad_norm": 4.196296215057373, "learning_rate": 4.261391949504457e-05, "loss": 0.5766, "step": 14025 }, { "epoch": 0.7, "grad_norm": 12.170926094055176, "learning_rate": 4.259084952844988e-05, "loss": 0.4864, "step": 14050 }, { "epoch": 0.7, "grad_norm": 174.35714721679688, "learning_rate": 4.25677795618552e-05, "loss": 0.4185, "step": 14075 }, { "epoch": 0.7, "grad_norm": 0.7264081835746765, "learning_rate": 4.2544709595260505e-05, "loss": 0.364, "step": 14100 }, { "epoch": 0.7, "grad_norm": 3.4145443439483643, "learning_rate": 4.2521639628665824e-05, "loss": 0.4507, "step": 14125 }, { "epoch": 0.71, "grad_norm": 14.267853736877441, "learning_rate": 4.249856966207113e-05, "loss": 0.563, "step": 14150 }, { "epoch": 0.71, "grad_norm": 2.8452975749969482, "learning_rate": 4.2475499695476446e-05, "loss": 0.5582, "step": 14175 }, { "epoch": 0.71, "grad_norm": 1.986310362815857, "learning_rate": 4.245242972888175e-05, "loss": 0.539, "step": 14200 }, { "epoch": 0.71, "grad_norm": 279.5508117675781, "learning_rate": 4.242935976228707e-05, "loss": 0.4617, "step": 14225 }, { "epoch": 0.71, "grad_norm": 0.829439640045166, "learning_rate": 4.240628979569238e-05, "loss": 0.4024, "step": 14250 }, { "epoch": 0.71, "grad_norm": 5.14355993270874, "learning_rate": 4.2383219829097685e-05, "loss": 0.4953, "step": 14275 }, { "epoch": 0.71, "grad_norm": 69.40645599365234, "learning_rate": 4.2360149862503e-05, "loss": 0.6197, "step": 14300 }, { "epoch": 0.71, "grad_norm": 53.14515686035156, "learning_rate": 4.233707989590831e-05, "loss": 0.5586, "step": 14325 }, { "epoch": 0.72, "grad_norm": 60.66070556640625, "learning_rate": 4.2314009929313626e-05, "loss": 0.4631, "step": 14350 }, { "epoch": 0.72, "grad_norm": 1.0162568092346191, "learning_rate": 4.229093996271894e-05, "loss": 0.5302, "step": 14375 }, { "epoch": 0.72, "grad_norm": 0.6008943319320679, "learning_rate": 4.226786999612425e-05, "loss": 0.3437, "step": 14400 }, { "epoch": 0.72, "grad_norm": 3.1900744438171387, "learning_rate": 4.224480002952956e-05, "loss": 0.3703, "step": 14425 }, { "epoch": 0.72, "grad_norm": 11.4000825881958, "learning_rate": 4.222173006293487e-05, "loss": 0.4234, "step": 14450 }, { "epoch": 0.72, "grad_norm": 5.070300102233887, "learning_rate": 4.219866009634018e-05, "loss": 0.6395, "step": 14475 }, { "epoch": 0.72, "grad_norm": 1.2910149097442627, "learning_rate": 4.2175590129745494e-05, "loss": 0.4914, "step": 14500 }, { "epoch": 0.72, "grad_norm": 7.105856895446777, "learning_rate": 4.2152520163150805e-05, "loss": 0.549, "step": 14525 }, { "epoch": 0.73, "grad_norm": 3.345780611038208, "learning_rate": 4.212945019655612e-05, "loss": 0.4593, "step": 14550 }, { "epoch": 0.73, "grad_norm": 8.449474334716797, "learning_rate": 4.210638022996143e-05, "loss": 0.4493, "step": 14575 }, { "epoch": 0.73, "grad_norm": 40.08137130737305, "learning_rate": 4.2083310263366746e-05, "loss": 0.4946, "step": 14600 }, { "epoch": 0.73, "grad_norm": 2.8794825077056885, "learning_rate": 4.206024029677205e-05, "loss": 0.4632, "step": 14625 }, { "epoch": 0.73, "grad_norm": 1.4181504249572754, "learning_rate": 4.203717033017736e-05, "loss": 0.554, "step": 14650 }, { "epoch": 0.73, "grad_norm": 34.15278244018555, "learning_rate": 4.2014100363582673e-05, "loss": 0.4662, "step": 14675 }, { "epoch": 0.73, "grad_norm": 1.2469291687011719, "learning_rate": 4.1991030396987985e-05, "loss": 0.4408, "step": 14700 }, { "epoch": 0.73, "grad_norm": 2.87349009513855, "learning_rate": 4.1967960430393296e-05, "loss": 0.3956, "step": 14725 }, { "epoch": 0.74, "grad_norm": 21.91674041748047, "learning_rate": 4.194489046379861e-05, "loss": 0.5344, "step": 14750 }, { "epoch": 0.74, "grad_norm": 38.84528350830078, "learning_rate": 4.1921820497203926e-05, "loss": 0.5154, "step": 14775 }, { "epoch": 0.74, "grad_norm": 122.549072265625, "learning_rate": 4.189875053060923e-05, "loss": 0.5487, "step": 14800 }, { "epoch": 0.74, "grad_norm": 198.7172088623047, "learning_rate": 4.187568056401455e-05, "loss": 0.4841, "step": 14825 }, { "epoch": 0.74, "grad_norm": 1.6492382287979126, "learning_rate": 4.185261059741985e-05, "loss": 0.4998, "step": 14850 }, { "epoch": 0.74, "grad_norm": 1.0393104553222656, "learning_rate": 4.182954063082517e-05, "loss": 0.4709, "step": 14875 }, { "epoch": 0.74, "grad_norm": 10.945531845092773, "learning_rate": 4.180647066423048e-05, "loss": 0.429, "step": 14900 }, { "epoch": 0.74, "grad_norm": 3.315153121948242, "learning_rate": 4.1783400697635794e-05, "loss": 0.5257, "step": 14925 }, { "epoch": 0.74, "grad_norm": 3.1690900325775146, "learning_rate": 4.1760330731041105e-05, "loss": 0.423, "step": 14950 }, { "epoch": 0.75, "grad_norm": 30.004865646362305, "learning_rate": 4.1737260764446417e-05, "loss": 0.4509, "step": 14975 }, { "epoch": 0.75, "grad_norm": 2.8555517196655273, "learning_rate": 4.171419079785173e-05, "loss": 0.4077, "step": 15000 }, { "epoch": 0.75, "grad_norm": 0.5392211079597473, "learning_rate": 4.169112083125703e-05, "loss": 0.3312, "step": 15025 }, { "epoch": 0.75, "grad_norm": 1.0116759538650513, "learning_rate": 4.166805086466235e-05, "loss": 0.5331, "step": 15050 }, { "epoch": 0.75, "grad_norm": 0.6311452388763428, "learning_rate": 4.164498089806766e-05, "loss": 0.4219, "step": 15075 }, { "epoch": 0.75, "grad_norm": 2.8859007358551025, "learning_rate": 4.162191093147297e-05, "loss": 0.5315, "step": 15100 }, { "epoch": 0.75, "grad_norm": 12.960403442382812, "learning_rate": 4.1598840964878285e-05, "loss": 0.5065, "step": 15125 }, { "epoch": 0.75, "grad_norm": 65.0334701538086, "learning_rate": 4.1575770998283596e-05, "loss": 0.3939, "step": 15150 }, { "epoch": 0.76, "grad_norm": 0.8992050886154175, "learning_rate": 4.155270103168891e-05, "loss": 0.5015, "step": 15175 }, { "epoch": 0.76, "grad_norm": 6.5797834396362305, "learning_rate": 4.152963106509422e-05, "loss": 0.4902, "step": 15200 }, { "epoch": 0.76, "grad_norm": 90.81952667236328, "learning_rate": 4.150656109849953e-05, "loss": 0.6276, "step": 15225 }, { "epoch": 0.76, "grad_norm": 12.694025993347168, "learning_rate": 4.148349113190484e-05, "loss": 0.521, "step": 15250 }, { "epoch": 0.76, "grad_norm": 3.315805673599243, "learning_rate": 4.146042116531015e-05, "loss": 0.3754, "step": 15275 }, { "epoch": 0.76, "grad_norm": 6.213558197021484, "learning_rate": 4.143735119871547e-05, "loss": 0.4641, "step": 15300 }, { "epoch": 0.76, "grad_norm": 5.649552345275879, "learning_rate": 4.1414281232120776e-05, "loss": 0.5123, "step": 15325 }, { "epoch": 0.76, "grad_norm": 1.9310487508773804, "learning_rate": 4.1391211265526094e-05, "loss": 0.4957, "step": 15350 }, { "epoch": 0.77, "grad_norm": 7.260229110717773, "learning_rate": 4.13681412989314e-05, "loss": 0.4961, "step": 15375 }, { "epoch": 0.77, "grad_norm": 2.5693869590759277, "learning_rate": 4.134507133233671e-05, "loss": 0.4218, "step": 15400 }, { "epoch": 0.77, "grad_norm": 0.8206908106803894, "learning_rate": 4.132200136574203e-05, "loss": 0.5148, "step": 15425 }, { "epoch": 0.77, "grad_norm": 0.8234782814979553, "learning_rate": 4.129893139914733e-05, "loss": 0.5661, "step": 15450 }, { "epoch": 0.77, "grad_norm": 3.116828680038452, "learning_rate": 4.127586143255265e-05, "loss": 0.4217, "step": 15475 }, { "epoch": 0.77, "grad_norm": 94.01576232910156, "learning_rate": 4.1252791465957955e-05, "loss": 0.5409, "step": 15500 }, { "epoch": 0.77, "grad_norm": 1.1207560300827026, "learning_rate": 4.122972149936327e-05, "loss": 0.5052, "step": 15525 }, { "epoch": 0.77, "grad_norm": 0.757140576839447, "learning_rate": 4.120665153276858e-05, "loss": 0.4548, "step": 15550 }, { "epoch": 0.78, "grad_norm": 6.338544845581055, "learning_rate": 4.1183581566173896e-05, "loss": 0.6972, "step": 15575 }, { "epoch": 0.78, "grad_norm": 10.121586799621582, "learning_rate": 4.116051159957921e-05, "loss": 0.51, "step": 15600 }, { "epoch": 0.78, "grad_norm": 2.8469491004943848, "learning_rate": 4.113744163298452e-05, "loss": 0.4761, "step": 15625 }, { "epoch": 0.78, "grad_norm": 3.0769906044006348, "learning_rate": 4.111437166638983e-05, "loss": 0.4206, "step": 15650 }, { "epoch": 0.78, "grad_norm": 4.13899040222168, "learning_rate": 4.109130169979514e-05, "loss": 0.6207, "step": 15675 }, { "epoch": 0.78, "grad_norm": 1.6825227737426758, "learning_rate": 4.106823173320045e-05, "loss": 0.4976, "step": 15700 }, { "epoch": 0.78, "grad_norm": 3.750791311264038, "learning_rate": 4.1045161766605764e-05, "loss": 0.6228, "step": 15725 }, { "epoch": 0.78, "grad_norm": 1.2766634225845337, "learning_rate": 4.1022091800011076e-05, "loss": 0.4583, "step": 15750 }, { "epoch": 0.79, "grad_norm": 23.536924362182617, "learning_rate": 4.099902183341639e-05, "loss": 0.5756, "step": 15775 }, { "epoch": 0.79, "grad_norm": 262.17529296875, "learning_rate": 4.09759518668217e-05, "loss": 0.4269, "step": 15800 }, { "epoch": 0.79, "grad_norm": 5.392236709594727, "learning_rate": 4.095288190022701e-05, "loss": 0.4978, "step": 15825 }, { "epoch": 0.79, "grad_norm": 36.02292251586914, "learning_rate": 4.092981193363232e-05, "loss": 0.4801, "step": 15850 }, { "epoch": 0.79, "grad_norm": 2.4638888835906982, "learning_rate": 4.090674196703763e-05, "loss": 0.4653, "step": 15875 }, { "epoch": 0.79, "grad_norm": 1.074580430984497, "learning_rate": 4.0883672000442944e-05, "loss": 0.4546, "step": 15900 }, { "epoch": 0.79, "grad_norm": 4.202010154724121, "learning_rate": 4.0860602033848255e-05, "loss": 0.5812, "step": 15925 }, { "epoch": 0.79, "grad_norm": 25.171972274780273, "learning_rate": 4.083753206725357e-05, "loss": 0.511, "step": 15950 }, { "epoch": 0.8, "grad_norm": 39.12727355957031, "learning_rate": 4.081446210065888e-05, "loss": 0.481, "step": 15975 }, { "epoch": 0.8, "grad_norm": 1.353677749633789, "learning_rate": 4.0791392134064196e-05, "loss": 0.462, "step": 16000 }, { "epoch": 0.8, "grad_norm": 39.51011657714844, "learning_rate": 4.07683221674695e-05, "loss": 0.4089, "step": 16025 }, { "epoch": 0.8, "grad_norm": 11.377568244934082, "learning_rate": 4.074525220087482e-05, "loss": 0.4299, "step": 16050 }, { "epoch": 0.8, "grad_norm": 4.243354797363281, "learning_rate": 4.072218223428012e-05, "loss": 0.4012, "step": 16075 }, { "epoch": 0.8, "grad_norm": 1.647400975227356, "learning_rate": 4.069911226768544e-05, "loss": 0.4251, "step": 16100 }, { "epoch": 0.8, "grad_norm": 1.1065517663955688, "learning_rate": 4.067604230109075e-05, "loss": 0.4315, "step": 16125 }, { "epoch": 0.8, "grad_norm": 3.7527124881744385, "learning_rate": 4.065297233449606e-05, "loss": 0.466, "step": 16150 }, { "epoch": 0.81, "grad_norm": 7.145634174346924, "learning_rate": 4.0629902367901375e-05, "loss": 0.4597, "step": 16175 }, { "epoch": 0.81, "grad_norm": 3.0760304927825928, "learning_rate": 4.060683240130668e-05, "loss": 0.5086, "step": 16200 }, { "epoch": 0.81, "grad_norm": 2.9849870204925537, "learning_rate": 4.0583762434712e-05, "loss": 0.5209, "step": 16225 }, { "epoch": 0.81, "grad_norm": 24.513986587524414, "learning_rate": 4.056069246811731e-05, "loss": 0.4571, "step": 16250 }, { "epoch": 0.81, "grad_norm": 2.497976541519165, "learning_rate": 4.053762250152262e-05, "loss": 0.4176, "step": 16275 }, { "epoch": 0.81, "grad_norm": 2.574549674987793, "learning_rate": 4.051455253492793e-05, "loss": 0.3929, "step": 16300 }, { "epoch": 0.81, "grad_norm": 0.813434898853302, "learning_rate": 4.0491482568333244e-05, "loss": 0.4107, "step": 16325 }, { "epoch": 0.81, "grad_norm": 3.409647226333618, "learning_rate": 4.0468412601738555e-05, "loss": 0.5186, "step": 16350 }, { "epoch": 0.82, "grad_norm": 0.8358980417251587, "learning_rate": 4.0445342635143866e-05, "loss": 0.4876, "step": 16375 }, { "epoch": 0.82, "grad_norm": 1.4873727560043335, "learning_rate": 4.042227266854918e-05, "loss": 0.5257, "step": 16400 }, { "epoch": 0.82, "grad_norm": 13.529160499572754, "learning_rate": 4.039920270195449e-05, "loss": 0.4294, "step": 16425 }, { "epoch": 0.82, "grad_norm": 1.2024579048156738, "learning_rate": 4.03761327353598e-05, "loss": 0.5386, "step": 16450 }, { "epoch": 0.82, "grad_norm": 2.024953842163086, "learning_rate": 4.035306276876512e-05, "loss": 0.5225, "step": 16475 }, { "epoch": 0.82, "grad_norm": 9.599167823791504, "learning_rate": 4.032999280217042e-05, "loss": 0.4629, "step": 16500 }, { "epoch": 0.82, "grad_norm": 2.511319160461426, "learning_rate": 4.0306922835575734e-05, "loss": 0.6207, "step": 16525 }, { "epoch": 0.82, "grad_norm": 2.126314163208008, "learning_rate": 4.0283852868981046e-05, "loss": 0.4982, "step": 16550 }, { "epoch": 0.83, "grad_norm": 2.2506449222564697, "learning_rate": 4.026078290238636e-05, "loss": 0.3821, "step": 16575 }, { "epoch": 0.83, "grad_norm": 7.21665620803833, "learning_rate": 4.023771293579167e-05, "loss": 0.5274, "step": 16600 }, { "epoch": 0.83, "grad_norm": 1.9467825889587402, "learning_rate": 4.021464296919698e-05, "loss": 0.6044, "step": 16625 }, { "epoch": 0.83, "grad_norm": 1.0800065994262695, "learning_rate": 4.01915730026023e-05, "loss": 0.3538, "step": 16650 }, { "epoch": 0.83, "grad_norm": 18.513242721557617, "learning_rate": 4.01685030360076e-05, "loss": 0.5133, "step": 16675 }, { "epoch": 0.83, "grad_norm": 1.1052954196929932, "learning_rate": 4.014543306941292e-05, "loss": 0.5397, "step": 16700 }, { "epoch": 0.83, "grad_norm": 5.234750270843506, "learning_rate": 4.0122363102818225e-05, "loss": 0.5056, "step": 16725 }, { "epoch": 0.83, "grad_norm": 3.360717296600342, "learning_rate": 4.0099293136223543e-05, "loss": 0.4306, "step": 16750 }, { "epoch": 0.84, "grad_norm": 10.082225799560547, "learning_rate": 4.0076223169628855e-05, "loss": 0.4634, "step": 16775 }, { "epoch": 0.84, "grad_norm": 5.244103908538818, "learning_rate": 4.0053153203034166e-05, "loss": 0.4331, "step": 16800 }, { "epoch": 0.84, "grad_norm": 9.171998977661133, "learning_rate": 4.003008323643948e-05, "loss": 0.5922, "step": 16825 }, { "epoch": 0.84, "grad_norm": 1.5789293050765991, "learning_rate": 4.000701326984479e-05, "loss": 0.4436, "step": 16850 }, { "epoch": 0.84, "grad_norm": 13.523552894592285, "learning_rate": 3.99839433032501e-05, "loss": 0.5009, "step": 16875 }, { "epoch": 0.84, "grad_norm": 31.91691780090332, "learning_rate": 3.9960873336655405e-05, "loss": 0.477, "step": 16900 }, { "epoch": 0.84, "grad_norm": 3.873971939086914, "learning_rate": 3.993780337006072e-05, "loss": 0.4808, "step": 16925 }, { "epoch": 0.84, "grad_norm": 1.8899236917495728, "learning_rate": 3.9914733403466034e-05, "loss": 0.4754, "step": 16950 }, { "epoch": 0.85, "grad_norm": 5.686938285827637, "learning_rate": 3.9891663436871346e-05, "loss": 0.4095, "step": 16975 }, { "epoch": 0.85, "grad_norm": 3.4337737560272217, "learning_rate": 3.986859347027666e-05, "loss": 0.534, "step": 17000 }, { "epoch": 0.85, "grad_norm": 1.897147536277771, "learning_rate": 3.984552350368197e-05, "loss": 0.4931, "step": 17025 }, { "epoch": 0.85, "grad_norm": 5.539985179901123, "learning_rate": 3.982245353708728e-05, "loss": 0.4568, "step": 17050 }, { "epoch": 0.85, "grad_norm": 1.8197060823440552, "learning_rate": 3.979938357049259e-05, "loss": 0.4894, "step": 17075 }, { "epoch": 0.85, "grad_norm": 2.8475852012634277, "learning_rate": 3.97763136038979e-05, "loss": 0.6445, "step": 17100 }, { "epoch": 0.85, "grad_norm": 18.02142906188965, "learning_rate": 3.9753243637303214e-05, "loss": 0.4647, "step": 17125 }, { "epoch": 0.85, "grad_norm": 2.6763222217559814, "learning_rate": 3.9730173670708525e-05, "loss": 0.4201, "step": 17150 }, { "epoch": 0.86, "grad_norm": 2.0824146270751953, "learning_rate": 3.970710370411384e-05, "loss": 0.4473, "step": 17175 }, { "epoch": 0.86, "grad_norm": 2.7089695930480957, "learning_rate": 3.968403373751915e-05, "loss": 0.4026, "step": 17200 }, { "epoch": 0.86, "grad_norm": 8.540483474731445, "learning_rate": 3.9660963770924466e-05, "loss": 0.5615, "step": 17225 }, { "epoch": 0.86, "grad_norm": 0.8385511040687561, "learning_rate": 3.963789380432977e-05, "loss": 0.5148, "step": 17250 }, { "epoch": 0.86, "grad_norm": 7.858611583709717, "learning_rate": 3.961482383773508e-05, "loss": 0.6374, "step": 17275 }, { "epoch": 0.86, "grad_norm": 1.7679189443588257, "learning_rate": 3.95917538711404e-05, "loss": 0.5359, "step": 17300 }, { "epoch": 0.86, "grad_norm": 14.490900039672852, "learning_rate": 3.9568683904545705e-05, "loss": 1.2369, "step": 17325 }, { "epoch": 0.86, "grad_norm": 14.354183197021484, "learning_rate": 3.954561393795102e-05, "loss": 0.823, "step": 17350 }, { "epoch": 0.87, "grad_norm": 4.188384532928467, "learning_rate": 3.952254397135633e-05, "loss": 0.5664, "step": 17375 }, { "epoch": 0.87, "grad_norm": 5.154536247253418, "learning_rate": 3.9499474004761646e-05, "loss": 0.5828, "step": 17400 }, { "epoch": 0.87, "grad_norm": 4.994510173797607, "learning_rate": 3.947640403816695e-05, "loss": 0.4817, "step": 17425 }, { "epoch": 0.87, "grad_norm": 138.496337890625, "learning_rate": 3.945333407157227e-05, "loss": 0.5828, "step": 17450 }, { "epoch": 0.87, "grad_norm": 4.370205879211426, "learning_rate": 3.943026410497758e-05, "loss": 0.5262, "step": 17475 }, { "epoch": 0.87, "grad_norm": 16.109251022338867, "learning_rate": 3.940719413838289e-05, "loss": 0.4149, "step": 17500 }, { "epoch": 0.87, "grad_norm": 10.229596138000488, "learning_rate": 3.93841241717882e-05, "loss": 0.5403, "step": 17525 }, { "epoch": 0.87, "grad_norm": 3.327038288116455, "learning_rate": 3.9361054205193514e-05, "loss": 0.6732, "step": 17550 }, { "epoch": 0.88, "grad_norm": 4.285974502563477, "learning_rate": 3.9337984238598825e-05, "loss": 0.5621, "step": 17575 }, { "epoch": 0.88, "grad_norm": 3.3494739532470703, "learning_rate": 3.9314914272004136e-05, "loss": 0.5229, "step": 17600 }, { "epoch": 0.88, "grad_norm": 1.3700764179229736, "learning_rate": 3.929184430540945e-05, "loss": 0.5192, "step": 17625 }, { "epoch": 0.88, "grad_norm": 2.759855031967163, "learning_rate": 3.926877433881476e-05, "loss": 0.4136, "step": 17650 }, { "epoch": 0.88, "grad_norm": 3.0871315002441406, "learning_rate": 3.924570437222007e-05, "loss": 0.553, "step": 17675 }, { "epoch": 0.88, "grad_norm": 93.34503936767578, "learning_rate": 3.922263440562538e-05, "loss": 0.4719, "step": 17700 }, { "epoch": 0.88, "grad_norm": 3.604959726333618, "learning_rate": 3.919956443903069e-05, "loss": 0.5601, "step": 17725 }, { "epoch": 0.88, "grad_norm": 38.621551513671875, "learning_rate": 3.9176494472436005e-05, "loss": 0.6033, "step": 17750 }, { "epoch": 0.89, "grad_norm": 1.2768781185150146, "learning_rate": 3.9153424505841316e-05, "loss": 0.4388, "step": 17775 }, { "epoch": 0.89, "grad_norm": 3.988166332244873, "learning_rate": 3.913035453924663e-05, "loss": 0.4533, "step": 17800 }, { "epoch": 0.89, "grad_norm": 1.5384948253631592, "learning_rate": 3.9107284572651945e-05, "loss": 0.6735, "step": 17825 }, { "epoch": 0.89, "grad_norm": 7.3451032638549805, "learning_rate": 3.908421460605725e-05, "loss": 0.5567, "step": 17850 }, { "epoch": 0.89, "grad_norm": 2.0618393421173096, "learning_rate": 3.906114463946257e-05, "loss": 0.5047, "step": 17875 }, { "epoch": 0.89, "grad_norm": 2.9151611328125, "learning_rate": 3.903807467286787e-05, "loss": 0.4682, "step": 17900 }, { "epoch": 0.89, "grad_norm": 3.500939130783081, "learning_rate": 3.901500470627319e-05, "loss": 0.5246, "step": 17925 }, { "epoch": 0.89, "grad_norm": 8.571981430053711, "learning_rate": 3.8991934739678495e-05, "loss": 0.5462, "step": 17950 }, { "epoch": 0.9, "grad_norm": 39.09160614013672, "learning_rate": 3.8968864773083814e-05, "loss": 0.4873, "step": 17975 }, { "epoch": 0.9, "grad_norm": 16.766748428344727, "learning_rate": 3.8945794806489125e-05, "loss": 0.4964, "step": 18000 }, { "epoch": 0.9, "grad_norm": 18.48149299621582, "learning_rate": 3.892272483989443e-05, "loss": 0.4481, "step": 18025 }, { "epoch": 0.9, "grad_norm": 1.355155110359192, "learning_rate": 3.889965487329975e-05, "loss": 0.4815, "step": 18050 }, { "epoch": 0.9, "grad_norm": 35.24787139892578, "learning_rate": 3.887658490670505e-05, "loss": 0.4782, "step": 18075 }, { "epoch": 0.9, "grad_norm": 3.5112314224243164, "learning_rate": 3.885351494011037e-05, "loss": 0.4727, "step": 18100 }, { "epoch": 0.9, "grad_norm": 5.651079177856445, "learning_rate": 3.8830444973515675e-05, "loss": 0.5468, "step": 18125 }, { "epoch": 0.9, "grad_norm": 3.5044541358947754, "learning_rate": 3.880737500692099e-05, "loss": 0.5182, "step": 18150 }, { "epoch": 0.91, "grad_norm": 1.015687346458435, "learning_rate": 3.8784305040326305e-05, "loss": 0.3501, "step": 18175 }, { "epoch": 0.91, "grad_norm": 2.154855728149414, "learning_rate": 3.8761235073731616e-05, "loss": 0.4771, "step": 18200 }, { "epoch": 0.91, "grad_norm": 41.30935287475586, "learning_rate": 3.873816510713693e-05, "loss": 0.3895, "step": 18225 }, { "epoch": 0.91, "grad_norm": 3.7652475833892822, "learning_rate": 3.871509514054224e-05, "loss": 0.522, "step": 18250 }, { "epoch": 0.91, "grad_norm": 6.1274943351745605, "learning_rate": 3.869202517394755e-05, "loss": 0.5085, "step": 18275 }, { "epoch": 0.91, "grad_norm": 5.170544624328613, "learning_rate": 3.866895520735286e-05, "loss": 0.4827, "step": 18300 }, { "epoch": 0.91, "grad_norm": 1.431235671043396, "learning_rate": 3.864588524075817e-05, "loss": 0.4614, "step": 18325 }, { "epoch": 0.91, "grad_norm": 0.8098027110099792, "learning_rate": 3.862281527416349e-05, "loss": 0.4926, "step": 18350 }, { "epoch": 0.92, "grad_norm": 3.5741729736328125, "learning_rate": 3.8599745307568795e-05, "loss": 0.3676, "step": 18375 }, { "epoch": 0.92, "grad_norm": 3.1784658432006836, "learning_rate": 3.857667534097411e-05, "loss": 0.6016, "step": 18400 }, { "epoch": 0.92, "grad_norm": 4.085948944091797, "learning_rate": 3.855360537437942e-05, "loss": 0.5894, "step": 18425 }, { "epoch": 0.92, "grad_norm": 4.510144233703613, "learning_rate": 3.853053540778473e-05, "loss": 0.6006, "step": 18450 }, { "epoch": 0.92, "grad_norm": 2.849971055984497, "learning_rate": 3.850746544119004e-05, "loss": 0.5905, "step": 18475 }, { "epoch": 0.92, "grad_norm": 53.399227142333984, "learning_rate": 3.848439547459535e-05, "loss": 0.511, "step": 18500 }, { "epoch": 0.92, "grad_norm": 1.8678909540176392, "learning_rate": 3.846132550800067e-05, "loss": 0.5009, "step": 18525 }, { "epoch": 0.92, "grad_norm": 4.071389198303223, "learning_rate": 3.8438255541405975e-05, "loss": 0.5701, "step": 18550 }, { "epoch": 0.93, "grad_norm": 9.301192283630371, "learning_rate": 3.841518557481129e-05, "loss": 0.5051, "step": 18575 }, { "epoch": 0.93, "grad_norm": 6.390158653259277, "learning_rate": 3.83921156082166e-05, "loss": 0.6236, "step": 18600 }, { "epoch": 0.93, "grad_norm": 1.5711665153503418, "learning_rate": 3.8369045641621916e-05, "loss": 0.5415, "step": 18625 }, { "epoch": 0.93, "grad_norm": 100.99102783203125, "learning_rate": 3.834597567502722e-05, "loss": 0.5929, "step": 18650 }, { "epoch": 0.93, "grad_norm": 2.2588348388671875, "learning_rate": 3.832290570843254e-05, "loss": 0.4933, "step": 18675 }, { "epoch": 0.93, "grad_norm": 12.144977569580078, "learning_rate": 3.829983574183785e-05, "loss": 0.5072, "step": 18700 }, { "epoch": 0.93, "grad_norm": 16.152042388916016, "learning_rate": 3.827676577524316e-05, "loss": 0.5485, "step": 18725 }, { "epoch": 0.93, "grad_norm": 1.2403534650802612, "learning_rate": 3.825369580864847e-05, "loss": 0.4722, "step": 18750 }, { "epoch": 0.94, "grad_norm": 1.4384055137634277, "learning_rate": 3.823062584205378e-05, "loss": 0.5137, "step": 18775 }, { "epoch": 0.94, "grad_norm": 40.7564697265625, "learning_rate": 3.8207555875459095e-05, "loss": 0.513, "step": 18800 }, { "epoch": 0.94, "grad_norm": 71.0677490234375, "learning_rate": 3.818448590886441e-05, "loss": 0.4625, "step": 18825 }, { "epoch": 0.94, "grad_norm": 1.8693602085113525, "learning_rate": 3.816141594226972e-05, "loss": 0.3826, "step": 18850 }, { "epoch": 0.94, "grad_norm": 8.29848575592041, "learning_rate": 3.813834597567503e-05, "loss": 0.5599, "step": 18875 }, { "epoch": 0.94, "grad_norm": 2.729771375656128, "learning_rate": 3.811527600908034e-05, "loss": 0.4568, "step": 18900 }, { "epoch": 0.94, "grad_norm": 76.36131286621094, "learning_rate": 3.809220604248565e-05, "loss": 0.6015, "step": 18925 }, { "epoch": 0.94, "grad_norm": 0.9493357539176941, "learning_rate": 3.8069136075890963e-05, "loss": 0.4385, "step": 18950 }, { "epoch": 0.95, "grad_norm": 3.290364980697632, "learning_rate": 3.8046066109296275e-05, "loss": 0.465, "step": 18975 }, { "epoch": 0.95, "grad_norm": 9.84317684173584, "learning_rate": 3.8022996142701586e-05, "loss": 0.5778, "step": 19000 }, { "epoch": 0.95, "grad_norm": 1.622100830078125, "learning_rate": 3.79999261761069e-05, "loss": 0.4833, "step": 19025 }, { "epoch": 0.95, "grad_norm": 4.288991928100586, "learning_rate": 3.7976856209512216e-05, "loss": 0.5373, "step": 19050 }, { "epoch": 0.95, "grad_norm": 3.3950583934783936, "learning_rate": 3.795378624291752e-05, "loss": 0.3924, "step": 19075 }, { "epoch": 0.95, "grad_norm": 28.451183319091797, "learning_rate": 3.793071627632284e-05, "loss": 0.4863, "step": 19100 }, { "epoch": 0.95, "grad_norm": 6.996535301208496, "learning_rate": 3.790764630972814e-05, "loss": 0.6353, "step": 19125 }, { "epoch": 0.95, "grad_norm": 3.060659885406494, "learning_rate": 3.7884576343133454e-05, "loss": 0.5, "step": 19150 }, { "epoch": 0.96, "grad_norm": 14.792171478271484, "learning_rate": 3.7861506376538766e-05, "loss": 0.5163, "step": 19175 }, { "epoch": 0.96, "grad_norm": 39.17354202270508, "learning_rate": 3.783843640994408e-05, "loss": 0.6595, "step": 19200 }, { "epoch": 0.96, "grad_norm": 51.1176872253418, "learning_rate": 3.7815366443349395e-05, "loss": 0.5817, "step": 19225 }, { "epoch": 0.96, "grad_norm": 1.3194643259048462, "learning_rate": 3.77922964767547e-05, "loss": 0.4769, "step": 19250 }, { "epoch": 0.96, "grad_norm": 7.984396457672119, "learning_rate": 3.776922651016002e-05, "loss": 0.4827, "step": 19275 }, { "epoch": 0.96, "grad_norm": 2.9708173274993896, "learning_rate": 3.774615654356532e-05, "loss": 0.4073, "step": 19300 }, { "epoch": 0.96, "grad_norm": 5.974608421325684, "learning_rate": 3.772308657697064e-05, "loss": 0.5785, "step": 19325 }, { "epoch": 0.96, "grad_norm": 26.67017364501953, "learning_rate": 3.770001661037595e-05, "loss": 0.5199, "step": 19350 }, { "epoch": 0.97, "grad_norm": 69.32394409179688, "learning_rate": 3.767694664378126e-05, "loss": 0.5085, "step": 19375 }, { "epoch": 0.97, "grad_norm": 16.79060935974121, "learning_rate": 3.7653876677186575e-05, "loss": 0.5602, "step": 19400 }, { "epoch": 0.97, "grad_norm": 4.172871112823486, "learning_rate": 3.7630806710591886e-05, "loss": 0.3412, "step": 19425 }, { "epoch": 0.97, "grad_norm": 4.1554975509643555, "learning_rate": 3.76077367439972e-05, "loss": 0.5426, "step": 19450 }, { "epoch": 0.97, "grad_norm": 20.442163467407227, "learning_rate": 3.758466677740251e-05, "loss": 0.543, "step": 19475 }, { "epoch": 0.97, "grad_norm": 0.9975307583808899, "learning_rate": 3.756159681080782e-05, "loss": 0.511, "step": 19500 }, { "epoch": 0.97, "grad_norm": 6.242922782897949, "learning_rate": 3.753852684421313e-05, "loss": 0.5415, "step": 19525 }, { "epoch": 0.97, "grad_norm": 5.49556827545166, "learning_rate": 3.751545687761844e-05, "loss": 0.3906, "step": 19550 }, { "epoch": 0.98, "grad_norm": 1.480525255203247, "learning_rate": 3.7492386911023754e-05, "loss": 0.3721, "step": 19575 }, { "epoch": 0.98, "grad_norm": 7.201631546020508, "learning_rate": 3.7469316944429066e-05, "loss": 0.6057, "step": 19600 }, { "epoch": 0.98, "grad_norm": 3.685736894607544, "learning_rate": 3.744624697783438e-05, "loss": 0.5243, "step": 19625 }, { "epoch": 0.98, "grad_norm": 8.694090843200684, "learning_rate": 3.742317701123969e-05, "loss": 0.4655, "step": 19650 }, { "epoch": 0.98, "grad_norm": 3.63700795173645, "learning_rate": 3.7400107044645e-05, "loss": 0.7424, "step": 19675 }, { "epoch": 0.98, "grad_norm": 3.734907627105713, "learning_rate": 3.737703707805031e-05, "loss": 0.516, "step": 19700 }, { "epoch": 0.98, "grad_norm": 14.383971214294434, "learning_rate": 3.735396711145562e-05, "loss": 0.4659, "step": 19725 }, { "epoch": 0.98, "grad_norm": 4.4923930168151855, "learning_rate": 3.733089714486094e-05, "loss": 0.4505, "step": 19750 }, { "epoch": 0.99, "grad_norm": 2.624192714691162, "learning_rate": 3.7307827178266245e-05, "loss": 0.5293, "step": 19775 }, { "epoch": 0.99, "grad_norm": 3.719897985458374, "learning_rate": 3.728475721167156e-05, "loss": 0.5854, "step": 19800 }, { "epoch": 0.99, "grad_norm": 1.4075324535369873, "learning_rate": 3.726168724507687e-05, "loss": 0.5475, "step": 19825 }, { "epoch": 0.99, "grad_norm": 87.31964874267578, "learning_rate": 3.7238617278482186e-05, "loss": 0.6098, "step": 19850 }, { "epoch": 0.99, "grad_norm": 66.36661529541016, "learning_rate": 3.72155473118875e-05, "loss": 0.4597, "step": 19875 }, { "epoch": 0.99, "grad_norm": 3.5049967765808105, "learning_rate": 3.71924773452928e-05, "loss": 0.5159, "step": 19900 }, { "epoch": 0.99, "grad_norm": 1.5183825492858887, "learning_rate": 3.716940737869812e-05, "loss": 0.4156, "step": 19925 }, { "epoch": 0.99, "grad_norm": 5.584221363067627, "learning_rate": 3.7146337412103425e-05, "loss": 0.4939, "step": 19950 }, { "epoch": 1.0, "grad_norm": 32.5567512512207, "learning_rate": 3.712326744550874e-05, "loss": 0.6205, "step": 19975 }, { "epoch": 1.0, "grad_norm": 4.678618907928467, "learning_rate": 3.710019747891405e-05, "loss": 0.554, "step": 20000 }, { "epoch": 1.0, "grad_norm": 2.869126796722412, "learning_rate": 3.7077127512319365e-05, "loss": 0.5592, "step": 20025 }, { "epoch": 1.0, "grad_norm": 4.049469470977783, "learning_rate": 3.705405754572468e-05, "loss": 0.4635, "step": 20050 }, { "epoch": 1.0, "eval_accuracy": 0.852423072131556, "eval_f1_macro": 0.740011717394454, "eval_f1_micro": 0.852423072131556, "eval_f1_weighted": 0.8490416400281636, "eval_loss": 0.46900421380996704, "eval_precision_macro": 0.8007344689241704, "eval_precision_micro": 0.852423072131556, "eval_precision_weighted": 0.8510035524325744, "eval_recall_macro": 0.7068611044442278, "eval_recall_micro": 0.852423072131556, "eval_recall_weighted": 0.852423072131556, "eval_runtime": 8175.3699, "eval_samples_per_second": 4.909, "eval_steps_per_second": 0.307, "step": 20068 }, { "epoch": 1.0, "grad_norm": 2.8573710918426514, "learning_rate": 3.703098757912999e-05, "loss": 0.6037, "step": 20075 }, { "epoch": 1.0, "grad_norm": 1.9019801616668701, "learning_rate": 3.70079176125353e-05, "loss": 0.524, "step": 20100 }, { "epoch": 1.0, "grad_norm": 43.846824645996094, "learning_rate": 3.698484764594061e-05, "loss": 0.4374, "step": 20125 }, { "epoch": 1.0, "grad_norm": 5.801967144012451, "learning_rate": 3.696177767934592e-05, "loss": 0.5659, "step": 20150 }, { "epoch": 1.01, "grad_norm": 41.790992736816406, "learning_rate": 3.6938707712751234e-05, "loss": 0.5, "step": 20175 }, { "epoch": 1.01, "grad_norm": 1.0711537599563599, "learning_rate": 3.6915637746156545e-05, "loss": 0.4129, "step": 20200 }, { "epoch": 1.01, "grad_norm": 1.601976990699768, "learning_rate": 3.6892567779561856e-05, "loss": 0.4549, "step": 20225 }, { "epoch": 1.01, "grad_norm": 5.723679065704346, "learning_rate": 3.686949781296717e-05, "loss": 0.4861, "step": 20250 }, { "epoch": 1.01, "grad_norm": 0.9833848476409912, "learning_rate": 3.684642784637248e-05, "loss": 0.4366, "step": 20275 }, { "epoch": 1.01, "grad_norm": 2.463611125946045, "learning_rate": 3.682335787977779e-05, "loss": 0.4352, "step": 20300 }, { "epoch": 1.01, "grad_norm": 3.361196994781494, "learning_rate": 3.68002879131831e-05, "loss": 0.4889, "step": 20325 }, { "epoch": 1.01, "grad_norm": 3.9204843044281006, "learning_rate": 3.677721794658841e-05, "loss": 0.4408, "step": 20350 }, { "epoch": 1.02, "grad_norm": 5.163851737976074, "learning_rate": 3.6754147979993724e-05, "loss": 0.4778, "step": 20375 }, { "epoch": 1.02, "grad_norm": 1.4100154638290405, "learning_rate": 3.673107801339904e-05, "loss": 0.4129, "step": 20400 }, { "epoch": 1.02, "grad_norm": 1.0082415342330933, "learning_rate": 3.670800804680435e-05, "loss": 0.3991, "step": 20425 }, { "epoch": 1.02, "grad_norm": 3.754681348800659, "learning_rate": 3.6684938080209665e-05, "loss": 0.4297, "step": 20450 }, { "epoch": 1.02, "grad_norm": 3.615309000015259, "learning_rate": 3.666186811361497e-05, "loss": 0.5264, "step": 20475 }, { "epoch": 1.02, "grad_norm": 5.391434192657471, "learning_rate": 3.663879814702029e-05, "loss": 0.5161, "step": 20500 }, { "epoch": 1.02, "grad_norm": 0.9345567226409912, "learning_rate": 3.661572818042559e-05, "loss": 0.4751, "step": 20525 }, { "epoch": 1.02, "grad_norm": 83.7790298461914, "learning_rate": 3.659265821383091e-05, "loss": 0.5471, "step": 20550 }, { "epoch": 1.03, "grad_norm": 5.505739212036133, "learning_rate": 3.656958824723622e-05, "loss": 0.442, "step": 20575 }, { "epoch": 1.03, "grad_norm": 2.3913259506225586, "learning_rate": 3.6546518280641534e-05, "loss": 0.3406, "step": 20600 }, { "epoch": 1.03, "grad_norm": 3.856517791748047, "learning_rate": 3.6523448314046845e-05, "loss": 0.4651, "step": 20625 }, { "epoch": 1.03, "grad_norm": 36.363807678222656, "learning_rate": 3.650037834745215e-05, "loss": 0.3299, "step": 20650 }, { "epoch": 1.03, "grad_norm": 4.59905481338501, "learning_rate": 3.647730838085747e-05, "loss": 0.4673, "step": 20675 }, { "epoch": 1.03, "grad_norm": 2.024817705154419, "learning_rate": 3.645423841426278e-05, "loss": 0.6025, "step": 20700 }, { "epoch": 1.03, "grad_norm": 0.8249455094337463, "learning_rate": 3.643116844766809e-05, "loss": 0.4823, "step": 20725 }, { "epoch": 1.03, "grad_norm": 3.736368179321289, "learning_rate": 3.64080984810734e-05, "loss": 0.4693, "step": 20750 }, { "epoch": 1.04, "grad_norm": 1.0846278667449951, "learning_rate": 3.638502851447871e-05, "loss": 0.5291, "step": 20775 }, { "epoch": 1.04, "grad_norm": 3.0749387741088867, "learning_rate": 3.6361958547884024e-05, "loss": 0.4587, "step": 20800 }, { "epoch": 1.04, "grad_norm": 1.6911934614181519, "learning_rate": 3.6338888581289336e-05, "loss": 0.4702, "step": 20825 }, { "epoch": 1.04, "grad_norm": 1.1063579320907593, "learning_rate": 3.631581861469465e-05, "loss": 0.5319, "step": 20850 }, { "epoch": 1.04, "grad_norm": 2.875474691390991, "learning_rate": 3.629274864809996e-05, "loss": 0.657, "step": 20875 }, { "epoch": 1.04, "grad_norm": 10.830842018127441, "learning_rate": 3.626967868150527e-05, "loss": 0.4379, "step": 20900 }, { "epoch": 1.04, "grad_norm": 5.642008304595947, "learning_rate": 3.624660871491059e-05, "loss": 0.6474, "step": 20925 }, { "epoch": 1.04, "grad_norm": 24.012779235839844, "learning_rate": 3.622353874831589e-05, "loss": 0.5086, "step": 20950 }, { "epoch": 1.05, "grad_norm": 1.3956573009490967, "learning_rate": 3.620046878172121e-05, "loss": 0.5026, "step": 20975 }, { "epoch": 1.05, "grad_norm": 0.8949878215789795, "learning_rate": 3.6177398815126515e-05, "loss": 0.4177, "step": 21000 }, { "epoch": 1.05, "grad_norm": 8.444169044494629, "learning_rate": 3.615432884853183e-05, "loss": 0.5521, "step": 21025 }, { "epoch": 1.05, "grad_norm": 1.7907155752182007, "learning_rate": 3.613125888193714e-05, "loss": 0.4044, "step": 21050 }, { "epoch": 1.05, "grad_norm": 3.08500599861145, "learning_rate": 3.610818891534245e-05, "loss": 0.4453, "step": 21075 }, { "epoch": 1.05, "grad_norm": 4.027458667755127, "learning_rate": 3.608511894874777e-05, "loss": 0.4518, "step": 21100 }, { "epoch": 1.05, "grad_norm": 3.554227590560913, "learning_rate": 3.606204898215307e-05, "loss": 0.3573, "step": 21125 }, { "epoch": 1.05, "grad_norm": 1.3447580337524414, "learning_rate": 3.603897901555839e-05, "loss": 0.415, "step": 21150 }, { "epoch": 1.06, "grad_norm": 4.002223014831543, "learning_rate": 3.6015909048963695e-05, "loss": 0.4981, "step": 21175 }, { "epoch": 1.06, "grad_norm": 13.900261878967285, "learning_rate": 3.599283908236901e-05, "loss": 0.5881, "step": 21200 }, { "epoch": 1.06, "grad_norm": 6.735679626464844, "learning_rate": 3.5969769115774324e-05, "loss": 0.4258, "step": 21225 }, { "epoch": 1.06, "grad_norm": 3.4372642040252686, "learning_rate": 3.5946699149179636e-05, "loss": 0.4882, "step": 21250 }, { "epoch": 1.06, "grad_norm": 7.4543890953063965, "learning_rate": 3.592362918258495e-05, "loss": 0.5204, "step": 21275 }, { "epoch": 1.06, "grad_norm": 27.05744743347168, "learning_rate": 3.590055921599026e-05, "loss": 0.3268, "step": 21300 }, { "epoch": 1.06, "grad_norm": 3.965268850326538, "learning_rate": 3.587748924939557e-05, "loss": 0.5526, "step": 21325 }, { "epoch": 1.06, "grad_norm": 2.135265827178955, "learning_rate": 3.585441928280088e-05, "loss": 0.3802, "step": 21350 }, { "epoch": 1.07, "grad_norm": 65.5523452758789, "learning_rate": 3.583134931620619e-05, "loss": 0.4903, "step": 21375 }, { "epoch": 1.07, "grad_norm": 3.186577558517456, "learning_rate": 3.5808279349611504e-05, "loss": 0.5125, "step": 21400 }, { "epoch": 1.07, "grad_norm": 54.349143981933594, "learning_rate": 3.5785209383016815e-05, "loss": 0.3316, "step": 21425 }, { "epoch": 1.07, "grad_norm": 3.5075273513793945, "learning_rate": 3.5762139416422127e-05, "loss": 0.3988, "step": 21450 }, { "epoch": 1.07, "grad_norm": 5.802154064178467, "learning_rate": 3.573906944982744e-05, "loss": 0.565, "step": 21475 }, { "epoch": 1.07, "grad_norm": 5.111006259918213, "learning_rate": 3.571599948323275e-05, "loss": 0.5855, "step": 21500 }, { "epoch": 1.07, "grad_norm": 6.996493339538574, "learning_rate": 3.569292951663806e-05, "loss": 0.4446, "step": 21525 }, { "epoch": 1.07, "grad_norm": 16.910449981689453, "learning_rate": 3.566985955004337e-05, "loss": 0.4844, "step": 21550 }, { "epoch": 1.08, "grad_norm": 6.590620994567871, "learning_rate": 3.564678958344868e-05, "loss": 0.5894, "step": 21575 }, { "epoch": 1.08, "grad_norm": 3.3420209884643555, "learning_rate": 3.5623719616853995e-05, "loss": 0.398, "step": 21600 }, { "epoch": 1.08, "grad_norm": 2.7199301719665527, "learning_rate": 3.560064965025931e-05, "loss": 0.5236, "step": 21625 }, { "epoch": 1.08, "grad_norm": 4.216751575469971, "learning_rate": 3.557757968366462e-05, "loss": 0.3373, "step": 21650 }, { "epoch": 1.08, "grad_norm": 28.876190185546875, "learning_rate": 3.5554509717069936e-05, "loss": 0.4816, "step": 21675 }, { "epoch": 1.08, "grad_norm": 6.362488269805908, "learning_rate": 3.553143975047524e-05, "loss": 0.4663, "step": 21700 }, { "epoch": 1.08, "grad_norm": 2.0933680534362793, "learning_rate": 3.550836978388056e-05, "loss": 0.5506, "step": 21725 }, { "epoch": 1.08, "grad_norm": 6.475625514984131, "learning_rate": 3.548529981728587e-05, "loss": 0.4748, "step": 21750 }, { "epoch": 1.09, "grad_norm": 6.892195701599121, "learning_rate": 3.5462229850691174e-05, "loss": 0.5035, "step": 21775 }, { "epoch": 1.09, "grad_norm": 2.857177495956421, "learning_rate": 3.543915988409649e-05, "loss": 0.5092, "step": 21800 }, { "epoch": 1.09, "grad_norm": 6.091822624206543, "learning_rate": 3.54160899175018e-05, "loss": 0.4765, "step": 21825 }, { "epoch": 1.09, "grad_norm": 2.356370687484741, "learning_rate": 3.5393019950907115e-05, "loss": 0.484, "step": 21850 }, { "epoch": 1.09, "grad_norm": 8.744856834411621, "learning_rate": 3.536994998431242e-05, "loss": 0.4418, "step": 21875 }, { "epoch": 1.09, "grad_norm": 2.4514973163604736, "learning_rate": 3.534688001771774e-05, "loss": 0.4823, "step": 21900 }, { "epoch": 1.09, "grad_norm": 1.6482782363891602, "learning_rate": 3.532381005112305e-05, "loss": 0.409, "step": 21925 }, { "epoch": 1.09, "grad_norm": 6.030589580535889, "learning_rate": 3.530074008452836e-05, "loss": 0.4702, "step": 21950 }, { "epoch": 1.1, "grad_norm": 3.8618252277374268, "learning_rate": 3.527767011793367e-05, "loss": 0.4308, "step": 21975 }, { "epoch": 1.1, "grad_norm": 0.9432287812232971, "learning_rate": 3.525460015133898e-05, "loss": 0.4436, "step": 22000 }, { "epoch": 1.1, "grad_norm": 47.525272369384766, "learning_rate": 3.5231530184744295e-05, "loss": 0.5559, "step": 22025 }, { "epoch": 1.1, "grad_norm": 3.386293411254883, "learning_rate": 3.5208460218149606e-05, "loss": 0.4641, "step": 22050 }, { "epoch": 1.1, "grad_norm": 6.038970947265625, "learning_rate": 3.518539025155492e-05, "loss": 0.4247, "step": 22075 }, { "epoch": 1.1, "grad_norm": 8.06563663482666, "learning_rate": 3.516232028496023e-05, "loss": 0.4515, "step": 22100 }, { "epoch": 1.1, "grad_norm": 5.882358551025391, "learning_rate": 3.513925031836554e-05, "loss": 0.5434, "step": 22125 }, { "epoch": 1.1, "grad_norm": 18.23955726623535, "learning_rate": 3.511618035177085e-05, "loss": 0.4401, "step": 22150 }, { "epoch": 1.1, "grad_norm": 16.53597068786621, "learning_rate": 3.509311038517616e-05, "loss": 0.5405, "step": 22175 }, { "epoch": 1.11, "grad_norm": 6.848310947418213, "learning_rate": 3.5070040418581474e-05, "loss": 0.4074, "step": 22200 }, { "epoch": 1.11, "grad_norm": 2.1513144969940186, "learning_rate": 3.5046970451986785e-05, "loss": 0.4812, "step": 22225 }, { "epoch": 1.11, "grad_norm": 15.589600563049316, "learning_rate": 3.50239004853921e-05, "loss": 0.4576, "step": 22250 }, { "epoch": 1.11, "grad_norm": 5.110909938812256, "learning_rate": 3.5000830518797415e-05, "loss": 0.5207, "step": 22275 }, { "epoch": 1.11, "grad_norm": 1.8498058319091797, "learning_rate": 3.497776055220272e-05, "loss": 0.5286, "step": 22300 }, { "epoch": 1.11, "grad_norm": 3.860257387161255, "learning_rate": 3.495469058560804e-05, "loss": 0.5053, "step": 22325 }, { "epoch": 1.11, "grad_norm": 3.398019313812256, "learning_rate": 3.493162061901334e-05, "loss": 0.4769, "step": 22350 }, { "epoch": 1.11, "grad_norm": 2.5294039249420166, "learning_rate": 3.490855065241866e-05, "loss": 0.4796, "step": 22375 }, { "epoch": 1.12, "grad_norm": 3.215113639831543, "learning_rate": 3.4885480685823965e-05, "loss": 0.6287, "step": 22400 }, { "epoch": 1.12, "grad_norm": 8.600980758666992, "learning_rate": 3.486241071922928e-05, "loss": 0.5, "step": 22425 }, { "epoch": 1.12, "grad_norm": 3.411648988723755, "learning_rate": 3.4839340752634594e-05, "loss": 0.4635, "step": 22450 }, { "epoch": 1.12, "grad_norm": 1.5979714393615723, "learning_rate": 3.4816270786039906e-05, "loss": 0.5175, "step": 22475 }, { "epoch": 1.12, "grad_norm": 15.22208023071289, "learning_rate": 3.479320081944522e-05, "loss": 0.5597, "step": 22500 }, { "epoch": 1.12, "grad_norm": 6.04987096786499, "learning_rate": 3.477013085285052e-05, "loss": 0.5439, "step": 22525 }, { "epoch": 1.12, "grad_norm": 6.620021343231201, "learning_rate": 3.474706088625584e-05, "loss": 0.4821, "step": 22550 }, { "epoch": 1.12, "grad_norm": 1.7440756559371948, "learning_rate": 3.472399091966115e-05, "loss": 0.4697, "step": 22575 }, { "epoch": 1.13, "grad_norm": 122.33167266845703, "learning_rate": 3.470092095306646e-05, "loss": 0.5446, "step": 22600 }, { "epoch": 1.13, "grad_norm": 1.3191596269607544, "learning_rate": 3.4677850986471774e-05, "loss": 0.4383, "step": 22625 }, { "epoch": 1.13, "grad_norm": 2.9673454761505127, "learning_rate": 3.4654781019877085e-05, "loss": 0.6085, "step": 22650 }, { "epoch": 1.13, "grad_norm": 2.2541136741638184, "learning_rate": 3.46317110532824e-05, "loss": 0.5107, "step": 22675 }, { "epoch": 1.13, "grad_norm": 3.6315643787384033, "learning_rate": 3.460864108668771e-05, "loss": 0.4303, "step": 22700 }, { "epoch": 1.13, "grad_norm": 9.80652141571045, "learning_rate": 3.458557112009302e-05, "loss": 0.5278, "step": 22725 }, { "epoch": 1.13, "grad_norm": 306.9204406738281, "learning_rate": 3.456250115349833e-05, "loss": 0.5635, "step": 22750 }, { "epoch": 1.13, "grad_norm": 7.718824863433838, "learning_rate": 3.453943118690364e-05, "loss": 0.5989, "step": 22775 }, { "epoch": 1.14, "grad_norm": 6.900171279907227, "learning_rate": 3.451636122030896e-05, "loss": 0.5765, "step": 22800 }, { "epoch": 1.14, "grad_norm": 2.9811198711395264, "learning_rate": 3.4493291253714265e-05, "loss": 0.6049, "step": 22825 }, { "epoch": 1.14, "grad_norm": 3.2900378704071045, "learning_rate": 3.447022128711958e-05, "loss": 0.7042, "step": 22850 }, { "epoch": 1.14, "grad_norm": 1.831827163696289, "learning_rate": 3.444715132052489e-05, "loss": 0.5314, "step": 22875 }, { "epoch": 1.14, "grad_norm": 251.12879943847656, "learning_rate": 3.44240813539302e-05, "loss": 0.5267, "step": 22900 }, { "epoch": 1.14, "grad_norm": 4.73264217376709, "learning_rate": 3.440101138733551e-05, "loss": 0.5608, "step": 22925 }, { "epoch": 1.14, "grad_norm": 130.6006622314453, "learning_rate": 3.437794142074082e-05, "loss": 0.532, "step": 22950 }, { "epoch": 1.14, "grad_norm": 1.9377081394195557, "learning_rate": 3.435487145414614e-05, "loss": 0.4259, "step": 22975 }, { "epoch": 1.15, "grad_norm": 2.885848045349121, "learning_rate": 3.4331801487551444e-05, "loss": 0.5345, "step": 23000 }, { "epoch": 1.15, "grad_norm": 2.7751710414886475, "learning_rate": 3.430873152095676e-05, "loss": 0.4862, "step": 23025 }, { "epoch": 1.15, "grad_norm": 2.8456525802612305, "learning_rate": 3.428566155436207e-05, "loss": 0.5253, "step": 23050 }, { "epoch": 1.15, "grad_norm": 3.7487411499023438, "learning_rate": 3.4262591587767385e-05, "loss": 0.5467, "step": 23075 }, { "epoch": 1.15, "grad_norm": 1.3666393756866455, "learning_rate": 3.4239521621172697e-05, "loss": 0.4968, "step": 23100 }, { "epoch": 1.15, "grad_norm": 1.7452878952026367, "learning_rate": 3.421645165457801e-05, "loss": 0.5518, "step": 23125 }, { "epoch": 1.15, "grad_norm": 2.3537375926971436, "learning_rate": 3.419338168798332e-05, "loss": 0.4887, "step": 23150 }, { "epoch": 1.15, "grad_norm": 3.2377073764801025, "learning_rate": 3.417031172138863e-05, "loss": 0.6111, "step": 23175 }, { "epoch": 1.16, "grad_norm": 16.65485954284668, "learning_rate": 3.414724175479394e-05, "loss": 0.4488, "step": 23200 }, { "epoch": 1.16, "grad_norm": 5.436980247497559, "learning_rate": 3.412417178819925e-05, "loss": 0.5589, "step": 23225 }, { "epoch": 1.16, "grad_norm": 4.784524440765381, "learning_rate": 3.4101101821604565e-05, "loss": 0.5464, "step": 23250 }, { "epoch": 1.16, "grad_norm": 1.517666220664978, "learning_rate": 3.4078031855009876e-05, "loss": 0.4781, "step": 23275 }, { "epoch": 1.16, "grad_norm": 6.419158935546875, "learning_rate": 3.405496188841519e-05, "loss": 0.3523, "step": 23300 }, { "epoch": 1.16, "grad_norm": 14.737177848815918, "learning_rate": 3.40318919218205e-05, "loss": 0.574, "step": 23325 }, { "epoch": 1.16, "grad_norm": 1.729177713394165, "learning_rate": 3.400882195522581e-05, "loss": 0.5458, "step": 23350 }, { "epoch": 1.16, "grad_norm": 1.4529409408569336, "learning_rate": 3.398575198863112e-05, "loss": 0.4995, "step": 23375 }, { "epoch": 1.17, "grad_norm": 1.5424673557281494, "learning_rate": 3.396268202203643e-05, "loss": 0.4908, "step": 23400 }, { "epoch": 1.17, "grad_norm": 3.3175251483917236, "learning_rate": 3.3939612055441744e-05, "loss": 0.5051, "step": 23425 }, { "epoch": 1.17, "grad_norm": 6.083924770355225, "learning_rate": 3.3916542088847056e-05, "loss": 0.4548, "step": 23450 }, { "epoch": 1.17, "grad_norm": 4.571666717529297, "learning_rate": 3.389347212225237e-05, "loss": 0.4662, "step": 23475 }, { "epoch": 1.17, "grad_norm": 2.936976671218872, "learning_rate": 3.3870402155657685e-05, "loss": 0.4087, "step": 23500 }, { "epoch": 1.17, "grad_norm": 6.462584495544434, "learning_rate": 3.384733218906299e-05, "loss": 0.4337, "step": 23525 }, { "epoch": 1.17, "grad_norm": 0.8171650171279907, "learning_rate": 3.382426222246831e-05, "loss": 0.4279, "step": 23550 }, { "epoch": 1.17, "grad_norm": 27.758996963500977, "learning_rate": 3.380119225587361e-05, "loss": 0.4673, "step": 23575 }, { "epoch": 1.18, "grad_norm": 1.5759187936782837, "learning_rate": 3.3778122289278924e-05, "loss": 0.4649, "step": 23600 }, { "epoch": 1.18, "grad_norm": 4.116755962371826, "learning_rate": 3.375505232268424e-05, "loss": 0.5568, "step": 23625 }, { "epoch": 1.18, "grad_norm": 12.596151351928711, "learning_rate": 3.3731982356089547e-05, "loss": 0.6374, "step": 23650 }, { "epoch": 1.18, "grad_norm": 2.9202468395233154, "learning_rate": 3.3708912389494865e-05, "loss": 0.4618, "step": 23675 }, { "epoch": 1.18, "grad_norm": 2.892897367477417, "learning_rate": 3.368584242290017e-05, "loss": 0.385, "step": 23700 }, { "epoch": 1.18, "grad_norm": 4.933587551116943, "learning_rate": 3.366277245630549e-05, "loss": 0.5328, "step": 23725 }, { "epoch": 1.18, "grad_norm": 1.407938003540039, "learning_rate": 3.363970248971079e-05, "loss": 0.4736, "step": 23750 }, { "epoch": 1.18, "grad_norm": 1.8879458904266357, "learning_rate": 3.361663252311611e-05, "loss": 0.582, "step": 23775 }, { "epoch": 1.19, "grad_norm": 1.3088123798370361, "learning_rate": 3.359356255652142e-05, "loss": 0.4891, "step": 23800 }, { "epoch": 1.19, "grad_norm": 2.8642852306365967, "learning_rate": 3.357049258992673e-05, "loss": 0.4502, "step": 23825 }, { "epoch": 1.19, "grad_norm": 2.1138124465942383, "learning_rate": 3.3547422623332044e-05, "loss": 0.5602, "step": 23850 }, { "epoch": 1.19, "grad_norm": 22.736108779907227, "learning_rate": 3.3524352656737356e-05, "loss": 0.4927, "step": 23875 }, { "epoch": 1.19, "grad_norm": 19.614036560058594, "learning_rate": 3.350128269014267e-05, "loss": 0.4496, "step": 23900 }, { "epoch": 1.19, "grad_norm": 6.80272912979126, "learning_rate": 3.347821272354798e-05, "loss": 0.5073, "step": 23925 }, { "epoch": 1.19, "grad_norm": 1.0270265340805054, "learning_rate": 3.345514275695329e-05, "loss": 0.3707, "step": 23950 }, { "epoch": 1.19, "grad_norm": 44.71577072143555, "learning_rate": 3.34320727903586e-05, "loss": 0.4172, "step": 23975 }, { "epoch": 1.2, "grad_norm": 3.711918354034424, "learning_rate": 3.340900282376391e-05, "loss": 0.5641, "step": 24000 }, { "epoch": 1.2, "grad_norm": 0.8723378777503967, "learning_rate": 3.3385932857169224e-05, "loss": 0.4355, "step": 24025 }, { "epoch": 1.2, "grad_norm": 0.9061195850372314, "learning_rate": 3.3362862890574535e-05, "loss": 0.4956, "step": 24050 }, { "epoch": 1.2, "grad_norm": 12.934722900390625, "learning_rate": 3.3339792923979846e-05, "loss": 0.3641, "step": 24075 }, { "epoch": 1.2, "grad_norm": 3.357600450515747, "learning_rate": 3.331672295738516e-05, "loss": 0.4216, "step": 24100 }, { "epoch": 1.2, "grad_norm": 2.2739009857177734, "learning_rate": 3.329365299079047e-05, "loss": 0.6057, "step": 24125 }, { "epoch": 1.2, "grad_norm": 12.44058895111084, "learning_rate": 3.327058302419579e-05, "loss": 0.567, "step": 24150 }, { "epoch": 1.2, "grad_norm": 6.135155200958252, "learning_rate": 3.324751305760109e-05, "loss": 0.4215, "step": 24175 }, { "epoch": 1.21, "grad_norm": 18.950393676757812, "learning_rate": 3.322444309100641e-05, "loss": 0.5002, "step": 24200 }, { "epoch": 1.21, "grad_norm": 18.473573684692383, "learning_rate": 3.3201373124411715e-05, "loss": 0.6292, "step": 24225 }, { "epoch": 1.21, "grad_norm": 2.5561163425445557, "learning_rate": 3.317830315781703e-05, "loss": 0.4706, "step": 24250 }, { "epoch": 1.21, "grad_norm": 1.382630467414856, "learning_rate": 3.315523319122234e-05, "loss": 0.295, "step": 24275 }, { "epoch": 1.21, "grad_norm": 0.9985918998718262, "learning_rate": 3.3132163224627655e-05, "loss": 0.5048, "step": 24300 }, { "epoch": 1.21, "grad_norm": 22.975221633911133, "learning_rate": 3.310909325803297e-05, "loss": 0.4621, "step": 24325 }, { "epoch": 1.21, "grad_norm": 3.7666635513305664, "learning_rate": 3.308602329143827e-05, "loss": 0.4485, "step": 24350 }, { "epoch": 1.21, "grad_norm": 3.244364023208618, "learning_rate": 3.306295332484359e-05, "loss": 0.4557, "step": 24375 }, { "epoch": 1.22, "grad_norm": 1.085253119468689, "learning_rate": 3.3039883358248894e-05, "loss": 0.4611, "step": 24400 }, { "epoch": 1.22, "grad_norm": 3.019808292388916, "learning_rate": 3.301681339165421e-05, "loss": 0.534, "step": 24425 }, { "epoch": 1.22, "grad_norm": 3.1418190002441406, "learning_rate": 3.299374342505952e-05, "loss": 0.5937, "step": 24450 }, { "epoch": 1.22, "grad_norm": 1.0198239088058472, "learning_rate": 3.2970673458464835e-05, "loss": 0.513, "step": 24475 }, { "epoch": 1.22, "grad_norm": 3.4835383892059326, "learning_rate": 3.2947603491870146e-05, "loss": 0.3619, "step": 24500 }, { "epoch": 1.22, "grad_norm": 2.089794635772705, "learning_rate": 3.292453352527546e-05, "loss": 0.5536, "step": 24525 }, { "epoch": 1.22, "grad_norm": 1.106353998184204, "learning_rate": 3.290146355868077e-05, "loss": 0.4232, "step": 24550 }, { "epoch": 1.22, "grad_norm": 3.080213785171509, "learning_rate": 3.287839359208608e-05, "loss": 0.5168, "step": 24575 }, { "epoch": 1.23, "grad_norm": 5.004645824432373, "learning_rate": 3.285532362549139e-05, "loss": 0.6027, "step": 24600 }, { "epoch": 1.23, "grad_norm": 4.234391689300537, "learning_rate": 3.28322536588967e-05, "loss": 0.3551, "step": 24625 }, { "epoch": 1.23, "grad_norm": 66.61769104003906, "learning_rate": 3.2809183692302014e-05, "loss": 0.513, "step": 24650 }, { "epoch": 1.23, "grad_norm": 1.416574239730835, "learning_rate": 3.278611372570733e-05, "loss": 0.3775, "step": 24675 }, { "epoch": 1.23, "grad_norm": 6.356552600860596, "learning_rate": 3.276304375911264e-05, "loss": 0.4529, "step": 24700 }, { "epoch": 1.23, "grad_norm": 1.3848146200180054, "learning_rate": 3.273997379251795e-05, "loss": 0.4677, "step": 24725 }, { "epoch": 1.23, "grad_norm": 2.0824124813079834, "learning_rate": 3.271690382592326e-05, "loss": 0.5075, "step": 24750 }, { "epoch": 1.23, "grad_norm": 1.3122801780700684, "learning_rate": 3.269383385932857e-05, "loss": 0.4526, "step": 24775 }, { "epoch": 1.24, "grad_norm": 1.8558235168457031, "learning_rate": 3.267076389273388e-05, "loss": 0.5952, "step": 24800 }, { "epoch": 1.24, "grad_norm": 1.6498969793319702, "learning_rate": 3.2647693926139194e-05, "loss": 0.5372, "step": 24825 }, { "epoch": 1.24, "grad_norm": 1.6553959846496582, "learning_rate": 3.262462395954451e-05, "loss": 0.4644, "step": 24850 }, { "epoch": 1.24, "grad_norm": 9.277535438537598, "learning_rate": 3.260155399294982e-05, "loss": 0.568, "step": 24875 }, { "epoch": 1.24, "grad_norm": 1.7730066776275635, "learning_rate": 3.2578484026355135e-05, "loss": 0.3625, "step": 24900 }, { "epoch": 1.24, "grad_norm": 1.2242295742034912, "learning_rate": 3.255541405976044e-05, "loss": 0.411, "step": 24925 }, { "epoch": 1.24, "grad_norm": 3.2038660049438477, "learning_rate": 3.253234409316576e-05, "loss": 0.316, "step": 24950 }, { "epoch": 1.24, "grad_norm": 6.638166427612305, "learning_rate": 3.250927412657106e-05, "loss": 0.5497, "step": 24975 }, { "epoch": 1.25, "grad_norm": 0.8951555490493774, "learning_rate": 3.248620415997638e-05, "loss": 0.432, "step": 25000 }, { "epoch": 1.25, "grad_norm": 3.3353822231292725, "learning_rate": 3.246313419338169e-05, "loss": 0.4795, "step": 25025 }, { "epoch": 1.25, "grad_norm": 2.7513251304626465, "learning_rate": 3.2440064226787e-05, "loss": 0.5432, "step": 25050 }, { "epoch": 1.25, "grad_norm": 0.9779685735702515, "learning_rate": 3.2416994260192314e-05, "loss": 0.5657, "step": 25075 }, { "epoch": 1.25, "grad_norm": 4.009624004364014, "learning_rate": 3.239392429359762e-05, "loss": 0.4931, "step": 25100 }, { "epoch": 1.25, "grad_norm": 1.116674542427063, "learning_rate": 3.237085432700294e-05, "loss": 0.5125, "step": 25125 }, { "epoch": 1.25, "grad_norm": 5.3419976234436035, "learning_rate": 3.234778436040825e-05, "loss": 0.5456, "step": 25150 }, { "epoch": 1.25, "grad_norm": 4.242710590362549, "learning_rate": 3.232471439381356e-05, "loss": 0.4269, "step": 25175 }, { "epoch": 1.26, "grad_norm": 1.4048486948013306, "learning_rate": 3.230164442721887e-05, "loss": 0.348, "step": 25200 }, { "epoch": 1.26, "grad_norm": 8.610922813415527, "learning_rate": 3.227857446062418e-05, "loss": 0.5558, "step": 25225 }, { "epoch": 1.26, "grad_norm": 0.9171805381774902, "learning_rate": 3.2255504494029494e-05, "loss": 0.547, "step": 25250 }, { "epoch": 1.26, "grad_norm": 5.653646945953369, "learning_rate": 3.2232434527434805e-05, "loss": 0.529, "step": 25275 }, { "epoch": 1.26, "grad_norm": 4.338497638702393, "learning_rate": 3.2209364560840117e-05, "loss": 0.5316, "step": 25300 }, { "epoch": 1.26, "grad_norm": 1.1895941495895386, "learning_rate": 3.218629459424543e-05, "loss": 0.3412, "step": 25325 }, { "epoch": 1.26, "grad_norm": 3.513719081878662, "learning_rate": 3.216322462765074e-05, "loss": 0.5027, "step": 25350 }, { "epoch": 1.26, "grad_norm": 3.2415273189544678, "learning_rate": 3.214015466105606e-05, "loss": 0.416, "step": 25375 }, { "epoch": 1.27, "grad_norm": 3.46956205368042, "learning_rate": 3.211708469446136e-05, "loss": 0.5001, "step": 25400 }, { "epoch": 1.27, "grad_norm": 5523.3828125, "learning_rate": 3.209401472786668e-05, "loss": 0.5623, "step": 25425 }, { "epoch": 1.27, "grad_norm": 3.629899024963379, "learning_rate": 3.2070944761271985e-05, "loss": 0.4153, "step": 25450 }, { "epoch": 1.27, "grad_norm": 11.945967674255371, "learning_rate": 3.2047874794677296e-05, "loss": 0.5597, "step": 25475 }, { "epoch": 1.27, "grad_norm": 1.553520917892456, "learning_rate": 3.202480482808261e-05, "loss": 0.4993, "step": 25500 }, { "epoch": 1.27, "grad_norm": 1.838877558708191, "learning_rate": 3.200173486148792e-05, "loss": 0.3805, "step": 25525 }, { "epoch": 1.27, "grad_norm": 1.2891180515289307, "learning_rate": 3.197866489489324e-05, "loss": 0.6021, "step": 25550 }, { "epoch": 1.27, "grad_norm": 1.362278699874878, "learning_rate": 3.195559492829854e-05, "loss": 0.4973, "step": 25575 }, { "epoch": 1.28, "grad_norm": 50.55386734008789, "learning_rate": 3.193252496170386e-05, "loss": 0.403, "step": 25600 }, { "epoch": 1.28, "grad_norm": 3.0399601459503174, "learning_rate": 3.1909454995109164e-05, "loss": 0.6989, "step": 25625 }, { "epoch": 1.28, "grad_norm": 2.681670665740967, "learning_rate": 3.188638502851448e-05, "loss": 0.5274, "step": 25650 }, { "epoch": 1.28, "grad_norm": 6.848381042480469, "learning_rate": 3.1863315061919794e-05, "loss": 0.3641, "step": 25675 }, { "epoch": 1.28, "grad_norm": 1.4316284656524658, "learning_rate": 3.1840245095325105e-05, "loss": 0.4657, "step": 25700 }, { "epoch": 1.28, "grad_norm": 1.1102467775344849, "learning_rate": 3.1817175128730416e-05, "loss": 0.4894, "step": 25725 }, { "epoch": 1.28, "grad_norm": 8.56735610961914, "learning_rate": 3.179410516213573e-05, "loss": 0.5228, "step": 25750 }, { "epoch": 1.28, "grad_norm": 2.549102544784546, "learning_rate": 3.177103519554104e-05, "loss": 0.4387, "step": 25775 }, { "epoch": 1.29, "grad_norm": 8.965352058410645, "learning_rate": 3.174796522894635e-05, "loss": 0.6523, "step": 25800 }, { "epoch": 1.29, "grad_norm": 3.4664347171783447, "learning_rate": 3.172489526235166e-05, "loss": 0.6127, "step": 25825 }, { "epoch": 1.29, "grad_norm": 2.6698806285858154, "learning_rate": 3.170182529575697e-05, "loss": 0.4529, "step": 25850 }, { "epoch": 1.29, "grad_norm": 6.293718338012695, "learning_rate": 3.1678755329162285e-05, "loss": 0.5818, "step": 25875 }, { "epoch": 1.29, "grad_norm": 1.8514574766159058, "learning_rate": 3.1655685362567596e-05, "loss": 0.4983, "step": 25900 }, { "epoch": 1.29, "grad_norm": 1.4830529689788818, "learning_rate": 3.163261539597291e-05, "loss": 0.5502, "step": 25925 }, { "epoch": 1.29, "grad_norm": 1.423803687095642, "learning_rate": 3.160954542937822e-05, "loss": 0.442, "step": 25950 }, { "epoch": 1.29, "grad_norm": 1.1572527885437012, "learning_rate": 3.158647546278353e-05, "loss": 0.5089, "step": 25975 }, { "epoch": 1.3, "grad_norm": 2.6536920070648193, "learning_rate": 3.156340549618884e-05, "loss": 0.4971, "step": 26000 }, { "epoch": 1.3, "grad_norm": 3.075524091720581, "learning_rate": 3.154033552959415e-05, "loss": 0.5367, "step": 26025 }, { "epoch": 1.3, "grad_norm": 1.6290290355682373, "learning_rate": 3.1517265562999464e-05, "loss": 0.529, "step": 26050 }, { "epoch": 1.3, "grad_norm": 2.615154266357422, "learning_rate": 3.149419559640478e-05, "loss": 0.5062, "step": 26075 }, { "epoch": 1.3, "grad_norm": 4.603691101074219, "learning_rate": 3.147112562981009e-05, "loss": 0.443, "step": 26100 }, { "epoch": 1.3, "grad_norm": 1.1716935634613037, "learning_rate": 3.1448055663215405e-05, "loss": 0.5058, "step": 26125 }, { "epoch": 1.3, "grad_norm": 1.8815289735794067, "learning_rate": 3.142498569662071e-05, "loss": 0.3954, "step": 26150 }, { "epoch": 1.3, "grad_norm": 3.429013252258301, "learning_rate": 3.140191573002603e-05, "loss": 0.538, "step": 26175 }, { "epoch": 1.31, "grad_norm": 4.653341293334961, "learning_rate": 3.137884576343134e-05, "loss": 0.5478, "step": 26200 }, { "epoch": 1.31, "grad_norm": 51.642330169677734, "learning_rate": 3.1355775796836644e-05, "loss": 0.5739, "step": 26225 }, { "epoch": 1.31, "grad_norm": 3.7197000980377197, "learning_rate": 3.133270583024196e-05, "loss": 0.5179, "step": 26250 }, { "epoch": 1.31, "grad_norm": 5.795961380004883, "learning_rate": 3.1309635863647266e-05, "loss": 0.4714, "step": 26275 }, { "epoch": 1.31, "grad_norm": 4.7175703048706055, "learning_rate": 3.1286565897052585e-05, "loss": 0.4698, "step": 26300 }, { "epoch": 1.31, "grad_norm": 2.4297192096710205, "learning_rate": 3.126349593045789e-05, "loss": 0.4828, "step": 26325 }, { "epoch": 1.31, "grad_norm": 3.88303279876709, "learning_rate": 3.124042596386321e-05, "loss": 0.5602, "step": 26350 }, { "epoch": 1.31, "grad_norm": 6.631831169128418, "learning_rate": 3.121735599726852e-05, "loss": 0.4625, "step": 26375 }, { "epoch": 1.32, "grad_norm": 5.661457061767578, "learning_rate": 3.119428603067383e-05, "loss": 0.5065, "step": 26400 }, { "epoch": 1.32, "grad_norm": 11.636333465576172, "learning_rate": 3.117121606407914e-05, "loss": 0.3525, "step": 26425 }, { "epoch": 1.32, "grad_norm": 1.0581868886947632, "learning_rate": 3.114814609748445e-05, "loss": 0.4466, "step": 26450 }, { "epoch": 1.32, "grad_norm": 3.096194267272949, "learning_rate": 3.1125076130889764e-05, "loss": 0.4712, "step": 26475 }, { "epoch": 1.32, "grad_norm": 6.5298075675964355, "learning_rate": 3.1102006164295075e-05, "loss": 0.5473, "step": 26500 }, { "epoch": 1.32, "grad_norm": 3.341557025909424, "learning_rate": 3.107893619770039e-05, "loss": 0.5103, "step": 26525 }, { "epoch": 1.32, "grad_norm": 7.310173988342285, "learning_rate": 3.10558662311057e-05, "loss": 0.4153, "step": 26550 }, { "epoch": 1.32, "grad_norm": 1.1792340278625488, "learning_rate": 3.103279626451101e-05, "loss": 0.5409, "step": 26575 }, { "epoch": 1.33, "grad_norm": 14.870302200317383, "learning_rate": 3.100972629791632e-05, "loss": 0.4116, "step": 26600 }, { "epoch": 1.33, "grad_norm": 1.2999331951141357, "learning_rate": 3.098665633132163e-05, "loss": 0.4566, "step": 26625 }, { "epoch": 1.33, "grad_norm": 6.613809585571289, "learning_rate": 3.0963586364726944e-05, "loss": 0.534, "step": 26650 }, { "epoch": 1.33, "grad_norm": 2.9065253734588623, "learning_rate": 3.0940516398132255e-05, "loss": 0.5131, "step": 26675 }, { "epoch": 1.33, "grad_norm": 2.785900115966797, "learning_rate": 3.0917446431537566e-05, "loss": 0.442, "step": 26700 }, { "epoch": 1.33, "grad_norm": 14.88037109375, "learning_rate": 3.0894376464942884e-05, "loss": 0.4574, "step": 26725 }, { "epoch": 1.33, "grad_norm": 4.1157636642456055, "learning_rate": 3.087130649834819e-05, "loss": 0.5106, "step": 26750 }, { "epoch": 1.33, "grad_norm": 2.2302122116088867, "learning_rate": 3.084823653175351e-05, "loss": 0.4049, "step": 26775 }, { "epoch": 1.34, "grad_norm": 4.412403106689453, "learning_rate": 3.082516656515881e-05, "loss": 0.4754, "step": 26800 }, { "epoch": 1.34, "grad_norm": 5.906436443328857, "learning_rate": 3.080209659856413e-05, "loss": 0.537, "step": 26825 }, { "epoch": 1.34, "grad_norm": 5.8190836906433105, "learning_rate": 3.0779026631969434e-05, "loss": 0.5667, "step": 26850 }, { "epoch": 1.34, "grad_norm": 2.78393292427063, "learning_rate": 3.075595666537475e-05, "loss": 0.4906, "step": 26875 }, { "epoch": 1.34, "grad_norm": 3.552785873413086, "learning_rate": 3.0732886698780064e-05, "loss": 0.4337, "step": 26900 }, { "epoch": 1.34, "grad_norm": 4.454800128936768, "learning_rate": 3.0709816732185375e-05, "loss": 0.4113, "step": 26925 }, { "epoch": 1.34, "grad_norm": 4.132376670837402, "learning_rate": 3.068674676559069e-05, "loss": 0.4546, "step": 26950 }, { "epoch": 1.34, "grad_norm": 5.598781108856201, "learning_rate": 3.066367679899599e-05, "loss": 0.4533, "step": 26975 }, { "epoch": 1.35, "grad_norm": 19.695556640625, "learning_rate": 3.064060683240131e-05, "loss": 0.5621, "step": 27000 }, { "epoch": 1.35, "grad_norm": 1.143239140510559, "learning_rate": 3.061753686580662e-05, "loss": 0.5251, "step": 27025 }, { "epoch": 1.35, "grad_norm": 1.0635647773742676, "learning_rate": 3.059446689921193e-05, "loss": 0.4429, "step": 27050 }, { "epoch": 1.35, "grad_norm": 43.61861801147461, "learning_rate": 3.0571396932617243e-05, "loss": 0.4055, "step": 27075 }, { "epoch": 1.35, "grad_norm": 6.694369316101074, "learning_rate": 3.0548326966022555e-05, "loss": 0.4343, "step": 27100 }, { "epoch": 1.35, "grad_norm": 3.3620946407318115, "learning_rate": 3.0525256999427866e-05, "loss": 0.5503, "step": 27125 }, { "epoch": 1.35, "grad_norm": 1.006640911102295, "learning_rate": 3.050218703283318e-05, "loss": 0.5391, "step": 27150 }, { "epoch": 1.35, "grad_norm": 3.3535425662994385, "learning_rate": 3.047911706623849e-05, "loss": 0.6485, "step": 27175 }, { "epoch": 1.36, "grad_norm": 1.531585931777954, "learning_rate": 3.0456047099643804e-05, "loss": 0.4886, "step": 27200 }, { "epoch": 1.36, "grad_norm": 5.89702844619751, "learning_rate": 3.043297713304911e-05, "loss": 0.6445, "step": 27225 }, { "epoch": 1.36, "grad_norm": 2.3690192699432373, "learning_rate": 3.0409907166454426e-05, "loss": 0.4703, "step": 27250 }, { "epoch": 1.36, "grad_norm": 2.31122088432312, "learning_rate": 3.0386837199859734e-05, "loss": 0.5204, "step": 27275 }, { "epoch": 1.36, "grad_norm": 1.4227750301361084, "learning_rate": 3.036376723326505e-05, "loss": 0.464, "step": 27300 }, { "epoch": 1.36, "grad_norm": 1.3377498388290405, "learning_rate": 3.034069726667036e-05, "loss": 0.4762, "step": 27325 }, { "epoch": 1.36, "grad_norm": 4.721393585205078, "learning_rate": 3.031762730007567e-05, "loss": 0.4241, "step": 27350 }, { "epoch": 1.36, "grad_norm": 1.1271649599075317, "learning_rate": 3.0294557333480983e-05, "loss": 0.3842, "step": 27375 }, { "epoch": 1.37, "grad_norm": 49.33847427368164, "learning_rate": 3.027148736688629e-05, "loss": 0.4582, "step": 27400 }, { "epoch": 1.37, "grad_norm": 6.304983615875244, "learning_rate": 3.0248417400291606e-05, "loss": 0.447, "step": 27425 }, { "epoch": 1.37, "grad_norm": 20.872663497924805, "learning_rate": 3.0225347433696917e-05, "loss": 0.461, "step": 27450 }, { "epoch": 1.37, "grad_norm": 0.8352044224739075, "learning_rate": 3.020227746710223e-05, "loss": 0.4513, "step": 27475 }, { "epoch": 1.37, "grad_norm": 3.6428568363189697, "learning_rate": 3.017920750050754e-05, "loss": 0.4375, "step": 27500 }, { "epoch": 1.37, "grad_norm": 0.8028072714805603, "learning_rate": 3.0156137533912855e-05, "loss": 0.3919, "step": 27525 }, { "epoch": 1.37, "grad_norm": 3.726179838180542, "learning_rate": 3.0133067567318163e-05, "loss": 0.5349, "step": 27550 }, { "epoch": 1.37, "grad_norm": 2.6823785305023193, "learning_rate": 3.0109997600723477e-05, "loss": 0.6074, "step": 27575 }, { "epoch": 1.38, "grad_norm": 6.498818397521973, "learning_rate": 3.0086927634128785e-05, "loss": 0.3749, "step": 27600 }, { "epoch": 1.38, "grad_norm": 4.5009026527404785, "learning_rate": 3.00638576675341e-05, "loss": 0.4709, "step": 27625 }, { "epoch": 1.38, "grad_norm": 64.75154113769531, "learning_rate": 3.004078770093941e-05, "loss": 0.3149, "step": 27650 }, { "epoch": 1.38, "grad_norm": 3.188356637954712, "learning_rate": 3.0017717734344726e-05, "loss": 0.4817, "step": 27675 }, { "epoch": 1.38, "grad_norm": 1.605098009109497, "learning_rate": 2.9994647767750034e-05, "loss": 0.532, "step": 27700 }, { "epoch": 1.38, "grad_norm": 3.195401191711426, "learning_rate": 2.9971577801155342e-05, "loss": 0.3489, "step": 27725 }, { "epoch": 1.38, "grad_norm": 1.9191489219665527, "learning_rate": 2.9948507834560657e-05, "loss": 0.4145, "step": 27750 }, { "epoch": 1.38, "grad_norm": 0.909359335899353, "learning_rate": 2.9925437867965965e-05, "loss": 0.4814, "step": 27775 }, { "epoch": 1.39, "grad_norm": 3497.900634765625, "learning_rate": 2.990236790137128e-05, "loss": 0.6498, "step": 27800 }, { "epoch": 1.39, "grad_norm": 3.320150375366211, "learning_rate": 2.987929793477659e-05, "loss": 0.3288, "step": 27825 }, { "epoch": 1.39, "grad_norm": 20.31009292602539, "learning_rate": 2.9856227968181906e-05, "loss": 0.5846, "step": 27850 }, { "epoch": 1.39, "grad_norm": 11.665755271911621, "learning_rate": 2.9833158001587214e-05, "loss": 0.4474, "step": 27875 }, { "epoch": 1.39, "grad_norm": 3.6207375526428223, "learning_rate": 2.981008803499253e-05, "loss": 0.6815, "step": 27900 }, { "epoch": 1.39, "grad_norm": 5.2110419273376465, "learning_rate": 2.9787018068397836e-05, "loss": 0.5543, "step": 27925 }, { "epoch": 1.39, "grad_norm": 1.0339158773422241, "learning_rate": 2.976394810180315e-05, "loss": 0.4515, "step": 27950 }, { "epoch": 1.39, "grad_norm": 3.7517521381378174, "learning_rate": 2.9740878135208463e-05, "loss": 0.5052, "step": 27975 }, { "epoch": 1.4, "grad_norm": 4.305375576019287, "learning_rate": 2.9717808168613774e-05, "loss": 0.5438, "step": 28000 }, { "epoch": 1.4, "grad_norm": 2.0723867416381836, "learning_rate": 2.9694738202019085e-05, "loss": 0.6635, "step": 28025 }, { "epoch": 1.4, "grad_norm": 1.6509737968444824, "learning_rate": 2.96716682354244e-05, "loss": 0.4156, "step": 28050 }, { "epoch": 1.4, "grad_norm": 3.1618893146514893, "learning_rate": 2.9648598268829708e-05, "loss": 0.4622, "step": 28075 }, { "epoch": 1.4, "grad_norm": 6.950445175170898, "learning_rate": 2.9625528302235016e-05, "loss": 0.4638, "step": 28100 }, { "epoch": 1.4, "grad_norm": 2.7668297290802, "learning_rate": 2.960245833564033e-05, "loss": 0.3667, "step": 28125 }, { "epoch": 1.4, "grad_norm": 1.7094498872756958, "learning_rate": 2.9579388369045642e-05, "loss": 0.5978, "step": 28150 }, { "epoch": 1.4, "grad_norm": 5.577686786651611, "learning_rate": 2.9556318402450957e-05, "loss": 0.5579, "step": 28175 }, { "epoch": 1.41, "grad_norm": 2.0594027042388916, "learning_rate": 2.9533248435856265e-05, "loss": 0.4377, "step": 28200 }, { "epoch": 1.41, "grad_norm": 3.223324775695801, "learning_rate": 2.951017846926158e-05, "loss": 0.4961, "step": 28225 }, { "epoch": 1.41, "grad_norm": 3.0184011459350586, "learning_rate": 2.9487108502666888e-05, "loss": 0.4569, "step": 28250 }, { "epoch": 1.41, "grad_norm": 4.878507614135742, "learning_rate": 2.9464038536072202e-05, "loss": 0.6598, "step": 28275 }, { "epoch": 1.41, "grad_norm": 2.9261884689331055, "learning_rate": 2.944096856947751e-05, "loss": 0.5696, "step": 28300 }, { "epoch": 1.41, "grad_norm": 1.7874714136123657, "learning_rate": 2.9417898602882825e-05, "loss": 0.6261, "step": 28325 }, { "epoch": 1.41, "grad_norm": 2.966109275817871, "learning_rate": 2.9394828636288136e-05, "loss": 0.4914, "step": 28350 }, { "epoch": 1.41, "grad_norm": 2.941281795501709, "learning_rate": 2.937175866969345e-05, "loss": 0.451, "step": 28375 }, { "epoch": 1.42, "grad_norm": 3.6125543117523193, "learning_rate": 2.934868870309876e-05, "loss": 0.5107, "step": 28400 }, { "epoch": 1.42, "grad_norm": 4.665797710418701, "learning_rate": 2.9325618736504074e-05, "loss": 0.6126, "step": 28425 }, { "epoch": 1.42, "grad_norm": 1.2434524297714233, "learning_rate": 2.9302548769909382e-05, "loss": 0.4911, "step": 28450 }, { "epoch": 1.42, "grad_norm": 93.26859283447266, "learning_rate": 2.9279478803314693e-05, "loss": 0.4562, "step": 28475 }, { "epoch": 1.42, "grad_norm": 5.395887851715088, "learning_rate": 2.9256408836720008e-05, "loss": 0.5041, "step": 28500 }, { "epoch": 1.42, "grad_norm": 6.143950939178467, "learning_rate": 2.9233338870125316e-05, "loss": 0.5255, "step": 28525 }, { "epoch": 1.42, "grad_norm": 1.3129888772964478, "learning_rate": 2.921026890353063e-05, "loss": 0.5034, "step": 28550 }, { "epoch": 1.42, "grad_norm": 3.8713059425354004, "learning_rate": 2.918719893693594e-05, "loss": 0.3438, "step": 28575 }, { "epoch": 1.43, "grad_norm": 0.9864943027496338, "learning_rate": 2.9164128970341253e-05, "loss": 0.5203, "step": 28600 }, { "epoch": 1.43, "grad_norm": 3.4831087589263916, "learning_rate": 2.914105900374656e-05, "loss": 0.5231, "step": 28625 }, { "epoch": 1.43, "grad_norm": 4.226259708404541, "learning_rate": 2.9117989037151876e-05, "loss": 0.5852, "step": 28650 }, { "epoch": 1.43, "grad_norm": 1.6076862812042236, "learning_rate": 2.9094919070557187e-05, "loss": 0.5976, "step": 28675 }, { "epoch": 1.43, "grad_norm": 2.1014654636383057, "learning_rate": 2.9071849103962502e-05, "loss": 0.4824, "step": 28700 }, { "epoch": 1.43, "grad_norm": 1.2649108171463013, "learning_rate": 2.904877913736781e-05, "loss": 0.5148, "step": 28725 }, { "epoch": 1.43, "grad_norm": 1.9344068765640259, "learning_rate": 2.9025709170773125e-05, "loss": 0.5006, "step": 28750 }, { "epoch": 1.43, "grad_norm": 1.1153630018234253, "learning_rate": 2.9002639204178433e-05, "loss": 0.3174, "step": 28775 }, { "epoch": 1.44, "grad_norm": 3.08996319770813, "learning_rate": 2.8979569237583748e-05, "loss": 0.4734, "step": 28800 }, { "epoch": 1.44, "grad_norm": 0.8343068361282349, "learning_rate": 2.8956499270989056e-05, "loss": 0.4373, "step": 28825 }, { "epoch": 1.44, "grad_norm": 2.541003465652466, "learning_rate": 2.8933429304394367e-05, "loss": 0.5624, "step": 28850 }, { "epoch": 1.44, "grad_norm": 1.278167486190796, "learning_rate": 2.891035933779968e-05, "loss": 0.3731, "step": 28875 }, { "epoch": 1.44, "grad_norm": 3.0557451248168945, "learning_rate": 2.888728937120499e-05, "loss": 0.3506, "step": 28900 }, { "epoch": 1.44, "grad_norm": 4.243703365325928, "learning_rate": 2.8864219404610304e-05, "loss": 0.4886, "step": 28925 }, { "epoch": 1.44, "grad_norm": 3.341449737548828, "learning_rate": 2.8841149438015612e-05, "loss": 0.4455, "step": 28950 }, { "epoch": 1.44, "grad_norm": 2.9293224811553955, "learning_rate": 2.8818079471420927e-05, "loss": 0.4343, "step": 28975 }, { "epoch": 1.45, "grad_norm": 5.4256205558776855, "learning_rate": 2.879500950482624e-05, "loss": 0.5741, "step": 29000 }, { "epoch": 1.45, "grad_norm": 0.9868775606155396, "learning_rate": 2.877193953823155e-05, "loss": 0.4025, "step": 29025 }, { "epoch": 1.45, "grad_norm": 0.9488250017166138, "learning_rate": 2.874886957163686e-05, "loss": 0.4695, "step": 29050 }, { "epoch": 1.45, "grad_norm": 1.0704574584960938, "learning_rate": 2.8725799605042176e-05, "loss": 0.4965, "step": 29075 }, { "epoch": 1.45, "grad_norm": 1.0031787157058716, "learning_rate": 2.8702729638447484e-05, "loss": 0.4797, "step": 29100 }, { "epoch": 1.45, "grad_norm": 2.660698413848877, "learning_rate": 2.86796596718528e-05, "loss": 0.6116, "step": 29125 }, { "epoch": 1.45, "grad_norm": 2.4585986137390137, "learning_rate": 2.8656589705258107e-05, "loss": 0.4994, "step": 29150 }, { "epoch": 1.45, "grad_norm": 1.0483205318450928, "learning_rate": 2.863351973866342e-05, "loss": 0.5072, "step": 29175 }, { "epoch": 1.46, "grad_norm": 5.655982494354248, "learning_rate": 2.8610449772068733e-05, "loss": 0.5917, "step": 29200 }, { "epoch": 1.46, "grad_norm": 3.0197324752807617, "learning_rate": 2.858737980547404e-05, "loss": 0.3059, "step": 29225 }, { "epoch": 1.46, "grad_norm": 3.459602117538452, "learning_rate": 2.8564309838879355e-05, "loss": 0.5433, "step": 29250 }, { "epoch": 1.46, "grad_norm": 1.178184986114502, "learning_rate": 2.8541239872284663e-05, "loss": 0.4661, "step": 29275 }, { "epoch": 1.46, "grad_norm": 16.78719711303711, "learning_rate": 2.8518169905689978e-05, "loss": 0.4727, "step": 29300 }, { "epoch": 1.46, "grad_norm": 9.509754180908203, "learning_rate": 2.8495099939095286e-05, "loss": 0.488, "step": 29325 }, { "epoch": 1.46, "grad_norm": 9.90145492553711, "learning_rate": 2.84720299725006e-05, "loss": 0.5822, "step": 29350 }, { "epoch": 1.46, "grad_norm": 12.901089668273926, "learning_rate": 2.8448960005905912e-05, "loss": 0.4885, "step": 29375 }, { "epoch": 1.47, "grad_norm": 3.088007926940918, "learning_rate": 2.8425890039311227e-05, "loss": 0.6254, "step": 29400 }, { "epoch": 1.47, "grad_norm": 3.0785930156707764, "learning_rate": 2.8402820072716535e-05, "loss": 0.4276, "step": 29425 }, { "epoch": 1.47, "grad_norm": 6.4305572509765625, "learning_rate": 2.837975010612185e-05, "loss": 0.53, "step": 29450 }, { "epoch": 1.47, "grad_norm": 2.6937947273254395, "learning_rate": 2.8356680139527158e-05, "loss": 0.4267, "step": 29475 }, { "epoch": 1.47, "grad_norm": 30.603422164916992, "learning_rate": 2.8333610172932472e-05, "loss": 0.3964, "step": 29500 }, { "epoch": 1.47, "grad_norm": 3.560316562652588, "learning_rate": 2.8310540206337784e-05, "loss": 0.4128, "step": 29525 }, { "epoch": 1.47, "grad_norm": 16.946557998657227, "learning_rate": 2.8287470239743095e-05, "loss": 0.5844, "step": 29550 }, { "epoch": 1.47, "grad_norm": 3.1420838832855225, "learning_rate": 2.8264400273148407e-05, "loss": 0.5387, "step": 29575 }, { "epoch": 1.47, "grad_norm": 8.17186450958252, "learning_rate": 2.8241330306553714e-05, "loss": 0.4568, "step": 29600 }, { "epoch": 1.48, "grad_norm": 4.252970218658447, "learning_rate": 2.821826033995903e-05, "loss": 0.4813, "step": 29625 }, { "epoch": 1.48, "grad_norm": 17.932771682739258, "learning_rate": 2.8195190373364337e-05, "loss": 0.5344, "step": 29650 }, { "epoch": 1.48, "grad_norm": 1.9828652143478394, "learning_rate": 2.8172120406769652e-05, "loss": 0.5218, "step": 29675 }, { "epoch": 1.48, "grad_norm": 7.818408489227295, "learning_rate": 2.8149050440174963e-05, "loss": 0.4881, "step": 29700 }, { "epoch": 1.48, "grad_norm": 1.1083383560180664, "learning_rate": 2.8125980473580278e-05, "loss": 0.5118, "step": 29725 }, { "epoch": 1.48, "grad_norm": 1.450125813484192, "learning_rate": 2.8102910506985586e-05, "loss": 0.6522, "step": 29750 }, { "epoch": 1.48, "grad_norm": 6.853457927703857, "learning_rate": 2.80798405403909e-05, "loss": 0.5363, "step": 29775 }, { "epoch": 1.48, "grad_norm": 6.707515716552734, "learning_rate": 2.805677057379621e-05, "loss": 0.493, "step": 29800 }, { "epoch": 1.49, "grad_norm": 1.163601279258728, "learning_rate": 2.8033700607201524e-05, "loss": 0.3708, "step": 29825 }, { "epoch": 1.49, "grad_norm": 8.384958267211914, "learning_rate": 2.801063064060683e-05, "loss": 0.4337, "step": 29850 }, { "epoch": 1.49, "grad_norm": 3.240912914276123, "learning_rate": 2.7987560674012146e-05, "loss": 0.4612, "step": 29875 }, { "epoch": 1.49, "grad_norm": 1.8176313638687134, "learning_rate": 2.7964490707417458e-05, "loss": 0.5031, "step": 29900 }, { "epoch": 1.49, "grad_norm": 1.002866506576538, "learning_rate": 2.7941420740822772e-05, "loss": 0.4217, "step": 29925 }, { "epoch": 1.49, "grad_norm": 2.510270357131958, "learning_rate": 2.791835077422808e-05, "loss": 0.5367, "step": 29950 }, { "epoch": 1.49, "grad_norm": 16.336475372314453, "learning_rate": 2.7895280807633388e-05, "loss": 0.5, "step": 29975 }, { "epoch": 1.49, "grad_norm": 1.0017398595809937, "learning_rate": 2.7872210841038703e-05, "loss": 0.2951, "step": 30000 }, { "epoch": 1.5, "grad_norm": 3.2033021450042725, "learning_rate": 2.7849140874444014e-05, "loss": 0.5003, "step": 30025 }, { "epoch": 1.5, "grad_norm": 1.3507925271987915, "learning_rate": 2.782607090784933e-05, "loss": 0.4622, "step": 30050 }, { "epoch": 1.5, "grad_norm": 1.161468505859375, "learning_rate": 2.7803000941254637e-05, "loss": 0.4648, "step": 30075 }, { "epoch": 1.5, "grad_norm": 3.3827836513519287, "learning_rate": 2.7779930974659952e-05, "loss": 0.414, "step": 30100 }, { "epoch": 1.5, "grad_norm": 3.53877329826355, "learning_rate": 2.775686100806526e-05, "loss": 0.4735, "step": 30125 }, { "epoch": 1.5, "grad_norm": 14.931818962097168, "learning_rate": 2.7733791041470575e-05, "loss": 0.4685, "step": 30150 }, { "epoch": 1.5, "grad_norm": 1.213295340538025, "learning_rate": 2.7710721074875883e-05, "loss": 0.4797, "step": 30175 }, { "epoch": 1.5, "grad_norm": 16.991479873657227, "learning_rate": 2.7687651108281197e-05, "loss": 0.5182, "step": 30200 }, { "epoch": 1.51, "grad_norm": 4.055248260498047, "learning_rate": 2.766458114168651e-05, "loss": 0.5566, "step": 30225 }, { "epoch": 1.51, "grad_norm": 3.4977238178253174, "learning_rate": 2.7641511175091823e-05, "loss": 0.5738, "step": 30250 }, { "epoch": 1.51, "grad_norm": 111.34226989746094, "learning_rate": 2.761844120849713e-05, "loss": 0.4432, "step": 30275 }, { "epoch": 1.51, "grad_norm": 7.632834434509277, "learning_rate": 2.7595371241902446e-05, "loss": 0.5878, "step": 30300 }, { "epoch": 1.51, "grad_norm": 13.939888954162598, "learning_rate": 2.7572301275307754e-05, "loss": 0.6465, "step": 30325 }, { "epoch": 1.51, "grad_norm": 6.650832653045654, "learning_rate": 2.7549231308713065e-05, "loss": 0.5544, "step": 30350 }, { "epoch": 1.51, "grad_norm": 3.3435723781585693, "learning_rate": 2.7526161342118377e-05, "loss": 0.5943, "step": 30375 }, { "epoch": 1.51, "grad_norm": 1.8094165325164795, "learning_rate": 2.7503091375523688e-05, "loss": 0.4553, "step": 30400 }, { "epoch": 1.52, "grad_norm": 13.593605041503906, "learning_rate": 2.7480021408929003e-05, "loss": 0.5378, "step": 30425 }, { "epoch": 1.52, "grad_norm": 2.0035171508789062, "learning_rate": 2.745695144233431e-05, "loss": 0.3657, "step": 30450 }, { "epoch": 1.52, "grad_norm": 5.202415943145752, "learning_rate": 2.7433881475739626e-05, "loss": 0.4987, "step": 30475 }, { "epoch": 1.52, "grad_norm": 2.9139647483825684, "learning_rate": 2.7410811509144934e-05, "loss": 0.4868, "step": 30500 }, { "epoch": 1.52, "grad_norm": 56.17787551879883, "learning_rate": 2.738774154255025e-05, "loss": 0.5008, "step": 30525 }, { "epoch": 1.52, "grad_norm": 3.269890308380127, "learning_rate": 2.736467157595556e-05, "loss": 0.505, "step": 30550 }, { "epoch": 1.52, "grad_norm": 0.9548706412315369, "learning_rate": 2.7341601609360874e-05, "loss": 0.39, "step": 30575 }, { "epoch": 1.52, "grad_norm": 1.014391541481018, "learning_rate": 2.7318531642766182e-05, "loss": 0.5306, "step": 30600 }, { "epoch": 1.53, "grad_norm": 337.91876220703125, "learning_rate": 2.7295461676171497e-05, "loss": 0.5413, "step": 30625 }, { "epoch": 1.53, "grad_norm": 4.666881084442139, "learning_rate": 2.7272391709576805e-05, "loss": 0.5424, "step": 30650 }, { "epoch": 1.53, "grad_norm": 0.9372049570083618, "learning_rate": 2.724932174298212e-05, "loss": 0.4929, "step": 30675 }, { "epoch": 1.53, "grad_norm": 1.1089849472045898, "learning_rate": 2.7226251776387428e-05, "loss": 0.3852, "step": 30700 }, { "epoch": 1.53, "grad_norm": 4.649927616119385, "learning_rate": 2.720318180979274e-05, "loss": 0.5104, "step": 30725 }, { "epoch": 1.53, "grad_norm": 2.7719218730926514, "learning_rate": 2.7180111843198054e-05, "loss": 0.5801, "step": 30750 }, { "epoch": 1.53, "grad_norm": 3.2304704189300537, "learning_rate": 2.7157041876603362e-05, "loss": 0.559, "step": 30775 }, { "epoch": 1.53, "grad_norm": 1.006312370300293, "learning_rate": 2.7133971910008677e-05, "loss": 0.3751, "step": 30800 }, { "epoch": 1.54, "grad_norm": 6.04379940032959, "learning_rate": 2.7110901943413985e-05, "loss": 0.4593, "step": 30825 }, { "epoch": 1.54, "grad_norm": 1.1266270875930786, "learning_rate": 2.70878319768193e-05, "loss": 0.4689, "step": 30850 }, { "epoch": 1.54, "grad_norm": 1.9402873516082764, "learning_rate": 2.706476201022461e-05, "loss": 0.4564, "step": 30875 }, { "epoch": 1.54, "grad_norm": 3.6099956035614014, "learning_rate": 2.7041692043629922e-05, "loss": 0.4079, "step": 30900 }, { "epoch": 1.54, "grad_norm": 1.433536171913147, "learning_rate": 2.7018622077035234e-05, "loss": 0.4365, "step": 30925 }, { "epoch": 1.54, "grad_norm": 3.8893444538116455, "learning_rate": 2.6995552110440548e-05, "loss": 0.5526, "step": 30950 }, { "epoch": 1.54, "grad_norm": 1.160085916519165, "learning_rate": 2.6972482143845856e-05, "loss": 0.4243, "step": 30975 }, { "epoch": 1.54, "grad_norm": 6.14224100112915, "learning_rate": 2.694941217725117e-05, "loss": 0.4999, "step": 31000 }, { "epoch": 1.55, "grad_norm": 6.147780895233154, "learning_rate": 2.692634221065648e-05, "loss": 0.4961, "step": 31025 }, { "epoch": 1.55, "grad_norm": 2.3592782020568848, "learning_rate": 2.6903272244061794e-05, "loss": 0.5433, "step": 31050 }, { "epoch": 1.55, "grad_norm": 1.0809153318405151, "learning_rate": 2.6880202277467105e-05, "loss": 0.5113, "step": 31075 }, { "epoch": 1.55, "grad_norm": 3.5836567878723145, "learning_rate": 2.6857132310872413e-05, "loss": 0.4979, "step": 31100 }, { "epoch": 1.55, "grad_norm": 0.9541612267494202, "learning_rate": 2.6834062344277728e-05, "loss": 0.3614, "step": 31125 }, { "epoch": 1.55, "grad_norm": 3.576939821243286, "learning_rate": 2.6810992377683036e-05, "loss": 0.4558, "step": 31150 }, { "epoch": 1.55, "grad_norm": 7.013927459716797, "learning_rate": 2.678792241108835e-05, "loss": 0.4799, "step": 31175 }, { "epoch": 1.55, "grad_norm": 5.320522308349609, "learning_rate": 2.676485244449366e-05, "loss": 0.518, "step": 31200 }, { "epoch": 1.56, "grad_norm": 1.2402065992355347, "learning_rate": 2.6741782477898973e-05, "loss": 0.409, "step": 31225 }, { "epoch": 1.56, "grad_norm": 1.8643181324005127, "learning_rate": 2.6718712511304285e-05, "loss": 0.5644, "step": 31250 }, { "epoch": 1.56, "grad_norm": 3.7191162109375, "learning_rate": 2.66956425447096e-05, "loss": 0.5179, "step": 31275 }, { "epoch": 1.56, "grad_norm": 3.634350061416626, "learning_rate": 2.6672572578114907e-05, "loss": 0.5734, "step": 31300 }, { "epoch": 1.56, "grad_norm": 1.830993890762329, "learning_rate": 2.6649502611520222e-05, "loss": 0.6335, "step": 31325 }, { "epoch": 1.56, "grad_norm": 2.8177618980407715, "learning_rate": 2.662643264492553e-05, "loss": 0.5966, "step": 31350 }, { "epoch": 1.56, "grad_norm": 4.917094707489014, "learning_rate": 2.6603362678330845e-05, "loss": 0.6458, "step": 31375 }, { "epoch": 1.56, "grad_norm": 3.4671552181243896, "learning_rate": 2.6580292711736156e-05, "loss": 0.6744, "step": 31400 }, { "epoch": 1.57, "grad_norm": 3.410064458847046, "learning_rate": 2.6557222745141467e-05, "loss": 0.6604, "step": 31425 }, { "epoch": 1.57, "grad_norm": 5.093179225921631, "learning_rate": 2.653415277854678e-05, "loss": 0.6048, "step": 31450 }, { "epoch": 1.57, "grad_norm": 1.9258631467819214, "learning_rate": 2.6511082811952087e-05, "loss": 0.6169, "step": 31475 }, { "epoch": 1.57, "grad_norm": 2.269477605819702, "learning_rate": 2.64880128453574e-05, "loss": 0.6868, "step": 31500 }, { "epoch": 1.57, "grad_norm": 1.7670726776123047, "learning_rate": 2.646494287876271e-05, "loss": 0.6663, "step": 31525 }, { "epoch": 1.57, "grad_norm": 1.7811284065246582, "learning_rate": 2.6441872912168024e-05, "loss": 0.7178, "step": 31550 }, { "epoch": 1.57, "grad_norm": 6.065280437469482, "learning_rate": 2.6418802945573336e-05, "loss": 0.6134, "step": 31575 }, { "epoch": 1.57, "grad_norm": 2.023129463195801, "learning_rate": 2.639573297897865e-05, "loss": 0.637, "step": 31600 }, { "epoch": 1.58, "grad_norm": 4.560355186462402, "learning_rate": 2.637266301238396e-05, "loss": 0.7594, "step": 31625 }, { "epoch": 1.58, "grad_norm": 3.2106659412384033, "learning_rate": 2.6349593045789273e-05, "loss": 0.7643, "step": 31650 }, { "epoch": 1.58, "grad_norm": 2.52014422416687, "learning_rate": 2.632652307919458e-05, "loss": 0.749, "step": 31675 }, { "epoch": 1.58, "grad_norm": 4.3201751708984375, "learning_rate": 2.6303453112599896e-05, "loss": 0.8099, "step": 31700 }, { "epoch": 1.58, "grad_norm": 3.502185344696045, "learning_rate": 2.6280383146005204e-05, "loss": 0.8316, "step": 31725 }, { "epoch": 1.58, "grad_norm": 5.395108699798584, "learning_rate": 2.625731317941052e-05, "loss": 0.8306, "step": 31750 }, { "epoch": 1.58, "grad_norm": 2.213697910308838, "learning_rate": 2.623424321281583e-05, "loss": 0.7331, "step": 31775 }, { "epoch": 1.58, "grad_norm": 3.1291141510009766, "learning_rate": 2.6211173246221145e-05, "loss": 0.7876, "step": 31800 }, { "epoch": 1.59, "grad_norm": 1.7159450054168701, "learning_rate": 2.6188103279626453e-05, "loss": 0.7596, "step": 31825 }, { "epoch": 1.59, "grad_norm": 2.6130173206329346, "learning_rate": 2.616503331303176e-05, "loss": 0.8343, "step": 31850 }, { "epoch": 1.59, "grad_norm": 3.8219878673553467, "learning_rate": 2.6141963346437075e-05, "loss": 0.7811, "step": 31875 }, { "epoch": 1.59, "grad_norm": 2.0162181854248047, "learning_rate": 2.6118893379842387e-05, "loss": 0.7997, "step": 31900 }, { "epoch": 1.59, "grad_norm": 3.441479444503784, "learning_rate": 2.6095823413247698e-05, "loss": 0.8247, "step": 31925 }, { "epoch": 1.59, "grad_norm": 6.634566307067871, "learning_rate": 2.607275344665301e-05, "loss": 0.751, "step": 31950 }, { "epoch": 1.59, "grad_norm": 2.1950550079345703, "learning_rate": 2.6049683480058324e-05, "loss": 0.7754, "step": 31975 }, { "epoch": 1.59, "grad_norm": 2.526723623275757, "learning_rate": 2.6026613513463632e-05, "loss": 0.9298, "step": 32000 }, { "epoch": 1.6, "grad_norm": 4.879652976989746, "learning_rate": 2.6003543546868947e-05, "loss": 0.751, "step": 32025 }, { "epoch": 1.6, "grad_norm": 6.0123186111450195, "learning_rate": 2.5980473580274255e-05, "loss": 0.7762, "step": 32050 }, { "epoch": 1.6, "grad_norm": 4.540564060211182, "learning_rate": 2.595740361367957e-05, "loss": 0.8824, "step": 32075 }, { "epoch": 1.6, "grad_norm": 2.412745952606201, "learning_rate": 2.593433364708488e-05, "loss": 0.8124, "step": 32100 }, { "epoch": 1.6, "grad_norm": 5.162590503692627, "learning_rate": 2.5911263680490196e-05, "loss": 0.7411, "step": 32125 }, { "epoch": 1.6, "grad_norm": 3.4849741458892822, "learning_rate": 2.5888193713895504e-05, "loss": 0.7982, "step": 32150 }, { "epoch": 1.6, "grad_norm": 2.908165693283081, "learning_rate": 2.586512374730082e-05, "loss": 0.826, "step": 32175 }, { "epoch": 1.6, "grad_norm": 2.439852237701416, "learning_rate": 2.5842053780706126e-05, "loss": 0.8829, "step": 32200 }, { "epoch": 1.61, "grad_norm": 3.7252159118652344, "learning_rate": 2.5818983814111434e-05, "loss": 0.8104, "step": 32225 }, { "epoch": 1.61, "grad_norm": 1.851833701133728, "learning_rate": 2.579591384751675e-05, "loss": 0.8495, "step": 32250 }, { "epoch": 1.61, "grad_norm": 3.105205774307251, "learning_rate": 2.577284388092206e-05, "loss": 0.7484, "step": 32275 }, { "epoch": 1.61, "grad_norm": 5.3158650398254395, "learning_rate": 2.5749773914327375e-05, "loss": 0.8756, "step": 32300 }, { "epoch": 1.61, "grad_norm": 6.524014949798584, "learning_rate": 2.5726703947732683e-05, "loss": 0.8681, "step": 32325 }, { "epoch": 1.61, "grad_norm": 2.6202030181884766, "learning_rate": 2.5703633981137998e-05, "loss": 0.8961, "step": 32350 }, { "epoch": 1.61, "grad_norm": 3.272714853286743, "learning_rate": 2.5680564014543306e-05, "loss": 0.8115, "step": 32375 }, { "epoch": 1.61, "grad_norm": 5.991594314575195, "learning_rate": 2.565749404794862e-05, "loss": 0.8041, "step": 32400 }, { "epoch": 1.62, "grad_norm": 4.424630641937256, "learning_rate": 2.5634424081353932e-05, "loss": 0.8302, "step": 32425 }, { "epoch": 1.62, "grad_norm": 4.359757900238037, "learning_rate": 2.5611354114759243e-05, "loss": 0.824, "step": 32450 }, { "epoch": 1.62, "grad_norm": 3.0658962726593018, "learning_rate": 2.5588284148164555e-05, "loss": 0.8463, "step": 32475 }, { "epoch": 1.62, "grad_norm": 9.23780632019043, "learning_rate": 2.556521418156987e-05, "loss": 0.7813, "step": 32500 }, { "epoch": 1.62, "grad_norm": 3.0023868083953857, "learning_rate": 2.5542144214975177e-05, "loss": 0.8218, "step": 32525 }, { "epoch": 1.62, "grad_norm": 2.9863386154174805, "learning_rate": 2.5519074248380492e-05, "loss": 0.8183, "step": 32550 }, { "epoch": 1.62, "grad_norm": 5.385082244873047, "learning_rate": 2.54960042817858e-05, "loss": 0.7862, "step": 32575 }, { "epoch": 1.62, "grad_norm": 2.6118578910827637, "learning_rate": 2.547293431519111e-05, "loss": 0.8312, "step": 32600 }, { "epoch": 1.63, "grad_norm": 3.4267027378082275, "learning_rate": 2.5449864348596426e-05, "loss": 0.8242, "step": 32625 }, { "epoch": 1.63, "grad_norm": 5.174554347991943, "learning_rate": 2.5426794382001734e-05, "loss": 0.7855, "step": 32650 }, { "epoch": 1.63, "grad_norm": 4.1581807136535645, "learning_rate": 2.540372441540705e-05, "loss": 0.8032, "step": 32675 }, { "epoch": 1.63, "grad_norm": 4.021723747253418, "learning_rate": 2.5380654448812357e-05, "loss": 0.7469, "step": 32700 }, { "epoch": 1.63, "grad_norm": 3.3047544956207275, "learning_rate": 2.5357584482217672e-05, "loss": 0.8432, "step": 32725 }, { "epoch": 1.63, "grad_norm": 1.8279719352722168, "learning_rate": 2.533451451562298e-05, "loss": 0.8011, "step": 32750 }, { "epoch": 1.63, "grad_norm": 3.484745979309082, "learning_rate": 2.5311444549028294e-05, "loss": 0.77, "step": 32775 }, { "epoch": 1.63, "grad_norm": 2.0162298679351807, "learning_rate": 2.5288374582433606e-05, "loss": 0.7745, "step": 32800 }, { "epoch": 1.64, "grad_norm": 2.171647071838379, "learning_rate": 2.526530461583892e-05, "loss": 0.8607, "step": 32825 }, { "epoch": 1.64, "grad_norm": 3.357196807861328, "learning_rate": 2.524223464924423e-05, "loss": 0.7857, "step": 32850 }, { "epoch": 1.64, "grad_norm": 6.561802864074707, "learning_rate": 2.5219164682649543e-05, "loss": 0.838, "step": 32875 }, { "epoch": 1.64, "grad_norm": 4.378182888031006, "learning_rate": 2.519609471605485e-05, "loss": 0.8374, "step": 32900 }, { "epoch": 1.64, "grad_norm": 1.8582767248153687, "learning_rate": 2.5173024749460166e-05, "loss": 0.7927, "step": 32925 }, { "epoch": 1.64, "grad_norm": 6.331790447235107, "learning_rate": 2.5149954782865477e-05, "loss": 0.7299, "step": 32950 }, { "epoch": 1.64, "grad_norm": 4.185370922088623, "learning_rate": 2.5126884816270785e-05, "loss": 0.8594, "step": 32975 }, { "epoch": 1.64, "grad_norm": 6.100175380706787, "learning_rate": 2.51038148496761e-05, "loss": 0.7837, "step": 33000 }, { "epoch": 1.65, "grad_norm": 6.206463813781738, "learning_rate": 2.5080744883081408e-05, "loss": 0.7516, "step": 33025 }, { "epoch": 1.65, "grad_norm": 5.179276943206787, "learning_rate": 2.5057674916486723e-05, "loss": 0.7947, "step": 33050 }, { "epoch": 1.65, "grad_norm": 5.463741302490234, "learning_rate": 2.503460494989203e-05, "loss": 0.8988, "step": 33075 }, { "epoch": 1.65, "grad_norm": 2.222621202468872, "learning_rate": 2.5011534983297346e-05, "loss": 0.835, "step": 33100 }, { "epoch": 1.65, "grad_norm": 2.5178072452545166, "learning_rate": 2.4988465016702657e-05, "loss": 0.809, "step": 33125 }, { "epoch": 1.65, "grad_norm": 3.1115121841430664, "learning_rate": 2.4965395050107968e-05, "loss": 0.7798, "step": 33150 }, { "epoch": 1.65, "grad_norm": 3.3300487995147705, "learning_rate": 2.494232508351328e-05, "loss": 0.8173, "step": 33175 }, { "epoch": 1.65, "grad_norm": 2.000523090362549, "learning_rate": 2.491925511691859e-05, "loss": 0.8308, "step": 33200 }, { "epoch": 1.66, "grad_norm": 10.636764526367188, "learning_rate": 2.4896185150323902e-05, "loss": 0.8941, "step": 33225 }, { "epoch": 1.66, "grad_norm": 1.9837878942489624, "learning_rate": 2.4873115183729214e-05, "loss": 0.8287, "step": 33250 }, { "epoch": 1.66, "grad_norm": 3.8704280853271484, "learning_rate": 2.4850045217134525e-05, "loss": 0.79, "step": 33275 }, { "epoch": 1.66, "grad_norm": 5.006420612335205, "learning_rate": 2.482697525053984e-05, "loss": 0.838, "step": 33300 }, { "epoch": 1.66, "grad_norm": 5.825026035308838, "learning_rate": 2.480390528394515e-05, "loss": 0.8216, "step": 33325 }, { "epoch": 1.66, "grad_norm": 2.980112075805664, "learning_rate": 2.4780835317350463e-05, "loss": 0.8636, "step": 33350 }, { "epoch": 1.66, "grad_norm": 3.990055799484253, "learning_rate": 2.4757765350755774e-05, "loss": 0.8725, "step": 33375 }, { "epoch": 1.66, "grad_norm": 2.0065150260925293, "learning_rate": 2.4734695384161085e-05, "loss": 0.8054, "step": 33400 }, { "epoch": 1.67, "grad_norm": 8.030442237854004, "learning_rate": 2.4711625417566397e-05, "loss": 0.8314, "step": 33425 }, { "epoch": 1.67, "grad_norm": 2.8273167610168457, "learning_rate": 2.4688555450971708e-05, "loss": 0.8094, "step": 33450 }, { "epoch": 1.67, "grad_norm": 2.2414333820343018, "learning_rate": 2.4665485484377023e-05, "loss": 0.889, "step": 33475 }, { "epoch": 1.67, "grad_norm": 3.5151126384735107, "learning_rate": 2.464241551778233e-05, "loss": 0.8246, "step": 33500 }, { "epoch": 1.67, "grad_norm": 9.038124084472656, "learning_rate": 2.4619345551187642e-05, "loss": 0.811, "step": 33525 }, { "epoch": 1.67, "grad_norm": 2.8050408363342285, "learning_rate": 2.4596275584592953e-05, "loss": 0.822, "step": 33550 }, { "epoch": 1.67, "grad_norm": 4.081467151641846, "learning_rate": 2.4573205617998265e-05, "loss": 0.7503, "step": 33575 }, { "epoch": 1.67, "grad_norm": 2.958592414855957, "learning_rate": 2.4550135651403576e-05, "loss": 0.8151, "step": 33600 }, { "epoch": 1.68, "grad_norm": 5.844561576843262, "learning_rate": 2.452706568480889e-05, "loss": 0.7702, "step": 33625 }, { "epoch": 1.68, "grad_norm": 2.1601386070251465, "learning_rate": 2.4503995718214202e-05, "loss": 0.7862, "step": 33650 }, { "epoch": 1.68, "grad_norm": 2.165982246398926, "learning_rate": 2.4480925751619514e-05, "loss": 0.8787, "step": 33675 }, { "epoch": 1.68, "grad_norm": 5.302867889404297, "learning_rate": 2.4457855785024825e-05, "loss": 0.8076, "step": 33700 }, { "epoch": 1.68, "grad_norm": 2.251314163208008, "learning_rate": 2.4434785818430136e-05, "loss": 0.8352, "step": 33725 }, { "epoch": 1.68, "grad_norm": 2.4113807678222656, "learning_rate": 2.4411715851835448e-05, "loss": 0.8318, "step": 33750 }, { "epoch": 1.68, "grad_norm": 3.341135263442993, "learning_rate": 2.438864588524076e-05, "loss": 0.7658, "step": 33775 }, { "epoch": 1.68, "grad_norm": 5.227036952972412, "learning_rate": 2.436557591864607e-05, "loss": 0.8928, "step": 33800 }, { "epoch": 1.69, "grad_norm": 7.890278339385986, "learning_rate": 2.4342505952051385e-05, "loss": 0.8021, "step": 33825 }, { "epoch": 1.69, "grad_norm": 2.11730694770813, "learning_rate": 2.4319435985456696e-05, "loss": 0.8418, "step": 33850 }, { "epoch": 1.69, "grad_norm": 4.105464935302734, "learning_rate": 2.4296366018862004e-05, "loss": 0.7946, "step": 33875 }, { "epoch": 1.69, "grad_norm": 4.777904033660889, "learning_rate": 2.4273296052267316e-05, "loss": 0.7699, "step": 33900 }, { "epoch": 1.69, "grad_norm": 8.123181343078613, "learning_rate": 2.4250226085672627e-05, "loss": 0.8535, "step": 33925 }, { "epoch": 1.69, "grad_norm": 5.432234764099121, "learning_rate": 2.422715611907794e-05, "loss": 0.84, "step": 33950 }, { "epoch": 1.69, "grad_norm": 5.834948539733887, "learning_rate": 2.4204086152483253e-05, "loss": 0.8552, "step": 33975 }, { "epoch": 1.69, "grad_norm": 3.680751323699951, "learning_rate": 2.4181016185888565e-05, "loss": 0.8243, "step": 34000 }, { "epoch": 1.7, "grad_norm": 9.424110412597656, "learning_rate": 2.4157946219293876e-05, "loss": 0.7783, "step": 34025 }, { "epoch": 1.7, "grad_norm": 4.688609600067139, "learning_rate": 2.4134876252699187e-05, "loss": 0.8046, "step": 34050 }, { "epoch": 1.7, "grad_norm": 4.4300432205200195, "learning_rate": 2.41118062861045e-05, "loss": 0.854, "step": 34075 }, { "epoch": 1.7, "grad_norm": 5.35884428024292, "learning_rate": 2.408873631950981e-05, "loss": 0.7425, "step": 34100 }, { "epoch": 1.7, "grad_norm": 2.651151657104492, "learning_rate": 2.406566635291512e-05, "loss": 0.847, "step": 34125 }, { "epoch": 1.7, "grad_norm": 9.372386932373047, "learning_rate": 2.4042596386320436e-05, "loss": 0.8982, "step": 34150 }, { "epoch": 1.7, "grad_norm": 2.5536534786224365, "learning_rate": 2.4019526419725748e-05, "loss": 0.8317, "step": 34175 }, { "epoch": 1.7, "grad_norm": 4.036647796630859, "learning_rate": 2.399645645313106e-05, "loss": 0.8515, "step": 34200 }, { "epoch": 1.71, "grad_norm": 3.1766724586486816, "learning_rate": 2.397338648653637e-05, "loss": 0.8698, "step": 34225 }, { "epoch": 1.71, "grad_norm": 9.911266326904297, "learning_rate": 2.3950316519941678e-05, "loss": 0.8694, "step": 34250 }, { "epoch": 1.71, "grad_norm": 8.645268440246582, "learning_rate": 2.392724655334699e-05, "loss": 0.8196, "step": 34275 }, { "epoch": 1.71, "grad_norm": 5.725925445556641, "learning_rate": 2.3904176586752304e-05, "loss": 0.839, "step": 34300 }, { "epoch": 1.71, "grad_norm": 1.862838625907898, "learning_rate": 2.3881106620157616e-05, "loss": 0.8578, "step": 34325 }, { "epoch": 1.71, "grad_norm": 2.1054186820983887, "learning_rate": 2.3858036653562927e-05, "loss": 0.8456, "step": 34350 }, { "epoch": 1.71, "grad_norm": 1.862760066986084, "learning_rate": 2.383496668696824e-05, "loss": 0.7978, "step": 34375 }, { "epoch": 1.71, "grad_norm": 5.457796096801758, "learning_rate": 2.381189672037355e-05, "loss": 0.8132, "step": 34400 }, { "epoch": 1.72, "grad_norm": 4.211350917816162, "learning_rate": 2.378882675377886e-05, "loss": 0.86, "step": 34425 }, { "epoch": 1.72, "grad_norm": 4.961258411407471, "learning_rate": 2.3765756787184172e-05, "loss": 0.8715, "step": 34450 }, { "epoch": 1.72, "grad_norm": 1.8565328121185303, "learning_rate": 2.3742686820589484e-05, "loss": 0.8418, "step": 34475 }, { "epoch": 1.72, "grad_norm": 2.928229570388794, "learning_rate": 2.37196168539948e-05, "loss": 0.8232, "step": 34500 }, { "epoch": 1.72, "grad_norm": 7.252219200134277, "learning_rate": 2.369654688740011e-05, "loss": 0.8422, "step": 34525 }, { "epoch": 1.72, "grad_norm": 7.312279224395752, "learning_rate": 2.367347692080542e-05, "loss": 0.8442, "step": 34550 }, { "epoch": 1.72, "grad_norm": 3.1411311626434326, "learning_rate": 2.3650406954210733e-05, "loss": 0.7978, "step": 34575 }, { "epoch": 1.72, "grad_norm": 2.388169050216675, "learning_rate": 2.3627336987616044e-05, "loss": 0.8161, "step": 34600 }, { "epoch": 1.73, "grad_norm": 2.1744918823242188, "learning_rate": 2.3604267021021352e-05, "loss": 0.8981, "step": 34625 }, { "epoch": 1.73, "grad_norm": 4.800972938537598, "learning_rate": 2.3581197054426667e-05, "loss": 0.831, "step": 34650 }, { "epoch": 1.73, "grad_norm": 2.453704357147217, "learning_rate": 2.3558127087831978e-05, "loss": 0.7941, "step": 34675 }, { "epoch": 1.73, "grad_norm": 2.5256292819976807, "learning_rate": 2.353505712123729e-05, "loss": 0.8007, "step": 34700 }, { "epoch": 1.73, "grad_norm": 4.391716003417969, "learning_rate": 2.35119871546426e-05, "loss": 0.8477, "step": 34725 }, { "epoch": 1.73, "grad_norm": 2.577244997024536, "learning_rate": 2.3488917188047912e-05, "loss": 0.8366, "step": 34750 }, { "epoch": 1.73, "grad_norm": 3.9222066402435303, "learning_rate": 2.3465847221453224e-05, "loss": 0.8394, "step": 34775 }, { "epoch": 1.73, "grad_norm": 4.5864715576171875, "learning_rate": 2.3442777254858535e-05, "loss": 0.827, "step": 34800 }, { "epoch": 1.74, "grad_norm": 2.346217155456543, "learning_rate": 2.3419707288263846e-05, "loss": 0.8577, "step": 34825 }, { "epoch": 1.74, "grad_norm": 1.9931409358978271, "learning_rate": 2.339663732166916e-05, "loss": 0.8782, "step": 34850 }, { "epoch": 1.74, "grad_norm": 8.162564277648926, "learning_rate": 2.3373567355074472e-05, "loss": 0.781, "step": 34875 }, { "epoch": 1.74, "grad_norm": 7.050436973571777, "learning_rate": 2.3350497388479784e-05, "loss": 0.797, "step": 34900 }, { "epoch": 1.74, "grad_norm": 5.392847537994385, "learning_rate": 2.3327427421885095e-05, "loss": 0.7826, "step": 34925 }, { "epoch": 1.74, "grad_norm": 3.3648784160614014, "learning_rate": 2.3304357455290406e-05, "loss": 0.9437, "step": 34950 }, { "epoch": 1.74, "grad_norm": 7.639132022857666, "learning_rate": 2.3281287488695718e-05, "loss": 0.8861, "step": 34975 }, { "epoch": 1.74, "grad_norm": 5.233222007751465, "learning_rate": 2.325821752210103e-05, "loss": 0.8305, "step": 35000 }, { "epoch": 1.75, "grad_norm": 2.399179220199585, "learning_rate": 2.323514755550634e-05, "loss": 0.8388, "step": 35025 }, { "epoch": 1.75, "grad_norm": 2.207289218902588, "learning_rate": 2.3212077588911652e-05, "loss": 0.7974, "step": 35050 }, { "epoch": 1.75, "grad_norm": 3.358884572982788, "learning_rate": 2.3189007622316963e-05, "loss": 0.8339, "step": 35075 }, { "epoch": 1.75, "grad_norm": 6.761455059051514, "learning_rate": 2.3165937655722275e-05, "loss": 0.8042, "step": 35100 }, { "epoch": 1.75, "grad_norm": 1.8530350923538208, "learning_rate": 2.3142867689127586e-05, "loss": 0.8953, "step": 35125 }, { "epoch": 1.75, "grad_norm": 2.0336318016052246, "learning_rate": 2.3119797722532897e-05, "loss": 0.8082, "step": 35150 }, { "epoch": 1.75, "grad_norm": 1.8673982620239258, "learning_rate": 2.3096727755938212e-05, "loss": 0.7853, "step": 35175 }, { "epoch": 1.75, "grad_norm": 5.2938761711120605, "learning_rate": 2.3073657789343523e-05, "loss": 0.7803, "step": 35200 }, { "epoch": 1.76, "grad_norm": 2.7945752143859863, "learning_rate": 2.3050587822748835e-05, "loss": 0.8236, "step": 35225 }, { "epoch": 1.76, "grad_norm": 8.166348457336426, "learning_rate": 2.3027517856154146e-05, "loss": 0.8583, "step": 35250 }, { "epoch": 1.76, "grad_norm": 7.2717790603637695, "learning_rate": 2.3004447889559458e-05, "loss": 0.8103, "step": 35275 }, { "epoch": 1.76, "grad_norm": 4.064415454864502, "learning_rate": 2.298137792296477e-05, "loss": 0.8061, "step": 35300 }, { "epoch": 1.76, "grad_norm": 6.309088706970215, "learning_rate": 2.295830795637008e-05, "loss": 0.8702, "step": 35325 }, { "epoch": 1.76, "grad_norm": 4.678400993347168, "learning_rate": 2.293523798977539e-05, "loss": 0.7731, "step": 35350 }, { "epoch": 1.76, "grad_norm": 1.9187320470809937, "learning_rate": 2.2912168023180703e-05, "loss": 0.8482, "step": 35375 }, { "epoch": 1.76, "grad_norm": 4.612795352935791, "learning_rate": 2.2889098056586014e-05, "loss": 0.8311, "step": 35400 }, { "epoch": 1.77, "grad_norm": 5.448324203491211, "learning_rate": 2.2866028089991326e-05, "loss": 0.854, "step": 35425 }, { "epoch": 1.77, "grad_norm": 7.444492816925049, "learning_rate": 2.2842958123396637e-05, "loss": 0.8343, "step": 35450 }, { "epoch": 1.77, "grad_norm": 3.258119821548462, "learning_rate": 2.281988815680195e-05, "loss": 0.8172, "step": 35475 }, { "epoch": 1.77, "grad_norm": 2.3989317417144775, "learning_rate": 2.279681819020726e-05, "loss": 0.8792, "step": 35500 }, { "epoch": 1.77, "grad_norm": 6.7266693115234375, "learning_rate": 2.2773748223612575e-05, "loss": 0.8326, "step": 35525 }, { "epoch": 1.77, "grad_norm": 8.093158721923828, "learning_rate": 2.2750678257017886e-05, "loss": 0.7305, "step": 35550 }, { "epoch": 1.77, "grad_norm": 6.749348163604736, "learning_rate": 2.2727608290423197e-05, "loss": 0.9024, "step": 35575 }, { "epoch": 1.77, "grad_norm": 1.8120150566101074, "learning_rate": 2.270453832382851e-05, "loss": 0.8003, "step": 35600 }, { "epoch": 1.78, "grad_norm": 5.8295793533325195, "learning_rate": 2.268146835723382e-05, "loss": 0.8163, "step": 35625 }, { "epoch": 1.78, "grad_norm": 5.463387489318848, "learning_rate": 2.265839839063913e-05, "loss": 0.8287, "step": 35650 }, { "epoch": 1.78, "grad_norm": 2.2227492332458496, "learning_rate": 2.2635328424044443e-05, "loss": 0.808, "step": 35675 }, { "epoch": 1.78, "grad_norm": 7.72734260559082, "learning_rate": 2.2612258457449757e-05, "loss": 0.7966, "step": 35700 }, { "epoch": 1.78, "grad_norm": 3.9670233726501465, "learning_rate": 2.2589188490855065e-05, "loss": 0.7873, "step": 35725 }, { "epoch": 1.78, "grad_norm": 4.4433207511901855, "learning_rate": 2.2566118524260377e-05, "loss": 0.9153, "step": 35750 }, { "epoch": 1.78, "grad_norm": 2.567361831665039, "learning_rate": 2.2543048557665688e-05, "loss": 0.816, "step": 35775 }, { "epoch": 1.78, "grad_norm": 2.7919533252716064, "learning_rate": 2.2519978591071e-05, "loss": 0.7996, "step": 35800 }, { "epoch": 1.79, "grad_norm": 1.8410005569458008, "learning_rate": 2.249690862447631e-05, "loss": 0.8555, "step": 35825 }, { "epoch": 1.79, "grad_norm": 3.8902974128723145, "learning_rate": 2.2473838657881626e-05, "loss": 0.8363, "step": 35850 }, { "epoch": 1.79, "grad_norm": 2.6485462188720703, "learning_rate": 2.2450768691286937e-05, "loss": 0.8928, "step": 35875 }, { "epoch": 1.79, "grad_norm": 3.155224323272705, "learning_rate": 2.2427698724692248e-05, "loss": 0.8476, "step": 35900 }, { "epoch": 1.79, "grad_norm": 2.0526411533355713, "learning_rate": 2.240462875809756e-05, "loss": 0.8015, "step": 35925 }, { "epoch": 1.79, "grad_norm": 2.686580181121826, "learning_rate": 2.238155879150287e-05, "loss": 0.7542, "step": 35950 }, { "epoch": 1.79, "grad_norm": 4.2547688484191895, "learning_rate": 2.2358488824908182e-05, "loss": 0.8228, "step": 35975 }, { "epoch": 1.79, "grad_norm": 5.983592987060547, "learning_rate": 2.2335418858313494e-05, "loss": 0.8093, "step": 36000 }, { "epoch": 1.8, "grad_norm": 7.171198844909668, "learning_rate": 2.2312348891718805e-05, "loss": 0.7551, "step": 36025 }, { "epoch": 1.8, "grad_norm": 5.765264511108398, "learning_rate": 2.228927892512412e-05, "loss": 0.9014, "step": 36050 }, { "epoch": 1.8, "grad_norm": 2.6858668327331543, "learning_rate": 2.226620895852943e-05, "loss": 0.8089, "step": 36075 }, { "epoch": 1.8, "grad_norm": 7.885138034820557, "learning_rate": 2.224313899193474e-05, "loss": 0.8527, "step": 36100 }, { "epoch": 1.8, "grad_norm": 2.0656027793884277, "learning_rate": 2.222006902534005e-05, "loss": 0.7826, "step": 36125 }, { "epoch": 1.8, "grad_norm": 5.330872535705566, "learning_rate": 2.2196999058745362e-05, "loss": 0.7946, "step": 36150 }, { "epoch": 1.8, "grad_norm": 6.922634124755859, "learning_rate": 2.2173929092150673e-05, "loss": 0.8434, "step": 36175 }, { "epoch": 1.8, "grad_norm": 2.980771064758301, "learning_rate": 2.2150859125555988e-05, "loss": 0.7595, "step": 36200 }, { "epoch": 1.81, "grad_norm": 2.326483726501465, "learning_rate": 2.21277891589613e-05, "loss": 0.8722, "step": 36225 }, { "epoch": 1.81, "grad_norm": 4.783517360687256, "learning_rate": 2.210471919236661e-05, "loss": 0.7853, "step": 36250 }, { "epoch": 1.81, "grad_norm": 5.524138450622559, "learning_rate": 2.2081649225771922e-05, "loss": 0.75, "step": 36275 }, { "epoch": 1.81, "grad_norm": 2.14884352684021, "learning_rate": 2.2058579259177233e-05, "loss": 0.9266, "step": 36300 }, { "epoch": 1.81, "grad_norm": 4.895349502563477, "learning_rate": 2.2035509292582545e-05, "loss": 0.8576, "step": 36325 }, { "epoch": 1.81, "grad_norm": 5.256768226623535, "learning_rate": 2.2012439325987856e-05, "loss": 0.8131, "step": 36350 }, { "epoch": 1.81, "grad_norm": 2.264431953430176, "learning_rate": 2.198936935939317e-05, "loss": 0.8111, "step": 36375 }, { "epoch": 1.81, "grad_norm": 2.4203009605407715, "learning_rate": 2.1966299392798482e-05, "loss": 0.8239, "step": 36400 }, { "epoch": 1.82, "grad_norm": 2.2608046531677246, "learning_rate": 2.1943229426203794e-05, "loss": 0.845, "step": 36425 }, { "epoch": 1.82, "grad_norm": 2.1424031257629395, "learning_rate": 2.1920159459609105e-05, "loss": 0.7855, "step": 36450 }, { "epoch": 1.82, "grad_norm": 4.636570453643799, "learning_rate": 2.1897089493014413e-05, "loss": 0.8291, "step": 36475 }, { "epoch": 1.82, "grad_norm": 4.953878402709961, "learning_rate": 2.1874019526419724e-05, "loss": 0.8101, "step": 36500 }, { "epoch": 1.82, "grad_norm": 6.59492826461792, "learning_rate": 2.185094955982504e-05, "loss": 0.798, "step": 36525 }, { "epoch": 1.82, "grad_norm": 3.5249340534210205, "learning_rate": 2.182787959323035e-05, "loss": 0.7401, "step": 36550 }, { "epoch": 1.82, "grad_norm": 4.184223651885986, "learning_rate": 2.1804809626635662e-05, "loss": 0.8175, "step": 36575 }, { "epoch": 1.82, "grad_norm": 2.470923900604248, "learning_rate": 2.1781739660040973e-05, "loss": 0.8137, "step": 36600 }, { "epoch": 1.83, "grad_norm": 2.4369516372680664, "learning_rate": 2.1758669693446285e-05, "loss": 0.7938, "step": 36625 }, { "epoch": 1.83, "grad_norm": 3.2020232677459717, "learning_rate": 2.1735599726851596e-05, "loss": 0.7854, "step": 36650 }, { "epoch": 1.83, "grad_norm": 4.032663822174072, "learning_rate": 2.1712529760256907e-05, "loss": 0.8469, "step": 36675 }, { "epoch": 1.83, "grad_norm": 12.03586196899414, "learning_rate": 2.168945979366222e-05, "loss": 0.8206, "step": 36700 }, { "epoch": 1.83, "grad_norm": 4.0775556564331055, "learning_rate": 2.1666389827067533e-05, "loss": 0.7563, "step": 36725 }, { "epoch": 1.83, "grad_norm": 2.1432387828826904, "learning_rate": 2.1643319860472845e-05, "loss": 0.869, "step": 36750 }, { "epoch": 1.83, "grad_norm": 11.407499313354492, "learning_rate": 2.1620249893878156e-05, "loss": 0.7877, "step": 36775 }, { "epoch": 1.83, "grad_norm": 5.67279052734375, "learning_rate": 2.1597179927283467e-05, "loss": 0.828, "step": 36800 }, { "epoch": 1.84, "grad_norm": 2.022274971008301, "learning_rate": 2.157410996068878e-05, "loss": 0.82, "step": 36825 }, { "epoch": 1.84, "grad_norm": 2.832973003387451, "learning_rate": 2.1551039994094087e-05, "loss": 0.813, "step": 36850 }, { "epoch": 1.84, "grad_norm": 2.9639811515808105, "learning_rate": 2.15279700274994e-05, "loss": 0.8116, "step": 36875 }, { "epoch": 1.84, "grad_norm": 4.492002010345459, "learning_rate": 2.1504900060904713e-05, "loss": 0.8496, "step": 36900 }, { "epoch": 1.84, "grad_norm": 3.4977283477783203, "learning_rate": 2.1481830094310024e-05, "loss": 0.8079, "step": 36925 }, { "epoch": 1.84, "grad_norm": 6.909940719604492, "learning_rate": 2.1458760127715336e-05, "loss": 0.824, "step": 36950 }, { "epoch": 1.84, "grad_norm": 4.175833702087402, "learning_rate": 2.1435690161120647e-05, "loss": 0.888, "step": 36975 }, { "epoch": 1.84, "grad_norm": 4.707504749298096, "learning_rate": 2.1412620194525958e-05, "loss": 0.775, "step": 37000 }, { "epoch": 1.84, "grad_norm": 3.5683321952819824, "learning_rate": 2.138955022793127e-05, "loss": 0.8675, "step": 37025 }, { "epoch": 1.85, "grad_norm": 4.9137043952941895, "learning_rate": 2.1366480261336584e-05, "loss": 0.7563, "step": 37050 }, { "epoch": 1.85, "grad_norm": 5.058765888214111, "learning_rate": 2.1343410294741896e-05, "loss": 0.862, "step": 37075 }, { "epoch": 1.85, "grad_norm": 2.8901517391204834, "learning_rate": 2.1320340328147207e-05, "loss": 0.8354, "step": 37100 }, { "epoch": 1.85, "grad_norm": 2.0617527961730957, "learning_rate": 2.129727036155252e-05, "loss": 0.8047, "step": 37125 }, { "epoch": 1.85, "grad_norm": 7.36972713470459, "learning_rate": 2.127420039495783e-05, "loss": 0.7663, "step": 37150 }, { "epoch": 1.85, "grad_norm": 3.1017777919769287, "learning_rate": 2.125113042836314e-05, "loss": 0.8682, "step": 37175 }, { "epoch": 1.85, "grad_norm": 2.616933584213257, "learning_rate": 2.1228060461768453e-05, "loss": 0.8047, "step": 37200 }, { "epoch": 1.85, "grad_norm": 4.542732238769531, "learning_rate": 2.1204990495173764e-05, "loss": 0.8326, "step": 37225 }, { "epoch": 1.86, "grad_norm": 2.4547290802001953, "learning_rate": 2.1181920528579075e-05, "loss": 0.8496, "step": 37250 }, { "epoch": 1.86, "grad_norm": 6.471099376678467, "learning_rate": 2.1158850561984387e-05, "loss": 0.8131, "step": 37275 }, { "epoch": 1.86, "grad_norm": 2.56229567527771, "learning_rate": 2.1135780595389698e-05, "loss": 0.8289, "step": 37300 }, { "epoch": 1.86, "grad_norm": 4.7177300453186035, "learning_rate": 2.111271062879501e-05, "loss": 0.8691, "step": 37325 }, { "epoch": 1.86, "grad_norm": 3.8973240852355957, "learning_rate": 2.108964066220032e-05, "loss": 0.7911, "step": 37350 }, { "epoch": 1.86, "grad_norm": 5.638448715209961, "learning_rate": 2.1066570695605632e-05, "loss": 0.774, "step": 37375 }, { "epoch": 1.86, "grad_norm": 2.739816427230835, "learning_rate": 2.1043500729010947e-05, "loss": 0.7736, "step": 37400 }, { "epoch": 1.86, "grad_norm": 2.312668561935425, "learning_rate": 2.1020430762416258e-05, "loss": 0.8059, "step": 37425 }, { "epoch": 1.87, "grad_norm": 8.428336143493652, "learning_rate": 2.099736079582157e-05, "loss": 0.7917, "step": 37450 }, { "epoch": 1.87, "grad_norm": 2.120487928390503, "learning_rate": 2.097429082922688e-05, "loss": 0.8561, "step": 37475 }, { "epoch": 1.87, "grad_norm": 2.118880271911621, "learning_rate": 2.0951220862632192e-05, "loss": 0.8347, "step": 37500 }, { "epoch": 1.87, "grad_norm": 2.0460243225097656, "learning_rate": 2.0928150896037504e-05, "loss": 0.7991, "step": 37525 }, { "epoch": 1.87, "grad_norm": 1.8884197473526, "learning_rate": 2.0905080929442815e-05, "loss": 0.7781, "step": 37550 }, { "epoch": 1.87, "grad_norm": 1.735609769821167, "learning_rate": 2.088201096284813e-05, "loss": 0.7974, "step": 37575 }, { "epoch": 1.87, "grad_norm": 7.059275150299072, "learning_rate": 2.0858940996253438e-05, "loss": 0.7835, "step": 37600 }, { "epoch": 1.87, "grad_norm": 4.521622657775879, "learning_rate": 2.083587102965875e-05, "loss": 0.8168, "step": 37625 }, { "epoch": 1.88, "grad_norm": 1.8211438655853271, "learning_rate": 2.081280106306406e-05, "loss": 0.772, "step": 37650 }, { "epoch": 1.88, "grad_norm": 6.678451061248779, "learning_rate": 2.0789731096469372e-05, "loss": 0.8403, "step": 37675 }, { "epoch": 1.88, "grad_norm": 7.757253170013428, "learning_rate": 2.0766661129874683e-05, "loss": 0.8458, "step": 37700 }, { "epoch": 1.88, "grad_norm": 2.1668477058410645, "learning_rate": 2.0743591163279995e-05, "loss": 0.8513, "step": 37725 }, { "epoch": 1.88, "grad_norm": 3.1951613426208496, "learning_rate": 2.072052119668531e-05, "loss": 0.8782, "step": 37750 }, { "epoch": 1.88, "grad_norm": 3.6583151817321777, "learning_rate": 2.069745123009062e-05, "loss": 0.8233, "step": 37775 }, { "epoch": 1.88, "grad_norm": 4.582195281982422, "learning_rate": 2.0674381263495932e-05, "loss": 0.8616, "step": 37800 }, { "epoch": 1.88, "grad_norm": 4.346683025360107, "learning_rate": 2.0651311296901243e-05, "loss": 0.7653, "step": 37825 }, { "epoch": 1.89, "grad_norm": 6.112051486968994, "learning_rate": 2.0628241330306555e-05, "loss": 0.8561, "step": 37850 }, { "epoch": 1.89, "grad_norm": 3.2946393489837646, "learning_rate": 2.0605171363711866e-05, "loss": 0.775, "step": 37875 }, { "epoch": 1.89, "grad_norm": 1.5814847946166992, "learning_rate": 2.0582101397117177e-05, "loss": 0.506, "step": 37900 }, { "epoch": 1.89, "grad_norm": 3.392300844192505, "learning_rate": 2.0559031430522492e-05, "loss": 0.5086, "step": 37925 }, { "epoch": 1.89, "grad_norm": 5.850569725036621, "learning_rate": 2.0535961463927804e-05, "loss": 0.55, "step": 37950 }, { "epoch": 1.89, "grad_norm": 2.7387137413024902, "learning_rate": 2.051289149733311e-05, "loss": 0.4841, "step": 37975 }, { "epoch": 1.89, "grad_norm": 2.8632755279541016, "learning_rate": 2.0489821530738423e-05, "loss": 0.4663, "step": 38000 }, { "epoch": 1.89, "grad_norm": 4.828920841217041, "learning_rate": 2.0466751564143734e-05, "loss": 0.8887, "step": 38025 }, { "epoch": 1.9, "grad_norm": 3.6731338500976562, "learning_rate": 2.0443681597549046e-05, "loss": 0.5948, "step": 38050 }, { "epoch": 1.9, "grad_norm": 5.887766361236572, "learning_rate": 2.042061163095436e-05, "loss": 0.4842, "step": 38075 }, { "epoch": 1.9, "grad_norm": 3.1257717609405518, "learning_rate": 2.039754166435967e-05, "loss": 0.5482, "step": 38100 }, { "epoch": 1.9, "grad_norm": 1.9796582460403442, "learning_rate": 2.0374471697764983e-05, "loss": 0.4274, "step": 38125 }, { "epoch": 1.9, "grad_norm": 2.342222213745117, "learning_rate": 2.0351401731170294e-05, "loss": 0.4282, "step": 38150 }, { "epoch": 1.9, "grad_norm": 3.084747791290283, "learning_rate": 2.0328331764575606e-05, "loss": 0.517, "step": 38175 }, { "epoch": 1.9, "grad_norm": 4.009477615356445, "learning_rate": 2.0305261797980917e-05, "loss": 0.3951, "step": 38200 }, { "epoch": 1.9, "grad_norm": 5.501810073852539, "learning_rate": 2.028219183138623e-05, "loss": 0.4186, "step": 38225 }, { "epoch": 1.91, "grad_norm": 1.4934760332107544, "learning_rate": 2.025912186479154e-05, "loss": 0.4899, "step": 38250 }, { "epoch": 1.91, "grad_norm": 4.006719589233398, "learning_rate": 2.0236051898196855e-05, "loss": 0.6189, "step": 38275 }, { "epoch": 1.91, "grad_norm": 2.458674907684326, "learning_rate": 2.0212981931602166e-05, "loss": 0.4577, "step": 38300 }, { "epoch": 1.91, "grad_norm": 2.8802576065063477, "learning_rate": 2.0189911965007477e-05, "loss": 0.5431, "step": 38325 }, { "epoch": 1.91, "grad_norm": 6.646295070648193, "learning_rate": 2.0166841998412785e-05, "loss": 0.4843, "step": 38350 }, { "epoch": 1.91, "grad_norm": 3.5825748443603516, "learning_rate": 2.0143772031818097e-05, "loss": 0.5397, "step": 38375 }, { "epoch": 1.91, "grad_norm": 3.379711389541626, "learning_rate": 2.0120702065223408e-05, "loss": 0.6215, "step": 38400 }, { "epoch": 1.91, "grad_norm": 4.383761882781982, "learning_rate": 2.0097632098628723e-05, "loss": 0.6691, "step": 38425 }, { "epoch": 1.92, "grad_norm": 2.3637311458587646, "learning_rate": 2.0074562132034034e-05, "loss": 0.509, "step": 38450 }, { "epoch": 1.92, "grad_norm": 3.028691291809082, "learning_rate": 2.0051492165439345e-05, "loss": 0.4861, "step": 38475 }, { "epoch": 1.92, "grad_norm": 2.57072377204895, "learning_rate": 2.0028422198844657e-05, "loss": 0.4268, "step": 38500 }, { "epoch": 1.92, "grad_norm": 1.385330319404602, "learning_rate": 2.0005352232249968e-05, "loss": 0.541, "step": 38525 }, { "epoch": 1.92, "grad_norm": 4.9758758544921875, "learning_rate": 1.998228226565528e-05, "loss": 0.4891, "step": 38550 }, { "epoch": 1.92, "grad_norm": 1.3216552734375, "learning_rate": 1.995921229906059e-05, "loss": 0.7594, "step": 38575 }, { "epoch": 1.92, "grad_norm": 1.4653106927871704, "learning_rate": 1.9936142332465906e-05, "loss": 0.6146, "step": 38600 }, { "epoch": 1.92, "grad_norm": 2.2742831707000732, "learning_rate": 1.9913072365871217e-05, "loss": 0.6672, "step": 38625 }, { "epoch": 1.93, "grad_norm": 3.1644742488861084, "learning_rate": 1.989000239927653e-05, "loss": 0.5287, "step": 38650 }, { "epoch": 1.93, "grad_norm": 2.7336010932922363, "learning_rate": 1.986693243268184e-05, "loss": 0.5898, "step": 38675 }, { "epoch": 1.93, "grad_norm": 3.7895514965057373, "learning_rate": 1.984386246608715e-05, "loss": 0.45, "step": 38700 }, { "epoch": 1.93, "grad_norm": 2.277320384979248, "learning_rate": 1.982079249949246e-05, "loss": 0.4729, "step": 38725 }, { "epoch": 1.93, "grad_norm": 2.145059108734131, "learning_rate": 1.9797722532897774e-05, "loss": 0.5978, "step": 38750 }, { "epoch": 1.93, "grad_norm": 5.892246246337891, "learning_rate": 1.9774652566303085e-05, "loss": 0.4955, "step": 38775 }, { "epoch": 1.93, "grad_norm": 1.9532073736190796, "learning_rate": 1.9751582599708397e-05, "loss": 0.4587, "step": 38800 }, { "epoch": 1.93, "grad_norm": 3.237112283706665, "learning_rate": 1.9728512633113708e-05, "loss": 0.5907, "step": 38825 }, { "epoch": 1.94, "grad_norm": 2.5157840251922607, "learning_rate": 1.970544266651902e-05, "loss": 0.6024, "step": 38850 }, { "epoch": 1.94, "grad_norm": 2.959118127822876, "learning_rate": 1.968237269992433e-05, "loss": 0.4482, "step": 38875 }, { "epoch": 1.94, "grad_norm": 1.199279546737671, "learning_rate": 1.9659302733329642e-05, "loss": 0.6254, "step": 38900 }, { "epoch": 1.94, "grad_norm": 2.68166184425354, "learning_rate": 1.9636232766734953e-05, "loss": 0.5216, "step": 38925 }, { "epoch": 1.94, "grad_norm": 3.7433533668518066, "learning_rate": 1.9613162800140268e-05, "loss": 0.5939, "step": 38950 }, { "epoch": 1.94, "grad_norm": 2.2293860912323, "learning_rate": 1.959009283354558e-05, "loss": 0.4903, "step": 38975 }, { "epoch": 1.94, "grad_norm": 4.912618637084961, "learning_rate": 1.956702286695089e-05, "loss": 0.4401, "step": 39000 }, { "epoch": 1.94, "grad_norm": 10.129170417785645, "learning_rate": 1.9543952900356202e-05, "loss": 0.5514, "step": 39025 }, { "epoch": 1.95, "grad_norm": 2.102691173553467, "learning_rate": 1.9520882933761514e-05, "loss": 0.4956, "step": 39050 }, { "epoch": 1.95, "grad_norm": 2.0237252712249756, "learning_rate": 1.9497812967166825e-05, "loss": 0.52, "step": 39075 }, { "epoch": 1.95, "grad_norm": 2.302222490310669, "learning_rate": 1.9474743000572136e-05, "loss": 0.3286, "step": 39100 }, { "epoch": 1.95, "grad_norm": 1.6583091020584106, "learning_rate": 1.9451673033977448e-05, "loss": 0.4163, "step": 39125 }, { "epoch": 1.95, "grad_norm": 2.9822421073913574, "learning_rate": 1.942860306738276e-05, "loss": 0.4914, "step": 39150 }, { "epoch": 1.95, "grad_norm": 3.1285948753356934, "learning_rate": 1.940553310078807e-05, "loss": 0.5983, "step": 39175 }, { "epoch": 1.95, "grad_norm": 3.0795788764953613, "learning_rate": 1.938246313419338e-05, "loss": 0.4094, "step": 39200 }, { "epoch": 1.95, "grad_norm": 20.059349060058594, "learning_rate": 1.9359393167598693e-05, "loss": 0.4936, "step": 39225 }, { "epoch": 1.96, "grad_norm": 2.8071374893188477, "learning_rate": 1.9336323201004004e-05, "loss": 0.5021, "step": 39250 }, { "epoch": 1.96, "grad_norm": 27.87400245666504, "learning_rate": 1.931325323440932e-05, "loss": 0.4801, "step": 39275 }, { "epoch": 1.96, "grad_norm": 3.6269407272338867, "learning_rate": 1.929018326781463e-05, "loss": 0.505, "step": 39300 }, { "epoch": 1.96, "grad_norm": 3.781467914581299, "learning_rate": 1.9267113301219942e-05, "loss": 0.4676, "step": 39325 }, { "epoch": 1.96, "grad_norm": 2.3654417991638184, "learning_rate": 1.9244043334625253e-05, "loss": 0.5953, "step": 39350 }, { "epoch": 1.96, "grad_norm": 3.031672954559326, "learning_rate": 1.9220973368030565e-05, "loss": 0.5119, "step": 39375 }, { "epoch": 1.96, "grad_norm": 3.2161550521850586, "learning_rate": 1.9197903401435876e-05, "loss": 0.4759, "step": 39400 }, { "epoch": 1.96, "grad_norm": 1.5116912126541138, "learning_rate": 1.9174833434841187e-05, "loss": 0.4879, "step": 39425 }, { "epoch": 1.97, "grad_norm": 4.8071770668029785, "learning_rate": 1.91517634682465e-05, "loss": 0.5768, "step": 39450 }, { "epoch": 1.97, "grad_norm": 2.3467979431152344, "learning_rate": 1.912869350165181e-05, "loss": 0.4743, "step": 39475 }, { "epoch": 1.97, "grad_norm": 3.0653600692749023, "learning_rate": 1.910562353505712e-05, "loss": 0.5656, "step": 39500 }, { "epoch": 1.97, "grad_norm": 3.336280584335327, "learning_rate": 1.9082553568462433e-05, "loss": 0.4947, "step": 39525 }, { "epoch": 1.97, "grad_norm": 3.2678709030151367, "learning_rate": 1.9059483601867744e-05, "loss": 0.4589, "step": 39550 }, { "epoch": 1.97, "grad_norm": 2.3263626098632812, "learning_rate": 1.9036413635273055e-05, "loss": 0.446, "step": 39575 }, { "epoch": 1.97, "grad_norm": 2.6241798400878906, "learning_rate": 1.9013343668678367e-05, "loss": 0.5622, "step": 39600 }, { "epoch": 1.97, "grad_norm": 41.76066970825195, "learning_rate": 1.899027370208368e-05, "loss": 0.4721, "step": 39625 }, { "epoch": 1.98, "grad_norm": 5.291028022766113, "learning_rate": 1.8967203735488993e-05, "loss": 0.4655, "step": 39650 }, { "epoch": 1.98, "grad_norm": 1.498937964439392, "learning_rate": 1.8944133768894304e-05, "loss": 0.4562, "step": 39675 }, { "epoch": 1.98, "grad_norm": 5.547112464904785, "learning_rate": 1.8921063802299616e-05, "loss": 0.4972, "step": 39700 }, { "epoch": 1.98, "grad_norm": 3.455609083175659, "learning_rate": 1.8897993835704927e-05, "loss": 0.5478, "step": 39725 }, { "epoch": 1.98, "grad_norm": 1.4441814422607422, "learning_rate": 1.887492386911024e-05, "loss": 0.5511, "step": 39750 }, { "epoch": 1.98, "grad_norm": 3.192058563232422, "learning_rate": 1.885185390251555e-05, "loss": 0.4308, "step": 39775 }, { "epoch": 1.98, "grad_norm": 1.615493655204773, "learning_rate": 1.8828783935920864e-05, "loss": 0.3588, "step": 39800 }, { "epoch": 1.98, "grad_norm": 1.5515351295471191, "learning_rate": 1.8805713969326176e-05, "loss": 0.4596, "step": 39825 }, { "epoch": 1.99, "grad_norm": 13.214730262756348, "learning_rate": 1.8782644002731484e-05, "loss": 0.4913, "step": 39850 }, { "epoch": 1.99, "grad_norm": 1.2539523839950562, "learning_rate": 1.8759574036136795e-05, "loss": 0.3608, "step": 39875 }, { "epoch": 1.99, "grad_norm": 1.4073877334594727, "learning_rate": 1.8736504069542107e-05, "loss": 0.4483, "step": 39900 }, { "epoch": 1.99, "grad_norm": 1.7948969602584839, "learning_rate": 1.8713434102947418e-05, "loss": 0.4403, "step": 39925 }, { "epoch": 1.99, "grad_norm": 3.484506130218506, "learning_rate": 1.8690364136352733e-05, "loss": 0.4113, "step": 39950 }, { "epoch": 1.99, "grad_norm": 2.7683985233306885, "learning_rate": 1.8667294169758044e-05, "loss": 0.499, "step": 39975 }, { "epoch": 1.99, "grad_norm": 6.0013885498046875, "learning_rate": 1.8644224203163355e-05, "loss": 0.5183, "step": 40000 }, { "epoch": 1.99, "grad_norm": 2.2845098972320557, "learning_rate": 1.8621154236568667e-05, "loss": 0.4011, "step": 40025 }, { "epoch": 2.0, "grad_norm": 3.0258421897888184, "learning_rate": 1.8598084269973978e-05, "loss": 0.5238, "step": 40050 }, { "epoch": 2.0, "grad_norm": 5.795552730560303, "learning_rate": 1.857501430337929e-05, "loss": 0.5491, "step": 40075 }, { "epoch": 2.0, "grad_norm": 3.7295963764190674, "learning_rate": 1.85519443367846e-05, "loss": 0.417, "step": 40100 }, { "epoch": 2.0, "grad_norm": 9.04057788848877, "learning_rate": 1.8528874370189912e-05, "loss": 0.5175, "step": 40125 }, { "epoch": 2.0, "eval_accuracy": 0.8620156970225489, "eval_f1_macro": 0.752511699878362, "eval_f1_micro": 0.8620156970225489, "eval_f1_weighted": 0.8588178797835208, "eval_loss": 0.4766501784324646, "eval_precision_macro": 0.819008993064034, "eval_precision_micro": 0.8620156970225489, "eval_precision_weighted": 0.8602145721151697, "eval_recall_macro": 0.7172989796317722, "eval_recall_micro": 0.8620156970225489, "eval_recall_weighted": 0.8620156970225489, "eval_runtime": 7058.5264, "eval_samples_per_second": 5.686, "eval_steps_per_second": 0.355, "step": 40136 }, { "epoch": 2.0, "grad_norm": 1.2662018537521362, "learning_rate": 1.8505804403595227e-05, "loss": 0.5362, "step": 40150 }, { "epoch": 2.0, "grad_norm": 6.242440700531006, "learning_rate": 1.8482734437000538e-05, "loss": 0.4781, "step": 40175 }, { "epoch": 2.0, "grad_norm": 2.6231377124786377, "learning_rate": 1.845966447040585e-05, "loss": 0.4311, "step": 40200 }, { "epoch": 2.0, "grad_norm": 2.520632028579712, "learning_rate": 1.8436594503811158e-05, "loss": 0.4914, "step": 40225 }, { "epoch": 2.01, "grad_norm": 3.3773555755615234, "learning_rate": 1.841352453721647e-05, "loss": 0.4345, "step": 40250 }, { "epoch": 2.01, "grad_norm": 5.317331790924072, "learning_rate": 1.839045457062178e-05, "loss": 0.4221, "step": 40275 }, { "epoch": 2.01, "grad_norm": 2.9365406036376953, "learning_rate": 1.8367384604027095e-05, "loss": 0.5238, "step": 40300 }, { "epoch": 2.01, "grad_norm": 6.145638465881348, "learning_rate": 1.8344314637432406e-05, "loss": 0.5701, "step": 40325 }, { "epoch": 2.01, "grad_norm": 4.878846168518066, "learning_rate": 1.8321244670837718e-05, "loss": 0.5091, "step": 40350 }, { "epoch": 2.01, "grad_norm": 5.4882493019104, "learning_rate": 1.829817470424303e-05, "loss": 0.5533, "step": 40375 }, { "epoch": 2.01, "grad_norm": 1.296852469444275, "learning_rate": 1.827510473764834e-05, "loss": 0.5923, "step": 40400 }, { "epoch": 2.01, "grad_norm": 9.215230941772461, "learning_rate": 1.8252034771053652e-05, "loss": 0.4332, "step": 40425 }, { "epoch": 2.02, "grad_norm": 3.852431058883667, "learning_rate": 1.8228964804458963e-05, "loss": 0.5316, "step": 40450 }, { "epoch": 2.02, "grad_norm": 1.412146806716919, "learning_rate": 1.8205894837864278e-05, "loss": 0.4826, "step": 40475 }, { "epoch": 2.02, "grad_norm": 1.4537949562072754, "learning_rate": 1.818282487126959e-05, "loss": 0.5232, "step": 40500 }, { "epoch": 2.02, "grad_norm": 4.489310264587402, "learning_rate": 1.81597549046749e-05, "loss": 0.5984, "step": 40525 }, { "epoch": 2.02, "grad_norm": 51.95314025878906, "learning_rate": 1.8136684938080212e-05, "loss": 0.5185, "step": 40550 }, { "epoch": 2.02, "grad_norm": 3.217198371887207, "learning_rate": 1.811361497148552e-05, "loss": 0.4918, "step": 40575 }, { "epoch": 2.02, "grad_norm": 4.193581581115723, "learning_rate": 1.809054500489083e-05, "loss": 0.5057, "step": 40600 }, { "epoch": 2.02, "grad_norm": 11.709174156188965, "learning_rate": 1.8067475038296143e-05, "loss": 0.4267, "step": 40625 }, { "epoch": 2.03, "grad_norm": 1.6062325239181519, "learning_rate": 1.8044405071701457e-05, "loss": 0.4338, "step": 40650 }, { "epoch": 2.03, "grad_norm": 3.2066144943237305, "learning_rate": 1.802133510510677e-05, "loss": 0.4786, "step": 40675 }, { "epoch": 2.03, "grad_norm": 72.72689819335938, "learning_rate": 1.799826513851208e-05, "loss": 0.4422, "step": 40700 }, { "epoch": 2.03, "grad_norm": 8.56615161895752, "learning_rate": 1.797519517191739e-05, "loss": 0.5187, "step": 40725 }, { "epoch": 2.03, "grad_norm": 2.4742374420166016, "learning_rate": 1.7952125205322703e-05, "loss": 0.4785, "step": 40750 }, { "epoch": 2.03, "grad_norm": 0.9956190586090088, "learning_rate": 1.7929055238728014e-05, "loss": 0.3973, "step": 40775 }, { "epoch": 2.03, "grad_norm": 1.506685495376587, "learning_rate": 1.7905985272133326e-05, "loss": 0.4557, "step": 40800 }, { "epoch": 2.03, "grad_norm": 2.4764609336853027, "learning_rate": 1.788291530553864e-05, "loss": 0.5184, "step": 40825 }, { "epoch": 2.04, "grad_norm": 6.196167469024658, "learning_rate": 1.7859845338943952e-05, "loss": 0.3534, "step": 40850 }, { "epoch": 2.04, "grad_norm": 3.4392249584198, "learning_rate": 1.7836775372349263e-05, "loss": 0.4914, "step": 40875 }, { "epoch": 2.04, "grad_norm": 3.8605923652648926, "learning_rate": 1.7813705405754574e-05, "loss": 0.4917, "step": 40900 }, { "epoch": 2.04, "grad_norm": 2.812488317489624, "learning_rate": 1.7790635439159886e-05, "loss": 0.4589, "step": 40925 }, { "epoch": 2.04, "grad_norm": 1.0951435565948486, "learning_rate": 1.7767565472565194e-05, "loss": 0.5361, "step": 40950 }, { "epoch": 2.04, "grad_norm": 3.0446274280548096, "learning_rate": 1.774449550597051e-05, "loss": 0.4389, "step": 40975 }, { "epoch": 2.04, "grad_norm": 1.0899313688278198, "learning_rate": 1.772142553937582e-05, "loss": 0.3562, "step": 41000 }, { "epoch": 2.04, "grad_norm": 3.682464122772217, "learning_rate": 1.769835557278113e-05, "loss": 0.4362, "step": 41025 }, { "epoch": 2.05, "grad_norm": 5.872410774230957, "learning_rate": 1.7675285606186443e-05, "loss": 0.3763, "step": 41050 }, { "epoch": 2.05, "grad_norm": 2.9771783351898193, "learning_rate": 1.7652215639591754e-05, "loss": 0.4498, "step": 41075 }, { "epoch": 2.05, "grad_norm": 1.2145291566848755, "learning_rate": 1.7629145672997065e-05, "loss": 0.5256, "step": 41100 }, { "epoch": 2.05, "grad_norm": 23.237903594970703, "learning_rate": 1.7606075706402377e-05, "loss": 0.4201, "step": 41125 }, { "epoch": 2.05, "grad_norm": 35.532711029052734, "learning_rate": 1.7583005739807688e-05, "loss": 0.4249, "step": 41150 }, { "epoch": 2.05, "grad_norm": 1.2682620286941528, "learning_rate": 1.7559935773213003e-05, "loss": 0.4446, "step": 41175 }, { "epoch": 2.05, "grad_norm": 3.1225569248199463, "learning_rate": 1.7536865806618314e-05, "loss": 0.4662, "step": 41200 }, { "epoch": 2.05, "grad_norm": 33.2777099609375, "learning_rate": 1.7513795840023626e-05, "loss": 0.4484, "step": 41225 }, { "epoch": 2.06, "grad_norm": 2.847723960876465, "learning_rate": 1.7490725873428937e-05, "loss": 0.52, "step": 41250 }, { "epoch": 2.06, "grad_norm": 56.711578369140625, "learning_rate": 1.7467655906834248e-05, "loss": 0.5024, "step": 41275 }, { "epoch": 2.06, "grad_norm": 6.478834629058838, "learning_rate": 1.744458594023956e-05, "loss": 0.4259, "step": 41300 }, { "epoch": 2.06, "grad_norm": 44.3673095703125, "learning_rate": 1.742151597364487e-05, "loss": 0.5075, "step": 41325 }, { "epoch": 2.06, "grad_norm": 8.986730575561523, "learning_rate": 1.7398446007050182e-05, "loss": 0.5224, "step": 41350 }, { "epoch": 2.06, "grad_norm": 1.1015548706054688, "learning_rate": 1.7375376040455494e-05, "loss": 0.519, "step": 41375 }, { "epoch": 2.06, "grad_norm": 4.729287147521973, "learning_rate": 1.7352306073860805e-05, "loss": 0.3976, "step": 41400 }, { "epoch": 2.06, "grad_norm": 12.000980377197266, "learning_rate": 1.7329236107266116e-05, "loss": 0.4831, "step": 41425 }, { "epoch": 2.07, "grad_norm": 3.545671224594116, "learning_rate": 1.7306166140671428e-05, "loss": 0.5621, "step": 41450 }, { "epoch": 2.07, "grad_norm": 2.6213879585266113, "learning_rate": 1.728309617407674e-05, "loss": 0.3901, "step": 41475 }, { "epoch": 2.07, "grad_norm": 3.5966148376464844, "learning_rate": 1.7260026207482054e-05, "loss": 0.4114, "step": 41500 }, { "epoch": 2.07, "grad_norm": 1.0668089389801025, "learning_rate": 1.7236956240887365e-05, "loss": 0.4352, "step": 41525 }, { "epoch": 2.07, "grad_norm": 2.247910976409912, "learning_rate": 1.7213886274292677e-05, "loss": 0.352, "step": 41550 }, { "epoch": 2.07, "grad_norm": 40.24725341796875, "learning_rate": 1.7190816307697988e-05, "loss": 0.4396, "step": 41575 }, { "epoch": 2.07, "grad_norm": 3.1879642009735107, "learning_rate": 1.71677463411033e-05, "loss": 0.4884, "step": 41600 }, { "epoch": 2.07, "grad_norm": 0.8823301196098328, "learning_rate": 1.714467637450861e-05, "loss": 0.5067, "step": 41625 }, { "epoch": 2.08, "grad_norm": 0.8447086811065674, "learning_rate": 1.7121606407913922e-05, "loss": 0.2941, "step": 41650 }, { "epoch": 2.08, "grad_norm": 0.8696421980857849, "learning_rate": 1.7098536441319233e-05, "loss": 0.4713, "step": 41675 }, { "epoch": 2.08, "grad_norm": 3.203528642654419, "learning_rate": 1.7075466474724545e-05, "loss": 0.5117, "step": 41700 }, { "epoch": 2.08, "grad_norm": 2.9464070796966553, "learning_rate": 1.7052396508129856e-05, "loss": 0.5354, "step": 41725 }, { "epoch": 2.08, "grad_norm": 3.2064943313598633, "learning_rate": 1.7029326541535167e-05, "loss": 0.5292, "step": 41750 }, { "epoch": 2.08, "grad_norm": 2.7801599502563477, "learning_rate": 1.700625657494048e-05, "loss": 0.4284, "step": 41775 }, { "epoch": 2.08, "grad_norm": 1.081207513809204, "learning_rate": 1.698318660834579e-05, "loss": 0.3817, "step": 41800 }, { "epoch": 2.08, "grad_norm": 3.171081066131592, "learning_rate": 1.69601166417511e-05, "loss": 0.6747, "step": 41825 }, { "epoch": 2.09, "grad_norm": 2.7076244354248047, "learning_rate": 1.6937046675156416e-05, "loss": 0.4907, "step": 41850 }, { "epoch": 2.09, "grad_norm": 5.660066604614258, "learning_rate": 1.6913976708561728e-05, "loss": 0.6036, "step": 41875 }, { "epoch": 2.09, "grad_norm": 5.710346698760986, "learning_rate": 1.689090674196704e-05, "loss": 0.5117, "step": 41900 }, { "epoch": 2.09, "grad_norm": 98.65436553955078, "learning_rate": 1.686783677537235e-05, "loss": 0.4043, "step": 41925 }, { "epoch": 2.09, "grad_norm": 20.60503387451172, "learning_rate": 1.6844766808777662e-05, "loss": 0.5613, "step": 41950 }, { "epoch": 2.09, "grad_norm": 1.6520097255706787, "learning_rate": 1.6821696842182973e-05, "loss": 0.3941, "step": 41975 }, { "epoch": 2.09, "grad_norm": 3.609546184539795, "learning_rate": 1.6798626875588284e-05, "loss": 0.5384, "step": 42000 }, { "epoch": 2.09, "grad_norm": 6.308892250061035, "learning_rate": 1.67755569089936e-05, "loss": 0.4076, "step": 42025 }, { "epoch": 2.1, "grad_norm": 5.283653736114502, "learning_rate": 1.675248694239891e-05, "loss": 0.605, "step": 42050 }, { "epoch": 2.1, "grad_norm": 1.1643218994140625, "learning_rate": 1.672941697580422e-05, "loss": 0.4295, "step": 42075 }, { "epoch": 2.1, "grad_norm": 6.132506847381592, "learning_rate": 1.670634700920953e-05, "loss": 0.4605, "step": 42100 }, { "epoch": 2.1, "grad_norm": 2.9161691665649414, "learning_rate": 1.668327704261484e-05, "loss": 0.3778, "step": 42125 }, { "epoch": 2.1, "grad_norm": 0.9792761206626892, "learning_rate": 1.6660207076020153e-05, "loss": 0.4585, "step": 42150 }, { "epoch": 2.1, "grad_norm": 1.2763252258300781, "learning_rate": 1.6637137109425467e-05, "loss": 0.5073, "step": 42175 }, { "epoch": 2.1, "grad_norm": 1.0612924098968506, "learning_rate": 1.661406714283078e-05, "loss": 0.5103, "step": 42200 }, { "epoch": 2.1, "grad_norm": 14.235264778137207, "learning_rate": 1.659099717623609e-05, "loss": 0.4173, "step": 42225 }, { "epoch": 2.11, "grad_norm": 3.783046007156372, "learning_rate": 1.65679272096414e-05, "loss": 0.4379, "step": 42250 }, { "epoch": 2.11, "grad_norm": 6.655981063842773, "learning_rate": 1.6544857243046713e-05, "loss": 0.624, "step": 42275 }, { "epoch": 2.11, "grad_norm": 3.5701019763946533, "learning_rate": 1.6521787276452024e-05, "loss": 0.4916, "step": 42300 }, { "epoch": 2.11, "grad_norm": 6.2817769050598145, "learning_rate": 1.6498717309857336e-05, "loss": 0.4803, "step": 42325 }, { "epoch": 2.11, "grad_norm": 3.3220369815826416, "learning_rate": 1.6475647343262647e-05, "loss": 0.4698, "step": 42350 }, { "epoch": 2.11, "grad_norm": 2.752890110015869, "learning_rate": 1.645257737666796e-05, "loss": 0.4165, "step": 42375 }, { "epoch": 2.11, "grad_norm": 1.393794298171997, "learning_rate": 1.6429507410073273e-05, "loss": 0.4023, "step": 42400 }, { "epoch": 2.11, "grad_norm": 5.880159854888916, "learning_rate": 1.6406437443478584e-05, "loss": 0.5125, "step": 42425 }, { "epoch": 2.12, "grad_norm": 3.1372454166412354, "learning_rate": 1.6383367476883892e-05, "loss": 0.4362, "step": 42450 }, { "epoch": 2.12, "grad_norm": 2.6483569145202637, "learning_rate": 1.6360297510289204e-05, "loss": 0.42, "step": 42475 }, { "epoch": 2.12, "grad_norm": 0.8303941488265991, "learning_rate": 1.6337227543694515e-05, "loss": 0.5208, "step": 42500 }, { "epoch": 2.12, "grad_norm": 2.7142579555511475, "learning_rate": 1.631415757709983e-05, "loss": 0.5796, "step": 42525 }, { "epoch": 2.12, "grad_norm": 3.3541243076324463, "learning_rate": 1.629108761050514e-05, "loss": 0.4091, "step": 42550 }, { "epoch": 2.12, "grad_norm": 42.22705078125, "learning_rate": 1.6268017643910453e-05, "loss": 0.4733, "step": 42575 }, { "epoch": 2.12, "grad_norm": 1.1842784881591797, "learning_rate": 1.6244947677315764e-05, "loss": 0.3802, "step": 42600 }, { "epoch": 2.12, "grad_norm": 4.394431114196777, "learning_rate": 1.6221877710721075e-05, "loss": 0.5847, "step": 42625 }, { "epoch": 2.13, "grad_norm": 0.9206206798553467, "learning_rate": 1.6198807744126387e-05, "loss": 0.5002, "step": 42650 }, { "epoch": 2.13, "grad_norm": 6.831622123718262, "learning_rate": 1.6175737777531698e-05, "loss": 0.6661, "step": 42675 }, { "epoch": 2.13, "grad_norm": 1.1463747024536133, "learning_rate": 1.6152667810937013e-05, "loss": 0.5326, "step": 42700 }, { "epoch": 2.13, "grad_norm": 0.8590452671051025, "learning_rate": 1.6129597844342324e-05, "loss": 0.2823, "step": 42725 }, { "epoch": 2.13, "grad_norm": 4.416738510131836, "learning_rate": 1.6106527877747635e-05, "loss": 0.3593, "step": 42750 }, { "epoch": 2.13, "grad_norm": 53.58966064453125, "learning_rate": 1.6083457911152947e-05, "loss": 0.4593, "step": 42775 }, { "epoch": 2.13, "grad_norm": 3.2743759155273438, "learning_rate": 1.6060387944558258e-05, "loss": 0.4814, "step": 42800 }, { "epoch": 2.13, "grad_norm": 37.638797760009766, "learning_rate": 1.6037317977963566e-05, "loss": 0.5561, "step": 42825 }, { "epoch": 2.14, "grad_norm": 3.340258836746216, "learning_rate": 1.601424801136888e-05, "loss": 0.4613, "step": 42850 }, { "epoch": 2.14, "grad_norm": 3.070276975631714, "learning_rate": 1.5991178044774192e-05, "loss": 0.3615, "step": 42875 }, { "epoch": 2.14, "grad_norm": 2.999905824661255, "learning_rate": 1.5968108078179504e-05, "loss": 0.4379, "step": 42900 }, { "epoch": 2.14, "grad_norm": 2.243607759475708, "learning_rate": 1.5945038111584815e-05, "loss": 0.5919, "step": 42925 }, { "epoch": 2.14, "grad_norm": 0.8025704026222229, "learning_rate": 1.5921968144990126e-05, "loss": 0.3943, "step": 42950 }, { "epoch": 2.14, "grad_norm": 3.583141803741455, "learning_rate": 1.5898898178395438e-05, "loss": 0.4952, "step": 42975 }, { "epoch": 2.14, "grad_norm": 5.429011821746826, "learning_rate": 1.587582821180075e-05, "loss": 0.5703, "step": 43000 }, { "epoch": 2.14, "grad_norm": 4.520952224731445, "learning_rate": 1.585275824520606e-05, "loss": 0.6043, "step": 43025 }, { "epoch": 2.15, "grad_norm": 1.0744482278823853, "learning_rate": 1.5829688278611375e-05, "loss": 0.4658, "step": 43050 }, { "epoch": 2.15, "grad_norm": 3.3810923099517822, "learning_rate": 1.5806618312016686e-05, "loss": 0.3332, "step": 43075 }, { "epoch": 2.15, "grad_norm": 3.215128183364868, "learning_rate": 1.5783548345421998e-05, "loss": 0.4496, "step": 43100 }, { "epoch": 2.15, "grad_norm": 4.319766998291016, "learning_rate": 1.576047837882731e-05, "loss": 0.4954, "step": 43125 }, { "epoch": 2.15, "grad_norm": 0.8192872405052185, "learning_rate": 1.573740841223262e-05, "loss": 0.4935, "step": 43150 }, { "epoch": 2.15, "grad_norm": 8.027621269226074, "learning_rate": 1.5714338445637932e-05, "loss": 0.566, "step": 43175 }, { "epoch": 2.15, "grad_norm": 3.3970248699188232, "learning_rate": 1.5691268479043243e-05, "loss": 0.3207, "step": 43200 }, { "epoch": 2.15, "grad_norm": 0.9151510000228882, "learning_rate": 1.5668198512448555e-05, "loss": 0.4512, "step": 43225 }, { "epoch": 2.16, "grad_norm": 0.9420716762542725, "learning_rate": 1.5645128545853866e-05, "loss": 0.4624, "step": 43250 }, { "epoch": 2.16, "grad_norm": 3.626981258392334, "learning_rate": 1.5622058579259177e-05, "loss": 0.4893, "step": 43275 }, { "epoch": 2.16, "grad_norm": 6.8483686447143555, "learning_rate": 1.559898861266449e-05, "loss": 0.425, "step": 43300 }, { "epoch": 2.16, "grad_norm": 1.0762109756469727, "learning_rate": 1.55759186460698e-05, "loss": 0.4628, "step": 43325 }, { "epoch": 2.16, "grad_norm": 5.211805820465088, "learning_rate": 1.555284867947511e-05, "loss": 0.4319, "step": 43350 }, { "epoch": 2.16, "grad_norm": 1.066054344177246, "learning_rate": 1.5529778712880426e-05, "loss": 0.4175, "step": 43375 }, { "epoch": 2.16, "grad_norm": 2.201251983642578, "learning_rate": 1.5506708746285738e-05, "loss": 0.4262, "step": 43400 }, { "epoch": 2.16, "grad_norm": 4.746454238891602, "learning_rate": 1.548363877969105e-05, "loss": 0.5822, "step": 43425 }, { "epoch": 2.17, "grad_norm": 0.9321187734603882, "learning_rate": 1.546056881309636e-05, "loss": 0.4756, "step": 43450 }, { "epoch": 2.17, "grad_norm": 2.94303035736084, "learning_rate": 1.543749884650167e-05, "loss": 0.3573, "step": 43475 }, { "epoch": 2.17, "grad_norm": 6.546296119689941, "learning_rate": 1.5414428879906983e-05, "loss": 0.4012, "step": 43500 }, { "epoch": 2.17, "grad_norm": 3.930738687515259, "learning_rate": 1.5391358913312294e-05, "loss": 0.4391, "step": 43525 }, { "epoch": 2.17, "grad_norm": 3.519620895385742, "learning_rate": 1.5368288946717606e-05, "loss": 0.5147, "step": 43550 }, { "epoch": 2.17, "grad_norm": 23.809885025024414, "learning_rate": 1.5345218980122917e-05, "loss": 0.387, "step": 43575 }, { "epoch": 2.17, "grad_norm": 2.7929399013519287, "learning_rate": 1.532214901352823e-05, "loss": 0.4934, "step": 43600 }, { "epoch": 2.17, "grad_norm": 0.9034756422042847, "learning_rate": 1.529907904693354e-05, "loss": 0.3842, "step": 43625 }, { "epoch": 2.18, "grad_norm": 7.339956760406494, "learning_rate": 1.527600908033885e-05, "loss": 0.4575, "step": 43650 }, { "epoch": 2.18, "grad_norm": 3.778557777404785, "learning_rate": 1.5252939113744164e-05, "loss": 0.4229, "step": 43675 }, { "epoch": 2.18, "grad_norm": 1.220953345298767, "learning_rate": 1.5229869147149476e-05, "loss": 0.3305, "step": 43700 }, { "epoch": 2.18, "grad_norm": 3.4920401573181152, "learning_rate": 1.5206799180554787e-05, "loss": 0.536, "step": 43725 }, { "epoch": 2.18, "grad_norm": 1.343904972076416, "learning_rate": 1.5183729213960098e-05, "loss": 0.3731, "step": 43750 }, { "epoch": 2.18, "grad_norm": 18.415124893188477, "learning_rate": 1.5160659247365411e-05, "loss": 0.4315, "step": 43775 }, { "epoch": 2.18, "grad_norm": 128.90646362304688, "learning_rate": 1.5137589280770723e-05, "loss": 0.4811, "step": 43800 }, { "epoch": 2.18, "grad_norm": 2.1207547187805176, "learning_rate": 1.5114519314176034e-05, "loss": 0.4449, "step": 43825 }, { "epoch": 2.19, "grad_norm": 28.949186325073242, "learning_rate": 1.5091449347581345e-05, "loss": 0.398, "step": 43850 }, { "epoch": 2.19, "grad_norm": 3.406864881515503, "learning_rate": 1.5068379380986658e-05, "loss": 0.3874, "step": 43875 }, { "epoch": 2.19, "grad_norm": 4.393004417419434, "learning_rate": 1.504530941439197e-05, "loss": 0.4774, "step": 43900 }, { "epoch": 2.19, "grad_norm": 1.2055267095565796, "learning_rate": 1.5022239447797281e-05, "loss": 0.418, "step": 43925 }, { "epoch": 2.19, "grad_norm": 8.036547660827637, "learning_rate": 1.499916948120259e-05, "loss": 0.4491, "step": 43950 }, { "epoch": 2.19, "grad_norm": 3.114225387573242, "learning_rate": 1.4976099514607902e-05, "loss": 0.338, "step": 43975 }, { "epoch": 2.19, "grad_norm": 3.2484042644500732, "learning_rate": 1.4953029548013214e-05, "loss": 0.3471, "step": 44000 }, { "epoch": 2.19, "grad_norm": 3.2204830646514893, "learning_rate": 1.4929959581418527e-05, "loss": 0.4569, "step": 44025 }, { "epoch": 2.2, "grad_norm": 25.30776023864746, "learning_rate": 1.4906889614823838e-05, "loss": 0.4698, "step": 44050 }, { "epoch": 2.2, "grad_norm": 2.0941309928894043, "learning_rate": 1.488381964822915e-05, "loss": 0.4937, "step": 44075 }, { "epoch": 2.2, "grad_norm": 0.8043497204780579, "learning_rate": 1.4860749681634462e-05, "loss": 0.4964, "step": 44100 }, { "epoch": 2.2, "grad_norm": 1.6714893579483032, "learning_rate": 1.4837679715039774e-05, "loss": 0.3856, "step": 44125 }, { "epoch": 2.2, "grad_norm": 4.106298446655273, "learning_rate": 1.4814609748445085e-05, "loss": 0.6539, "step": 44150 }, { "epoch": 2.2, "grad_norm": 5.138443946838379, "learning_rate": 1.4791539781850396e-05, "loss": 0.4228, "step": 44175 }, { "epoch": 2.2, "grad_norm": 0.8958789110183716, "learning_rate": 1.476846981525571e-05, "loss": 0.4342, "step": 44200 }, { "epoch": 2.2, "grad_norm": 4.751018524169922, "learning_rate": 1.4745399848661021e-05, "loss": 0.4827, "step": 44225 }, { "epoch": 2.21, "grad_norm": 76.34806060791016, "learning_rate": 1.4722329882066332e-05, "loss": 0.3532, "step": 44250 }, { "epoch": 2.21, "grad_norm": 40.389320373535156, "learning_rate": 1.4699259915471644e-05, "loss": 0.5208, "step": 44275 }, { "epoch": 2.21, "grad_norm": 12.967884063720703, "learning_rate": 1.4676189948876957e-05, "loss": 0.4516, "step": 44300 }, { "epoch": 2.21, "grad_norm": 1.6726328134536743, "learning_rate": 1.4653119982282265e-05, "loss": 0.4281, "step": 44325 }, { "epoch": 2.21, "grad_norm": 5.279252052307129, "learning_rate": 1.4630050015687578e-05, "loss": 0.6983, "step": 44350 }, { "epoch": 2.21, "grad_norm": 1.5335670709609985, "learning_rate": 1.4606980049092889e-05, "loss": 0.4025, "step": 44375 }, { "epoch": 2.21, "grad_norm": 1.3843210935592651, "learning_rate": 1.45839100824982e-05, "loss": 0.4038, "step": 44400 }, { "epoch": 2.21, "grad_norm": 3.566404104232788, "learning_rate": 1.4560840115903512e-05, "loss": 0.5917, "step": 44425 }, { "epoch": 2.21, "grad_norm": 0.7694686055183411, "learning_rate": 1.4537770149308825e-05, "loss": 0.3287, "step": 44450 }, { "epoch": 2.22, "grad_norm": 11.228218078613281, "learning_rate": 1.4514700182714136e-05, "loss": 0.3775, "step": 44475 }, { "epoch": 2.22, "grad_norm": 1.0687758922576904, "learning_rate": 1.4491630216119448e-05, "loss": 0.5109, "step": 44500 }, { "epoch": 2.22, "grad_norm": 3.5314488410949707, "learning_rate": 1.4468560249524759e-05, "loss": 0.5578, "step": 44525 }, { "epoch": 2.22, "grad_norm": 1.0430461168289185, "learning_rate": 1.4445490282930072e-05, "loss": 0.383, "step": 44550 }, { "epoch": 2.22, "grad_norm": 0.908136785030365, "learning_rate": 1.4422420316335383e-05, "loss": 0.4595, "step": 44575 }, { "epoch": 2.22, "grad_norm": 5.896038055419922, "learning_rate": 1.4399350349740695e-05, "loss": 0.5647, "step": 44600 }, { "epoch": 2.22, "grad_norm": 1.4894803762435913, "learning_rate": 1.4376280383146008e-05, "loss": 0.3499, "step": 44625 }, { "epoch": 2.22, "grad_norm": 3.1445472240448, "learning_rate": 1.4353210416551319e-05, "loss": 0.4634, "step": 44650 }, { "epoch": 2.23, "grad_norm": 1.3199021816253662, "learning_rate": 1.433014044995663e-05, "loss": 0.4512, "step": 44675 }, { "epoch": 2.23, "grad_norm": 3.4122154712677, "learning_rate": 1.430707048336194e-05, "loss": 0.43, "step": 44700 }, { "epoch": 2.23, "grad_norm": 2.9166758060455322, "learning_rate": 1.4284000516767251e-05, "loss": 0.3729, "step": 44725 }, { "epoch": 2.23, "grad_norm": 5.693099021911621, "learning_rate": 1.4260930550172563e-05, "loss": 0.5361, "step": 44750 }, { "epoch": 2.23, "grad_norm": 3.788499116897583, "learning_rate": 1.4237860583577874e-05, "loss": 0.4689, "step": 44775 }, { "epoch": 2.23, "grad_norm": 5.346036434173584, "learning_rate": 1.4214790616983187e-05, "loss": 0.4982, "step": 44800 }, { "epoch": 2.23, "grad_norm": 12.89140510559082, "learning_rate": 1.4191720650388499e-05, "loss": 0.3943, "step": 44825 }, { "epoch": 2.23, "grad_norm": 7.905126094818115, "learning_rate": 1.416865068379381e-05, "loss": 0.4794, "step": 44850 }, { "epoch": 2.24, "grad_norm": 6.943532466888428, "learning_rate": 1.4145580717199123e-05, "loss": 0.4801, "step": 44875 }, { "epoch": 2.24, "grad_norm": 75.09638977050781, "learning_rate": 1.4122510750604434e-05, "loss": 0.6133, "step": 44900 }, { "epoch": 2.24, "grad_norm": 6.250883102416992, "learning_rate": 1.4099440784009746e-05, "loss": 0.425, "step": 44925 }, { "epoch": 2.24, "grad_norm": 5.3356709480285645, "learning_rate": 1.4076370817415057e-05, "loss": 0.5111, "step": 44950 }, { "epoch": 2.24, "grad_norm": 5.9915242195129395, "learning_rate": 1.405330085082037e-05, "loss": 0.6025, "step": 44975 }, { "epoch": 2.24, "grad_norm": 1.2483477592468262, "learning_rate": 1.4030230884225682e-05, "loss": 0.4851, "step": 45000 }, { "epoch": 2.24, "grad_norm": 4.203946590423584, "learning_rate": 1.4007160917630993e-05, "loss": 0.5118, "step": 45025 }, { "epoch": 2.24, "grad_norm": 2042.261962890625, "learning_rate": 1.3984090951036304e-05, "loss": 0.3955, "step": 45050 }, { "epoch": 2.25, "grad_norm": 1.6488062143325806, "learning_rate": 1.3961020984441614e-05, "loss": 0.4482, "step": 45075 }, { "epoch": 2.25, "grad_norm": 2.0874555110931396, "learning_rate": 1.3937951017846925e-05, "loss": 0.4248, "step": 45100 }, { "epoch": 2.25, "grad_norm": 4.379459857940674, "learning_rate": 1.3914881051252238e-05, "loss": 0.4551, "step": 45125 }, { "epoch": 2.25, "grad_norm": 1.5643028020858765, "learning_rate": 1.389181108465755e-05, "loss": 0.4521, "step": 45150 }, { "epoch": 2.25, "grad_norm": 2.6934609413146973, "learning_rate": 1.3868741118062861e-05, "loss": 0.4903, "step": 45175 }, { "epoch": 2.25, "grad_norm": 1.576201319694519, "learning_rate": 1.3845671151468172e-05, "loss": 0.5084, "step": 45200 }, { "epoch": 2.25, "grad_norm": 3.358137845993042, "learning_rate": 1.3822601184873485e-05, "loss": 0.4601, "step": 45225 }, { "epoch": 2.25, "grad_norm": 4.92896842956543, "learning_rate": 1.3799531218278797e-05, "loss": 0.4735, "step": 45250 }, { "epoch": 2.26, "grad_norm": 1.0355300903320312, "learning_rate": 1.3776461251684108e-05, "loss": 0.3459, "step": 45275 }, { "epoch": 2.26, "grad_norm": 1.0891985893249512, "learning_rate": 1.375339128508942e-05, "loss": 0.4327, "step": 45300 }, { "epoch": 2.26, "grad_norm": 4.514690399169922, "learning_rate": 1.3730321318494733e-05, "loss": 0.3761, "step": 45325 }, { "epoch": 2.26, "grad_norm": 3.550197124481201, "learning_rate": 1.3707251351900044e-05, "loss": 0.498, "step": 45350 }, { "epoch": 2.26, "grad_norm": 4.564043045043945, "learning_rate": 1.3684181385305355e-05, "loss": 0.5021, "step": 45375 }, { "epoch": 2.26, "grad_norm": 0.9587386846542358, "learning_rate": 1.3661111418710668e-05, "loss": 0.3791, "step": 45400 }, { "epoch": 2.26, "grad_norm": 3.4362361431121826, "learning_rate": 1.3638041452115976e-05, "loss": 0.4855, "step": 45425 }, { "epoch": 2.26, "grad_norm": 1.1819294691085815, "learning_rate": 1.3614971485521288e-05, "loss": 0.3571, "step": 45450 }, { "epoch": 2.27, "grad_norm": 3.3091654777526855, "learning_rate": 1.35919015189266e-05, "loss": 0.5136, "step": 45475 }, { "epoch": 2.27, "grad_norm": 0.8691762089729309, "learning_rate": 1.3568831552331912e-05, "loss": 0.5167, "step": 45500 }, { "epoch": 2.27, "grad_norm": 3.3368494510650635, "learning_rate": 1.3545761585737223e-05, "loss": 0.5208, "step": 45525 }, { "epoch": 2.27, "grad_norm": 11.81055736541748, "learning_rate": 1.3522691619142537e-05, "loss": 0.4869, "step": 45550 }, { "epoch": 2.27, "grad_norm": 1.5365840196609497, "learning_rate": 1.3499621652547848e-05, "loss": 0.4494, "step": 45575 }, { "epoch": 2.27, "grad_norm": 1.9530702829360962, "learning_rate": 1.347655168595316e-05, "loss": 0.5195, "step": 45600 }, { "epoch": 2.27, "grad_norm": 11.941510200500488, "learning_rate": 1.345348171935847e-05, "loss": 0.3484, "step": 45625 }, { "epoch": 2.27, "grad_norm": 1.299529790878296, "learning_rate": 1.3430411752763784e-05, "loss": 0.4795, "step": 45650 }, { "epoch": 2.28, "grad_norm": 3.4457457065582275, "learning_rate": 1.3407341786169095e-05, "loss": 0.4536, "step": 45675 }, { "epoch": 2.28, "grad_norm": 3.7312796115875244, "learning_rate": 1.3384271819574406e-05, "loss": 0.346, "step": 45700 }, { "epoch": 2.28, "grad_norm": 1.1145527362823486, "learning_rate": 1.3361201852979718e-05, "loss": 0.4283, "step": 45725 }, { "epoch": 2.28, "grad_norm": 3.5182554721832275, "learning_rate": 1.333813188638503e-05, "loss": 0.4792, "step": 45750 }, { "epoch": 2.28, "grad_norm": 19.439828872680664, "learning_rate": 1.3315061919790342e-05, "loss": 0.434, "step": 45775 }, { "epoch": 2.28, "grad_norm": 8.909052848815918, "learning_rate": 1.3291991953195652e-05, "loss": 0.4421, "step": 45800 }, { "epoch": 2.28, "grad_norm": 3.3405191898345947, "learning_rate": 1.3268921986600963e-05, "loss": 0.4626, "step": 45825 }, { "epoch": 2.28, "grad_norm": 1.1706880331039429, "learning_rate": 1.3245852020006275e-05, "loss": 0.445, "step": 45850 }, { "epoch": 2.29, "grad_norm": 7.387587547302246, "learning_rate": 1.3222782053411586e-05, "loss": 0.361, "step": 45875 }, { "epoch": 2.29, "grad_norm": 3.0818896293640137, "learning_rate": 1.3199712086816899e-05, "loss": 0.5711, "step": 45900 }, { "epoch": 2.29, "grad_norm": 2.2934417724609375, "learning_rate": 1.317664212022221e-05, "loss": 0.5208, "step": 45925 }, { "epoch": 2.29, "grad_norm": 2.78090238571167, "learning_rate": 1.3153572153627522e-05, "loss": 0.4421, "step": 45950 }, { "epoch": 2.29, "grad_norm": 5.722506046295166, "learning_rate": 1.3130502187032833e-05, "loss": 0.4551, "step": 45975 }, { "epoch": 2.29, "grad_norm": 7.568353176116943, "learning_rate": 1.3107432220438146e-05, "loss": 0.3366, "step": 46000 }, { "epoch": 2.29, "grad_norm": 5.417125225067139, "learning_rate": 1.3084362253843457e-05, "loss": 0.4595, "step": 46025 }, { "epoch": 2.29, "grad_norm": 12.71193790435791, "learning_rate": 1.3061292287248769e-05, "loss": 0.4978, "step": 46050 }, { "epoch": 2.3, "grad_norm": 6.872551441192627, "learning_rate": 1.3038222320654082e-05, "loss": 0.3987, "step": 46075 }, { "epoch": 2.3, "grad_norm": 14.637410163879395, "learning_rate": 1.3015152354059393e-05, "loss": 0.5225, "step": 46100 }, { "epoch": 2.3, "grad_norm": 3.4603843688964844, "learning_rate": 1.2992082387464705e-05, "loss": 0.4371, "step": 46125 }, { "epoch": 2.3, "grad_norm": 3.496575355529785, "learning_rate": 1.2969012420870016e-05, "loss": 0.3872, "step": 46150 }, { "epoch": 2.3, "grad_norm": 0.7497350573539734, "learning_rate": 1.2945942454275326e-05, "loss": 0.3906, "step": 46175 }, { "epoch": 2.3, "grad_norm": 2.979525566101074, "learning_rate": 1.2922872487680637e-05, "loss": 0.4707, "step": 46200 }, { "epoch": 2.3, "grad_norm": 0.837188184261322, "learning_rate": 1.2899802521085948e-05, "loss": 0.3688, "step": 46225 }, { "epoch": 2.3, "grad_norm": 0.7865057587623596, "learning_rate": 1.2876732554491261e-05, "loss": 0.425, "step": 46250 }, { "epoch": 2.31, "grad_norm": 1.184089183807373, "learning_rate": 1.2853662587896573e-05, "loss": 0.4349, "step": 46275 }, { "epoch": 2.31, "grad_norm": 6.87714958190918, "learning_rate": 1.2830592621301884e-05, "loss": 0.4748, "step": 46300 }, { "epoch": 2.31, "grad_norm": 0.8082959055900574, "learning_rate": 1.2807522654707197e-05, "loss": 0.4739, "step": 46325 }, { "epoch": 2.31, "grad_norm": 2.2679901123046875, "learning_rate": 1.2784452688112508e-05, "loss": 0.4529, "step": 46350 }, { "epoch": 2.31, "grad_norm": 0.8802863955497742, "learning_rate": 1.276138272151782e-05, "loss": 0.4996, "step": 46375 }, { "epoch": 2.31, "grad_norm": 34.556427001953125, "learning_rate": 1.2738312754923131e-05, "loss": 0.4807, "step": 46400 }, { "epoch": 2.31, "grad_norm": 3.192739486694336, "learning_rate": 1.2715242788328444e-05, "loss": 0.6016, "step": 46425 }, { "epoch": 2.31, "grad_norm": 1.1725696325302124, "learning_rate": 1.2692172821733756e-05, "loss": 0.3707, "step": 46450 }, { "epoch": 2.32, "grad_norm": 4.39725399017334, "learning_rate": 1.2669102855139067e-05, "loss": 0.4125, "step": 46475 }, { "epoch": 2.32, "grad_norm": 1.110650658607483, "learning_rate": 1.2646032888544378e-05, "loss": 0.4464, "step": 46500 }, { "epoch": 2.32, "grad_norm": 3.1061861515045166, "learning_rate": 1.2622962921949691e-05, "loss": 0.64, "step": 46525 }, { "epoch": 2.32, "grad_norm": 17.628704071044922, "learning_rate": 1.2599892955355e-05, "loss": 0.4659, "step": 46550 }, { "epoch": 2.32, "grad_norm": 0.8352084755897522, "learning_rate": 1.2576822988760312e-05, "loss": 0.4472, "step": 46575 }, { "epoch": 2.32, "grad_norm": 0.9835976958274841, "learning_rate": 1.2553753022165624e-05, "loss": 0.4686, "step": 46600 }, { "epoch": 2.32, "grad_norm": 31.921051025390625, "learning_rate": 1.2530683055570935e-05, "loss": 0.6688, "step": 46625 }, { "epoch": 2.32, "grad_norm": 3.089951992034912, "learning_rate": 1.2507613088976246e-05, "loss": 0.4112, "step": 46650 }, { "epoch": 2.33, "grad_norm": 6.3877410888671875, "learning_rate": 1.248454312238156e-05, "loss": 0.5201, "step": 46675 }, { "epoch": 2.33, "grad_norm": 0.9439681172370911, "learning_rate": 1.2461473155786871e-05, "loss": 0.3882, "step": 46700 }, { "epoch": 2.33, "grad_norm": 0.9483340382575989, "learning_rate": 1.2438403189192182e-05, "loss": 0.5469, "step": 46725 }, { "epoch": 2.33, "grad_norm": 4.738260269165039, "learning_rate": 1.2415333222597494e-05, "loss": 0.4783, "step": 46750 }, { "epoch": 2.33, "grad_norm": 3.267001152038574, "learning_rate": 1.2392263256002807e-05, "loss": 0.435, "step": 46775 }, { "epoch": 2.33, "grad_norm": 43.56253433227539, "learning_rate": 1.2369193289408118e-05, "loss": 0.4588, "step": 46800 }, { "epoch": 2.33, "grad_norm": 2.040438652038574, "learning_rate": 1.2346123322813428e-05, "loss": 0.3251, "step": 46825 }, { "epoch": 2.33, "grad_norm": 2.8884494304656982, "learning_rate": 1.232305335621874e-05, "loss": 0.3172, "step": 46850 }, { "epoch": 2.34, "grad_norm": 0.7062200903892517, "learning_rate": 1.2299983389624052e-05, "loss": 0.4494, "step": 46875 }, { "epoch": 2.34, "grad_norm": 15.709783554077148, "learning_rate": 1.2276913423029363e-05, "loss": 0.4507, "step": 46900 }, { "epoch": 2.34, "grad_norm": 3.04421067237854, "learning_rate": 1.2253843456434677e-05, "loss": 0.6037, "step": 46925 }, { "epoch": 2.34, "grad_norm": 2.8988988399505615, "learning_rate": 1.2230773489839988e-05, "loss": 0.5066, "step": 46950 }, { "epoch": 2.34, "grad_norm": 4.624483585357666, "learning_rate": 1.22077035232453e-05, "loss": 0.6149, "step": 46975 }, { "epoch": 2.34, "grad_norm": 0.8602836728096008, "learning_rate": 1.218463355665061e-05, "loss": 0.4331, "step": 47000 }, { "epoch": 2.34, "grad_norm": 1.0193194150924683, "learning_rate": 1.2161563590055922e-05, "loss": 0.3927, "step": 47025 }, { "epoch": 2.34, "grad_norm": 3.1821088790893555, "learning_rate": 1.2138493623461233e-05, "loss": 0.4709, "step": 47050 }, { "epoch": 2.35, "grad_norm": 0.8067628741264343, "learning_rate": 1.2115423656866545e-05, "loss": 0.4642, "step": 47075 }, { "epoch": 2.35, "grad_norm": 6.950100898742676, "learning_rate": 1.2092353690271858e-05, "loss": 0.4884, "step": 47100 }, { "epoch": 2.35, "grad_norm": 15.024262428283691, "learning_rate": 1.2069283723677169e-05, "loss": 0.2618, "step": 47125 }, { "epoch": 2.35, "grad_norm": 6.122317790985107, "learning_rate": 1.204621375708248e-05, "loss": 0.6606, "step": 47150 }, { "epoch": 2.35, "grad_norm": 4.26874303817749, "learning_rate": 1.2023143790487792e-05, "loss": 0.4987, "step": 47175 }, { "epoch": 2.35, "grad_norm": 3.464191198348999, "learning_rate": 1.2000073823893103e-05, "loss": 0.4721, "step": 47200 }, { "epoch": 2.35, "grad_norm": 3.5283408164978027, "learning_rate": 1.1977003857298415e-05, "loss": 0.3801, "step": 47225 }, { "epoch": 2.35, "grad_norm": 5.1887311935424805, "learning_rate": 1.1953933890703726e-05, "loss": 0.4973, "step": 47250 }, { "epoch": 2.36, "grad_norm": 6.538787364959717, "learning_rate": 1.1930863924109039e-05, "loss": 0.4962, "step": 47275 }, { "epoch": 2.36, "grad_norm": 4.12895393371582, "learning_rate": 1.190779395751435e-05, "loss": 0.4363, "step": 47300 }, { "epoch": 2.36, "grad_norm": 3.3740382194519043, "learning_rate": 1.1884723990919662e-05, "loss": 0.5633, "step": 47325 }, { "epoch": 2.36, "grad_norm": 58.57019805908203, "learning_rate": 1.1861654024324973e-05, "loss": 0.362, "step": 47350 }, { "epoch": 2.36, "grad_norm": 3.191279888153076, "learning_rate": 1.1838584057730286e-05, "loss": 0.3877, "step": 47375 }, { "epoch": 2.36, "grad_norm": 3.0479769706726074, "learning_rate": 1.1815514091135596e-05, "loss": 0.6029, "step": 47400 }, { "epoch": 2.36, "grad_norm": 2.293454170227051, "learning_rate": 1.1792444124540907e-05, "loss": 0.4918, "step": 47425 }, { "epoch": 2.36, "grad_norm": 3.393441677093506, "learning_rate": 1.176937415794622e-05, "loss": 0.4612, "step": 47450 }, { "epoch": 2.37, "grad_norm": 0.7810852527618408, "learning_rate": 1.1746304191351532e-05, "loss": 0.4292, "step": 47475 }, { "epoch": 2.37, "grad_norm": 3.1317076683044434, "learning_rate": 1.1723234224756843e-05, "loss": 0.5239, "step": 47500 }, { "epoch": 2.37, "grad_norm": 55.293575286865234, "learning_rate": 1.1700164258162156e-05, "loss": 0.5202, "step": 47525 }, { "epoch": 2.37, "grad_norm": 32.76906967163086, "learning_rate": 1.1677094291567467e-05, "loss": 0.4267, "step": 47550 }, { "epoch": 2.37, "grad_norm": 3.1955676078796387, "learning_rate": 1.1654024324972777e-05, "loss": 0.3918, "step": 47575 }, { "epoch": 2.37, "grad_norm": 0.926774799823761, "learning_rate": 1.1630954358378088e-05, "loss": 0.6901, "step": 47600 }, { "epoch": 2.37, "grad_norm": 6.414040565490723, "learning_rate": 1.1607884391783401e-05, "loss": 0.5149, "step": 47625 }, { "epoch": 2.37, "grad_norm": 1.0682789087295532, "learning_rate": 1.1584814425188713e-05, "loss": 0.4258, "step": 47650 }, { "epoch": 2.38, "grad_norm": 1.077889323234558, "learning_rate": 1.1561744458594024e-05, "loss": 0.3138, "step": 47675 }, { "epoch": 2.38, "grad_norm": 9.041423797607422, "learning_rate": 1.1538674491999337e-05, "loss": 0.346, "step": 47700 }, { "epoch": 2.38, "grad_norm": 4.021137237548828, "learning_rate": 1.1515604525404649e-05, "loss": 0.4743, "step": 47725 }, { "epoch": 2.38, "grad_norm": 6.2052836418151855, "learning_rate": 1.149253455880996e-05, "loss": 0.4666, "step": 47750 }, { "epoch": 2.38, "grad_norm": 17.59967041015625, "learning_rate": 1.1469464592215271e-05, "loss": 0.4114, "step": 47775 }, { "epoch": 2.38, "grad_norm": 3.0670289993286133, "learning_rate": 1.1446394625620583e-05, "loss": 0.4274, "step": 47800 }, { "epoch": 2.38, "grad_norm": 3.6514852046966553, "learning_rate": 1.1423324659025894e-05, "loss": 0.524, "step": 47825 }, { "epoch": 2.38, "grad_norm": 2.6334304809570312, "learning_rate": 1.1400254692431205e-05, "loss": 0.3774, "step": 47850 }, { "epoch": 2.39, "grad_norm": 4.937973976135254, "learning_rate": 1.1377184725836518e-05, "loss": 0.3975, "step": 47875 }, { "epoch": 2.39, "grad_norm": 4.668248653411865, "learning_rate": 1.135411475924183e-05, "loss": 0.4265, "step": 47900 }, { "epoch": 2.39, "grad_norm": 4.245274543762207, "learning_rate": 1.1331044792647141e-05, "loss": 0.3939, "step": 47925 }, { "epoch": 2.39, "grad_norm": 1.0817639827728271, "learning_rate": 1.1307974826052452e-05, "loss": 0.4652, "step": 47950 }, { "epoch": 2.39, "grad_norm": 1.1002476215362549, "learning_rate": 1.1284904859457764e-05, "loss": 0.498, "step": 47975 }, { "epoch": 2.39, "grad_norm": 0.6932094097137451, "learning_rate": 1.1261834892863075e-05, "loss": 0.3954, "step": 48000 }, { "epoch": 2.39, "grad_norm": 0.7378042936325073, "learning_rate": 1.1238764926268387e-05, "loss": 0.451, "step": 48025 }, { "epoch": 2.39, "grad_norm": 4.199863433837891, "learning_rate": 1.12156949596737e-05, "loss": 0.4264, "step": 48050 }, { "epoch": 2.4, "grad_norm": 1.039255142211914, "learning_rate": 1.1192624993079011e-05, "loss": 0.434, "step": 48075 }, { "epoch": 2.4, "grad_norm": 53.35725784301758, "learning_rate": 1.1169555026484322e-05, "loss": 0.5869, "step": 48100 }, { "epoch": 2.4, "grad_norm": 0.8894838690757751, "learning_rate": 1.1146485059889634e-05, "loss": 0.4002, "step": 48125 }, { "epoch": 2.4, "grad_norm": 3.4555165767669678, "learning_rate": 1.1123415093294945e-05, "loss": 0.3874, "step": 48150 }, { "epoch": 2.4, "grad_norm": 35.925559997558594, "learning_rate": 1.1100345126700256e-05, "loss": 0.3686, "step": 48175 }, { "epoch": 2.4, "grad_norm": 4.092686176300049, "learning_rate": 1.1077275160105568e-05, "loss": 0.3766, "step": 48200 }, { "epoch": 2.4, "grad_norm": 0.7535989880561829, "learning_rate": 1.105420519351088e-05, "loss": 0.3985, "step": 48225 }, { "epoch": 2.4, "grad_norm": 1.280040979385376, "learning_rate": 1.1031135226916192e-05, "loss": 0.5734, "step": 48250 }, { "epoch": 2.41, "grad_norm": 3.505943775177002, "learning_rate": 1.1008065260321504e-05, "loss": 0.3752, "step": 48275 }, { "epoch": 2.41, "grad_norm": 0.8014325499534607, "learning_rate": 1.0984995293726817e-05, "loss": 0.5073, "step": 48300 }, { "epoch": 2.41, "grad_norm": 3.3095195293426514, "learning_rate": 1.0961925327132126e-05, "loss": 0.5718, "step": 48325 }, { "epoch": 2.41, "grad_norm": 14.79623031616211, "learning_rate": 1.0938855360537438e-05, "loss": 0.4989, "step": 48350 }, { "epoch": 2.41, "grad_norm": 15.96692180633545, "learning_rate": 1.091578539394275e-05, "loss": 0.5859, "step": 48375 }, { "epoch": 2.41, "grad_norm": 3.386098861694336, "learning_rate": 1.0892715427348062e-05, "loss": 0.377, "step": 48400 }, { "epoch": 2.41, "grad_norm": 4.3916754722595215, "learning_rate": 1.0869645460753373e-05, "loss": 0.3962, "step": 48425 }, { "epoch": 2.41, "grad_norm": 21.46055793762207, "learning_rate": 1.0846575494158685e-05, "loss": 0.4741, "step": 48450 }, { "epoch": 2.42, "grad_norm": 0.8601337671279907, "learning_rate": 1.0823505527563998e-05, "loss": 0.4751, "step": 48475 }, { "epoch": 2.42, "grad_norm": 8.975151062011719, "learning_rate": 1.0800435560969309e-05, "loss": 0.4602, "step": 48500 }, { "epoch": 2.42, "grad_norm": 0.9004390835762024, "learning_rate": 1.0777365594374619e-05, "loss": 0.4327, "step": 48525 }, { "epoch": 2.42, "grad_norm": 0.8999045491218567, "learning_rate": 1.0754295627779932e-05, "loss": 0.4216, "step": 48550 }, { "epoch": 2.42, "grad_norm": 5.836853981018066, "learning_rate": 1.0731225661185243e-05, "loss": 0.402, "step": 48575 }, { "epoch": 2.42, "grad_norm": 1169.8812255859375, "learning_rate": 1.0708155694590555e-05, "loss": 0.2952, "step": 48600 }, { "epoch": 2.42, "grad_norm": 6.735498905181885, "learning_rate": 1.0685085727995866e-05, "loss": 0.3534, "step": 48625 }, { "epoch": 2.42, "grad_norm": 4.032488822937012, "learning_rate": 1.0662015761401179e-05, "loss": 0.4946, "step": 48650 }, { "epoch": 2.43, "grad_norm": 6.46728515625, "learning_rate": 1.063894579480649e-05, "loss": 0.4081, "step": 48675 }, { "epoch": 2.43, "grad_norm": 0.7925447821617126, "learning_rate": 1.06158758282118e-05, "loss": 0.424, "step": 48700 }, { "epoch": 2.43, "grad_norm": 0.7554740905761719, "learning_rate": 1.0592805861617113e-05, "loss": 0.3596, "step": 48725 }, { "epoch": 2.43, "grad_norm": 88.7288818359375, "learning_rate": 1.0569735895022424e-05, "loss": 0.4129, "step": 48750 }, { "epoch": 2.43, "grad_norm": 0.6437362432479858, "learning_rate": 1.0546665928427736e-05, "loss": 0.4925, "step": 48775 }, { "epoch": 2.43, "grad_norm": 1.6111449003219604, "learning_rate": 1.0523595961833047e-05, "loss": 0.5669, "step": 48800 }, { "epoch": 2.43, "grad_norm": 4.0401740074157715, "learning_rate": 1.050052599523836e-05, "loss": 0.4289, "step": 48825 }, { "epoch": 2.43, "grad_norm": 5.602783679962158, "learning_rate": 1.0477456028643672e-05, "loss": 0.4176, "step": 48850 }, { "epoch": 2.44, "grad_norm": 3.130173683166504, "learning_rate": 1.0454386062048981e-05, "loss": 0.3674, "step": 48875 }, { "epoch": 2.44, "grad_norm": 3.0914132595062256, "learning_rate": 1.0431316095454294e-05, "loss": 0.4028, "step": 48900 }, { "epoch": 2.44, "grad_norm": 0.9653416275978088, "learning_rate": 1.0408246128859606e-05, "loss": 0.4655, "step": 48925 }, { "epoch": 2.44, "grad_norm": 1.5674058198928833, "learning_rate": 1.0385176162264917e-05, "loss": 0.3246, "step": 48950 }, { "epoch": 2.44, "grad_norm": 2.738037109375, "learning_rate": 1.036210619567023e-05, "loss": 0.3521, "step": 48975 }, { "epoch": 2.44, "grad_norm": 3.1687607765197754, "learning_rate": 1.0339036229075541e-05, "loss": 0.5494, "step": 49000 }, { "epoch": 2.44, "grad_norm": 6.417214870452881, "learning_rate": 1.0315966262480853e-05, "loss": 0.3474, "step": 49025 }, { "epoch": 2.44, "grad_norm": 0.7319241762161255, "learning_rate": 1.0292896295886164e-05, "loss": 0.4822, "step": 49050 }, { "epoch": 2.45, "grad_norm": 3.495887517929077, "learning_rate": 1.0269826329291475e-05, "loss": 0.3905, "step": 49075 }, { "epoch": 2.45, "grad_norm": 3.4438161849975586, "learning_rate": 1.0246756362696787e-05, "loss": 0.5075, "step": 49100 }, { "epoch": 2.45, "grad_norm": 6.572171688079834, "learning_rate": 1.0223686396102098e-05, "loss": 0.4268, "step": 49125 }, { "epoch": 2.45, "grad_norm": 52.90046691894531, "learning_rate": 1.0200616429507411e-05, "loss": 0.6094, "step": 49150 }, { "epoch": 2.45, "grad_norm": 5.936100006103516, "learning_rate": 1.0177546462912723e-05, "loss": 0.3771, "step": 49175 }, { "epoch": 2.45, "grad_norm": 0.8982616662979126, "learning_rate": 1.0154476496318034e-05, "loss": 0.3057, "step": 49200 }, { "epoch": 2.45, "grad_norm": 2.5868983268737793, "learning_rate": 1.0131406529723345e-05, "loss": 0.4864, "step": 49225 }, { "epoch": 2.45, "grad_norm": 11.693443298339844, "learning_rate": 1.0108336563128657e-05, "loss": 0.5302, "step": 49250 }, { "epoch": 2.46, "grad_norm": 1.0244178771972656, "learning_rate": 1.0085266596533968e-05, "loss": 0.4169, "step": 49275 }, { "epoch": 2.46, "grad_norm": 4.7818603515625, "learning_rate": 1.006219662993928e-05, "loss": 0.409, "step": 49300 }, { "epoch": 2.46, "grad_norm": 0.8569893836975098, "learning_rate": 1.0039126663344592e-05, "loss": 0.3737, "step": 49325 }, { "epoch": 2.46, "grad_norm": 0.7575650811195374, "learning_rate": 1.0016056696749904e-05, "loss": 0.4904, "step": 49350 }, { "epoch": 2.46, "grad_norm": 0.7171018123626709, "learning_rate": 9.992986730155215e-06, "loss": 0.3005, "step": 49375 }, { "epoch": 2.46, "grad_norm": 86.97425842285156, "learning_rate": 9.969916763560527e-06, "loss": 0.4328, "step": 49400 }, { "epoch": 2.46, "grad_norm": 5.976800441741943, "learning_rate": 9.94684679696584e-06, "loss": 0.4754, "step": 49425 }, { "epoch": 2.46, "grad_norm": 6.821709156036377, "learning_rate": 9.92377683037115e-06, "loss": 0.3567, "step": 49450 }, { "epoch": 2.47, "grad_norm": 21.48541831970215, "learning_rate": 9.90070686377646e-06, "loss": 0.5048, "step": 49475 }, { "epoch": 2.47, "grad_norm": 0.8857642412185669, "learning_rate": 9.877636897181774e-06, "loss": 0.3556, "step": 49500 }, { "epoch": 2.47, "grad_norm": 2.9118235111236572, "learning_rate": 9.854566930587085e-06, "loss": 0.4667, "step": 49525 }, { "epoch": 2.47, "grad_norm": 30.48115348815918, "learning_rate": 9.831496963992396e-06, "loss": 0.3564, "step": 49550 }, { "epoch": 2.47, "grad_norm": 3.914069652557373, "learning_rate": 9.808426997397708e-06, "loss": 0.3922, "step": 49575 }, { "epoch": 2.47, "grad_norm": 3.106795072555542, "learning_rate": 9.78535703080302e-06, "loss": 0.46, "step": 49600 }, { "epoch": 2.47, "grad_norm": 1.6387038230895996, "learning_rate": 9.76228706420833e-06, "loss": 0.4054, "step": 49625 }, { "epoch": 2.47, "grad_norm": 65.76454162597656, "learning_rate": 9.739217097613642e-06, "loss": 0.4163, "step": 49650 }, { "epoch": 2.48, "grad_norm": 6.016766548156738, "learning_rate": 9.716147131018955e-06, "loss": 0.3834, "step": 49675 }, { "epoch": 2.48, "grad_norm": 4.530858993530273, "learning_rate": 9.693077164424266e-06, "loss": 0.3758, "step": 49700 }, { "epoch": 2.48, "grad_norm": 14.341697692871094, "learning_rate": 9.670007197829578e-06, "loss": 0.3814, "step": 49725 }, { "epoch": 2.48, "grad_norm": 3.512382745742798, "learning_rate": 9.64693723123489e-06, "loss": 0.4102, "step": 49750 }, { "epoch": 2.48, "grad_norm": 67.73419189453125, "learning_rate": 9.623867264640202e-06, "loss": 0.5958, "step": 49775 }, { "epoch": 2.48, "grad_norm": 0.7057674527168274, "learning_rate": 9.600797298045513e-06, "loss": 0.3469, "step": 49800 }, { "epoch": 2.48, "grad_norm": 1.1521495580673218, "learning_rate": 9.577727331450825e-06, "loss": 0.4476, "step": 49825 }, { "epoch": 2.48, "grad_norm": 0.952574610710144, "learning_rate": 9.554657364856136e-06, "loss": 0.5608, "step": 49850 }, { "epoch": 2.49, "grad_norm": 4.289364814758301, "learning_rate": 9.531587398261447e-06, "loss": 0.4397, "step": 49875 }, { "epoch": 2.49, "grad_norm": 4.029369831085205, "learning_rate": 9.508517431666759e-06, "loss": 0.5829, "step": 49900 }, { "epoch": 2.49, "grad_norm": 67.86260986328125, "learning_rate": 9.485447465072072e-06, "loss": 0.4823, "step": 49925 }, { "epoch": 2.49, "grad_norm": 0.8601451516151428, "learning_rate": 9.462377498477383e-06, "loss": 0.3746, "step": 49950 }, { "epoch": 2.49, "grad_norm": 0.7222571969032288, "learning_rate": 9.439307531882695e-06, "loss": 0.5386, "step": 49975 }, { "epoch": 2.49, "grad_norm": 3.494142770767212, "learning_rate": 9.416237565288006e-06, "loss": 0.5036, "step": 50000 }, { "epoch": 2.49, "grad_norm": 0.7426605820655823, "learning_rate": 9.393167598693317e-06, "loss": 0.3962, "step": 50025 }, { "epoch": 2.49, "grad_norm": 18.78670883178711, "learning_rate": 9.370097632098629e-06, "loss": 0.4831, "step": 50050 }, { "epoch": 2.5, "grad_norm": 3.5823943614959717, "learning_rate": 9.34702766550394e-06, "loss": 0.4929, "step": 50075 }, { "epoch": 2.5, "grad_norm": 3.229736566543579, "learning_rate": 9.323957698909253e-06, "loss": 0.4353, "step": 50100 }, { "epoch": 2.5, "grad_norm": 3.537930965423584, "learning_rate": 9.300887732314564e-06, "loss": 0.4397, "step": 50125 }, { "epoch": 2.5, "grad_norm": 2.34350323677063, "learning_rate": 9.277817765719876e-06, "loss": 0.4432, "step": 50150 }, { "epoch": 2.5, "grad_norm": 5.109536170959473, "learning_rate": 9.254747799125187e-06, "loss": 0.4513, "step": 50175 }, { "epoch": 2.5, "grad_norm": 0.7203817367553711, "learning_rate": 9.231677832530499e-06, "loss": 0.3611, "step": 50200 }, { "epoch": 2.5, "grad_norm": 1.0782755613327026, "learning_rate": 9.20860786593581e-06, "loss": 0.4285, "step": 50225 }, { "epoch": 2.5, "grad_norm": 0.7718564867973328, "learning_rate": 9.185537899341121e-06, "loss": 0.3597, "step": 50250 }, { "epoch": 2.51, "grad_norm": 4.9814133644104, "learning_rate": 9.162467932746434e-06, "loss": 0.4823, "step": 50275 }, { "epoch": 2.51, "grad_norm": 3.0942840576171875, "learning_rate": 9.139397966151746e-06, "loss": 0.3619, "step": 50300 }, { "epoch": 2.51, "grad_norm": 3.4611849784851074, "learning_rate": 9.116327999557057e-06, "loss": 0.4649, "step": 50325 }, { "epoch": 2.51, "grad_norm": 0.7275696992874146, "learning_rate": 9.09325803296237e-06, "loss": 0.3838, "step": 50350 }, { "epoch": 2.51, "grad_norm": 5.559508800506592, "learning_rate": 9.07018806636768e-06, "loss": 0.4082, "step": 50375 }, { "epoch": 2.51, "grad_norm": 2.714812994003296, "learning_rate": 9.047118099772991e-06, "loss": 0.4573, "step": 50400 }, { "epoch": 2.51, "grad_norm": 1.6366543769836426, "learning_rate": 9.024048133178304e-06, "loss": 0.6329, "step": 50425 }, { "epoch": 2.51, "grad_norm": 0.8482502698898315, "learning_rate": 9.000978166583616e-06, "loss": 0.3166, "step": 50450 }, { "epoch": 2.52, "grad_norm": 3.0310323238372803, "learning_rate": 8.977908199988927e-06, "loss": 0.4704, "step": 50475 }, { "epoch": 2.52, "grad_norm": 37.44198989868164, "learning_rate": 8.954838233394238e-06, "loss": 0.592, "step": 50500 }, { "epoch": 2.52, "grad_norm": 8.434538841247559, "learning_rate": 8.931768266799551e-06, "loss": 0.4933, "step": 50525 }, { "epoch": 2.52, "grad_norm": 18.220142364501953, "learning_rate": 8.908698300204863e-06, "loss": 0.5272, "step": 50550 }, { "epoch": 2.52, "grad_norm": 10.412059783935547, "learning_rate": 8.885628333610172e-06, "loss": 0.3622, "step": 50575 }, { "epoch": 2.52, "grad_norm": 2.166194438934326, "learning_rate": 8.862558367015485e-06, "loss": 0.4643, "step": 50600 }, { "epoch": 2.52, "grad_norm": 6.727598190307617, "learning_rate": 8.839488400420797e-06, "loss": 0.5054, "step": 50625 }, { "epoch": 2.52, "grad_norm": 5.515819549560547, "learning_rate": 8.816418433826108e-06, "loss": 0.4443, "step": 50650 }, { "epoch": 2.53, "grad_norm": 0.8282546401023865, "learning_rate": 8.79334846723142e-06, "loss": 0.4671, "step": 50675 }, { "epoch": 2.53, "grad_norm": 1.9409846067428589, "learning_rate": 8.770278500636733e-06, "loss": 0.4, "step": 50700 }, { "epoch": 2.53, "grad_norm": 6.412890911102295, "learning_rate": 8.747208534042044e-06, "loss": 0.4176, "step": 50725 }, { "epoch": 2.53, "grad_norm": 7.00205659866333, "learning_rate": 8.724138567447354e-06, "loss": 0.3337, "step": 50750 }, { "epoch": 2.53, "grad_norm": 14.442793846130371, "learning_rate": 8.701068600852667e-06, "loss": 0.4154, "step": 50775 }, { "epoch": 2.53, "grad_norm": 1.0041389465332031, "learning_rate": 8.677998634257978e-06, "loss": 0.4439, "step": 50800 }, { "epoch": 2.53, "grad_norm": 3.049905776977539, "learning_rate": 8.65492866766329e-06, "loss": 0.4632, "step": 50825 }, { "epoch": 2.53, "grad_norm": 2.8461859226226807, "learning_rate": 8.6318587010686e-06, "loss": 0.5343, "step": 50850 }, { "epoch": 2.54, "grad_norm": 4.451079368591309, "learning_rate": 8.608788734473914e-06, "loss": 0.5663, "step": 50875 }, { "epoch": 2.54, "grad_norm": 42.84324645996094, "learning_rate": 8.585718767879225e-06, "loss": 0.4916, "step": 50900 }, { "epoch": 2.54, "grad_norm": 5.204446315765381, "learning_rate": 8.562648801284536e-06, "loss": 0.3823, "step": 50925 }, { "epoch": 2.54, "grad_norm": 87.77778625488281, "learning_rate": 8.539578834689848e-06, "loss": 0.2921, "step": 50950 }, { "epoch": 2.54, "grad_norm": 0.7765432000160217, "learning_rate": 8.51650886809516e-06, "loss": 0.4655, "step": 50975 }, { "epoch": 2.54, "grad_norm": 8.35226058959961, "learning_rate": 8.49343890150047e-06, "loss": 0.4879, "step": 51000 }, { "epoch": 2.54, "grad_norm": 6.179222106933594, "learning_rate": 8.470368934905782e-06, "loss": 0.3122, "step": 51025 }, { "epoch": 2.54, "grad_norm": 6.96940803527832, "learning_rate": 8.447298968311095e-06, "loss": 0.5331, "step": 51050 }, { "epoch": 2.55, "grad_norm": 3.286719799041748, "learning_rate": 8.424229001716406e-06, "loss": 0.5512, "step": 51075 }, { "epoch": 2.55, "grad_norm": 1.991182565689087, "learning_rate": 8.401159035121718e-06, "loss": 0.582, "step": 51100 }, { "epoch": 2.55, "grad_norm": 2.020848035812378, "learning_rate": 8.378089068527029e-06, "loss": 0.6097, "step": 51125 }, { "epoch": 2.55, "grad_norm": 3.684962511062622, "learning_rate": 8.35501910193234e-06, "loss": 0.5262, "step": 51150 }, { "epoch": 2.55, "grad_norm": 2.4446282386779785, "learning_rate": 8.331949135337652e-06, "loss": 0.3397, "step": 51175 }, { "epoch": 2.55, "grad_norm": 6.0752482414245605, "learning_rate": 8.308879168742965e-06, "loss": 0.4074, "step": 51200 }, { "epoch": 2.55, "grad_norm": 0.973638117313385, "learning_rate": 8.285809202148276e-06, "loss": 0.4706, "step": 51225 }, { "epoch": 2.55, "grad_norm": 0.7517675757408142, "learning_rate": 8.262739235553588e-06, "loss": 0.4092, "step": 51250 }, { "epoch": 2.56, "grad_norm": 0.8337798714637756, "learning_rate": 8.239669268958899e-06, "loss": 0.5036, "step": 51275 }, { "epoch": 2.56, "grad_norm": 5.119658946990967, "learning_rate": 8.21659930236421e-06, "loss": 0.3791, "step": 51300 }, { "epoch": 2.56, "grad_norm": 0.7112604379653931, "learning_rate": 8.193529335769522e-06, "loss": 0.3078, "step": 51325 }, { "epoch": 2.56, "grad_norm": 2.3811254501342773, "learning_rate": 8.170459369174833e-06, "loss": 0.3596, "step": 51350 }, { "epoch": 2.56, "grad_norm": 6.41787052154541, "learning_rate": 8.147389402580146e-06, "loss": 0.4551, "step": 51375 }, { "epoch": 2.56, "grad_norm": 7.111554145812988, "learning_rate": 8.124319435985457e-06, "loss": 0.3701, "step": 51400 }, { "epoch": 2.56, "grad_norm": 0.8633186221122742, "learning_rate": 8.101249469390769e-06, "loss": 0.3227, "step": 51425 }, { "epoch": 2.56, "grad_norm": 3.249757766723633, "learning_rate": 8.07817950279608e-06, "loss": 0.369, "step": 51450 }, { "epoch": 2.57, "grad_norm": 0.7638012170791626, "learning_rate": 8.055109536201393e-06, "loss": 0.5205, "step": 51475 }, { "epoch": 2.57, "grad_norm": 2.608543634414673, "learning_rate": 8.032039569606703e-06, "loss": 0.3419, "step": 51500 }, { "epoch": 2.57, "grad_norm": 3.6765499114990234, "learning_rate": 8.008969603012014e-06, "loss": 0.3323, "step": 51525 }, { "epoch": 2.57, "grad_norm": 3.2431704998016357, "learning_rate": 7.985899636417327e-06, "loss": 0.4416, "step": 51550 }, { "epoch": 2.57, "grad_norm": 4.2475104331970215, "learning_rate": 7.962829669822639e-06, "loss": 0.467, "step": 51575 }, { "epoch": 2.57, "grad_norm": 0.6722660660743713, "learning_rate": 7.93975970322795e-06, "loss": 0.4526, "step": 51600 }, { "epoch": 2.57, "grad_norm": 4.108065128326416, "learning_rate": 7.916689736633261e-06, "loss": 0.4785, "step": 51625 }, { "epoch": 2.57, "grad_norm": 27.554222106933594, "learning_rate": 7.893619770038574e-06, "loss": 0.3775, "step": 51650 }, { "epoch": 2.57, "grad_norm": 5.31900691986084, "learning_rate": 7.870549803443884e-06, "loss": 0.4917, "step": 51675 }, { "epoch": 2.58, "grad_norm": 6.323947429656982, "learning_rate": 7.847479836849195e-06, "loss": 0.4157, "step": 51700 }, { "epoch": 2.58, "grad_norm": 0.7022064328193665, "learning_rate": 7.824409870254508e-06, "loss": 0.371, "step": 51725 }, { "epoch": 2.58, "grad_norm": 6.228670120239258, "learning_rate": 7.80133990365982e-06, "loss": 0.4627, "step": 51750 }, { "epoch": 2.58, "grad_norm": 0.704119861125946, "learning_rate": 7.778269937065131e-06, "loss": 0.4452, "step": 51775 }, { "epoch": 2.58, "grad_norm": 2.9643666744232178, "learning_rate": 7.755199970470444e-06, "loss": 0.3993, "step": 51800 }, { "epoch": 2.58, "grad_norm": 0.7044605016708374, "learning_rate": 7.732130003875756e-06, "loss": 0.481, "step": 51825 }, { "epoch": 2.58, "grad_norm": 22.700096130371094, "learning_rate": 7.709060037281067e-06, "loss": 0.3396, "step": 51850 }, { "epoch": 2.58, "grad_norm": 0.7432065010070801, "learning_rate": 7.685990070686378e-06, "loss": 0.3764, "step": 51875 }, { "epoch": 2.59, "grad_norm": 1.9928373098373413, "learning_rate": 7.66292010409169e-06, "loss": 0.3288, "step": 51900 }, { "epoch": 2.59, "grad_norm": 3.4592535495758057, "learning_rate": 7.639850137497001e-06, "loss": 0.3718, "step": 51925 }, { "epoch": 2.59, "grad_norm": 0.9576864242553711, "learning_rate": 7.616780170902313e-06, "loss": 0.445, "step": 51950 }, { "epoch": 2.59, "grad_norm": 20.564001083374023, "learning_rate": 7.5937102043076246e-06, "loss": 0.3948, "step": 51975 }, { "epoch": 2.59, "grad_norm": 0.7595285177230835, "learning_rate": 7.570640237712937e-06, "loss": 0.345, "step": 52000 }, { "epoch": 2.59, "grad_norm": 13.459945678710938, "learning_rate": 7.547570271118248e-06, "loss": 0.5181, "step": 52025 }, { "epoch": 2.59, "grad_norm": 0.6884210705757141, "learning_rate": 7.524500304523559e-06, "loss": 0.4252, "step": 52050 }, { "epoch": 2.59, "grad_norm": 2.0429818630218506, "learning_rate": 7.501430337928871e-06, "loss": 0.3594, "step": 52075 }, { "epoch": 2.6, "grad_norm": 5.49559211730957, "learning_rate": 7.478360371334182e-06, "loss": 0.401, "step": 52100 }, { "epoch": 2.6, "grad_norm": 8.92778205871582, "learning_rate": 7.455290404739494e-06, "loss": 0.5722, "step": 52125 }, { "epoch": 2.6, "grad_norm": 3.305980920791626, "learning_rate": 7.432220438144806e-06, "loss": 0.3295, "step": 52150 }, { "epoch": 2.6, "grad_norm": 0.6217209696769714, "learning_rate": 7.409150471550118e-06, "loss": 0.4052, "step": 52175 }, { "epoch": 2.6, "grad_norm": 0.6542679667472839, "learning_rate": 7.386080504955429e-06, "loss": 0.2494, "step": 52200 }, { "epoch": 2.6, "grad_norm": 0.6969010233879089, "learning_rate": 7.3630105383607416e-06, "loss": 0.5526, "step": 52225 }, { "epoch": 2.6, "grad_norm": 2.3818559646606445, "learning_rate": 7.339940571766052e-06, "loss": 0.4938, "step": 52250 }, { "epoch": 2.6, "grad_norm": 15.166160583496094, "learning_rate": 7.3168706051713634e-06, "loss": 0.3946, "step": 52275 }, { "epoch": 2.61, "grad_norm": 16.992691040039062, "learning_rate": 7.293800638576676e-06, "loss": 0.4292, "step": 52300 }, { "epoch": 2.61, "grad_norm": 1.524154782295227, "learning_rate": 7.270730671981987e-06, "loss": 0.5369, "step": 52325 }, { "epoch": 2.61, "grad_norm": 3.3624045848846436, "learning_rate": 7.247660705387299e-06, "loss": 0.3681, "step": 52350 }, { "epoch": 2.61, "grad_norm": 15.962396621704102, "learning_rate": 7.2245907387926106e-06, "loss": 0.366, "step": 52375 }, { "epoch": 2.61, "grad_norm": 1.18215811252594, "learning_rate": 7.201520772197923e-06, "loss": 0.3614, "step": 52400 }, { "epoch": 2.61, "grad_norm": 2.024662494659424, "learning_rate": 7.178450805603233e-06, "loss": 0.3949, "step": 52425 }, { "epoch": 2.61, "grad_norm": 4.9505228996276855, "learning_rate": 7.155380839008545e-06, "loss": 0.4286, "step": 52450 }, { "epoch": 2.61, "grad_norm": 0.6292359232902527, "learning_rate": 7.132310872413857e-06, "loss": 0.3199, "step": 52475 }, { "epoch": 2.62, "grad_norm": 0.6973364949226379, "learning_rate": 7.109240905819169e-06, "loss": 0.4549, "step": 52500 }, { "epoch": 2.62, "grad_norm": 22.525678634643555, "learning_rate": 7.08617093922448e-06, "loss": 0.4789, "step": 52525 }, { "epoch": 2.62, "grad_norm": 0.7233843207359314, "learning_rate": 7.063100972629793e-06, "loss": 0.4184, "step": 52550 }, { "epoch": 2.62, "grad_norm": 3.2459542751312256, "learning_rate": 7.040031006035104e-06, "loss": 0.3048, "step": 52575 }, { "epoch": 2.62, "grad_norm": 5.569832801818848, "learning_rate": 7.016961039440416e-06, "loss": 0.3551, "step": 52600 }, { "epoch": 2.62, "grad_norm": 0.7612528204917908, "learning_rate": 6.993891072845727e-06, "loss": 0.4557, "step": 52625 }, { "epoch": 2.62, "grad_norm": 11.590838432312012, "learning_rate": 6.970821106251038e-06, "loss": 0.4279, "step": 52650 }, { "epoch": 2.62, "grad_norm": 0.667598307132721, "learning_rate": 6.94775113965635e-06, "loss": 0.3403, "step": 52675 }, { "epoch": 2.63, "grad_norm": 3.3274714946746826, "learning_rate": 6.924681173061662e-06, "loss": 0.4698, "step": 52700 }, { "epoch": 2.63, "grad_norm": 3.191425323486328, "learning_rate": 6.901611206466974e-06, "loss": 0.48, "step": 52725 }, { "epoch": 2.63, "grad_norm": 0.7707619071006775, "learning_rate": 6.878541239872285e-06, "loss": 0.3145, "step": 52750 }, { "epoch": 2.63, "grad_norm": 0.6428590416908264, "learning_rate": 6.855471273277597e-06, "loss": 0.4764, "step": 52775 }, { "epoch": 2.63, "grad_norm": 3.274758815765381, "learning_rate": 6.832401306682908e-06, "loss": 0.4921, "step": 52800 }, { "epoch": 2.63, "grad_norm": 0.732591450214386, "learning_rate": 6.809331340088219e-06, "loss": 0.3035, "step": 52825 }, { "epoch": 2.63, "grad_norm": 3.3230175971984863, "learning_rate": 6.7862613734935315e-06, "loss": 0.5051, "step": 52850 }, { "epoch": 2.63, "grad_norm": 4.391756534576416, "learning_rate": 6.763191406898843e-06, "loss": 0.3918, "step": 52875 }, { "epoch": 2.64, "grad_norm": 9.392721176147461, "learning_rate": 6.740121440304155e-06, "loss": 0.4598, "step": 52900 }, { "epoch": 2.64, "grad_norm": 0.6300041675567627, "learning_rate": 6.717051473709466e-06, "loss": 0.3568, "step": 52925 }, { "epoch": 2.64, "grad_norm": 3.346933126449585, "learning_rate": 6.693981507114779e-06, "loss": 0.4281, "step": 52950 }, { "epoch": 2.64, "grad_norm": 15.497859954833984, "learning_rate": 6.67091154052009e-06, "loss": 0.4197, "step": 52975 }, { "epoch": 2.64, "grad_norm": 6.255995750427246, "learning_rate": 6.6478415739254005e-06, "loss": 0.4091, "step": 53000 }, { "epoch": 2.64, "grad_norm": 1.5552594661712646, "learning_rate": 6.624771607330713e-06, "loss": 0.4507, "step": 53025 }, { "epoch": 2.64, "grad_norm": 35.074859619140625, "learning_rate": 6.601701640736024e-06, "loss": 0.3937, "step": 53050 }, { "epoch": 2.64, "grad_norm": 3.185941696166992, "learning_rate": 6.578631674141336e-06, "loss": 0.4116, "step": 53075 }, { "epoch": 2.65, "grad_norm": 0.6813209056854248, "learning_rate": 6.555561707546648e-06, "loss": 0.3293, "step": 53100 }, { "epoch": 2.65, "grad_norm": 25.33660888671875, "learning_rate": 6.53249174095196e-06, "loss": 0.4416, "step": 53125 }, { "epoch": 2.65, "grad_norm": 1.5817681550979614, "learning_rate": 6.509421774357272e-06, "loss": 0.5166, "step": 53150 }, { "epoch": 2.65, "grad_norm": 3.9538614749908447, "learning_rate": 6.486351807762582e-06, "loss": 0.5216, "step": 53175 }, { "epoch": 2.65, "grad_norm": 6.135497570037842, "learning_rate": 6.463281841167894e-06, "loss": 0.4999, "step": 53200 }, { "epoch": 2.65, "grad_norm": 1.1646127700805664, "learning_rate": 6.440211874573206e-06, "loss": 0.4203, "step": 53225 }, { "epoch": 2.65, "grad_norm": 7.302920341491699, "learning_rate": 6.4171419079785175e-06, "loss": 0.3305, "step": 53250 }, { "epoch": 2.65, "grad_norm": 2.83675479888916, "learning_rate": 6.39407194138383e-06, "loss": 0.3606, "step": 53275 }, { "epoch": 2.66, "grad_norm": 1.0463924407958984, "learning_rate": 6.371001974789141e-06, "loss": 0.417, "step": 53300 }, { "epoch": 2.66, "grad_norm": 0.7047767043113708, "learning_rate": 6.347932008194453e-06, "loss": 0.3989, "step": 53325 }, { "epoch": 2.66, "grad_norm": 20.336246490478516, "learning_rate": 6.324862041599765e-06, "loss": 0.4479, "step": 53350 }, { "epoch": 2.66, "grad_norm": 3.4728808403015137, "learning_rate": 6.301792075005075e-06, "loss": 0.3529, "step": 53375 }, { "epoch": 2.66, "grad_norm": 22.320112228393555, "learning_rate": 6.278722108410387e-06, "loss": 0.3883, "step": 53400 }, { "epoch": 2.66, "grad_norm": 6.709505558013916, "learning_rate": 6.255652141815699e-06, "loss": 0.4964, "step": 53425 }, { "epoch": 2.66, "grad_norm": 7.791197776794434, "learning_rate": 6.232582175221011e-06, "loss": 0.3674, "step": 53450 }, { "epoch": 2.66, "grad_norm": 0.6814228296279907, "learning_rate": 6.209512208626322e-06, "loss": 0.3593, "step": 53475 }, { "epoch": 2.67, "grad_norm": 4.702853679656982, "learning_rate": 6.186442242031634e-06, "loss": 0.4375, "step": 53500 }, { "epoch": 2.67, "grad_norm": 3.231199264526367, "learning_rate": 6.163372275436945e-06, "loss": 0.3586, "step": 53525 }, { "epoch": 2.67, "grad_norm": 3.124828815460205, "learning_rate": 6.140302308842257e-06, "loss": 0.3698, "step": 53550 }, { "epoch": 2.67, "grad_norm": 0.6851596832275391, "learning_rate": 6.117232342247569e-06, "loss": 0.4728, "step": 53575 }, { "epoch": 2.67, "grad_norm": 32.70745849609375, "learning_rate": 6.09416237565288e-06, "loss": 0.4506, "step": 53600 }, { "epoch": 2.67, "grad_norm": 5.827382564544678, "learning_rate": 6.071092409058192e-06, "loss": 0.5484, "step": 53625 }, { "epoch": 2.67, "grad_norm": 1.0193488597869873, "learning_rate": 6.0480224424635035e-06, "loss": 0.4257, "step": 53650 }, { "epoch": 2.67, "grad_norm": 6.486295223236084, "learning_rate": 6.024952475868816e-06, "loss": 0.5501, "step": 53675 }, { "epoch": 2.68, "grad_norm": 0.740279495716095, "learning_rate": 6.001882509274127e-06, "loss": 0.4992, "step": 53700 }, { "epoch": 2.68, "grad_norm": 3.4536116123199463, "learning_rate": 5.978812542679438e-06, "loss": 0.4032, "step": 53725 }, { "epoch": 2.68, "grad_norm": 2.0655429363250732, "learning_rate": 5.955742576084751e-06, "loss": 0.5314, "step": 53750 }, { "epoch": 2.68, "grad_norm": 3.477311134338379, "learning_rate": 5.932672609490061e-06, "loss": 0.4435, "step": 53775 }, { "epoch": 2.68, "grad_norm": 0.6453719735145569, "learning_rate": 5.909602642895373e-06, "loss": 0.311, "step": 53800 }, { "epoch": 2.68, "grad_norm": 6.679521083831787, "learning_rate": 5.886532676300685e-06, "loss": 0.3425, "step": 53825 }, { "epoch": 2.68, "grad_norm": 0.7992690801620483, "learning_rate": 5.863462709705997e-06, "loss": 0.4486, "step": 53850 }, { "epoch": 2.68, "grad_norm": 3.493523120880127, "learning_rate": 5.840392743111308e-06, "loss": 0.3975, "step": 53875 }, { "epoch": 2.69, "grad_norm": 1.6282230615615845, "learning_rate": 5.81732277651662e-06, "loss": 0.3937, "step": 53900 }, { "epoch": 2.69, "grad_norm": 3.9338877201080322, "learning_rate": 5.794252809921932e-06, "loss": 0.5131, "step": 53925 }, { "epoch": 2.69, "grad_norm": 2.9956166744232178, "learning_rate": 5.771182843327243e-06, "loss": 0.5215, "step": 53950 }, { "epoch": 2.69, "grad_norm": 1.5365872383117676, "learning_rate": 5.7481128767325545e-06, "loss": 0.4076, "step": 53975 }, { "epoch": 2.69, "grad_norm": 6.710165023803711, "learning_rate": 5.725042910137867e-06, "loss": 0.4186, "step": 54000 }, { "epoch": 2.69, "grad_norm": 3.2934117317199707, "learning_rate": 5.701972943543178e-06, "loss": 0.3445, "step": 54025 }, { "epoch": 2.69, "grad_norm": 1.4656898975372314, "learning_rate": 5.67890297694849e-06, "loss": 0.3595, "step": 54050 }, { "epoch": 2.69, "grad_norm": 6.39401388168335, "learning_rate": 5.655833010353801e-06, "loss": 0.2611, "step": 54075 }, { "epoch": 2.7, "grad_norm": 0.6337581276893616, "learning_rate": 5.632763043759113e-06, "loss": 0.2903, "step": 54100 }, { "epoch": 2.7, "grad_norm": 27.258468627929688, "learning_rate": 5.609693077164424e-06, "loss": 0.3028, "step": 54125 }, { "epoch": 2.7, "grad_norm": 3.1868808269500732, "learning_rate": 5.586623110569736e-06, "loss": 0.4316, "step": 54150 }, { "epoch": 2.7, "grad_norm": 3.674356698989868, "learning_rate": 5.563553143975048e-06, "loss": 0.4515, "step": 54175 }, { "epoch": 2.7, "grad_norm": 3.406693696975708, "learning_rate": 5.540483177380359e-06, "loss": 0.4075, "step": 54200 }, { "epoch": 2.7, "grad_norm": 3.3194386959075928, "learning_rate": 5.5174132107856715e-06, "loss": 0.3291, "step": 54225 }, { "epoch": 2.7, "grad_norm": 19.359365463256836, "learning_rate": 5.494343244190982e-06, "loss": 0.4357, "step": 54250 }, { "epoch": 2.7, "grad_norm": 0.826265275478363, "learning_rate": 5.471273277596294e-06, "loss": 0.4412, "step": 54275 }, { "epoch": 2.71, "grad_norm": 7.969943523406982, "learning_rate": 5.4482033110016064e-06, "loss": 0.4301, "step": 54300 }, { "epoch": 2.71, "grad_norm": 73.21637725830078, "learning_rate": 5.425133344406918e-06, "loss": 0.4003, "step": 54325 }, { "epoch": 2.71, "grad_norm": 1.427546739578247, "learning_rate": 5.402063377812229e-06, "loss": 0.4922, "step": 54350 }, { "epoch": 2.71, "grad_norm": 3.304819345474243, "learning_rate": 5.3789934112175405e-06, "loss": 0.3643, "step": 54375 }, { "epoch": 2.71, "grad_norm": 4.45997428894043, "learning_rate": 5.355923444622853e-06, "loss": 0.5532, "step": 54400 }, { "epoch": 2.71, "grad_norm": 3.3385884761810303, "learning_rate": 5.332853478028164e-06, "loss": 0.4333, "step": 54425 }, { "epoch": 2.71, "grad_norm": 0.6692785620689392, "learning_rate": 5.3097835114334754e-06, "loss": 0.568, "step": 54450 }, { "epoch": 2.71, "grad_norm": 2.820305347442627, "learning_rate": 5.286713544838788e-06, "loss": 0.3056, "step": 54475 }, { "epoch": 2.72, "grad_norm": 1.4250597953796387, "learning_rate": 5.263643578244099e-06, "loss": 0.4482, "step": 54500 }, { "epoch": 2.72, "grad_norm": 3.537055730819702, "learning_rate": 5.24057361164941e-06, "loss": 0.4922, "step": 54525 }, { "epoch": 2.72, "grad_norm": 0.8892665505409241, "learning_rate": 5.217503645054722e-06, "loss": 0.4592, "step": 54550 }, { "epoch": 2.72, "grad_norm": 0.7274606227874756, "learning_rate": 5.194433678460034e-06, "loss": 0.3322, "step": 54575 }, { "epoch": 2.72, "grad_norm": 2.687718391418457, "learning_rate": 5.171363711865346e-06, "loss": 0.4036, "step": 54600 }, { "epoch": 2.72, "grad_norm": 1.226678490638733, "learning_rate": 5.148293745270657e-06, "loss": 0.4232, "step": 54625 }, { "epoch": 2.72, "grad_norm": 4.129399299621582, "learning_rate": 5.125223778675969e-06, "loss": 0.4251, "step": 54650 }, { "epoch": 2.72, "grad_norm": 3.4940590858459473, "learning_rate": 5.10215381208128e-06, "loss": 0.525, "step": 54675 }, { "epoch": 2.73, "grad_norm": 3.4788758754730225, "learning_rate": 5.0790838454865924e-06, "loss": 0.4932, "step": 54700 }, { "epoch": 2.73, "grad_norm": 0.8910707235336304, "learning_rate": 5.056013878891904e-06, "loss": 0.3805, "step": 54725 }, { "epoch": 2.73, "grad_norm": 8.056558609008789, "learning_rate": 5.032943912297215e-06, "loss": 0.4761, "step": 54750 }, { "epoch": 2.73, "grad_norm": 0.702038586139679, "learning_rate": 5.009873945702527e-06, "loss": 0.3327, "step": 54775 }, { "epoch": 2.73, "grad_norm": 0.9609004259109497, "learning_rate": 4.986803979107838e-06, "loss": 0.3144, "step": 54800 }, { "epoch": 2.73, "grad_norm": 8.136700630187988, "learning_rate": 4.96373401251315e-06, "loss": 0.2805, "step": 54825 }, { "epoch": 2.73, "grad_norm": 1.3470325469970703, "learning_rate": 4.9406640459184614e-06, "loss": 0.4093, "step": 54850 }, { "epoch": 2.73, "grad_norm": 4.504135608673096, "learning_rate": 4.917594079323774e-06, "loss": 0.612, "step": 54875 }, { "epoch": 2.74, "grad_norm": 0.6652967929840088, "learning_rate": 4.894524112729085e-06, "loss": 0.3323, "step": 54900 }, { "epoch": 2.74, "grad_norm": 0.6020931005477905, "learning_rate": 4.871454146134396e-06, "loss": 0.3657, "step": 54925 }, { "epoch": 2.74, "grad_norm": 7.894162178039551, "learning_rate": 4.8483841795397086e-06, "loss": 0.2989, "step": 54950 }, { "epoch": 2.74, "grad_norm": 2.817714214324951, "learning_rate": 4.82531421294502e-06, "loss": 0.53, "step": 54975 }, { "epoch": 2.74, "grad_norm": 5.955691337585449, "learning_rate": 4.802244246350331e-06, "loss": 0.4958, "step": 55000 }, { "epoch": 2.74, "grad_norm": 0.6819628477096558, "learning_rate": 4.7791742797556435e-06, "loss": 0.3742, "step": 55025 }, { "epoch": 2.74, "grad_norm": 3.3516225814819336, "learning_rate": 4.756104313160955e-06, "loss": 0.4367, "step": 55050 }, { "epoch": 2.74, "grad_norm": 0.8088192939758301, "learning_rate": 4.733034346566267e-06, "loss": 0.4869, "step": 55075 }, { "epoch": 2.75, "grad_norm": 11.700592994689941, "learning_rate": 4.7099643799715776e-06, "loss": 0.4097, "step": 55100 }, { "epoch": 2.75, "grad_norm": 0.9019712805747986, "learning_rate": 4.68689441337689e-06, "loss": 0.2985, "step": 55125 }, { "epoch": 2.75, "grad_norm": 0.7735490202903748, "learning_rate": 4.663824446782201e-06, "loss": 0.4336, "step": 55150 }, { "epoch": 2.75, "grad_norm": 8.80534839630127, "learning_rate": 4.6407544801875125e-06, "loss": 0.4035, "step": 55175 }, { "epoch": 2.75, "grad_norm": 2.1064610481262207, "learning_rate": 4.617684513592825e-06, "loss": 0.2458, "step": 55200 }, { "epoch": 2.75, "grad_norm": 2.9238836765289307, "learning_rate": 4.594614546998136e-06, "loss": 0.4663, "step": 55225 }, { "epoch": 2.75, "grad_norm": 10.220343589782715, "learning_rate": 4.571544580403448e-06, "loss": 0.5069, "step": 55250 }, { "epoch": 2.75, "grad_norm": 0.9700506925582886, "learning_rate": 4.548474613808759e-06, "loss": 0.4545, "step": 55275 }, { "epoch": 2.76, "grad_norm": 0.8662756085395813, "learning_rate": 4.525404647214071e-06, "loss": 0.5237, "step": 55300 }, { "epoch": 2.76, "grad_norm": 3.5456159114837646, "learning_rate": 4.502334680619383e-06, "loss": 0.2305, "step": 55325 }, { "epoch": 2.76, "grad_norm": 50.77360916137695, "learning_rate": 4.4792647140246946e-06, "loss": 0.4274, "step": 55350 }, { "epoch": 2.76, "grad_norm": 0.8012394905090332, "learning_rate": 4.456194747430006e-06, "loss": 0.3622, "step": 55375 }, { "epoch": 2.76, "grad_norm": 14.076640129089355, "learning_rate": 4.433124780835317e-06, "loss": 0.352, "step": 55400 }, { "epoch": 2.76, "grad_norm": 3.324470281600952, "learning_rate": 4.4100548142406295e-06, "loss": 0.4269, "step": 55425 }, { "epoch": 2.76, "grad_norm": 0.8007457256317139, "learning_rate": 4.386984847645941e-06, "loss": 0.4752, "step": 55450 }, { "epoch": 2.76, "grad_norm": 92.04176330566406, "learning_rate": 4.363914881051252e-06, "loss": 0.4391, "step": 55475 }, { "epoch": 2.77, "grad_norm": 0.700799822807312, "learning_rate": 4.340844914456564e-06, "loss": 0.6196, "step": 55500 }, { "epoch": 2.77, "grad_norm": 1.1383848190307617, "learning_rate": 4.317774947861876e-06, "loss": 0.5262, "step": 55525 }, { "epoch": 2.77, "grad_norm": 41.94340133666992, "learning_rate": 4.294704981267187e-06, "loss": 0.4941, "step": 55550 }, { "epoch": 2.77, "grad_norm": 4.038702487945557, "learning_rate": 4.2716350146724985e-06, "loss": 0.4238, "step": 55575 }, { "epoch": 2.77, "grad_norm": 3.343867301940918, "learning_rate": 4.248565048077811e-06, "loss": 0.3817, "step": 55600 }, { "epoch": 2.77, "grad_norm": 15.859959602355957, "learning_rate": 4.225495081483123e-06, "loss": 0.4272, "step": 55625 }, { "epoch": 2.77, "grad_norm": 3.166660785675049, "learning_rate": 4.202425114888433e-06, "loss": 0.3375, "step": 55650 }, { "epoch": 2.77, "grad_norm": 0.8695279359817505, "learning_rate": 4.179355148293746e-06, "loss": 0.3752, "step": 55675 }, { "epoch": 2.78, "grad_norm": 3.280230760574341, "learning_rate": 4.156285181699057e-06, "loss": 0.4374, "step": 55700 }, { "epoch": 2.78, "grad_norm": 6.026671886444092, "learning_rate": 4.133215215104369e-06, "loss": 0.3545, "step": 55725 }, { "epoch": 2.78, "grad_norm": 3.5654361248016357, "learning_rate": 4.1101452485096805e-06, "loss": 0.465, "step": 55750 }, { "epoch": 2.78, "grad_norm": 0.7267552614212036, "learning_rate": 4.087075281914992e-06, "loss": 0.3821, "step": 55775 }, { "epoch": 2.78, "grad_norm": 4.42562198638916, "learning_rate": 4.064005315320304e-06, "loss": 0.4465, "step": 55800 }, { "epoch": 2.78, "grad_norm": 3.448962926864624, "learning_rate": 4.040935348725615e-06, "loss": 0.4483, "step": 55825 }, { "epoch": 2.78, "grad_norm": 12.654199600219727, "learning_rate": 4.017865382130927e-06, "loss": 0.4897, "step": 55850 }, { "epoch": 2.78, "grad_norm": 6.2945027351379395, "learning_rate": 3.994795415536238e-06, "loss": 0.3699, "step": 55875 }, { "epoch": 2.79, "grad_norm": 4.625159740447998, "learning_rate": 3.97172544894155e-06, "loss": 0.3586, "step": 55900 }, { "epoch": 2.79, "grad_norm": 6.696944236755371, "learning_rate": 3.948655482346862e-06, "loss": 0.4257, "step": 55925 }, { "epoch": 2.79, "grad_norm": 3.5296878814697266, "learning_rate": 3.925585515752173e-06, "loss": 0.3581, "step": 55950 }, { "epoch": 2.79, "grad_norm": 9.328879356384277, "learning_rate": 3.902515549157485e-06, "loss": 0.3942, "step": 55975 }, { "epoch": 2.79, "grad_norm": 3.365654230117798, "learning_rate": 3.879445582562797e-06, "loss": 0.3671, "step": 56000 }, { "epoch": 2.79, "grad_norm": 9.659181594848633, "learning_rate": 3.856375615968108e-06, "loss": 0.4164, "step": 56025 }, { "epoch": 2.79, "grad_norm": 0.6306387782096863, "learning_rate": 3.83330564937342e-06, "loss": 0.2838, "step": 56050 }, { "epoch": 2.79, "grad_norm": 175.56747436523438, "learning_rate": 3.8102356827787316e-06, "loss": 0.5275, "step": 56075 }, { "epoch": 2.8, "grad_norm": 16.832124710083008, "learning_rate": 3.7871657161840434e-06, "loss": 0.5355, "step": 56100 }, { "epoch": 2.8, "grad_norm": 2.7186450958251953, "learning_rate": 3.7640957495893548e-06, "loss": 0.3458, "step": 56125 }, { "epoch": 2.8, "grad_norm": 3.481710433959961, "learning_rate": 3.7410257829946665e-06, "loss": 0.4307, "step": 56150 }, { "epoch": 2.8, "grad_norm": 0.9718573093414307, "learning_rate": 3.7179558163999783e-06, "loss": 0.4509, "step": 56175 }, { "epoch": 2.8, "grad_norm": 7.378538131713867, "learning_rate": 3.6948858498052893e-06, "loss": 0.3863, "step": 56200 }, { "epoch": 2.8, "grad_norm": 0.7621744275093079, "learning_rate": 3.671815883210601e-06, "loss": 0.3617, "step": 56225 }, { "epoch": 2.8, "grad_norm": 0.687117338180542, "learning_rate": 3.648745916615913e-06, "loss": 0.2841, "step": 56250 }, { "epoch": 2.8, "grad_norm": 3.461027145385742, "learning_rate": 3.6256759500212246e-06, "loss": 0.3593, "step": 56275 }, { "epoch": 2.81, "grad_norm": 6.926644325256348, "learning_rate": 3.602605983426536e-06, "loss": 0.4395, "step": 56300 }, { "epoch": 2.81, "grad_norm": 4.752397060394287, "learning_rate": 3.5795360168318477e-06, "loss": 0.4244, "step": 56325 }, { "epoch": 2.81, "grad_norm": 0.7236278057098389, "learning_rate": 3.5564660502371595e-06, "loss": 0.368, "step": 56350 }, { "epoch": 2.81, "grad_norm": 0.6548141837120056, "learning_rate": 3.5333960836424713e-06, "loss": 0.3861, "step": 56375 }, { "epoch": 2.81, "grad_norm": 7.0432209968566895, "learning_rate": 3.5103261170477827e-06, "loss": 0.4131, "step": 56400 }, { "epoch": 2.81, "grad_norm": 14.461806297302246, "learning_rate": 3.4872561504530945e-06, "loss": 0.5109, "step": 56425 }, { "epoch": 2.81, "grad_norm": 2.674800395965576, "learning_rate": 3.4641861838584062e-06, "loss": 0.4644, "step": 56450 }, { "epoch": 2.81, "grad_norm": 6.315777778625488, "learning_rate": 3.441116217263718e-06, "loss": 0.3623, "step": 56475 }, { "epoch": 2.82, "grad_norm": 0.7652971148490906, "learning_rate": 3.418046250669029e-06, "loss": 0.4106, "step": 56500 }, { "epoch": 2.82, "grad_norm": 3.053166627883911, "learning_rate": 3.3949762840743407e-06, "loss": 0.4133, "step": 56525 }, { "epoch": 2.82, "grad_norm": 1.1593868732452393, "learning_rate": 3.3719063174796525e-06, "loss": 0.4392, "step": 56550 }, { "epoch": 2.82, "grad_norm": 0.6778175234794617, "learning_rate": 3.348836350884964e-06, "loss": 0.3329, "step": 56575 }, { "epoch": 2.82, "grad_norm": 3.0964550971984863, "learning_rate": 3.3257663842902757e-06, "loss": 0.3984, "step": 56600 }, { "epoch": 2.82, "grad_norm": 1.697141408920288, "learning_rate": 3.3026964176955875e-06, "loss": 0.308, "step": 56625 }, { "epoch": 2.82, "grad_norm": 8.337875366210938, "learning_rate": 3.2796264511008992e-06, "loss": 0.3159, "step": 56650 }, { "epoch": 2.82, "grad_norm": 34.76048278808594, "learning_rate": 3.25655648450621e-06, "loss": 0.3352, "step": 56675 }, { "epoch": 2.83, "grad_norm": 7.32834529876709, "learning_rate": 3.233486517911522e-06, "loss": 0.4327, "step": 56700 }, { "epoch": 2.83, "grad_norm": 3.3736586570739746, "learning_rate": 3.210416551316834e-06, "loss": 0.4657, "step": 56725 }, { "epoch": 2.83, "grad_norm": 20.95182228088379, "learning_rate": 3.187346584722146e-06, "loss": 0.5894, "step": 56750 }, { "epoch": 2.83, "grad_norm": 0.7072112560272217, "learning_rate": 3.164276618127457e-06, "loss": 0.4317, "step": 56775 }, { "epoch": 2.83, "grad_norm": 43.19917297363281, "learning_rate": 3.1412066515327687e-06, "loss": 0.3036, "step": 56800 }, { "epoch": 2.83, "grad_norm": 0.7587588429450989, "learning_rate": 3.1181366849380804e-06, "loss": 0.3638, "step": 56825 }, { "epoch": 2.83, "grad_norm": 2.467799186706543, "learning_rate": 3.095066718343392e-06, "loss": 0.4133, "step": 56850 }, { "epoch": 2.83, "grad_norm": 9.070191383361816, "learning_rate": 3.0719967517487036e-06, "loss": 0.432, "step": 56875 }, { "epoch": 2.84, "grad_norm": 3.5591251850128174, "learning_rate": 3.0489267851540154e-06, "loss": 0.4507, "step": 56900 }, { "epoch": 2.84, "grad_norm": 0.7420728802680969, "learning_rate": 3.0258568185593267e-06, "loss": 0.356, "step": 56925 }, { "epoch": 2.84, "grad_norm": 60.67326354980469, "learning_rate": 3.0027868519646385e-06, "loss": 0.4632, "step": 56950 }, { "epoch": 2.84, "grad_norm": 0.8600183129310608, "learning_rate": 2.97971688536995e-06, "loss": 0.4973, "step": 56975 }, { "epoch": 2.84, "grad_norm": 0.7386710047721863, "learning_rate": 2.9566469187752617e-06, "loss": 0.308, "step": 57000 }, { "epoch": 2.84, "grad_norm": 3.9108200073242188, "learning_rate": 2.9335769521805734e-06, "loss": 0.3888, "step": 57025 }, { "epoch": 2.84, "grad_norm": 8.574189186096191, "learning_rate": 2.9105069855858852e-06, "loss": 0.3844, "step": 57050 }, { "epoch": 2.84, "grad_norm": 0.6366267800331116, "learning_rate": 2.8874370189911966e-06, "loss": 0.3923, "step": 57075 }, { "epoch": 2.85, "grad_norm": 19.721044540405273, "learning_rate": 2.8643670523965084e-06, "loss": 0.5066, "step": 57100 }, { "epoch": 2.85, "grad_norm": 3.4444007873535156, "learning_rate": 2.8412970858018197e-06, "loss": 0.4136, "step": 57125 }, { "epoch": 2.85, "grad_norm": 8.910185813903809, "learning_rate": 2.8182271192071315e-06, "loss": 0.5175, "step": 57150 }, { "epoch": 2.85, "grad_norm": 0.9261099100112915, "learning_rate": 2.7951571526124433e-06, "loss": 0.462, "step": 57175 }, { "epoch": 2.85, "grad_norm": 8.69874095916748, "learning_rate": 2.7720871860177547e-06, "loss": 0.3286, "step": 57200 }, { "epoch": 2.85, "grad_norm": 0.8175128698348999, "learning_rate": 2.7490172194230664e-06, "loss": 0.5005, "step": 57225 }, { "epoch": 2.85, "grad_norm": 6.2586750984191895, "learning_rate": 2.725947252828378e-06, "loss": 0.4317, "step": 57250 }, { "epoch": 2.85, "grad_norm": 0.8921399116516113, "learning_rate": 2.7028772862336896e-06, "loss": 0.2917, "step": 57275 }, { "epoch": 2.86, "grad_norm": 3.185152053833008, "learning_rate": 2.6798073196390014e-06, "loss": 0.366, "step": 57300 }, { "epoch": 2.86, "grad_norm": 1.9014008045196533, "learning_rate": 2.656737353044313e-06, "loss": 0.3909, "step": 57325 }, { "epoch": 2.86, "grad_norm": 1.7264939546585083, "learning_rate": 2.6336673864496245e-06, "loss": 0.3432, "step": 57350 }, { "epoch": 2.86, "grad_norm": 22.747976303100586, "learning_rate": 2.6105974198549363e-06, "loss": 0.4633, "step": 57375 }, { "epoch": 2.86, "grad_norm": 0.8758083581924438, "learning_rate": 2.5875274532602477e-06, "loss": 0.4581, "step": 57400 }, { "epoch": 2.86, "grad_norm": 3.4689791202545166, "learning_rate": 2.5644574866655594e-06, "loss": 0.3463, "step": 57425 }, { "epoch": 2.86, "grad_norm": 3.972005844116211, "learning_rate": 2.5413875200708712e-06, "loss": 0.4681, "step": 57450 }, { "epoch": 2.86, "grad_norm": 50.45981979370117, "learning_rate": 2.518317553476183e-06, "loss": 0.5288, "step": 57475 }, { "epoch": 2.87, "grad_norm": 1.1776808500289917, "learning_rate": 2.4952475868814944e-06, "loss": 0.5307, "step": 57500 }, { "epoch": 2.87, "grad_norm": 3.626554250717163, "learning_rate": 2.4721776202868057e-06, "loss": 0.4914, "step": 57525 }, { "epoch": 2.87, "grad_norm": 10.887569427490234, "learning_rate": 2.4491076536921175e-06, "loss": 0.3871, "step": 57550 }, { "epoch": 2.87, "grad_norm": 0.7736265063285828, "learning_rate": 2.426037687097429e-06, "loss": 0.3467, "step": 57575 }, { "epoch": 2.87, "grad_norm": 11.513165473937988, "learning_rate": 2.402967720502741e-06, "loss": 0.3595, "step": 57600 }, { "epoch": 2.87, "grad_norm": 0.938291072845459, "learning_rate": 2.3798977539080524e-06, "loss": 0.3298, "step": 57625 }, { "epoch": 2.87, "grad_norm": 1.0008108615875244, "learning_rate": 2.3568277873133642e-06, "loss": 0.3453, "step": 57650 }, { "epoch": 2.87, "grad_norm": 2.2188680171966553, "learning_rate": 2.3337578207186756e-06, "loss": 0.374, "step": 57675 }, { "epoch": 2.88, "grad_norm": 3.403674602508545, "learning_rate": 2.3106878541239874e-06, "loss": 0.4336, "step": 57700 }, { "epoch": 2.88, "grad_norm": 0.7060783505439758, "learning_rate": 2.2876178875292987e-06, "loss": 0.3556, "step": 57725 }, { "epoch": 2.88, "grad_norm": 0.691160261631012, "learning_rate": 2.2645479209346105e-06, "loss": 0.4475, "step": 57750 }, { "epoch": 2.88, "grad_norm": 0.6420150399208069, "learning_rate": 2.2414779543399223e-06, "loss": 0.246, "step": 57775 }, { "epoch": 2.88, "grad_norm": 3.1971852779388428, "learning_rate": 2.218407987745234e-06, "loss": 0.2914, "step": 57800 }, { "epoch": 2.88, "grad_norm": 0.6994123458862305, "learning_rate": 2.1953380211505454e-06, "loss": 0.4257, "step": 57825 }, { "epoch": 2.88, "grad_norm": 0.673072338104248, "learning_rate": 2.172268054555857e-06, "loss": 0.4714, "step": 57850 }, { "epoch": 2.88, "grad_norm": 7.408416271209717, "learning_rate": 2.1491980879611686e-06, "loss": 0.5478, "step": 57875 }, { "epoch": 2.89, "grad_norm": 4.219334125518799, "learning_rate": 2.1261281213664804e-06, "loss": 0.3488, "step": 57900 }, { "epoch": 2.89, "grad_norm": 7.087649345397949, "learning_rate": 2.103058154771792e-06, "loss": 0.4356, "step": 57925 }, { "epoch": 2.89, "grad_norm": 22.720373153686523, "learning_rate": 2.0799881881771035e-06, "loss": 0.3209, "step": 57950 }, { "epoch": 2.89, "grad_norm": 21.636537551879883, "learning_rate": 2.0569182215824153e-06, "loss": 0.4794, "step": 57975 }, { "epoch": 2.89, "grad_norm": 4.3353447914123535, "learning_rate": 2.0338482549877266e-06, "loss": 0.3213, "step": 58000 }, { "epoch": 2.89, "grad_norm": 3.337759017944336, "learning_rate": 2.0107782883930384e-06, "loss": 0.4506, "step": 58025 }, { "epoch": 2.89, "grad_norm": 0.6747495532035828, "learning_rate": 1.98770832179835e-06, "loss": 0.5877, "step": 58050 }, { "epoch": 2.89, "grad_norm": 0.7362604737281799, "learning_rate": 1.964638355203662e-06, "loss": 0.3278, "step": 58075 }, { "epoch": 2.9, "grad_norm": 0.698776125907898, "learning_rate": 1.9415683886089733e-06, "loss": 0.4506, "step": 58100 }, { "epoch": 2.9, "grad_norm": 7.648797988891602, "learning_rate": 1.918498422014285e-06, "loss": 0.4466, "step": 58125 }, { "epoch": 2.9, "grad_norm": 3.523207664489746, "learning_rate": 1.8954284554195967e-06, "loss": 0.3555, "step": 58150 }, { "epoch": 2.9, "grad_norm": 2.721613883972168, "learning_rate": 1.8723584888249085e-06, "loss": 0.3494, "step": 58175 }, { "epoch": 2.9, "grad_norm": 0.7379086017608643, "learning_rate": 1.8492885222302198e-06, "loss": 0.3489, "step": 58200 }, { "epoch": 2.9, "grad_norm": 7.868014335632324, "learning_rate": 1.8262185556355314e-06, "loss": 0.5111, "step": 58225 }, { "epoch": 2.9, "grad_norm": 3.1979591846466064, "learning_rate": 1.8031485890408432e-06, "loss": 0.3398, "step": 58250 }, { "epoch": 2.9, "grad_norm": 3.2477834224700928, "learning_rate": 1.7800786224461546e-06, "loss": 0.4812, "step": 58275 }, { "epoch": 2.91, "grad_norm": 0.7919636368751526, "learning_rate": 1.7570086558514666e-06, "loss": 0.3957, "step": 58300 }, { "epoch": 2.91, "grad_norm": 9.772295951843262, "learning_rate": 1.733938689256778e-06, "loss": 0.3896, "step": 58325 }, { "epoch": 2.91, "grad_norm": 52.001590728759766, "learning_rate": 1.7108687226620897e-06, "loss": 0.4273, "step": 58350 }, { "epoch": 2.91, "grad_norm": 4.567387580871582, "learning_rate": 1.6877987560674013e-06, "loss": 0.5927, "step": 58375 }, { "epoch": 2.91, "grad_norm": 0.7052479982376099, "learning_rate": 1.664728789472713e-06, "loss": 0.4157, "step": 58400 }, { "epoch": 2.91, "grad_norm": 12.553930282592773, "learning_rate": 1.6416588228780244e-06, "loss": 0.3005, "step": 58425 }, { "epoch": 2.91, "grad_norm": 1.8058936595916748, "learning_rate": 1.6185888562833362e-06, "loss": 0.4036, "step": 58450 }, { "epoch": 2.91, "grad_norm": 0.6824276447296143, "learning_rate": 1.5955188896886478e-06, "loss": 0.4308, "step": 58475 }, { "epoch": 2.92, "grad_norm": 4.887246131896973, "learning_rate": 1.5724489230939596e-06, "loss": 0.5227, "step": 58500 }, { "epoch": 2.92, "grad_norm": 3.1629316806793213, "learning_rate": 1.5493789564992711e-06, "loss": 0.4616, "step": 58525 }, { "epoch": 2.92, "grad_norm": 0.7038543224334717, "learning_rate": 1.5263089899045827e-06, "loss": 0.4053, "step": 58550 }, { "epoch": 2.92, "grad_norm": 0.8581077456474304, "learning_rate": 1.5032390233098943e-06, "loss": 0.3863, "step": 58575 }, { "epoch": 2.92, "grad_norm": 3.157999038696289, "learning_rate": 1.480169056715206e-06, "loss": 0.3802, "step": 58600 }, { "epoch": 2.92, "grad_norm": 8.047822952270508, "learning_rate": 1.4570990901205176e-06, "loss": 0.4406, "step": 58625 }, { "epoch": 2.92, "grad_norm": 3.3583245277404785, "learning_rate": 1.4340291235258292e-06, "loss": 0.4823, "step": 58650 }, { "epoch": 2.92, "grad_norm": 0.6569061875343323, "learning_rate": 1.410959156931141e-06, "loss": 0.426, "step": 58675 }, { "epoch": 2.93, "grad_norm": 9.12439250946045, "learning_rate": 1.3878891903364525e-06, "loss": 0.3746, "step": 58700 }, { "epoch": 2.93, "grad_norm": 435.5043640136719, "learning_rate": 1.3648192237417641e-06, "loss": 0.2853, "step": 58725 }, { "epoch": 2.93, "grad_norm": 6.289416313171387, "learning_rate": 1.3417492571470757e-06, "loss": 0.296, "step": 58750 }, { "epoch": 2.93, "grad_norm": 3.6272895336151123, "learning_rate": 1.3186792905523873e-06, "loss": 0.4109, "step": 58775 }, { "epoch": 2.93, "grad_norm": 0.7682371139526367, "learning_rate": 1.2956093239576988e-06, "loss": 0.4651, "step": 58800 }, { "epoch": 2.93, "grad_norm": 3.5857977867126465, "learning_rate": 1.2725393573630106e-06, "loss": 0.4428, "step": 58825 }, { "epoch": 2.93, "grad_norm": 3.7168776988983154, "learning_rate": 1.2494693907683222e-06, "loss": 0.4103, "step": 58850 }, { "epoch": 2.93, "grad_norm": 3.0777294635772705, "learning_rate": 1.2263994241736338e-06, "loss": 0.3149, "step": 58875 }, { "epoch": 2.94, "grad_norm": 3.4248714447021484, "learning_rate": 1.2033294575789455e-06, "loss": 0.3501, "step": 58900 }, { "epoch": 2.94, "grad_norm": 3.277421236038208, "learning_rate": 1.1802594909842571e-06, "loss": 0.4754, "step": 58925 }, { "epoch": 2.94, "grad_norm": 0.7078830003738403, "learning_rate": 1.1571895243895687e-06, "loss": 0.416, "step": 58950 }, { "epoch": 2.94, "grad_norm": 4.206014633178711, "learning_rate": 1.1341195577948805e-06, "loss": 0.3185, "step": 58975 }, { "epoch": 2.94, "grad_norm": 14.83720588684082, "learning_rate": 1.111049591200192e-06, "loss": 0.3692, "step": 59000 }, { "epoch": 2.94, "grad_norm": 0.7804746627807617, "learning_rate": 1.0879796246055036e-06, "loss": 0.304, "step": 59025 }, { "epoch": 2.94, "grad_norm": 3.3699700832366943, "learning_rate": 1.0649096580108154e-06, "loss": 0.3233, "step": 59050 }, { "epoch": 2.94, "grad_norm": 3.395963668823242, "learning_rate": 1.041839691416127e-06, "loss": 0.4438, "step": 59075 }, { "epoch": 2.94, "grad_norm": 3.5054526329040527, "learning_rate": 1.0187697248214385e-06, "loss": 0.4384, "step": 59100 }, { "epoch": 2.95, "grad_norm": 3.579428195953369, "learning_rate": 9.956997582267501e-07, "loss": 0.436, "step": 59125 }, { "epoch": 2.95, "grad_norm": 3.2751588821411133, "learning_rate": 9.726297916320617e-07, "loss": 0.4432, "step": 59150 }, { "epoch": 2.95, "grad_norm": 3.472015857696533, "learning_rate": 9.495598250373734e-07, "loss": 0.5857, "step": 59175 }, { "epoch": 2.95, "grad_norm": 10.637346267700195, "learning_rate": 9.26489858442685e-07, "loss": 0.3446, "step": 59200 }, { "epoch": 2.95, "grad_norm": 63.770668029785156, "learning_rate": 9.034198918479966e-07, "loss": 0.3659, "step": 59225 }, { "epoch": 2.95, "grad_norm": 5.983547687530518, "learning_rate": 8.803499252533083e-07, "loss": 0.447, "step": 59250 }, { "epoch": 2.95, "grad_norm": 5.243884086608887, "learning_rate": 8.5727995865862e-07, "loss": 0.4767, "step": 59275 }, { "epoch": 2.95, "grad_norm": 3.3908684253692627, "learning_rate": 8.342099920639315e-07, "loss": 0.4583, "step": 59300 }, { "epoch": 2.96, "grad_norm": 3.267824172973633, "learning_rate": 8.111400254692432e-07, "loss": 0.3493, "step": 59325 }, { "epoch": 2.96, "grad_norm": 8.377107620239258, "learning_rate": 7.880700588745549e-07, "loss": 0.4306, "step": 59350 }, { "epoch": 2.96, "grad_norm": 0.8545198440551758, "learning_rate": 7.650000922798664e-07, "loss": 0.3806, "step": 59375 }, { "epoch": 2.96, "grad_norm": 5.678221225738525, "learning_rate": 7.41930125685178e-07, "loss": 0.3354, "step": 59400 }, { "epoch": 2.96, "grad_norm": 0.791506826877594, "learning_rate": 7.188601590904897e-07, "loss": 0.3562, "step": 59425 }, { "epoch": 2.96, "grad_norm": 3.0934579372406006, "learning_rate": 6.957901924958013e-07, "loss": 0.396, "step": 59450 }, { "epoch": 2.96, "grad_norm": 6.885002136230469, "learning_rate": 6.72720225901113e-07, "loss": 0.4913, "step": 59475 }, { "epoch": 2.96, "grad_norm": 6.017404556274414, "learning_rate": 6.496502593064246e-07, "loss": 0.3693, "step": 59500 }, { "epoch": 2.97, "grad_norm": 7.71619987487793, "learning_rate": 6.265802927117361e-07, "loss": 0.3407, "step": 59525 }, { "epoch": 2.97, "grad_norm": 0.6652907133102417, "learning_rate": 6.035103261170478e-07, "loss": 0.2536, "step": 59550 }, { "epoch": 2.97, "grad_norm": 0.734829843044281, "learning_rate": 5.804403595223595e-07, "loss": 0.4627, "step": 59575 }, { "epoch": 2.97, "grad_norm": 0.6811460852622986, "learning_rate": 5.57370392927671e-07, "loss": 0.4573, "step": 59600 }, { "epoch": 2.97, "grad_norm": 2.0909130573272705, "learning_rate": 5.343004263329827e-07, "loss": 0.4136, "step": 59625 }, { "epoch": 2.97, "grad_norm": 7.259815216064453, "learning_rate": 5.112304597382944e-07, "loss": 0.3893, "step": 59650 }, { "epoch": 2.97, "grad_norm": 0.6684561371803284, "learning_rate": 4.88160493143606e-07, "loss": 0.4502, "step": 59675 }, { "epoch": 2.97, "grad_norm": 3.2914390563964844, "learning_rate": 4.650905265489175e-07, "loss": 0.272, "step": 59700 }, { "epoch": 2.98, "grad_norm": 4.816464900970459, "learning_rate": 4.420205599542292e-07, "loss": 0.3046, "step": 59725 }, { "epoch": 2.98, "grad_norm": 1.9208866357803345, "learning_rate": 4.189505933595408e-07, "loss": 0.4547, "step": 59750 }, { "epoch": 2.98, "grad_norm": 18.676660537719727, "learning_rate": 3.9588062676485245e-07, "loss": 0.4184, "step": 59775 }, { "epoch": 2.98, "grad_norm": 0.7250289916992188, "learning_rate": 3.7281066017016413e-07, "loss": 0.471, "step": 59800 }, { "epoch": 2.98, "grad_norm": 7.761043548583984, "learning_rate": 3.497406935754757e-07, "loss": 0.3157, "step": 59825 }, { "epoch": 2.98, "grad_norm": 1.860449194908142, "learning_rate": 3.266707269807873e-07, "loss": 0.3256, "step": 59850 }, { "epoch": 2.98, "grad_norm": 23.85589027404785, "learning_rate": 3.03600760386099e-07, "loss": 0.3953, "step": 59875 }, { "epoch": 2.98, "grad_norm": 0.6777181029319763, "learning_rate": 2.8053079379141057e-07, "loss": 0.2922, "step": 59900 }, { "epoch": 2.99, "grad_norm": 8.837489128112793, "learning_rate": 2.574608271967222e-07, "loss": 0.4033, "step": 59925 }, { "epoch": 2.99, "grad_norm": 3.261415719985962, "learning_rate": 2.3439086060203387e-07, "loss": 0.3313, "step": 59950 }, { "epoch": 2.99, "grad_norm": 21.563766479492188, "learning_rate": 2.113208940073455e-07, "loss": 0.4912, "step": 59975 }, { "epoch": 2.99, "grad_norm": 7.959074974060059, "learning_rate": 1.8825092741265712e-07, "loss": 0.4481, "step": 60000 }, { "epoch": 2.99, "grad_norm": 33.16374969482422, "learning_rate": 1.6518096081796875e-07, "loss": 0.4365, "step": 60025 }, { "epoch": 2.99, "grad_norm": 1.0097713470458984, "learning_rate": 1.4211099422328037e-07, "loss": 0.4598, "step": 60050 }, { "epoch": 2.99, "grad_norm": 13.992691993713379, "learning_rate": 1.19041027628592e-07, "loss": 0.4552, "step": 60075 }, { "epoch": 2.99, "grad_norm": 3.1975209712982178, "learning_rate": 9.597106103390363e-08, "loss": 0.5176, "step": 60100 }, { "epoch": 3.0, "grad_norm": 0.6553680896759033, "learning_rate": 7.290109443921525e-08, "loss": 0.4653, "step": 60125 }, { "epoch": 3.0, "grad_norm": 31.342302322387695, "learning_rate": 4.9831127844526877e-08, "loss": 0.5265, "step": 60150 }, { "epoch": 3.0, "grad_norm": 8.892498016357422, "learning_rate": 2.676116124983851e-08, "loss": 0.5413, "step": 60175 }, { "epoch": 3.0, "grad_norm": 6.391012191772461, "learning_rate": 3.691194655150139e-09, "loss": 0.4561, "step": 60200 }, { "epoch": 3.0, "eval_accuracy": 0.8848137535816619, "eval_f1_macro": 0.7960248281288549, "eval_f1_micro": 0.8848137535816619, "eval_f1_weighted": 0.8831232981645943, "eval_loss": 0.41398245096206665, "eval_precision_macro": 0.8400378915863742, "eval_precision_micro": 0.8848137535816619, "eval_precision_weighted": 0.8831401394863536, "eval_recall_macro": 0.7672428418298921, "eval_recall_micro": 0.8848137535816619, "eval_recall_weighted": 0.8848137535816619, "eval_runtime": 6833.944, "eval_samples_per_second": 5.873, "eval_steps_per_second": 0.367, "step": 60204 } ], "logging_steps": 25, "max_steps": 60204, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 3.167977639141325e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }