| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.436871996505024, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00436871996505024, | |
| "grad_norm": 9.839941024780273, | |
| "learning_rate": 8e-05, | |
| "loss": 2.5246, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.00873743993010048, | |
| "grad_norm": 13.773455619812012, | |
| "learning_rate": 0.00018, | |
| "loss": 1.1343, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01310615989515072, | |
| "grad_norm": 5.6580424308776855, | |
| "learning_rate": 0.0001999997582552296, | |
| "loss": 0.7712, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01747487986020096, | |
| "grad_norm": 5.294467926025391, | |
| "learning_rate": 0.0001999987761691029, | |
| "loss": 0.73, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.021843599825251202, | |
| "grad_norm": 2.8633503913879395, | |
| "learning_rate": 0.00019999703863998527, | |
| "loss": 0.7289, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.02621231979030144, | |
| "grad_norm": 3.2836177349090576, | |
| "learning_rate": 0.00019999454568100293, | |
| "loss": 0.4686, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03058103975535168, | |
| "grad_norm": 4.878258228302002, | |
| "learning_rate": 0.00019999129731098898, | |
| "loss": 0.6629, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.03494975972040192, | |
| "grad_norm": 2.899914026260376, | |
| "learning_rate": 0.00019998729355448326, | |
| "loss": 0.6038, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.039318479685452164, | |
| "grad_norm": 3.289844274520874, | |
| "learning_rate": 0.00019998253444173235, | |
| "loss": 0.4573, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.043687199650502405, | |
| "grad_norm": 2.957254648208618, | |
| "learning_rate": 0.00019997702000868896, | |
| "loss": 0.594, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.048055919615552646, | |
| "grad_norm": 3.171276807785034, | |
| "learning_rate": 0.00019997075029701207, | |
| "loss": 0.5719, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.05242463958060288, | |
| "grad_norm": 2.55605149269104, | |
| "learning_rate": 0.0001999637253540663, | |
| "loss": 0.5971, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05679335954565312, | |
| "grad_norm": 2.127289295196533, | |
| "learning_rate": 0.00019995594523292178, | |
| "loss": 0.5712, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.06116207951070336, | |
| "grad_norm": 3.3928685188293457, | |
| "learning_rate": 0.00019994740999235359, | |
| "loss": 0.5712, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0655307994757536, | |
| "grad_norm": 2.6700279712677, | |
| "learning_rate": 0.00019993811969684142, | |
| "loss": 0.427, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.06989951944080385, | |
| "grad_norm": 2.6936633586883545, | |
| "learning_rate": 0.00019992807441656898, | |
| "loss": 0.5321, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.07426823940585409, | |
| "grad_norm": 3.9897687435150146, | |
| "learning_rate": 0.00019991727422742362, | |
| "loss": 0.6025, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.07863695937090433, | |
| "grad_norm": 2.3496663570404053, | |
| "learning_rate": 0.00019990571921099553, | |
| "loss": 0.5975, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08300567933595457, | |
| "grad_norm": 3.3796467781066895, | |
| "learning_rate": 0.0001998934094545774, | |
| "loss": 0.5255, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.08737439930100481, | |
| "grad_norm": 3.1103007793426514, | |
| "learning_rate": 0.00019988034505116352, | |
| "loss": 0.4946, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09174311926605505, | |
| "grad_norm": 2.002304792404175, | |
| "learning_rate": 0.00019986652609944926, | |
| "loss": 0.425, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.09611183923110529, | |
| "grad_norm": 1.7572168111801147, | |
| "learning_rate": 0.00019985195270383018, | |
| "loss": 0.6073, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10048055919615553, | |
| "grad_norm": 2.745215654373169, | |
| "learning_rate": 0.00019983662497440133, | |
| "loss": 0.586, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.10484927916120576, | |
| "grad_norm": 1.8170915842056274, | |
| "learning_rate": 0.0001998205430269564, | |
| "loss": 0.5255, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.109217999126256, | |
| "grad_norm": 1.4944056272506714, | |
| "learning_rate": 0.00019980370698298677, | |
| "loss": 0.4219, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.11358671909130624, | |
| "grad_norm": 1.6616989374160767, | |
| "learning_rate": 0.00019978611696968074, | |
| "loss": 0.4231, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.11795543905635648, | |
| "grad_norm": 2.0523645877838135, | |
| "learning_rate": 0.00019976777311992247, | |
| "loss": 0.5298, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.12232415902140673, | |
| "grad_norm": 2.065765619277954, | |
| "learning_rate": 0.00019974867557229098, | |
| "loss": 0.5228, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12669287898645698, | |
| "grad_norm": 1.7283438444137573, | |
| "learning_rate": 0.00019972882447105912, | |
| "loss": 0.3452, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.1310615989515072, | |
| "grad_norm": 2.655750274658203, | |
| "learning_rate": 0.00019970821996619244, | |
| "loss": 0.508, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.13543031891655744, | |
| "grad_norm": 2.67799973487854, | |
| "learning_rate": 0.0001996868622133482, | |
| "loss": 0.4359, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.1397990388816077, | |
| "grad_norm": 1.6298809051513672, | |
| "learning_rate": 0.00019966475137387396, | |
| "loss": 0.5447, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.14416775884665792, | |
| "grad_norm": 1.4772286415100098, | |
| "learning_rate": 0.00019964188761480657, | |
| "loss": 0.4105, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.14853647881170817, | |
| "grad_norm": 2.2986271381378174, | |
| "learning_rate": 0.00019961827110887083, | |
| "loss": 0.603, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1529051987767584, | |
| "grad_norm": 2.8261911869049072, | |
| "learning_rate": 0.00019959390203447817, | |
| "loss": 0.4649, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.15727391874180865, | |
| "grad_norm": 1.7771011590957642, | |
| "learning_rate": 0.00019956878057572524, | |
| "loss": 0.4394, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16164263870685888, | |
| "grad_norm": 1.7315421104431152, | |
| "learning_rate": 0.00019954290692239274, | |
| "loss": 0.5289, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.16601135867190914, | |
| "grad_norm": 1.6124423742294312, | |
| "learning_rate": 0.00019951628126994373, | |
| "loss": 0.4173, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17038007863695936, | |
| "grad_norm": 1.792577862739563, | |
| "learning_rate": 0.00019948890381952232, | |
| "loss": 0.4331, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.17474879860200962, | |
| "grad_norm": 1.9038774967193604, | |
| "learning_rate": 0.000199460774777952, | |
| "loss": 0.4247, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.17911751856705985, | |
| "grad_norm": 2.457122802734375, | |
| "learning_rate": 0.00019943189435773432, | |
| "loss": 0.4519, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.1834862385321101, | |
| "grad_norm": 1.97683584690094, | |
| "learning_rate": 0.00019940226277704706, | |
| "loss": 0.4761, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.18785495849716033, | |
| "grad_norm": 2.1646862030029297, | |
| "learning_rate": 0.0001993718802597426, | |
| "loss": 0.5294, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.19222367846221058, | |
| "grad_norm": 1.565412998199463, | |
| "learning_rate": 0.00019934074703534637, | |
| "loss": 0.3999, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1965923984272608, | |
| "grad_norm": 2.4315876960754395, | |
| "learning_rate": 0.00019930886333905504, | |
| "loss": 0.378, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.20096111839231107, | |
| "grad_norm": 2.7567529678344727, | |
| "learning_rate": 0.00019927622941173467, | |
| "loss": 0.5075, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2053298383573613, | |
| "grad_norm": 1.8640387058258057, | |
| "learning_rate": 0.00019924284549991902, | |
| "loss": 0.4749, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.20969855832241152, | |
| "grad_norm": 2.090924024581909, | |
| "learning_rate": 0.00019920871185580757, | |
| "loss": 0.4353, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.21406727828746178, | |
| "grad_norm": 1.9691081047058105, | |
| "learning_rate": 0.00019917382873726376, | |
| "loss": 0.4051, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.218435998252512, | |
| "grad_norm": 1.8130213022232056, | |
| "learning_rate": 0.0001991381964078128, | |
| "loss": 0.526, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.22280471821756226, | |
| "grad_norm": 2.078805923461914, | |
| "learning_rate": 0.00019910181513664, | |
| "loss": 0.5654, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.22717343818261249, | |
| "grad_norm": 2.0686287879943848, | |
| "learning_rate": 0.0001990646851985884, | |
| "loss": 0.43, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.23154215814766274, | |
| "grad_norm": 1.475821614265442, | |
| "learning_rate": 0.00019902680687415705, | |
| "loss": 0.355, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.23591087811271297, | |
| "grad_norm": 1.901236891746521, | |
| "learning_rate": 0.0001989881804494985, | |
| "loss": 0.4522, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.24027959807776322, | |
| "grad_norm": 1.2583553791046143, | |
| "learning_rate": 0.00019894880621641704, | |
| "loss": 0.3869, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.24464831804281345, | |
| "grad_norm": 1.712336540222168, | |
| "learning_rate": 0.00019890868447236613, | |
| "loss": 0.454, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2490170380078637, | |
| "grad_norm": 2.3967206478118896, | |
| "learning_rate": 0.00019886781552044634, | |
| "loss": 0.4074, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.25338575797291396, | |
| "grad_norm": 2.0578925609588623, | |
| "learning_rate": 0.0001988261996694032, | |
| "loss": 0.4268, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.2577544779379642, | |
| "grad_norm": 1.7411088943481445, | |
| "learning_rate": 0.0001987838372336245, | |
| "loss": 0.334, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.2621231979030144, | |
| "grad_norm": 1.8145533800125122, | |
| "learning_rate": 0.0001987407285331382, | |
| "loss": 0.4019, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.26649191786806464, | |
| "grad_norm": 1.3501653671264648, | |
| "learning_rate": 0.00019869687389361, | |
| "loss": 0.32, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.27086063783311487, | |
| "grad_norm": 1.208422303199768, | |
| "learning_rate": 0.00019865227364634073, | |
| "loss": 0.4548, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.27522935779816515, | |
| "grad_norm": 1.521690011024475, | |
| "learning_rate": 0.00019860692812826396, | |
| "loss": 0.3572, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.2795980777632154, | |
| "grad_norm": 2.2849714756011963, | |
| "learning_rate": 0.0001985608376819434, | |
| "loss": 0.4555, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2839667977282656, | |
| "grad_norm": 2.7733798027038574, | |
| "learning_rate": 0.00019851400265557037, | |
| "loss": 0.4726, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.28833551769331583, | |
| "grad_norm": 1.973522424697876, | |
| "learning_rate": 0.00019846642340296114, | |
| "loss": 0.4585, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.2927042376583661, | |
| "grad_norm": 1.7133642435073853, | |
| "learning_rate": 0.0001984181002835542, | |
| "loss": 0.4679, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.29707295762341634, | |
| "grad_norm": 2.8383235931396484, | |
| "learning_rate": 0.00019836903366240768, | |
| "loss": 0.4119, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.30144167758846657, | |
| "grad_norm": 2.798276901245117, | |
| "learning_rate": 0.00019831922391019645, | |
| "loss": 0.3665, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.3058103975535168, | |
| "grad_norm": 2.171276569366455, | |
| "learning_rate": 0.00019826867140320938, | |
| "loss": 0.5691, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3101791175185671, | |
| "grad_norm": 2.0866177082061768, | |
| "learning_rate": 0.00019821737652334653, | |
| "loss": 0.4074, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.3145478374836173, | |
| "grad_norm": 1.3713918924331665, | |
| "learning_rate": 0.0001981653396581162, | |
| "loss": 0.3379, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.31891655744866754, | |
| "grad_norm": 1.6086684465408325, | |
| "learning_rate": 0.0001981125612006321, | |
| "loss": 0.3563, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.32328527741371776, | |
| "grad_norm": 2.655686378479004, | |
| "learning_rate": 0.0001980590415496102, | |
| "loss": 0.3988, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.32765399737876805, | |
| "grad_norm": 1.5271559953689575, | |
| "learning_rate": 0.00019800478110936596, | |
| "loss": 0.5784, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.3320227173438183, | |
| "grad_norm": 1.3043195009231567, | |
| "learning_rate": 0.00019794978028981106, | |
| "loss": 0.2637, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3363914373088685, | |
| "grad_norm": 2.539109706878662, | |
| "learning_rate": 0.0001978940395064504, | |
| "loss": 0.4658, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.34076015727391873, | |
| "grad_norm": 1.7521268129348755, | |
| "learning_rate": 0.00019783755918037903, | |
| "loss": 0.4253, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.34512887723896896, | |
| "grad_norm": 1.5679692029953003, | |
| "learning_rate": 0.00019778033973827882, | |
| "loss": 0.4528, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.34949759720401924, | |
| "grad_norm": 1.670640468597412, | |
| "learning_rate": 0.00019772238161241528, | |
| "loss": 0.3724, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.35386631716906947, | |
| "grad_norm": 1.520856261253357, | |
| "learning_rate": 0.00019766368524063438, | |
| "loss": 0.4141, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.3582350371341197, | |
| "grad_norm": 1.0802158117294312, | |
| "learning_rate": 0.00019760425106635926, | |
| "loss": 0.3268, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3626037570991699, | |
| "grad_norm": 1.7306379079818726, | |
| "learning_rate": 0.0001975440795385866, | |
| "loss": 0.3654, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.3669724770642202, | |
| "grad_norm": 1.5037274360656738, | |
| "learning_rate": 0.0001974831711118836, | |
| "loss": 0.4285, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.37134119702927043, | |
| "grad_norm": 1.4654844999313354, | |
| "learning_rate": 0.00019742152624638437, | |
| "loss": 0.2548, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.37570991699432066, | |
| "grad_norm": 2.6770753860473633, | |
| "learning_rate": 0.00019735914540778638, | |
| "loss": 0.4238, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3800786369593709, | |
| "grad_norm": 1.1864055395126343, | |
| "learning_rate": 0.00019729602906734704, | |
| "loss": 0.3959, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.38444735692442117, | |
| "grad_norm": 1.904876708984375, | |
| "learning_rate": 0.00019723217770188024, | |
| "loss": 0.3603, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.3888160768894714, | |
| "grad_norm": 1.7086598873138428, | |
| "learning_rate": 0.0001971675917937525, | |
| "loss": 0.551, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.3931847968545216, | |
| "grad_norm": 1.4635995626449585, | |
| "learning_rate": 0.00019710227183087947, | |
| "loss": 0.3738, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.39755351681957185, | |
| "grad_norm": 1.6047295331954956, | |
| "learning_rate": 0.00019703621830672238, | |
| "loss": 0.475, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.40192223678462213, | |
| "grad_norm": 1.4741933345794678, | |
| "learning_rate": 0.00019696943172028394, | |
| "loss": 0.4021, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.40629095674967236, | |
| "grad_norm": 2.8138020038604736, | |
| "learning_rate": 0.00019690191257610497, | |
| "loss": 0.3665, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.4106596767147226, | |
| "grad_norm": 1.6264874935150146, | |
| "learning_rate": 0.00019683366138426034, | |
| "loss": 0.3598, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4150283966797728, | |
| "grad_norm": 1.6185061931610107, | |
| "learning_rate": 0.00019676467866035525, | |
| "loss": 0.5003, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.41939711664482304, | |
| "grad_norm": 1.8654040098190308, | |
| "learning_rate": 0.00019669496492552113, | |
| "loss": 0.397, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4237658366098733, | |
| "grad_norm": 1.2525237798690796, | |
| "learning_rate": 0.00019662452070641205, | |
| "loss": 0.3235, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.42813455657492355, | |
| "grad_norm": 1.7755401134490967, | |
| "learning_rate": 0.00019655334653520036, | |
| "loss": 0.2978, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4325032765399738, | |
| "grad_norm": 1.6025470495224, | |
| "learning_rate": 0.00019648144294957297, | |
| "loss": 0.4436, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.436871996505024, | |
| "grad_norm": 1.085461974143982, | |
| "learning_rate": 0.00019640881049272713, | |
| "loss": 0.22, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 5725, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 243832462098432.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |