ale-bay commited on
Commit
c56e3a0
1 Parent(s): 9971a84

Model save

Browse files
README.md CHANGED
@@ -1,15 +1,10 @@
1
  ---
2
- base_model: data/gemma-2b
3
  tags:
4
- - alignment-handbook
5
- - trl
6
- - sft
7
- - generated_from_trainer
8
  - trl
9
  - sft
10
  - generated_from_trainer
11
  datasets:
12
- - argilla/dpo-mix-7k
13
  model-index:
14
  - name: zephyr-2b-gemma-dft-debug
15
  results: []
@@ -20,9 +15,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # zephyr-2b-gemma-dft-debug
22
 
23
- This model is a fine-tuned version of [data/gemma-2b](https://huggingface.co/data/gemma-2b) on the argilla/dpo-mix-7k dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 5.7530
26
 
27
  ## Model description
28
 
@@ -56,7 +51,7 @@ The following hyperparameters were used during training:
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:------:|:----:|:---------------:|
59
- | 5.7909 | 0.9982 | 270 | 5.7530 |
60
 
61
 
62
  ### Framework versions
 
1
  ---
 
2
  tags:
 
 
 
 
3
  - trl
4
  - sft
5
  - generated_from_trainer
6
  datasets:
7
+ - generator
8
  model-index:
9
  - name: zephyr-2b-gemma-dft-debug
10
  results: []
 
15
 
16
  # zephyr-2b-gemma-dft-debug
17
 
18
+ This model was trained from scratch on the generator dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 5.7648
21
 
22
  ## Model description
23
 
 
51
 
52
  | Training Loss | Epoch | Step | Validation Loss |
53
  |:-------------:|:------:|:----:|:---------------:|
54
+ | 5.8055 | 0.9982 | 270 | 5.7648 |
55
 
56
 
57
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 0.9981515711645101,
3
- "eval_loss": 5.75302791595459,
4
- "eval_runtime": 422.4972,
5
- "eval_samples": 750,
6
- "eval_samples_per_second": 2.298,
7
- "eval_steps_per_second": 0.289,
8
  "total_flos": 5.260333472022528e+16,
9
- "train_loss": 7.3803352285314485,
10
- "train_runtime": 2516.7229,
11
  "train_samples": 6750,
12
- "train_samples_per_second": 3.438,
13
- "train_steps_per_second": 0.107
14
  }
 
1
  {
2
  "epoch": 0.9981515711645101,
 
 
 
 
 
3
  "total_flos": 5.260333472022528e+16,
4
+ "train_loss": 7.369897298459653,
5
+ "train_runtime": 2195.5957,
6
  "train_samples": 6750,
7
+ "train_samples_per_second": 3.941,
8
+ "train_steps_per_second": 0.123
9
  }
config.json CHANGED
@@ -24,6 +24,6 @@
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.43.3",
27
- "use_cache": true,
28
  "vocab_size": 256000
29
  }
 
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.43.3",
27
+ "use_cache": false,
28
  "vocab_size": 256000
29
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cc245fefab97d950d4759fe13ab557dd93524fd58102bb040e57fbe13709f50
3
  size 4945242264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:302478d12c555b4bbed9c86317b38cad6aee12e98984b44a35510ea75c5df6d7
3
  size 4945242264
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c9ca3299801e9176ca770f65c661ccc72a5b519bf063d407a2606c996c2efcd
3
  size 67121608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5919bd0e78fe39414c69b98ae088404ebc6c7007c7c4d18b927edbe5d5c217f
3
  size 67121608
runs/Aug02_11-52-50_ale-distillm-8-0-0/events.out.tfevents.1722595982.ale-distillm-8-0-0.9078.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69e554244db879b5ab4de86d2113d3b1ae9168e943b825257008aff373713372
3
+ size 7183
runs/Aug02_11-59-21_ale-distillm-8-0-0/events.out.tfevents.1722596374.ale-distillm-8-0-0.10851.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c059071fa523a9a278afcf726b9909363182d2461df2f6cd359627e03a57f072
3
+ size 17239
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9981515711645101,
3
  "total_flos": 5.260333472022528e+16,
4
- "train_loss": 7.3803352285314485,
5
- "train_runtime": 2516.7229,
6
  "train_samples": 6750,
7
- "train_samples_per_second": 3.438,
8
- "train_steps_per_second": 0.107
9
  }
 
1
  {
2
  "epoch": 0.9981515711645101,
3
  "total_flos": 5.260333472022528e+16,
4
+ "train_loss": 7.369897298459653,
5
+ "train_runtime": 2195.5957,
6
  "train_samples": 6750,
7
+ "train_samples_per_second": 3.941,
8
+ "train_steps_per_second": 0.123
9
  }
trainer_state.json CHANGED
@@ -10,398 +10,398 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.018484288354898338,
13
- "grad_norm": 422.0,
14
  "learning_rate": 3.7037037037037037e-06,
15
- "loss": 20.7086,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.036968576709796676,
20
- "grad_norm": 185.0,
21
  "learning_rate": 7.4074074074074075e-06,
22
- "loss": 19.253,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.05545286506469501,
27
- "grad_norm": 147.0,
28
  "learning_rate": 1.1111111111111113e-05,
29
- "loss": 16.7138,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.07393715341959335,
34
- "grad_norm": 62.75,
35
  "learning_rate": 1.4814814814814815e-05,
36
- "loss": 14.0999,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.09242144177449169,
41
- "grad_norm": 46.25,
42
  "learning_rate": 1.851851851851852e-05,
43
- "loss": 11.9456,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.11090573012939002,
48
- "grad_norm": 49.75,
49
  "learning_rate": 1.9992479525042305e-05,
50
- "loss": 10.3044,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.12939001848428835,
55
- "grad_norm": 88.5,
56
  "learning_rate": 1.9946562024066018e-05,
57
- "loss": 9.602,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.1478743068391867,
62
- "grad_norm": 49.25,
63
  "learning_rate": 1.9859096633447965e-05,
64
- "loss": 9.1612,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.16635859519408502,
69
- "grad_norm": 114.0,
70
  "learning_rate": 1.973044870579824e-05,
71
- "loss": 8.5549,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.18484288354898337,
76
- "grad_norm": 54.75,
77
  "learning_rate": 1.95611556177388e-05,
78
- "loss": 8.3168,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.2033271719038817,
83
- "grad_norm": 16.875,
84
  "learning_rate": 1.93519245252219e-05,
85
- "loss": 7.8042,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.22181146025878004,
90
- "grad_norm": 55.75,
91
  "learning_rate": 1.9103629409661468e-05,
92
- "loss": 7.4335,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.24029574861367836,
97
- "grad_norm": 17.5,
98
  "learning_rate": 1.881730742721608e-05,
99
- "loss": 7.4047,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.2587800369685767,
104
- "grad_norm": 52.5,
105
  "learning_rate": 1.8494154576472976e-05,
106
- "loss": 7.0682,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.27726432532347506,
111
- "grad_norm": 69.5,
112
  "learning_rate": 1.8135520702629677e-05,
113
- "loss": 7.1519,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.2957486136783734,
118
- "grad_norm": 23.125,
119
  "learning_rate": 1.7742903859041324e-05,
120
- "loss": 6.9284,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.3142329020332717,
125
- "grad_norm": 17.25,
126
  "learning_rate": 1.7317944049686125e-05,
127
- "loss": 6.723,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.33271719038817005,
132
- "grad_norm": 103.5,
133
  "learning_rate": 1.686241637868734e-05,
134
- "loss": 6.6468,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.3512014787430684,
139
- "grad_norm": 17.625,
140
  "learning_rate": 1.637822363550706e-05,
141
- "loss": 6.5427,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.36968576709796674,
146
- "grad_norm": 15.6875,
147
  "learning_rate": 1.586738834678418e-05,
148
- "loss": 6.3795,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.38817005545286504,
153
- "grad_norm": 17.875,
154
  "learning_rate": 1.5332044328016916e-05,
155
- "loss": 6.4565,
156
  "step": 105
157
  },
158
  {
159
  "epoch": 0.4066543438077634,
160
- "grad_norm": 68.0,
161
  "learning_rate": 1.4774427770379492e-05,
162
- "loss": 6.416,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.42513863216266173,
167
- "grad_norm": 71.5,
168
  "learning_rate": 1.4196867899904292e-05,
169
- "loss": 6.2522,
170
  "step": 115
171
  },
172
  {
173
  "epoch": 0.4436229205175601,
174
- "grad_norm": 28.875,
175
  "learning_rate": 1.3601777248047105e-05,
176
- "loss": 6.0918,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 0.46210720887245843,
181
- "grad_norm": 25.75,
182
  "learning_rate": 1.2991641574276419e-05,
183
- "loss": 6.0943,
184
  "step": 125
185
  },
186
  {
187
  "epoch": 0.4805914972273567,
188
- "grad_norm": 24.375,
189
  "learning_rate": 1.2369009482781191e-05,
190
- "loss": 6.0615,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 0.49907578558225507,
195
- "grad_norm": 53.0,
196
  "learning_rate": 1.1736481776669307e-05,
197
- "loss": 5.9591,
198
  "step": 135
199
  },
200
  {
201
  "epoch": 0.5175600739371534,
202
- "grad_norm": 93.5,
203
  "learning_rate": 1.1096700594125318e-05,
204
- "loss": 6.0178,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 0.5360443622920518,
209
- "grad_norm": 14.0625,
210
  "learning_rate": 1.0452338371907065e-05,
211
- "loss": 5.8597,
212
  "step": 145
213
  },
214
  {
215
  "epoch": 0.5545286506469501,
216
- "grad_norm": 102.0,
217
  "learning_rate": 9.806086682281759e-06,
218
- "loss": 5.8942,
219
  "step": 150
220
  },
221
  {
222
  "epoch": 0.5730129390018485,
223
- "grad_norm": 30.125,
224
  "learning_rate": 9.160644990030932e-06,
225
- "loss": 5.9999,
226
  "step": 155
227
  },
228
  {
229
  "epoch": 0.5914972273567468,
230
- "grad_norm": 23.25,
231
  "learning_rate": 8.518709376487515e-06,
232
- "loss": 5.9019,
233
  "step": 160
234
  },
235
  {
236
  "epoch": 0.609981515711645,
237
- "grad_norm": 56.0,
238
  "learning_rate": 7.882961277705897e-06,
239
- "loss": 5.8274,
240
  "step": 165
241
  },
242
  {
243
  "epoch": 0.6284658040665434,
244
- "grad_norm": 10.1875,
245
  "learning_rate": 7.256056283806987e-06,
246
- "loss": 5.9167,
247
  "step": 170
248
  },
249
  {
250
  "epoch": 0.6469500924214417,
251
- "grad_norm": 26.75,
252
  "learning_rate": 6.640613046284581e-06,
253
- "loss": 5.9097,
254
  "step": 175
255
  },
256
  {
257
  "epoch": 0.6654343807763401,
258
- "grad_norm": 11.5625,
259
  "learning_rate": 6.039202339608432e-06,
260
- "loss": 5.6327,
261
  "step": 180
262
  },
263
  {
264
  "epoch": 0.6839186691312384,
265
- "grad_norm": 26.75,
266
  "learning_rate": 5.454336322814995e-06,
267
- "loss": 5.6804,
268
  "step": 185
269
  },
270
  {
271
  "epoch": 0.7024029574861368,
272
- "grad_norm": 30.375,
273
  "learning_rate": 4.888458045941269e-06,
274
- "loss": 5.7316,
275
  "step": 190
276
  },
277
  {
278
  "epoch": 0.7208872458410351,
279
- "grad_norm": 39.5,
280
  "learning_rate": 4.343931245134616e-06,
281
- "loss": 5.7299,
282
  "step": 195
283
  },
284
  {
285
  "epoch": 0.7393715341959335,
286
- "grad_norm": 11.75,
287
  "learning_rate": 3.823030469065431e-06,
288
- "loss": 5.6982,
289
  "step": 200
290
  },
291
  {
292
  "epoch": 0.7578558225508318,
293
- "grad_norm": 20.625,
294
  "learning_rate": 3.3279315778858034e-06,
295
  "loss": 5.862,
296
  "step": 205
297
  },
298
  {
299
  "epoch": 0.7763401109057301,
300
- "grad_norm": 50.75,
301
  "learning_rate": 2.8607026544210115e-06,
302
- "loss": 5.7507,
303
  "step": 210
304
  },
305
  {
306
  "epoch": 0.7948243992606284,
307
- "grad_norm": 49.75,
308
  "learning_rate": 2.423295365558821e-06,
309
- "loss": 5.7101,
310
  "step": 215
311
  },
312
  {
313
  "epoch": 0.8133086876155268,
314
- "grad_norm": 14.4375,
315
  "learning_rate": 2.01753680992107e-06,
316
- "loss": 5.8428,
317
  "step": 220
318
  },
319
  {
320
  "epoch": 0.8317929759704251,
321
- "grad_norm": 34.75,
322
  "learning_rate": 1.6451218858706374e-06,
323
- "loss": 5.7162,
324
  "step": 225
325
  },
326
  {
327
  "epoch": 0.8502772643253235,
328
  "grad_norm": 47.0,
329
  "learning_rate": 1.307606211733522e-06,
330
- "loss": 5.6621,
331
  "step": 230
332
  },
333
  {
334
  "epoch": 0.8687615526802218,
335
- "grad_norm": 12.875,
336
  "learning_rate": 1.0063996278090704e-06,
337
- "loss": 5.6616,
338
  "step": 235
339
  },
340
  {
341
  "epoch": 0.8872458410351202,
342
- "grad_norm": 13.1875,
343
  "learning_rate": 7.427603073110967e-07,
344
- "loss": 5.8559,
345
  "step": 240
346
  },
347
  {
348
  "epoch": 0.9057301293900185,
349
- "grad_norm": 7.96875,
350
  "learning_rate": 5.177895008392353e-07,
351
- "loss": 5.7149,
352
  "step": 245
353
  },
354
  {
355
  "epoch": 0.9242144177449169,
356
- "grad_norm": 61.75,
357
  "learning_rate": 3.3242693633337986e-07,
358
- "loss": 5.9011,
359
  "step": 250
360
  },
361
  {
362
  "epoch": 0.9426987060998152,
363
- "grad_norm": 13.625,
364
  "learning_rate": 1.874468937261531e-07,
365
- "loss": 5.685,
366
  "step": 255
367
  },
368
  {
369
  "epoch": 0.9611829944547134,
370
- "grad_norm": 15.6875,
371
  "learning_rate": 8.345497068998897e-08,
372
- "loss": 5.7765,
373
  "step": 260
374
  },
375
  {
376
  "epoch": 0.9796672828096118,
377
- "grad_norm": 18.5,
378
  "learning_rate": 2.088555298867978e-08,
379
- "loss": 5.734,
380
  "step": 265
381
  },
382
  {
383
  "epoch": 0.9981515711645101,
384
- "grad_norm": 28.25,
385
  "learning_rate": 0.0,
386
- "loss": 5.7909,
387
  "step": 270
388
  },
389
  {
390
  "epoch": 0.9981515711645101,
391
- "eval_loss": 5.75302791595459,
392
- "eval_runtime": 376.2306,
393
- "eval_samples_per_second": 2.581,
394
- "eval_steps_per_second": 0.324,
395
  "step": 270
396
  },
397
  {
398
  "epoch": 0.9981515711645101,
399
  "step": 270,
400
  "total_flos": 5.260333472022528e+16,
401
- "train_loss": 7.3803352285314485,
402
- "train_runtime": 2516.7229,
403
- "train_samples_per_second": 3.438,
404
- "train_steps_per_second": 0.107
405
  }
406
  ],
407
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.018484288354898338,
13
+ "grad_norm": 314.0,
14
  "learning_rate": 3.7037037037037037e-06,
15
+ "loss": 20.7105,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.036968576709796676,
20
+ "grad_norm": 177.0,
21
  "learning_rate": 7.4074074074074075e-06,
22
+ "loss": 19.245,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.05545286506469501,
27
+ "grad_norm": 189.0,
28
  "learning_rate": 1.1111111111111113e-05,
29
+ "loss": 16.7313,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.07393715341959335,
34
+ "grad_norm": 85.0,
35
  "learning_rate": 1.4814814814814815e-05,
36
+ "loss": 14.1818,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.09242144177449169,
41
+ "grad_norm": 54.25,
42
  "learning_rate": 1.851851851851852e-05,
43
+ "loss": 11.905,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.11090573012939002,
48
+ "grad_norm": 91.0,
49
  "learning_rate": 1.9992479525042305e-05,
50
+ "loss": 10.2358,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.12939001848428835,
55
+ "grad_norm": 60.0,
56
  "learning_rate": 1.9946562024066018e-05,
57
+ "loss": 9.522,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.1478743068391867,
62
+ "grad_norm": 22.125,
63
  "learning_rate": 1.9859096633447965e-05,
64
+ "loss": 9.0956,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.16635859519408502,
69
+ "grad_norm": 98.5,
70
  "learning_rate": 1.973044870579824e-05,
71
+ "loss": 8.5045,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.18484288354898337,
76
+ "grad_norm": 22.5,
77
  "learning_rate": 1.95611556177388e-05,
78
+ "loss": 8.2548,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.2033271719038817,
83
+ "grad_norm": 46.5,
84
  "learning_rate": 1.93519245252219e-05,
85
+ "loss": 7.7123,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.22181146025878004,
90
+ "grad_norm": 34.75,
91
  "learning_rate": 1.9103629409661468e-05,
92
+ "loss": 7.3372,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.24029574861367836,
97
+ "grad_norm": 45.75,
98
  "learning_rate": 1.881730742721608e-05,
99
+ "loss": 7.2639,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.2587800369685767,
104
+ "grad_norm": 99.5,
105
  "learning_rate": 1.8494154576472976e-05,
106
+ "loss": 6.9563,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.27726432532347506,
111
+ "grad_norm": 58.75,
112
  "learning_rate": 1.8135520702629677e-05,
113
+ "loss": 6.8706,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.2957486136783734,
118
+ "grad_norm": 189.0,
119
  "learning_rate": 1.7742903859041324e-05,
120
+ "loss": 6.8386,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.3142329020332717,
125
+ "grad_norm": 31.0,
126
  "learning_rate": 1.7317944049686125e-05,
127
+ "loss": 6.7403,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.33271719038817005,
132
+ "grad_norm": 55.25,
133
  "learning_rate": 1.686241637868734e-05,
134
+ "loss": 6.554,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.3512014787430684,
139
+ "grad_norm": 58.5,
140
  "learning_rate": 1.637822363550706e-05,
141
+ "loss": 6.5162,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.36968576709796674,
146
+ "grad_norm": 18.875,
147
  "learning_rate": 1.586738834678418e-05,
148
+ "loss": 6.3662,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.38817005545286504,
153
+ "grad_norm": 25.125,
154
  "learning_rate": 1.5332044328016916e-05,
155
+ "loss": 6.4888,
156
  "step": 105
157
  },
158
  {
159
  "epoch": 0.4066543438077634,
160
+ "grad_norm": 17.0,
161
  "learning_rate": 1.4774427770379492e-05,
162
+ "loss": 6.5204,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.42513863216266173,
167
+ "grad_norm": 125.5,
168
  "learning_rate": 1.4196867899904292e-05,
169
+ "loss": 6.3338,
170
  "step": 115
171
  },
172
  {
173
  "epoch": 0.4436229205175601,
174
+ "grad_norm": 17.5,
175
  "learning_rate": 1.3601777248047105e-05,
176
+ "loss": 6.0951,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 0.46210720887245843,
181
+ "grad_norm": 10.0,
182
  "learning_rate": 1.2991641574276419e-05,
183
+ "loss": 6.1339,
184
  "step": 125
185
  },
186
  {
187
  "epoch": 0.4805914972273567,
188
+ "grad_norm": 24.0,
189
  "learning_rate": 1.2369009482781191e-05,
190
+ "loss": 6.1337,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 0.49907578558225507,
195
+ "grad_norm": 40.75,
196
  "learning_rate": 1.1736481776669307e-05,
197
+ "loss": 5.997,
198
  "step": 135
199
  },
200
  {
201
  "epoch": 0.5175600739371534,
202
+ "grad_norm": 61.5,
203
  "learning_rate": 1.1096700594125318e-05,
204
+ "loss": 6.0472,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 0.5360443622920518,
209
+ "grad_norm": 18.125,
210
  "learning_rate": 1.0452338371907065e-05,
211
+ "loss": 5.8697,
212
  "step": 145
213
  },
214
  {
215
  "epoch": 0.5545286506469501,
216
+ "grad_norm": 85.0,
217
  "learning_rate": 9.806086682281759e-06,
218
+ "loss": 5.9026,
219
  "step": 150
220
  },
221
  {
222
  "epoch": 0.5730129390018485,
223
+ "grad_norm": 28.0,
224
  "learning_rate": 9.160644990030932e-06,
225
+ "loss": 6.016,
226
  "step": 155
227
  },
228
  {
229
  "epoch": 0.5914972273567468,
230
+ "grad_norm": 12.75,
231
  "learning_rate": 8.518709376487515e-06,
232
+ "loss": 5.9233,
233
  "step": 160
234
  },
235
  {
236
  "epoch": 0.609981515711645,
237
+ "grad_norm": 77.5,
238
  "learning_rate": 7.882961277705897e-06,
239
+ "loss": 5.8483,
240
  "step": 165
241
  },
242
  {
243
  "epoch": 0.6284658040665434,
244
+ "grad_norm": 27.25,
245
  "learning_rate": 7.256056283806987e-06,
246
+ "loss": 5.9308,
247
  "step": 170
248
  },
249
  {
250
  "epoch": 0.6469500924214417,
251
+ "grad_norm": 58.0,
252
  "learning_rate": 6.640613046284581e-06,
253
+ "loss": 5.9355,
254
  "step": 175
255
  },
256
  {
257
  "epoch": 0.6654343807763401,
258
+ "grad_norm": 31.125,
259
  "learning_rate": 6.039202339608432e-06,
260
+ "loss": 5.6434,
261
  "step": 180
262
  },
263
  {
264
  "epoch": 0.6839186691312384,
265
+ "grad_norm": 25.0,
266
  "learning_rate": 5.454336322814995e-06,
267
+ "loss": 5.6888,
268
  "step": 185
269
  },
270
  {
271
  "epoch": 0.7024029574861368,
272
+ "grad_norm": 34.75,
273
  "learning_rate": 4.888458045941269e-06,
274
+ "loss": 5.7455,
275
  "step": 190
276
  },
277
  {
278
  "epoch": 0.7208872458410351,
279
+ "grad_norm": 23.375,
280
  "learning_rate": 4.343931245134616e-06,
281
+ "loss": 5.7362,
282
  "step": 195
283
  },
284
  {
285
  "epoch": 0.7393715341959335,
286
+ "grad_norm": 12.8125,
287
  "learning_rate": 3.823030469065431e-06,
288
+ "loss": 5.6849,
289
  "step": 200
290
  },
291
  {
292
  "epoch": 0.7578558225508318,
293
+ "grad_norm": 24.875,
294
  "learning_rate": 3.3279315778858034e-06,
295
  "loss": 5.862,
296
  "step": 205
297
  },
298
  {
299
  "epoch": 0.7763401109057301,
300
+ "grad_norm": 36.0,
301
  "learning_rate": 2.8607026544210115e-06,
302
+ "loss": 5.7541,
303
  "step": 210
304
  },
305
  {
306
  "epoch": 0.7948243992606284,
307
+ "grad_norm": 38.25,
308
  "learning_rate": 2.423295365558821e-06,
309
+ "loss": 5.7035,
310
  "step": 215
311
  },
312
  {
313
  "epoch": 0.8133086876155268,
314
+ "grad_norm": 19.5,
315
  "learning_rate": 2.01753680992107e-06,
316
+ "loss": 5.8542,
317
  "step": 220
318
  },
319
  {
320
  "epoch": 0.8317929759704251,
321
+ "grad_norm": 33.25,
322
  "learning_rate": 1.6451218858706374e-06,
323
+ "loss": 5.7262,
324
  "step": 225
325
  },
326
  {
327
  "epoch": 0.8502772643253235,
328
  "grad_norm": 47.0,
329
  "learning_rate": 1.307606211733522e-06,
330
+ "loss": 5.6701,
331
  "step": 230
332
  },
333
  {
334
  "epoch": 0.8687615526802218,
335
+ "grad_norm": 15.3125,
336
  "learning_rate": 1.0063996278090704e-06,
337
+ "loss": 5.6769,
338
  "step": 235
339
  },
340
  {
341
  "epoch": 0.8872458410351202,
342
+ "grad_norm": 16.0,
343
  "learning_rate": 7.427603073110967e-07,
344
+ "loss": 5.868,
345
  "step": 240
346
  },
347
  {
348
  "epoch": 0.9057301293900185,
349
+ "grad_norm": 7.71875,
350
  "learning_rate": 5.177895008392353e-07,
351
+ "loss": 5.7182,
352
  "step": 245
353
  },
354
  {
355
  "epoch": 0.9242144177449169,
356
+ "grad_norm": 62.5,
357
  "learning_rate": 3.3242693633337986e-07,
358
+ "loss": 5.9037,
359
  "step": 250
360
  },
361
  {
362
  "epoch": 0.9426987060998152,
363
+ "grad_norm": 12.875,
364
  "learning_rate": 1.874468937261531e-07,
365
+ "loss": 5.6952,
366
  "step": 255
367
  },
368
  {
369
  "epoch": 0.9611829944547134,
370
+ "grad_norm": 16.875,
371
  "learning_rate": 8.345497068998897e-08,
372
+ "loss": 5.7802,
373
  "step": 260
374
  },
375
  {
376
  "epoch": 0.9796672828096118,
377
+ "grad_norm": 13.75,
378
  "learning_rate": 2.088555298867978e-08,
379
+ "loss": 5.7397,
380
  "step": 265
381
  },
382
  {
383
  "epoch": 0.9981515711645101,
384
+ "grad_norm": 26.625,
385
  "learning_rate": 0.0,
386
+ "loss": 5.8055,
387
  "step": 270
388
  },
389
  {
390
  "epoch": 0.9981515711645101,
391
+ "eval_loss": 5.764816761016846,
392
+ "eval_runtime": 319.0279,
393
+ "eval_samples_per_second": 3.044,
394
+ "eval_steps_per_second": 0.382,
395
  "step": 270
396
  },
397
  {
398
  "epoch": 0.9981515711645101,
399
  "step": 270,
400
  "total_flos": 5.260333472022528e+16,
401
+ "train_loss": 7.369897298459653,
402
+ "train_runtime": 2195.5957,
403
+ "train_samples_per_second": 3.941,
404
+ "train_steps_per_second": 0.123
405
  }
406
  ],
407
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa1e4059799b3fd2bc19d9b9bdbb3a0911a6dc0077207bdb4e02f055fcee2618
3
- size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94f28c170a957c643092ce3557ce1fd1db0ab541b074c9aab5a4e5efa0f58622
3
+ size 5496