ray commited on
Commit
a51abff
1 Parent(s): 83679c8

[init] bloom model 4k

Browse files
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ckip-joint/bloom-3b-zh",
3
+ "apply_residual_connection_post_layernorm": false,
4
+ "architectures": [
5
+ "BloomForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "attention_softmax_in_fp32": true,
9
+ "bias_dropout_fusion": true,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 2560,
14
+ "initializer_range": 0.02,
15
+ "layer_norm_epsilon": 1e-05,
16
+ "masked_softmax_fusion": true,
17
+ "model_type": "bloom",
18
+ "n_head": 32,
19
+ "n_inner": null,
20
+ "n_layer": 30,
21
+ "offset_alibi": 100,
22
+ "pad_token_id": 3,
23
+ "pretraining_tp": 4,
24
+ "seq_length": 2048,
25
+ "skip_bias_add": true,
26
+ "skip_bias_add_qkv": false,
27
+ "slow_but_exact": false,
28
+ "torch_dtype": "float16",
29
+ "transformers_version": "4.28.0.dev0",
30
+ "unk_token_id": 0,
31
+ "use_cache": true,
32
+ "vocab_size": 250688
33
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 3,
6
+ "transformers_version": "4.28.0.dev0"
7
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:588cf83c73c21566511f0f4ff584abf3e3075a863d97d88b5defac51ca1b42d7
3
+ size 6004251933
special_tokens_map.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|system|>",
4
+ "<|prompter|>",
5
+ "<|prefix_end|>",
6
+ "<|prefix_begin|>",
7
+ "<|assistant|>"
8
+ ],
9
+ "bos_token": "<s>",
10
+ "eos_token": "</s>",
11
+ "pad_token": "<pad>",
12
+ "sep_token": "<s>",
13
+ "unk_token": "<unk>"
14
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "<pad>",
8
+ "special_tokens_map_file": null,
9
+ "tokenizer_class": "BloomTokenizer",
10
+ "unk_token": "<unk>"
11
+ }
trainer_state.json ADDED
@@ -0,0 +1,1468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.3639164690398064,
5
+ "global_step": 2000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 1.6e-08,
13
+ "loss": 2.8922,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.0,
18
+ "learning_rate": 5.6e-08,
19
+ "loss": 2.6572,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 9.199999999999999e-08,
25
+ "loss": 2.6816,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.01,
30
+ "learning_rate": 1.32e-07,
31
+ "loss": 2.6259,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.01,
36
+ "learning_rate": 1.7199999999999998e-07,
37
+ "loss": 2.4734,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.01,
42
+ "learning_rate": 2.12e-07,
43
+ "loss": 2.4194,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.01,
48
+ "learning_rate": 2.52e-07,
49
+ "loss": 2.3645,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.01,
54
+ "learning_rate": 2.9199999999999997e-07,
55
+ "loss": 2.287,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.02,
60
+ "learning_rate": 3.32e-07,
61
+ "loss": 2.2451,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.02,
66
+ "learning_rate": 3.72e-07,
67
+ "loss": 2.1339,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.02,
72
+ "learning_rate": 4.12e-07,
73
+ "loss": 2.0578,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 0.02,
78
+ "learning_rate": 4.5199999999999997e-07,
79
+ "loss": 2.0854,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 0.02,
84
+ "learning_rate": 4.92e-07,
85
+ "loss": 1.9981,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 0.03,
90
+ "learning_rate": 5.32e-07,
91
+ "loss": 1.927,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 0.03,
96
+ "learning_rate": 5.719999999999999e-07,
97
+ "loss": 1.8661,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 0.03,
102
+ "learning_rate": 6.119999999999999e-07,
103
+ "loss": 1.8546,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 0.03,
108
+ "learning_rate": 6.52e-07,
109
+ "loss": 1.7949,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 0.03,
114
+ "learning_rate": 6.919999999999999e-07,
115
+ "loss": 1.7926,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 0.03,
120
+ "learning_rate": 7.319999999999999e-07,
121
+ "loss": 1.7737,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 0.04,
126
+ "learning_rate": 7.72e-07,
127
+ "loss": 1.7178,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 0.04,
132
+ "learning_rate": 8.12e-07,
133
+ "loss": 1.6872,
134
+ "step": 210
135
+ },
136
+ {
137
+ "epoch": 0.04,
138
+ "learning_rate": 8.52e-07,
139
+ "loss": 1.6504,
140
+ "step": 220
141
+ },
142
+ {
143
+ "epoch": 0.04,
144
+ "learning_rate": 8.92e-07,
145
+ "loss": 1.6955,
146
+ "step": 230
147
+ },
148
+ {
149
+ "epoch": 0.04,
150
+ "learning_rate": 9.32e-07,
151
+ "loss": 1.6822,
152
+ "step": 240
153
+ },
154
+ {
155
+ "epoch": 0.05,
156
+ "learning_rate": 9.72e-07,
157
+ "loss": 1.647,
158
+ "step": 250
159
+ },
160
+ {
161
+ "epoch": 0.05,
162
+ "learning_rate": 1.012e-06,
163
+ "loss": 1.6131,
164
+ "step": 260
165
+ },
166
+ {
167
+ "epoch": 0.05,
168
+ "learning_rate": 1.052e-06,
169
+ "loss": 1.6433,
170
+ "step": 270
171
+ },
172
+ {
173
+ "epoch": 0.05,
174
+ "learning_rate": 1.092e-06,
175
+ "loss": 1.596,
176
+ "step": 280
177
+ },
178
+ {
179
+ "epoch": 0.05,
180
+ "learning_rate": 1.132e-06,
181
+ "loss": 1.5824,
182
+ "step": 290
183
+ },
184
+ {
185
+ "epoch": 0.05,
186
+ "learning_rate": 1.172e-06,
187
+ "loss": 1.5606,
188
+ "step": 300
189
+ },
190
+ {
191
+ "epoch": 0.06,
192
+ "learning_rate": 1.212e-06,
193
+ "loss": 1.5886,
194
+ "step": 310
195
+ },
196
+ {
197
+ "epoch": 0.06,
198
+ "learning_rate": 1.252e-06,
199
+ "loss": 1.5905,
200
+ "step": 320
201
+ },
202
+ {
203
+ "epoch": 0.06,
204
+ "learning_rate": 1.292e-06,
205
+ "loss": 1.5493,
206
+ "step": 330
207
+ },
208
+ {
209
+ "epoch": 0.06,
210
+ "learning_rate": 1.332e-06,
211
+ "loss": 1.5734,
212
+ "step": 340
213
+ },
214
+ {
215
+ "epoch": 0.06,
216
+ "learning_rate": 1.372e-06,
217
+ "loss": 1.5435,
218
+ "step": 350
219
+ },
220
+ {
221
+ "epoch": 0.07,
222
+ "learning_rate": 1.4119999999999998e-06,
223
+ "loss": 1.5359,
224
+ "step": 360
225
+ },
226
+ {
227
+ "epoch": 0.07,
228
+ "learning_rate": 1.4519999999999998e-06,
229
+ "loss": 1.5579,
230
+ "step": 370
231
+ },
232
+ {
233
+ "epoch": 0.07,
234
+ "learning_rate": 1.4919999999999999e-06,
235
+ "loss": 1.5358,
236
+ "step": 380
237
+ },
238
+ {
239
+ "epoch": 0.07,
240
+ "learning_rate": 1.532e-06,
241
+ "loss": 1.5509,
242
+ "step": 390
243
+ },
244
+ {
245
+ "epoch": 0.07,
246
+ "learning_rate": 1.572e-06,
247
+ "loss": 1.5116,
248
+ "step": 400
249
+ },
250
+ {
251
+ "epoch": 0.07,
252
+ "learning_rate": 1.612e-06,
253
+ "loss": 1.5251,
254
+ "step": 410
255
+ },
256
+ {
257
+ "epoch": 0.08,
258
+ "learning_rate": 1.6519999999999998e-06,
259
+ "loss": 1.5107,
260
+ "step": 420
261
+ },
262
+ {
263
+ "epoch": 0.08,
264
+ "learning_rate": 1.6919999999999999e-06,
265
+ "loss": 1.4805,
266
+ "step": 430
267
+ },
268
+ {
269
+ "epoch": 0.08,
270
+ "learning_rate": 1.7319999999999999e-06,
271
+ "loss": 1.4819,
272
+ "step": 440
273
+ },
274
+ {
275
+ "epoch": 0.08,
276
+ "learning_rate": 1.772e-06,
277
+ "loss": 1.4897,
278
+ "step": 450
279
+ },
280
+ {
281
+ "epoch": 0.08,
282
+ "learning_rate": 1.812e-06,
283
+ "loss": 1.4824,
284
+ "step": 460
285
+ },
286
+ {
287
+ "epoch": 0.09,
288
+ "learning_rate": 1.852e-06,
289
+ "loss": 1.4822,
290
+ "step": 470
291
+ },
292
+ {
293
+ "epoch": 0.09,
294
+ "learning_rate": 1.8919999999999998e-06,
295
+ "loss": 1.5,
296
+ "step": 480
297
+ },
298
+ {
299
+ "epoch": 0.09,
300
+ "learning_rate": 1.932e-06,
301
+ "loss": 1.4595,
302
+ "step": 490
303
+ },
304
+ {
305
+ "epoch": 0.09,
306
+ "learning_rate": 1.972e-06,
307
+ "loss": 1.4163,
308
+ "step": 500
309
+ },
310
+ {
311
+ "epoch": 0.09,
312
+ "eval_wmt2019_zh-en_accuracy": 0.5762586867111048,
313
+ "eval_wmt2019_zh-en_loss": 2.037109375,
314
+ "eval_wmt2019_zh-en_runtime": 77.5891,
315
+ "eval_wmt2019_zh-en_samples_per_second": 12.888,
316
+ "eval_wmt2019_zh-en_steps_per_second": 3.222,
317
+ "step": 500
318
+ },
319
+ {
320
+ "epoch": 0.09,
321
+ "eval_ted_trans_en-ja_accuracy": 0.49833809864188705,
322
+ "eval_ted_trans_en-ja_loss": 2.359375,
323
+ "eval_ted_trans_en-ja_runtime": 67.0162,
324
+ "eval_ted_trans_en-ja_samples_per_second": 11.952,
325
+ "eval_ted_trans_en-ja_steps_per_second": 2.999,
326
+ "step": 500
327
+ },
328
+ {
329
+ "epoch": 0.09,
330
+ "eval_ted_trans_zh-ja_accuracy": 0.41924741924741926,
331
+ "eval_ted_trans_zh-ja_loss": 3.099609375,
332
+ "eval_ted_trans_zh-ja_runtime": 4.122,
333
+ "eval_ted_trans_zh-ja_samples_per_second": 10.189,
334
+ "eval_ted_trans_zh-ja_steps_per_second": 2.669,
335
+ "step": 500
336
+ },
337
+ {
338
+ "epoch": 0.09,
339
+ "eval_sharegpt_accuracy": 0.6638255086604158,
340
+ "eval_sharegpt_loss": 1.4677734375,
341
+ "eval_sharegpt_runtime": 754.4011,
342
+ "eval_sharegpt_samples_per_second": 4.438,
343
+ "eval_sharegpt_steps_per_second": 1.109,
344
+ "step": 500
345
+ },
346
+ {
347
+ "epoch": 0.09,
348
+ "eval_dolly15k_accuracy": 0.5514953610117659,
349
+ "eval_dolly15k_loss": 1.9169921875,
350
+ "eval_dolly15k_runtime": 63.6309,
351
+ "eval_dolly15k_samples_per_second": 11.802,
352
+ "eval_dolly15k_steps_per_second": 2.955,
353
+ "step": 500
354
+ },
355
+ {
356
+ "epoch": 0.09,
357
+ "eval_ikala_accuracy": 0.6825314578991759,
358
+ "eval_ikala_loss": 1.2626953125,
359
+ "eval_ikala_runtime": 1328.8914,
360
+ "eval_ikala_samples_per_second": 10.328,
361
+ "eval_ikala_steps_per_second": 2.583,
362
+ "step": 500
363
+ },
364
+ {
365
+ "epoch": 0.09,
366
+ "eval_oasst_export_accuracy": 0.5725428280687767,
367
+ "eval_oasst_export_loss": 2.181640625,
368
+ "eval_oasst_export_runtime": 206.1134,
369
+ "eval_oasst_export_samples_per_second": 10.184,
370
+ "eval_oasst_export_steps_per_second": 2.547,
371
+ "step": 500
372
+ },
373
+ {
374
+ "epoch": 0.09,
375
+ "learning_rate": 2.012e-06,
376
+ "loss": 1.4116,
377
+ "step": 510
378
+ },
379
+ {
380
+ "epoch": 0.09,
381
+ "learning_rate": 2.052e-06,
382
+ "loss": 1.4304,
383
+ "step": 520
384
+ },
385
+ {
386
+ "epoch": 0.1,
387
+ "learning_rate": 2.092e-06,
388
+ "loss": 1.4102,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 0.1,
393
+ "learning_rate": 2.132e-06,
394
+ "loss": 1.3838,
395
+ "step": 540
396
+ },
397
+ {
398
+ "epoch": 0.1,
399
+ "learning_rate": 2.172e-06,
400
+ "loss": 1.4179,
401
+ "step": 550
402
+ },
403
+ {
404
+ "epoch": 0.1,
405
+ "learning_rate": 2.212e-06,
406
+ "loss": 1.4286,
407
+ "step": 560
408
+ },
409
+ {
410
+ "epoch": 0.1,
411
+ "learning_rate": 2.2519999999999998e-06,
412
+ "loss": 1.426,
413
+ "step": 570
414
+ },
415
+ {
416
+ "epoch": 0.11,
417
+ "learning_rate": 2.292e-06,
418
+ "loss": 1.4257,
419
+ "step": 580
420
+ },
421
+ {
422
+ "epoch": 0.11,
423
+ "learning_rate": 2.332e-06,
424
+ "loss": 1.4342,
425
+ "step": 590
426
+ },
427
+ {
428
+ "epoch": 0.11,
429
+ "learning_rate": 2.372e-06,
430
+ "loss": 1.4015,
431
+ "step": 600
432
+ },
433
+ {
434
+ "epoch": 0.11,
435
+ "learning_rate": 2.412e-06,
436
+ "loss": 1.3999,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.11,
441
+ "learning_rate": 2.452e-06,
442
+ "loss": 1.4062,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 0.11,
447
+ "learning_rate": 2.492e-06,
448
+ "loss": 1.427,
449
+ "step": 630
450
+ },
451
+ {
452
+ "epoch": 0.12,
453
+ "learning_rate": 2.532e-06,
454
+ "loss": 1.4171,
455
+ "step": 640
456
+ },
457
+ {
458
+ "epoch": 0.12,
459
+ "learning_rate": 2.572e-06,
460
+ "loss": 1.4221,
461
+ "step": 650
462
+ },
463
+ {
464
+ "epoch": 0.12,
465
+ "learning_rate": 2.612e-06,
466
+ "loss": 1.3547,
467
+ "step": 660
468
+ },
469
+ {
470
+ "epoch": 0.12,
471
+ "learning_rate": 2.652e-06,
472
+ "loss": 1.3783,
473
+ "step": 670
474
+ },
475
+ {
476
+ "epoch": 0.12,
477
+ "learning_rate": 2.6920000000000002e-06,
478
+ "loss": 1.3574,
479
+ "step": 680
480
+ },
481
+ {
482
+ "epoch": 0.13,
483
+ "learning_rate": 2.7320000000000003e-06,
484
+ "loss": 1.3604,
485
+ "step": 690
486
+ },
487
+ {
488
+ "epoch": 0.13,
489
+ "learning_rate": 2.7719999999999995e-06,
490
+ "loss": 1.3978,
491
+ "step": 700
492
+ },
493
+ {
494
+ "epoch": 0.13,
495
+ "learning_rate": 2.8119999999999995e-06,
496
+ "loss": 1.3879,
497
+ "step": 710
498
+ },
499
+ {
500
+ "epoch": 0.13,
501
+ "learning_rate": 2.8519999999999995e-06,
502
+ "loss": 1.3731,
503
+ "step": 720
504
+ },
505
+ {
506
+ "epoch": 0.13,
507
+ "learning_rate": 2.8919999999999996e-06,
508
+ "loss": 1.3732,
509
+ "step": 730
510
+ },
511
+ {
512
+ "epoch": 0.13,
513
+ "learning_rate": 2.9319999999999996e-06,
514
+ "loss": 1.3787,
515
+ "step": 740
516
+ },
517
+ {
518
+ "epoch": 0.14,
519
+ "learning_rate": 2.9719999999999997e-06,
520
+ "loss": 1.3842,
521
+ "step": 750
522
+ },
523
+ {
524
+ "epoch": 0.14,
525
+ "learning_rate": 3.0119999999999997e-06,
526
+ "loss": 1.3952,
527
+ "step": 760
528
+ },
529
+ {
530
+ "epoch": 0.14,
531
+ "learning_rate": 3.0519999999999997e-06,
532
+ "loss": 1.3117,
533
+ "step": 770
534
+ },
535
+ {
536
+ "epoch": 0.14,
537
+ "learning_rate": 3.0919999999999998e-06,
538
+ "loss": 1.3531,
539
+ "step": 780
540
+ },
541
+ {
542
+ "epoch": 0.14,
543
+ "learning_rate": 3.132e-06,
544
+ "loss": 1.3287,
545
+ "step": 790
546
+ },
547
+ {
548
+ "epoch": 0.15,
549
+ "learning_rate": 3.172e-06,
550
+ "loss": 1.3417,
551
+ "step": 800
552
+ },
553
+ {
554
+ "epoch": 0.15,
555
+ "learning_rate": 3.212e-06,
556
+ "loss": 1.3341,
557
+ "step": 810
558
+ },
559
+ {
560
+ "epoch": 0.15,
561
+ "learning_rate": 3.2519999999999995e-06,
562
+ "loss": 1.364,
563
+ "step": 820
564
+ },
565
+ {
566
+ "epoch": 0.15,
567
+ "learning_rate": 3.2919999999999996e-06,
568
+ "loss": 1.3201,
569
+ "step": 830
570
+ },
571
+ {
572
+ "epoch": 0.15,
573
+ "learning_rate": 3.3319999999999996e-06,
574
+ "loss": 1.3714,
575
+ "step": 840
576
+ },
577
+ {
578
+ "epoch": 0.15,
579
+ "learning_rate": 3.3719999999999996e-06,
580
+ "loss": 1.3098,
581
+ "step": 850
582
+ },
583
+ {
584
+ "epoch": 0.16,
585
+ "learning_rate": 3.4119999999999997e-06,
586
+ "loss": 1.3222,
587
+ "step": 860
588
+ },
589
+ {
590
+ "epoch": 0.16,
591
+ "learning_rate": 3.4519999999999997e-06,
592
+ "loss": 1.3451,
593
+ "step": 870
594
+ },
595
+ {
596
+ "epoch": 0.16,
597
+ "learning_rate": 3.4919999999999998e-06,
598
+ "loss": 1.3502,
599
+ "step": 880
600
+ },
601
+ {
602
+ "epoch": 0.16,
603
+ "learning_rate": 3.532e-06,
604
+ "loss": 1.3445,
605
+ "step": 890
606
+ },
607
+ {
608
+ "epoch": 0.16,
609
+ "learning_rate": 3.572e-06,
610
+ "loss": 1.3304,
611
+ "step": 900
612
+ },
613
+ {
614
+ "epoch": 0.17,
615
+ "learning_rate": 3.612e-06,
616
+ "loss": 1.3081,
617
+ "step": 910
618
+ },
619
+ {
620
+ "epoch": 0.17,
621
+ "learning_rate": 3.652e-06,
622
+ "loss": 1.3106,
623
+ "step": 920
624
+ },
625
+ {
626
+ "epoch": 0.17,
627
+ "learning_rate": 3.692e-06,
628
+ "loss": 1.3357,
629
+ "step": 930
630
+ },
631
+ {
632
+ "epoch": 0.17,
633
+ "learning_rate": 3.732e-06,
634
+ "loss": 1.3243,
635
+ "step": 940
636
+ },
637
+ {
638
+ "epoch": 0.17,
639
+ "learning_rate": 3.7719999999999996e-06,
640
+ "loss": 1.3164,
641
+ "step": 950
642
+ },
643
+ {
644
+ "epoch": 0.17,
645
+ "learning_rate": 3.8119999999999997e-06,
646
+ "loss": 1.3124,
647
+ "step": 960
648
+ },
649
+ {
650
+ "epoch": 0.18,
651
+ "learning_rate": 3.852e-06,
652
+ "loss": 1.3162,
653
+ "step": 970
654
+ },
655
+ {
656
+ "epoch": 0.18,
657
+ "learning_rate": 3.891999999999999e-06,
658
+ "loss": 1.3124,
659
+ "step": 980
660
+ },
661
+ {
662
+ "epoch": 0.18,
663
+ "learning_rate": 3.932e-06,
664
+ "loss": 1.3347,
665
+ "step": 990
666
+ },
667
+ {
668
+ "epoch": 0.18,
669
+ "learning_rate": 3.971999999999999e-06,
670
+ "loss": 1.3174,
671
+ "step": 1000
672
+ },
673
+ {
674
+ "epoch": 0.18,
675
+ "eval_wmt2019_zh-en_accuracy": 0.5819691791089524,
676
+ "eval_wmt2019_zh-en_loss": 1.9326171875,
677
+ "eval_wmt2019_zh-en_runtime": 76.631,
678
+ "eval_wmt2019_zh-en_samples_per_second": 13.05,
679
+ "eval_wmt2019_zh-en_steps_per_second": 3.262,
680
+ "step": 1000
681
+ },
682
+ {
683
+ "epoch": 0.18,
684
+ "eval_ted_trans_en-ja_accuracy": 0.5248655214298109,
685
+ "eval_ted_trans_en-ja_loss": 2.15625,
686
+ "eval_ted_trans_en-ja_runtime": 67.1457,
687
+ "eval_ted_trans_en-ja_samples_per_second": 11.929,
688
+ "eval_ted_trans_en-ja_steps_per_second": 2.993,
689
+ "step": 1000
690
+ },
691
+ {
692
+ "epoch": 0.18,
693
+ "eval_ted_trans_zh-ja_accuracy": 0.46262002743484226,
694
+ "eval_ted_trans_zh-ja_loss": 2.8671875,
695
+ "eval_ted_trans_zh-ja_runtime": 3.9271,
696
+ "eval_ted_trans_zh-ja_samples_per_second": 10.695,
697
+ "eval_ted_trans_zh-ja_steps_per_second": 2.801,
698
+ "step": 1000
699
+ },
700
+ {
701
+ "epoch": 0.18,
702
+ "eval_sharegpt_accuracy": 0.6835767342429696,
703
+ "eval_sharegpt_loss": 1.326171875,
704
+ "eval_sharegpt_runtime": 751.6419,
705
+ "eval_sharegpt_samples_per_second": 4.454,
706
+ "eval_sharegpt_steps_per_second": 1.114,
707
+ "step": 1000
708
+ },
709
+ {
710
+ "epoch": 0.18,
711
+ "eval_dolly15k_accuracy": 0.5566094783945817,
712
+ "eval_dolly15k_loss": 1.84765625,
713
+ "eval_dolly15k_runtime": 63.1655,
714
+ "eval_dolly15k_samples_per_second": 11.889,
715
+ "eval_dolly15k_steps_per_second": 2.976,
716
+ "step": 1000
717
+ },
718
+ {
719
+ "epoch": 0.18,
720
+ "eval_ikala_accuracy": 0.6979014199391581,
721
+ "eval_ikala_loss": 1.1513671875,
722
+ "eval_ikala_runtime": 1328.524,
723
+ "eval_ikala_samples_per_second": 10.331,
724
+ "eval_ikala_steps_per_second": 2.583,
725
+ "step": 1000
726
+ },
727
+ {
728
+ "epoch": 0.18,
729
+ "eval_oasst_export_accuracy": 0.5836786504830943,
730
+ "eval_oasst_export_loss": 2.068359375,
731
+ "eval_oasst_export_runtime": 206.6429,
732
+ "eval_oasst_export_samples_per_second": 10.158,
733
+ "eval_oasst_export_steps_per_second": 2.541,
734
+ "step": 1000
735
+ },
736
+ {
737
+ "epoch": 0.18,
738
+ "learning_rate": 4.011999999999999e-06,
739
+ "loss": 1.2867,
740
+ "step": 1010
741
+ },
742
+ {
743
+ "epoch": 0.19,
744
+ "learning_rate": 4.0519999999999995e-06,
745
+ "loss": 1.2933,
746
+ "step": 1020
747
+ },
748
+ {
749
+ "epoch": 0.19,
750
+ "learning_rate": 4.091999999999999e-06,
751
+ "loss": 1.2778,
752
+ "step": 1030
753
+ },
754
+ {
755
+ "epoch": 0.19,
756
+ "learning_rate": 4.1319999999999996e-06,
757
+ "loss": 1.3085,
758
+ "step": 1040
759
+ },
760
+ {
761
+ "epoch": 0.19,
762
+ "learning_rate": 4.171999999999999e-06,
763
+ "loss": 1.2772,
764
+ "step": 1050
765
+ },
766
+ {
767
+ "epoch": 0.19,
768
+ "learning_rate": 4.212e-06,
769
+ "loss": 1.3461,
770
+ "step": 1060
771
+ },
772
+ {
773
+ "epoch": 0.19,
774
+ "learning_rate": 4.251999999999999e-06,
775
+ "loss": 1.3247,
776
+ "step": 1070
777
+ },
778
+ {
779
+ "epoch": 0.2,
780
+ "learning_rate": 4.292e-06,
781
+ "loss": 1.2988,
782
+ "step": 1080
783
+ },
784
+ {
785
+ "epoch": 0.2,
786
+ "learning_rate": 4.331999999999999e-06,
787
+ "loss": 1.3175,
788
+ "step": 1090
789
+ },
790
+ {
791
+ "epoch": 0.2,
792
+ "learning_rate": 4.372e-06,
793
+ "loss": 1.3061,
794
+ "step": 1100
795
+ },
796
+ {
797
+ "epoch": 0.2,
798
+ "learning_rate": 4.4119999999999994e-06,
799
+ "loss": 1.3001,
800
+ "step": 1110
801
+ },
802
+ {
803
+ "epoch": 0.2,
804
+ "learning_rate": 4.452e-06,
805
+ "loss": 1.3325,
806
+ "step": 1120
807
+ },
808
+ {
809
+ "epoch": 0.21,
810
+ "learning_rate": 4.4919999999999995e-06,
811
+ "loss": 1.3177,
812
+ "step": 1130
813
+ },
814
+ {
815
+ "epoch": 0.21,
816
+ "learning_rate": 4.532e-06,
817
+ "loss": 1.3128,
818
+ "step": 1140
819
+ },
820
+ {
821
+ "epoch": 0.21,
822
+ "learning_rate": 4.572e-06,
823
+ "loss": 1.302,
824
+ "step": 1150
825
+ },
826
+ {
827
+ "epoch": 0.21,
828
+ "learning_rate": 4.612e-06,
829
+ "loss": 1.3233,
830
+ "step": 1160
831
+ },
832
+ {
833
+ "epoch": 0.21,
834
+ "learning_rate": 4.652e-06,
835
+ "loss": 1.3075,
836
+ "step": 1170
837
+ },
838
+ {
839
+ "epoch": 0.21,
840
+ "learning_rate": 4.692e-06,
841
+ "loss": 1.3044,
842
+ "step": 1180
843
+ },
844
+ {
845
+ "epoch": 0.22,
846
+ "learning_rate": 4.732e-06,
847
+ "loss": 1.2686,
848
+ "step": 1190
849
+ },
850
+ {
851
+ "epoch": 0.22,
852
+ "learning_rate": 4.772e-06,
853
+ "loss": 1.3169,
854
+ "step": 1200
855
+ },
856
+ {
857
+ "epoch": 0.22,
858
+ "learning_rate": 4.812e-06,
859
+ "loss": 1.3075,
860
+ "step": 1210
861
+ },
862
+ {
863
+ "epoch": 0.22,
864
+ "learning_rate": 4.852e-06,
865
+ "loss": 1.2911,
866
+ "step": 1220
867
+ },
868
+ {
869
+ "epoch": 0.22,
870
+ "learning_rate": 4.892e-06,
871
+ "loss": 1.289,
872
+ "step": 1230
873
+ },
874
+ {
875
+ "epoch": 0.23,
876
+ "learning_rate": 4.932e-06,
877
+ "loss": 1.2944,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.23,
882
+ "learning_rate": 4.972e-06,
883
+ "loss": 1.2753,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 0.23,
888
+ "learning_rate": 5.012e-06,
889
+ "loss": 1.2949,
890
+ "step": 1260
891
+ },
892
+ {
893
+ "epoch": 0.23,
894
+ "learning_rate": 5.051999999999999e-06,
895
+ "loss": 1.2816,
896
+ "step": 1270
897
+ },
898
+ {
899
+ "epoch": 0.23,
900
+ "learning_rate": 5.092e-06,
901
+ "loss": 1.3104,
902
+ "step": 1280
903
+ },
904
+ {
905
+ "epoch": 0.23,
906
+ "learning_rate": 5.131999999999999e-06,
907
+ "loss": 1.274,
908
+ "step": 1290
909
+ },
910
+ {
911
+ "epoch": 0.24,
912
+ "learning_rate": 5.172e-06,
913
+ "loss": 1.296,
914
+ "step": 1300
915
+ },
916
+ {
917
+ "epoch": 0.24,
918
+ "learning_rate": 5.211999999999999e-06,
919
+ "loss": 1.29,
920
+ "step": 1310
921
+ },
922
+ {
923
+ "epoch": 0.24,
924
+ "learning_rate": 5.252e-06,
925
+ "loss": 1.2643,
926
+ "step": 1320
927
+ },
928
+ {
929
+ "epoch": 0.24,
930
+ "learning_rate": 5.2919999999999995e-06,
931
+ "loss": 1.2882,
932
+ "step": 1330
933
+ },
934
+ {
935
+ "epoch": 0.24,
936
+ "learning_rate": 5.332e-06,
937
+ "loss": 1.295,
938
+ "step": 1340
939
+ },
940
+ {
941
+ "epoch": 0.25,
942
+ "learning_rate": 5.3719999999999996e-06,
943
+ "loss": 1.248,
944
+ "step": 1350
945
+ },
946
+ {
947
+ "epoch": 0.25,
948
+ "learning_rate": 5.412e-06,
949
+ "loss": 1.3236,
950
+ "step": 1360
951
+ },
952
+ {
953
+ "epoch": 0.25,
954
+ "learning_rate": 5.452e-06,
955
+ "loss": 1.2925,
956
+ "step": 1370
957
+ },
958
+ {
959
+ "epoch": 0.25,
960
+ "learning_rate": 5.492e-06,
961
+ "loss": 1.2991,
962
+ "step": 1380
963
+ },
964
+ {
965
+ "epoch": 0.25,
966
+ "learning_rate": 5.532e-06,
967
+ "loss": 1.2853,
968
+ "step": 1390
969
+ },
970
+ {
971
+ "epoch": 0.25,
972
+ "learning_rate": 5.572e-06,
973
+ "loss": 1.2835,
974
+ "step": 1400
975
+ },
976
+ {
977
+ "epoch": 0.26,
978
+ "learning_rate": 5.612e-06,
979
+ "loss": 1.2687,
980
+ "step": 1410
981
+ },
982
+ {
983
+ "epoch": 0.26,
984
+ "learning_rate": 5.652e-06,
985
+ "loss": 1.254,
986
+ "step": 1420
987
+ },
988
+ {
989
+ "epoch": 0.26,
990
+ "learning_rate": 5.692e-06,
991
+ "loss": 1.3045,
992
+ "step": 1430
993
+ },
994
+ {
995
+ "epoch": 0.26,
996
+ "learning_rate": 5.732e-06,
997
+ "loss": 1.2598,
998
+ "step": 1440
999
+ },
1000
+ {
1001
+ "epoch": 0.26,
1002
+ "learning_rate": 5.772e-06,
1003
+ "loss": 1.2628,
1004
+ "step": 1450
1005
+ },
1006
+ {
1007
+ "epoch": 0.27,
1008
+ "learning_rate": 5.8120000000000004e-06,
1009
+ "loss": 1.2519,
1010
+ "step": 1460
1011
+ },
1012
+ {
1013
+ "epoch": 0.27,
1014
+ "learning_rate": 5.852e-06,
1015
+ "loss": 1.2902,
1016
+ "step": 1470
1017
+ },
1018
+ {
1019
+ "epoch": 0.27,
1020
+ "learning_rate": 5.892e-06,
1021
+ "loss": 1.2999,
1022
+ "step": 1480
1023
+ },
1024
+ {
1025
+ "epoch": 0.27,
1026
+ "learning_rate": 5.932e-06,
1027
+ "loss": 1.283,
1028
+ "step": 1490
1029
+ },
1030
+ {
1031
+ "epoch": 0.27,
1032
+ "learning_rate": 5.972e-06,
1033
+ "loss": 1.2697,
1034
+ "step": 1500
1035
+ },
1036
+ {
1037
+ "epoch": 0.27,
1038
+ "eval_wmt2019_zh-en_accuracy": 0.587438249823571,
1039
+ "eval_wmt2019_zh-en_loss": 1.9453125,
1040
+ "eval_wmt2019_zh-en_runtime": 77.3466,
1041
+ "eval_wmt2019_zh-en_samples_per_second": 12.929,
1042
+ "eval_wmt2019_zh-en_steps_per_second": 3.232,
1043
+ "step": 1500
1044
+ },
1045
+ {
1046
+ "epoch": 0.27,
1047
+ "eval_ted_trans_en-ja_accuracy": 0.5393802273612374,
1048
+ "eval_ted_trans_en-ja_loss": 2.05078125,
1049
+ "eval_ted_trans_en-ja_runtime": 66.8272,
1050
+ "eval_ted_trans_en-ja_samples_per_second": 11.986,
1051
+ "eval_ted_trans_en-ja_steps_per_second": 3.008,
1052
+ "step": 1500
1053
+ },
1054
+ {
1055
+ "epoch": 0.27,
1056
+ "eval_ted_trans_zh-ja_accuracy": 0.46426092990978485,
1057
+ "eval_ted_trans_zh-ja_loss": 2.8203125,
1058
+ "eval_ted_trans_zh-ja_runtime": 3.6203,
1059
+ "eval_ted_trans_zh-ja_samples_per_second": 11.601,
1060
+ "eval_ted_trans_zh-ja_steps_per_second": 3.038,
1061
+ "step": 1500
1062
+ },
1063
+ {
1064
+ "epoch": 0.27,
1065
+ "eval_sharegpt_accuracy": 0.6912774549792565,
1066
+ "eval_sharegpt_loss": 1.2744140625,
1067
+ "eval_sharegpt_runtime": 755.8288,
1068
+ "eval_sharegpt_samples_per_second": 4.43,
1069
+ "eval_sharegpt_steps_per_second": 1.107,
1070
+ "step": 1500
1071
+ },
1072
+ {
1073
+ "epoch": 0.27,
1074
+ "eval_dolly15k_accuracy": 0.5594429758634393,
1075
+ "eval_dolly15k_loss": 1.8212890625,
1076
+ "eval_dolly15k_runtime": 63.0527,
1077
+ "eval_dolly15k_samples_per_second": 11.911,
1078
+ "eval_dolly15k_steps_per_second": 2.982,
1079
+ "step": 1500
1080
+ },
1081
+ {
1082
+ "epoch": 0.27,
1083
+ "eval_ikala_accuracy": 0.7042301907582296,
1084
+ "eval_ikala_loss": 1.115234375,
1085
+ "eval_ikala_runtime": 1329.6879,
1086
+ "eval_ikala_samples_per_second": 10.322,
1087
+ "eval_ikala_steps_per_second": 2.581,
1088
+ "step": 1500
1089
+ },
1090
+ {
1091
+ "epoch": 0.27,
1092
+ "eval_oasst_export_accuracy": 0.5868625620259539,
1093
+ "eval_oasst_export_loss": 2.0390625,
1094
+ "eval_oasst_export_runtime": 206.437,
1095
+ "eval_oasst_export_samples_per_second": 10.168,
1096
+ "eval_oasst_export_steps_per_second": 2.543,
1097
+ "step": 1500
1098
+ },
1099
+ {
1100
+ "epoch": 0.27,
1101
+ "learning_rate": 6.011999999999999e-06,
1102
+ "loss": 1.3179,
1103
+ "step": 1510
1104
+ },
1105
+ {
1106
+ "epoch": 0.28,
1107
+ "learning_rate": 6.051999999999999e-06,
1108
+ "loss": 1.2781,
1109
+ "step": 1520
1110
+ },
1111
+ {
1112
+ "epoch": 0.28,
1113
+ "learning_rate": 6.0919999999999994e-06,
1114
+ "loss": 1.2717,
1115
+ "step": 1530
1116
+ },
1117
+ {
1118
+ "epoch": 0.28,
1119
+ "learning_rate": 6.131999999999999e-06,
1120
+ "loss": 1.2661,
1121
+ "step": 1540
1122
+ },
1123
+ {
1124
+ "epoch": 0.28,
1125
+ "learning_rate": 6.1719999999999995e-06,
1126
+ "loss": 1.287,
1127
+ "step": 1550
1128
+ },
1129
+ {
1130
+ "epoch": 0.28,
1131
+ "learning_rate": 6.211999999999999e-06,
1132
+ "loss": 1.2784,
1133
+ "step": 1560
1134
+ },
1135
+ {
1136
+ "epoch": 0.29,
1137
+ "learning_rate": 6.252e-06,
1138
+ "loss": 1.2767,
1139
+ "step": 1570
1140
+ },
1141
+ {
1142
+ "epoch": 0.29,
1143
+ "learning_rate": 6.291999999999999e-06,
1144
+ "loss": 1.2657,
1145
+ "step": 1580
1146
+ },
1147
+ {
1148
+ "epoch": 0.29,
1149
+ "learning_rate": 6.332e-06,
1150
+ "loss": 1.2957,
1151
+ "step": 1590
1152
+ },
1153
+ {
1154
+ "epoch": 0.29,
1155
+ "learning_rate": 6.371999999999999e-06,
1156
+ "loss": 1.3181,
1157
+ "step": 1600
1158
+ },
1159
+ {
1160
+ "epoch": 0.29,
1161
+ "learning_rate": 6.412e-06,
1162
+ "loss": 1.2688,
1163
+ "step": 1610
1164
+ },
1165
+ {
1166
+ "epoch": 0.29,
1167
+ "learning_rate": 6.451999999999999e-06,
1168
+ "loss": 1.2598,
1169
+ "step": 1620
1170
+ },
1171
+ {
1172
+ "epoch": 0.3,
1173
+ "learning_rate": 6.492e-06,
1174
+ "loss": 1.2875,
1175
+ "step": 1630
1176
+ },
1177
+ {
1178
+ "epoch": 0.3,
1179
+ "learning_rate": 6.5319999999999995e-06,
1180
+ "loss": 1.2573,
1181
+ "step": 1640
1182
+ },
1183
+ {
1184
+ "epoch": 0.3,
1185
+ "learning_rate": 6.572e-06,
1186
+ "loss": 1.2698,
1187
+ "step": 1650
1188
+ },
1189
+ {
1190
+ "epoch": 0.3,
1191
+ "learning_rate": 6.6119999999999995e-06,
1192
+ "loss": 1.271,
1193
+ "step": 1660
1194
+ },
1195
+ {
1196
+ "epoch": 0.3,
1197
+ "learning_rate": 6.652e-06,
1198
+ "loss": 1.2724,
1199
+ "step": 1670
1200
+ },
1201
+ {
1202
+ "epoch": 0.31,
1203
+ "learning_rate": 6.692e-06,
1204
+ "loss": 1.2527,
1205
+ "step": 1680
1206
+ },
1207
+ {
1208
+ "epoch": 0.31,
1209
+ "learning_rate": 6.732e-06,
1210
+ "loss": 1.2659,
1211
+ "step": 1690
1212
+ },
1213
+ {
1214
+ "epoch": 0.31,
1215
+ "learning_rate": 6.772e-06,
1216
+ "loss": 1.2298,
1217
+ "step": 1700
1218
+ },
1219
+ {
1220
+ "epoch": 0.31,
1221
+ "learning_rate": 6.812e-06,
1222
+ "loss": 1.2217,
1223
+ "step": 1710
1224
+ },
1225
+ {
1226
+ "epoch": 0.31,
1227
+ "learning_rate": 6.852e-06,
1228
+ "loss": 1.2695,
1229
+ "step": 1720
1230
+ },
1231
+ {
1232
+ "epoch": 0.31,
1233
+ "learning_rate": 6.892e-06,
1234
+ "loss": 1.2339,
1235
+ "step": 1730
1236
+ },
1237
+ {
1238
+ "epoch": 0.32,
1239
+ "learning_rate": 6.932e-06,
1240
+ "loss": 1.2342,
1241
+ "step": 1740
1242
+ },
1243
+ {
1244
+ "epoch": 0.32,
1245
+ "learning_rate": 6.972e-06,
1246
+ "loss": 1.2652,
1247
+ "step": 1750
1248
+ },
1249
+ {
1250
+ "epoch": 0.32,
1251
+ "learning_rate": 7.011999999999999e-06,
1252
+ "loss": 1.2411,
1253
+ "step": 1760
1254
+ },
1255
+ {
1256
+ "epoch": 0.32,
1257
+ "learning_rate": 7.0519999999999996e-06,
1258
+ "loss": 1.2478,
1259
+ "step": 1770
1260
+ },
1261
+ {
1262
+ "epoch": 0.32,
1263
+ "learning_rate": 7.091999999999999e-06,
1264
+ "loss": 1.2379,
1265
+ "step": 1780
1266
+ },
1267
+ {
1268
+ "epoch": 0.33,
1269
+ "learning_rate": 7.132e-06,
1270
+ "loss": 1.2847,
1271
+ "step": 1790
1272
+ },
1273
+ {
1274
+ "epoch": 0.33,
1275
+ "learning_rate": 7.171999999999999e-06,
1276
+ "loss": 1.2378,
1277
+ "step": 1800
1278
+ },
1279
+ {
1280
+ "epoch": 0.33,
1281
+ "learning_rate": 7.212e-06,
1282
+ "loss": 1.2901,
1283
+ "step": 1810
1284
+ },
1285
+ {
1286
+ "epoch": 0.33,
1287
+ "learning_rate": 7.251999999999999e-06,
1288
+ "loss": 1.2662,
1289
+ "step": 1820
1290
+ },
1291
+ {
1292
+ "epoch": 0.33,
1293
+ "learning_rate": 7.292e-06,
1294
+ "loss": 1.2622,
1295
+ "step": 1830
1296
+ },
1297
+ {
1298
+ "epoch": 0.33,
1299
+ "learning_rate": 7.3319999999999994e-06,
1300
+ "loss": 1.2518,
1301
+ "step": 1840
1302
+ },
1303
+ {
1304
+ "epoch": 0.34,
1305
+ "learning_rate": 7.372e-06,
1306
+ "loss": 1.2648,
1307
+ "step": 1850
1308
+ },
1309
+ {
1310
+ "epoch": 0.34,
1311
+ "learning_rate": 7.4119999999999995e-06,
1312
+ "loss": 1.2582,
1313
+ "step": 1860
1314
+ },
1315
+ {
1316
+ "epoch": 0.34,
1317
+ "learning_rate": 7.452e-06,
1318
+ "loss": 1.2545,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.34,
1323
+ "learning_rate": 7.492e-06,
1324
+ "loss": 1.2372,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 0.34,
1329
+ "learning_rate": 7.532e-06,
1330
+ "loss": 1.2266,
1331
+ "step": 1890
1332
+ },
1333
+ {
1334
+ "epoch": 0.35,
1335
+ "learning_rate": 7.572e-06,
1336
+ "loss": 1.2509,
1337
+ "step": 1900
1338
+ },
1339
+ {
1340
+ "epoch": 0.35,
1341
+ "learning_rate": 7.612e-06,
1342
+ "loss": 1.2487,
1343
+ "step": 1910
1344
+ },
1345
+ {
1346
+ "epoch": 0.35,
1347
+ "learning_rate": 7.652e-06,
1348
+ "loss": 1.2968,
1349
+ "step": 1920
1350
+ },
1351
+ {
1352
+ "epoch": 0.35,
1353
+ "learning_rate": 7.692e-06,
1354
+ "loss": 1.2719,
1355
+ "step": 1930
1356
+ },
1357
+ {
1358
+ "epoch": 0.35,
1359
+ "learning_rate": 7.732e-06,
1360
+ "loss": 1.2537,
1361
+ "step": 1940
1362
+ },
1363
+ {
1364
+ "epoch": 0.35,
1365
+ "learning_rate": 7.772e-06,
1366
+ "loss": 1.2733,
1367
+ "step": 1950
1368
+ },
1369
+ {
1370
+ "epoch": 0.36,
1371
+ "learning_rate": 7.812e-06,
1372
+ "loss": 1.2663,
1373
+ "step": 1960
1374
+ },
1375
+ {
1376
+ "epoch": 0.36,
1377
+ "learning_rate": 7.852e-06,
1378
+ "loss": 1.2496,
1379
+ "step": 1970
1380
+ },
1381
+ {
1382
+ "epoch": 0.36,
1383
+ "learning_rate": 7.892e-06,
1384
+ "loss": 1.2542,
1385
+ "step": 1980
1386
+ },
1387
+ {
1388
+ "epoch": 0.36,
1389
+ "learning_rate": 7.932e-06,
1390
+ "loss": 1.2398,
1391
+ "step": 1990
1392
+ },
1393
+ {
1394
+ "epoch": 0.36,
1395
+ "learning_rate": 7.972e-06,
1396
+ "loss": 1.2564,
1397
+ "step": 2000
1398
+ },
1399
+ {
1400
+ "epoch": 0.36,
1401
+ "eval_wmt2019_zh-en_accuracy": 0.5861558940985061,
1402
+ "eval_wmt2019_zh-en_loss": 1.9638671875,
1403
+ "eval_wmt2019_zh-en_runtime": 76.6629,
1404
+ "eval_wmt2019_zh-en_samples_per_second": 13.044,
1405
+ "eval_wmt2019_zh-en_steps_per_second": 3.261,
1406
+ "step": 2000
1407
+ },
1408
+ {
1409
+ "epoch": 0.36,
1410
+ "eval_ted_trans_en-ja_accuracy": 0.5495979979766785,
1411
+ "eval_ted_trans_en-ja_loss": 2.001953125,
1412
+ "eval_ted_trans_en-ja_runtime": 66.9893,
1413
+ "eval_ted_trans_en-ja_samples_per_second": 11.957,
1414
+ "eval_ted_trans_en-ja_steps_per_second": 3.0,
1415
+ "step": 2000
1416
+ },
1417
+ {
1418
+ "epoch": 0.36,
1419
+ "eval_ted_trans_zh-ja_accuracy": 0.47411944869831546,
1420
+ "eval_ted_trans_zh-ja_loss": 2.7109375,
1421
+ "eval_ted_trans_zh-ja_runtime": 4.1683,
1422
+ "eval_ted_trans_zh-ja_samples_per_second": 10.076,
1423
+ "eval_ted_trans_zh-ja_steps_per_second": 2.639,
1424
+ "step": 2000
1425
+ },
1426
+ {
1427
+ "epoch": 0.36,
1428
+ "eval_sharegpt_accuracy": 0.6977260319800304,
1429
+ "eval_sharegpt_loss": 1.234375,
1430
+ "eval_sharegpt_runtime": 751.9973,
1431
+ "eval_sharegpt_samples_per_second": 4.452,
1432
+ "eval_sharegpt_steps_per_second": 1.113,
1433
+ "step": 2000
1434
+ },
1435
+ {
1436
+ "epoch": 0.36,
1437
+ "eval_dolly15k_accuracy": 0.5599267437239759,
1438
+ "eval_dolly15k_loss": 1.822265625,
1439
+ "eval_dolly15k_runtime": 63.1103,
1440
+ "eval_dolly15k_samples_per_second": 11.9,
1441
+ "eval_dolly15k_steps_per_second": 2.979,
1442
+ "step": 2000
1443
+ },
1444
+ {
1445
+ "epoch": 0.36,
1446
+ "eval_ikala_accuracy": 0.708770642571952,
1447
+ "eval_ikala_loss": 1.0927734375,
1448
+ "eval_ikala_runtime": 1329.0014,
1449
+ "eval_ikala_samples_per_second": 10.327,
1450
+ "eval_ikala_steps_per_second": 2.582,
1451
+ "step": 2000
1452
+ },
1453
+ {
1454
+ "epoch": 0.36,
1455
+ "eval_oasst_export_accuracy": 0.5890918246383349,
1456
+ "eval_oasst_export_loss": 2.033203125,
1457
+ "eval_oasst_export_runtime": 207.1646,
1458
+ "eval_oasst_export_samples_per_second": 10.132,
1459
+ "eval_oasst_export_steps_per_second": 2.534,
1460
+ "step": 2000
1461
+ }
1462
+ ],
1463
+ "max_steps": 43960,
1464
+ "num_train_epochs": 8,
1465
+ "total_flos": 952516817190912.0,
1466
+ "trial_name": null,
1467
+ "trial_params": null
1468
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35b62182d7892f980c48028a76cd6e9d78707555ee9860c7cec9e87cd0d15bef
3
+ size 4923