nlee-208 commited on
Commit
4c10953
1 Parent(s): 60ef4ea

Model save

Browse files
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: alignment-handbook/zephyr-7b-sft-full
4
+ tags:
5
+ - trl
6
+ - kto
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: zephyr-7b-sft-kto2
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/nlee28/huggingface/runs/m7ew3rpy)
17
+ # zephyr-7b-sft-kto2
18
+
19
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-07
39
+ - train_batch_size: 8
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 4
44
+ - total_train_batch_size: 32
45
+ - total_eval_batch_size: 32
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: cosine
48
+ - num_epochs: 1
49
+
50
+ ### Training results
51
+
52
+
53
+
54
+ ### Framework versions
55
+
56
+ - Transformers 4.42.4
57
+ - Pytorch 2.1.2.post303
58
+ - Datasets 2.18.0
59
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.3297611388589154,
5
+ "train_runtime": 9235.4216,
6
+ "train_samples": 60917,
7
+ "train_samples_per_second": 6.596,
8
+ "train_steps_per_second": 0.206
9
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.42.4"
6
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cec96c3b58f53bcec7434705715a4902135cab39fcaabe586add185bb936026e
3
+ size 4943162336
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4061e674fcf909c2cf8f98ff6c01ed06116f9274dc7c4e05af3e84a632010481
3
+ size 4999819336
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ec300c77b863f3fa30c414cd49b86c48f01da0b4a3fd3a9b66549d6ffd43776
3
+ size 4540516344
model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 14483464192
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.norm.weight": "model-00003-of-00003.safetensors"
297
+ }
298
+ }
runs/Jul27_17-18-46_gpu-1/events.out.tfevents.1722068762.gpu-1.2291129.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a6dccc026e56f2b950625f16df28066e290381b3cdbbeb362a4745b28336a91
3
- size 110616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9cf7cac131b1da4f7ff27ea8f1a7c4c35abdd6dd7fa28ab07ac6bf25a7f989
3
+ size 110970
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.3297611388589154,
5
+ "train_runtime": 9235.4216,
6
+ "train_samples": 60917,
7
+ "train_samples_per_second": 6.596,
8
+ "train_steps_per_second": 0.206
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,2512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1904,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005252100840336135,
13
+ "grad_norm": 69.5775218105222,
14
+ "kl": 0.15183432400226593,
15
+ "learning_rate": 4.999659696812289e-07,
16
+ "logps/chosen": -298.5141715116279,
17
+ "logps/rejected": -259.0493032094595,
18
+ "loss": 0.4994,
19
+ "rewards/chosen": -0.04585062625796296,
20
+ "rewards/margins": 0.01403944421908452,
21
+ "rewards/rejected": -0.05989007047704748,
22
+ "step": 10
23
+ },
24
+ {
25
+ "epoch": 0.01050420168067227,
26
+ "grad_norm": 55.49240989732535,
27
+ "kl": 2.105862855911255,
28
+ "learning_rate": 4.998638879894165e-07,
29
+ "logps/chosen": -273.6553505345395,
30
+ "logps/rejected": -227.15778459821428,
31
+ "loss": 0.4776,
32
+ "rewards/chosen": 0.49514409115439967,
33
+ "rewards/margins": 0.21992081329039764,
34
+ "rewards/rejected": 0.275223277864002,
35
+ "step": 20
36
+ },
37
+ {
38
+ "epoch": 0.015756302521008403,
39
+ "grad_norm": 51.28980044595933,
40
+ "kl": 0.28644976019859314,
41
+ "learning_rate": 4.996937827155428e-07,
42
+ "logps/chosen": -311.9908854166667,
43
+ "logps/rejected": -247.46916946308724,
44
+ "loss": 0.4486,
45
+ "rewards/chosen": 0.30754662675467154,
46
+ "rewards/margins": 0.5134327379181245,
47
+ "rewards/rejected": -0.20588611116345296,
48
+ "step": 30
49
+ },
50
+ {
51
+ "epoch": 0.02100840336134454,
52
+ "grad_norm": 53.39172095489608,
53
+ "kl": 0.0,
54
+ "learning_rate": 4.994557001695013e-07,
55
+ "logps/chosen": -298.2471417682927,
56
+ "logps/rejected": -238.9893078926282,
57
+ "loss": 0.4172,
58
+ "rewards/chosen": 0.01273148815806319,
59
+ "rewards/margins": 0.8681685468269335,
60
+ "rewards/rejected": -0.8554370586688702,
61
+ "step": 40
62
+ },
63
+ {
64
+ "epoch": 0.026260504201680673,
65
+ "grad_norm": 48.3236105129346,
66
+ "kl": 0.0,
67
+ "learning_rate": 4.991497051674917e-07,
68
+ "logps/chosen": -289.8418828616352,
69
+ "logps/rejected": -264.62730493012424,
70
+ "loss": 0.3922,
71
+ "rewards/chosen": 0.3131725623172784,
72
+ "rewards/margins": 1.3170881950509257,
73
+ "rewards/rejected": -1.0039156327336471,
74
+ "step": 50
75
+ },
76
+ {
77
+ "epoch": 0.031512605042016806,
78
+ "grad_norm": 51.661072159206874,
79
+ "kl": 0.0,
80
+ "learning_rate": 4.987758810143735e-07,
81
+ "logps/chosen": -331.279736328125,
82
+ "logps/rejected": -255.901513671875,
83
+ "loss": 0.3954,
84
+ "rewards/chosen": 0.22145886421203614,
85
+ "rewards/margins": 1.2238109111785889,
86
+ "rewards/rejected": -1.0023520469665528,
87
+ "step": 60
88
+ },
89
+ {
90
+ "epoch": 0.03676470588235294,
91
+ "grad_norm": 48.98498102999305,
92
+ "kl": 0.0,
93
+ "learning_rate": 4.983343294809874e-07,
94
+ "logps/chosen": -309.5669285973837,
95
+ "logps/rejected": -255.4453125,
96
+ "loss": 0.3732,
97
+ "rewards/chosen": 0.5395132552745731,
98
+ "rewards/margins": 1.6830449514910533,
99
+ "rewards/rejected": -1.14353169621648,
100
+ "step": 70
101
+ },
102
+ {
103
+ "epoch": 0.04201680672268908,
104
+ "grad_norm": 49.52520073364695,
105
+ "kl": 0.0,
106
+ "learning_rate": 4.978251707764491e-07,
107
+ "logps/chosen": -298.83478282232704,
108
+ "logps/rejected": -262.3364712732919,
109
+ "loss": 0.3542,
110
+ "rewards/chosen": 0.41468177651459315,
111
+ "rewards/margins": 2.0195703134298686,
112
+ "rewards/rejected": -1.6048885369152757,
113
+ "step": 80
114
+ },
115
+ {
116
+ "epoch": 0.04726890756302521,
117
+ "grad_norm": 45.84438443510472,
118
+ "kl": 0.0,
119
+ "learning_rate": 4.972485435154228e-07,
120
+ "logps/chosen": -284.00230493012424,
121
+ "logps/rejected": -292.5305866745283,
122
+ "loss": 0.3424,
123
+ "rewards/chosen": 0.20849199472747235,
124
+ "rewards/margins": 2.4833206356472837,
125
+ "rewards/rejected": -2.2748286409198113,
126
+ "step": 90
127
+ },
128
+ {
129
+ "epoch": 0.052521008403361345,
130
+ "grad_norm": 51.90503609381182,
131
+ "kl": 0.0,
132
+ "learning_rate": 4.966046046803842e-07,
133
+ "logps/chosen": -305.3333748009554,
134
+ "logps/rejected": -265.70988305214723,
135
+ "loss": 0.3825,
136
+ "rewards/chosen": 0.518976345183743,
137
+ "rewards/margins": 2.160451905658724,
138
+ "rewards/rejected": -1.6414755604749809,
139
+ "step": 100
140
+ },
141
+ {
142
+ "epoch": 0.05777310924369748,
143
+ "grad_norm": 46.615982508495186,
144
+ "kl": 0.0,
145
+ "learning_rate": 4.958935295788841e-07,
146
+ "logps/chosen": -346.7928911423841,
147
+ "logps/rejected": -282.3527181952663,
148
+ "loss": 0.3829,
149
+ "rewards/chosen": 0.18162511358197952,
150
+ "rewards/margins": 1.8821555777091985,
151
+ "rewards/rejected": -1.700530464127219,
152
+ "step": 110
153
+ },
154
+ {
155
+ "epoch": 0.06302521008403361,
156
+ "grad_norm": 43.56170699177104,
157
+ "kl": 0.0,
158
+ "learning_rate": 4.951155117958216e-07,
159
+ "logps/chosen": -299.7170380015432,
160
+ "logps/rejected": -281.8764092167722,
161
+ "loss": 0.3787,
162
+ "rewards/chosen": -0.607381797131197,
163
+ "rewards/margins": 2.2764308295746973,
164
+ "rewards/rejected": -2.883812626705894,
165
+ "step": 120
166
+ },
167
+ {
168
+ "epoch": 0.06827731092436974,
169
+ "grad_norm": 52.35577489618577,
170
+ "kl": 0.0,
171
+ "learning_rate": 4.942707631407419e-07,
172
+ "logps/chosen": -335.1082099780702,
173
+ "logps/rejected": -281.61443477348996,
174
+ "loss": 0.3757,
175
+ "rewards/chosen": -0.22571238021404422,
176
+ "rewards/margins": 2.2212163225308172,
177
+ "rewards/rejected": -2.4469287027448616,
178
+ "step": 130
179
+ },
180
+ {
181
+ "epoch": 0.07352941176470588,
182
+ "grad_norm": 28.41591231440036,
183
+ "kl": 0.0,
184
+ "learning_rate": 4.933595135901732e-07,
185
+ "logps/chosen": -318.5261247783688,
186
+ "logps/rejected": -269.2608676675978,
187
+ "loss": 0.3369,
188
+ "rewards/chosen": 0.26553474588597076,
189
+ "rewards/margins": 2.99381714157382,
190
+ "rewards/rejected": -2.7282823956878492,
191
+ "step": 140
192
+ },
193
+ {
194
+ "epoch": 0.07878151260504201,
195
+ "grad_norm": 77.73773489117062,
196
+ "kl": 0.0,
197
+ "learning_rate": 4.923820112250169e-07,
198
+ "logps/chosen": -301.63650901845637,
199
+ "logps/rejected": -266.3384959795322,
200
+ "loss": 0.3375,
201
+ "rewards/chosen": 0.025881773673447985,
202
+ "rewards/margins": 3.2468575937575124,
203
+ "rewards/rejected": -3.2209758200840644,
204
+ "step": 150
205
+ },
206
+ {
207
+ "epoch": 0.08403361344537816,
208
+ "grad_norm": 37.575572316994865,
209
+ "kl": 0.0,
210
+ "learning_rate": 4.913385221630096e-07,
211
+ "logps/chosen": -250.38438482704402,
212
+ "logps/rejected": -265.8534064440994,
213
+ "loss": 0.3758,
214
+ "rewards/chosen": 0.7617762343688581,
215
+ "rewards/margins": 1.6022024720556787,
216
+ "rewards/rejected": -0.8404262376868207,
217
+ "step": 160
218
+ },
219
+ {
220
+ "epoch": 0.08928571428571429,
221
+ "grad_norm": 40.501974364358205,
222
+ "kl": 0.0,
223
+ "learning_rate": 4.902293304862749e-07,
224
+ "logps/chosen": -270.815234375,
225
+ "logps/rejected": -255.6860107421875,
226
+ "loss": 0.3672,
227
+ "rewards/chosen": 0.9947892189025879,
228
+ "rewards/margins": 1.629128646850586,
229
+ "rewards/rejected": -0.634339427947998,
230
+ "step": 170
231
+ },
232
+ {
233
+ "epoch": 0.09453781512605042,
234
+ "grad_norm": 36.96138019330735,
235
+ "kl": 0.0,
236
+ "learning_rate": 4.890547381639833e-07,
237
+ "logps/chosen": -299.6106373856707,
238
+ "logps/rejected": -230.88887219551282,
239
+ "loss": 0.3625,
240
+ "rewards/chosen": 0.3851209733544326,
241
+ "rewards/margins": 2.033356000961103,
242
+ "rewards/rejected": -1.6482350276066706,
243
+ "step": 180
244
+ },
245
+ {
246
+ "epoch": 0.09978991596638656,
247
+ "grad_norm": 44.46735174580667,
248
+ "kl": 0.0,
249
+ "learning_rate": 4.878150649701439e-07,
250
+ "logps/chosen": -344.7996875,
251
+ "logps/rejected": -265.18083639705884,
252
+ "loss": 0.3185,
253
+ "rewards/chosen": 0.36289586385091144,
254
+ "rewards/margins": 2.9504150719736133,
255
+ "rewards/rejected": -2.587519208122702,
256
+ "step": 190
257
+ },
258
+ {
259
+ "epoch": 0.10504201680672269,
260
+ "grad_norm": 52.28178000802996,
261
+ "kl": 0.0,
262
+ "learning_rate": 4.865106483965486e-07,
263
+ "logps/chosen": -292.6435797275641,
264
+ "logps/rejected": -240.90084317835365,
265
+ "loss": 0.3407,
266
+ "rewards/chosen": -0.03864570764394907,
267
+ "rewards/margins": 2.6535366936875104,
268
+ "rewards/rejected": -2.6921824013314595,
269
+ "step": 200
270
+ },
271
+ {
272
+ "epoch": 0.11029411764705882,
273
+ "grad_norm": 42.51233566429091,
274
+ "kl": 0.0,
275
+ "learning_rate": 4.851418435608919e-07,
276
+ "logps/chosen": -300.3898746468927,
277
+ "logps/rejected": -269.88991477272725,
278
+ "loss": 0.3554,
279
+ "rewards/chosen": 0.8636472023139565,
280
+ "rewards/margins": 2.70990092417803,
281
+ "rewards/rejected": -1.8462537218640733,
282
+ "step": 210
283
+ },
284
+ {
285
+ "epoch": 0.11554621848739496,
286
+ "grad_norm": 45.269264573206264,
287
+ "kl": 0.0,
288
+ "learning_rate": 4.837090231100927e-07,
289
+ "logps/chosen": -315.7438054733728,
290
+ "logps/rejected": -242.41240687086093,
291
+ "loss": 0.327,
292
+ "rewards/chosen": 1.163497202495146,
293
+ "rewards/margins": 2.8938327665084946,
294
+ "rewards/rejected": -1.7303355640133484,
295
+ "step": 220
296
+ },
297
+ {
298
+ "epoch": 0.1207983193277311,
299
+ "grad_norm": 46.30035764458271,
300
+ "kl": 0.0,
301
+ "learning_rate": 4.822125771188448e-07,
302
+ "logps/chosen": -285.65898944805195,
303
+ "logps/rejected": -271.8729762801205,
304
+ "loss": 0.3828,
305
+ "rewards/chosen": 0.5068141392299107,
306
+ "rewards/margins": 1.911893440811236,
307
+ "rewards/rejected": -1.4050793015813252,
308
+ "step": 230
309
+ },
310
+ {
311
+ "epoch": 0.12605042016806722,
312
+ "grad_norm": 44.08666008078037,
313
+ "kl": 0.0,
314
+ "learning_rate": 4.806529129834207e-07,
315
+ "logps/chosen": -280.8894210188356,
316
+ "logps/rejected": -270.21331716954023,
317
+ "loss": 0.329,
318
+ "rewards/chosen": 0.5693619087950824,
319
+ "rewards/margins": 2.748881386727015,
320
+ "rewards/rejected": -2.1795194779319327,
321
+ "step": 240
322
+ },
323
+ {
324
+ "epoch": 0.13130252100840337,
325
+ "grad_norm": 38.010572045662606,
326
+ "kl": 0.0,
327
+ "learning_rate": 4.790304553107622e-07,
328
+ "logps/chosen": -304.2262851331361,
329
+ "logps/rejected": -254.57090749172184,
330
+ "loss": 0.3588,
331
+ "rewards/chosen": 0.49599551590236685,
332
+ "rewards/margins": 2.326870937893778,
333
+ "rewards/rejected": -1.8308754219914114,
334
+ "step": 250
335
+ },
336
+ {
337
+ "epoch": 0.13655462184873948,
338
+ "grad_norm": 44.14238619732289,
339
+ "kl": 0.0,
340
+ "learning_rate": 4.773456458028837e-07,
341
+ "logps/chosen": -278.3834842754777,
342
+ "logps/rejected": -268.8938362730061,
343
+ "loss": 0.3378,
344
+ "rewards/chosen": 0.5815278071506768,
345
+ "rewards/margins": 2.6120607376769422,
346
+ "rewards/rejected": -2.0305329305262654,
347
+ "step": 260
348
+ },
349
+ {
350
+ "epoch": 0.14180672268907563,
351
+ "grad_norm": 39.61832047489729,
352
+ "kl": 1.3655751943588257,
353
+ "learning_rate": 4.755989431366221e-07,
354
+ "logps/chosen": -301.3936798878205,
355
+ "logps/rejected": -299.8501572027439,
356
+ "loss": 0.3287,
357
+ "rewards/chosen": 0.6345734229454627,
358
+ "rewards/margins": 2.3345022648852494,
359
+ "rewards/rejected": -1.6999288419397867,
360
+ "step": 270
361
+ },
362
+ {
363
+ "epoch": 0.14705882352941177,
364
+ "grad_norm": 43.29173967453139,
365
+ "kl": 0.0,
366
+ "learning_rate": 4.737908228387656e-07,
367
+ "logps/chosen": -294.7026832217262,
368
+ "logps/rejected": -264.49424342105266,
369
+ "loss": 0.3553,
370
+ "rewards/chosen": 0.5891142799740746,
371
+ "rewards/margins": 2.708873086704646,
372
+ "rewards/rejected": -2.1197588067305717,
373
+ "step": 280
374
+ },
375
+ {
376
+ "epoch": 0.15231092436974789,
377
+ "grad_norm": 49.48779153565427,
378
+ "kl": 0.0,
379
+ "learning_rate": 4.7192177715659516e-07,
380
+ "logps/chosen": -290.1478293413174,
381
+ "logps/rejected": -243.8566176470588,
382
+ "loss": 0.3763,
383
+ "rewards/chosen": 0.392075178865901,
384
+ "rewards/margins": 2.094171158045333,
385
+ "rewards/rejected": -1.7020959791794321,
386
+ "step": 290
387
+ },
388
+ {
389
+ "epoch": 0.15756302521008403,
390
+ "grad_norm": 44.86612305681867,
391
+ "kl": 0.0,
392
+ "learning_rate": 4.699923149238736e-07,
393
+ "logps/chosen": -283.2191996402878,
394
+ "logps/rejected": -272.7527192679558,
395
+ "loss": 0.3369,
396
+ "rewards/chosen": 0.6118597675570481,
397
+ "rewards/margins": 2.6730571222057016,
398
+ "rewards/rejected": -2.0611973546486535,
399
+ "step": 300
400
+ },
401
+ {
402
+ "epoch": 0.16281512605042017,
403
+ "grad_norm": 43.00617930560404,
404
+ "kl": 0.0,
405
+ "learning_rate": 4.680029614223198e-07,
406
+ "logps/chosen": -272.5205015923567,
407
+ "logps/rejected": -266.77614551380367,
408
+ "loss": 0.3531,
409
+ "rewards/chosen": 0.13305815921467581,
410
+ "rewards/margins": 2.627990556226275,
411
+ "rewards/rejected": -2.494932397011599,
412
+ "step": 310
413
+ },
414
+ {
415
+ "epoch": 0.16806722689075632,
416
+ "grad_norm": 39.410413100307075,
417
+ "kl": 0.0,
418
+ "learning_rate": 4.65954258238604e-07,
419
+ "logps/chosen": -288.9263426272455,
420
+ "logps/rejected": -254.3640216503268,
421
+ "loss": 0.3736,
422
+ "rewards/chosen": 0.09242550056137724,
423
+ "rewards/margins": 2.2175613337032276,
424
+ "rewards/rejected": -2.1251358331418504,
425
+ "step": 320
426
+ },
427
+ {
428
+ "epoch": 0.17331932773109243,
429
+ "grad_norm": 41.100451094010424,
430
+ "kl": 0.0,
431
+ "learning_rate": 4.638467631169056e-07,
432
+ "logps/chosen": -326.0165550595238,
433
+ "logps/rejected": -306.81566097861844,
434
+ "loss": 0.318,
435
+ "rewards/chosen": 0.5781731832595098,
436
+ "rewards/margins": 3.290003924740287,
437
+ "rewards/rejected": -2.711830741480777,
438
+ "step": 330
439
+ },
440
+ {
441
+ "epoch": 0.17857142857142858,
442
+ "grad_norm": 59.056271025186184,
443
+ "kl": 0.0,
444
+ "learning_rate": 4.6168104980707103e-07,
445
+ "logps/chosen": -285.7653391768293,
446
+ "logps/rejected": -274.3047876602564,
447
+ "loss": 0.347,
448
+ "rewards/chosen": -0.003299643353718083,
449
+ "rewards/margins": 3.289772396910705,
450
+ "rewards/rejected": -3.293072040264423,
451
+ "step": 340
452
+ },
453
+ {
454
+ "epoch": 0.18382352941176472,
455
+ "grad_norm": 40.5420682235125,
456
+ "kl": 0.0,
457
+ "learning_rate": 4.594577079084145e-07,
458
+ "logps/chosen": -290.82744140625,
459
+ "logps/rejected": -292.7601318359375,
460
+ "loss": 0.3356,
461
+ "rewards/chosen": 0.4554912567138672,
462
+ "rewards/margins": 3.3385635375976563,
463
+ "rewards/rejected": -2.883072280883789,
464
+ "step": 350
465
+ },
466
+ {
467
+ "epoch": 0.18907563025210083,
468
+ "grad_norm": 73.25856842000852,
469
+ "kl": 0.0,
470
+ "learning_rate": 4.5717734270920466e-07,
471
+ "logps/chosen": -281.4008819018405,
472
+ "logps/rejected": -228.66883957006368,
473
+ "loss": 0.3378,
474
+ "rewards/chosen": 0.49251336407807705,
475
+ "rewards/margins": 2.78153659942342,
476
+ "rewards/rejected": -2.2890232353453426,
477
+ "step": 360
478
+ },
479
+ {
480
+ "epoch": 0.19432773109243698,
481
+ "grad_norm": 36.80227868403581,
482
+ "kl": 0.0,
483
+ "learning_rate": 4.548405750218785e-07,
484
+ "logps/chosen": -288.27572185672517,
485
+ "logps/rejected": -269.6975146812081,
486
+ "loss": 0.3552,
487
+ "rewards/chosen": -0.02239043949640285,
488
+ "rewards/margins": 3.2017408950516257,
489
+ "rewards/rejected": -3.2241313345480287,
490
+ "step": 370
491
+ },
492
+ {
493
+ "epoch": 0.19957983193277312,
494
+ "grad_norm": 52.176625363111675,
495
+ "kl": 0.0,
496
+ "learning_rate": 4.5244804101403025e-07,
497
+ "logps/chosen": -262.20579637096773,
498
+ "logps/rejected": -253.31008522727274,
499
+ "loss": 0.3773,
500
+ "rewards/chosen": -0.5523546772618447,
501
+ "rewards/margins": 2.2483803339834205,
502
+ "rewards/rejected": -2.800735011245265,
503
+ "step": 380
504
+ },
505
+ {
506
+ "epoch": 0.20483193277310924,
507
+ "grad_norm": 40.91805541297444,
508
+ "kl": 0.0,
509
+ "learning_rate": 4.5000039203521976e-07,
510
+ "logps/chosen": -305.74300986842104,
511
+ "logps/rejected": -303.6615048363095,
512
+ "loss": 0.3428,
513
+ "rewards/chosen": 0.08902386615150853,
514
+ "rewards/margins": 3.217569718086032,
515
+ "rewards/rejected": -3.1285458519345237,
516
+ "step": 390
517
+ },
518
+ {
519
+ "epoch": 0.21008403361344538,
520
+ "grad_norm": 48.39763770048436,
521
+ "kl": 0.0,
522
+ "learning_rate": 4.47498294439647e-07,
523
+ "logps/chosen": -293.79254518072287,
524
+ "logps/rejected": -266.05382508116884,
525
+ "loss": 0.3273,
526
+ "rewards/chosen": 0.20805818488798947,
527
+ "rewards/margins": 2.784955744115871,
528
+ "rewards/rejected": -2.5768975592278816,
529
+ "step": 400
530
+ },
531
+ {
532
+ "epoch": 0.21533613445378152,
533
+ "grad_norm": 64.69719531344766,
534
+ "kl": 0.0,
535
+ "learning_rate": 4.449424294047419e-07,
536
+ "logps/chosen": -298.2714004984663,
537
+ "logps/rejected": -277.8911972531847,
538
+ "loss": 0.3478,
539
+ "rewards/chosen": 0.7857334628426955,
540
+ "rewards/margins": 2.803303423904598,
541
+ "rewards/rejected": -2.017569961061903,
542
+ "step": 410
543
+ },
544
+ {
545
+ "epoch": 0.22058823529411764,
546
+ "grad_norm": 41.5548039641909,
547
+ "kl": 0.0,
548
+ "learning_rate": 4.4233349274571974e-07,
549
+ "logps/chosen": -294.40205536912754,
550
+ "logps/rejected": -262.53922240497076,
551
+ "loss": 0.3273,
552
+ "rewards/chosen": 0.6274464754450241,
553
+ "rewards/margins": 3.27400759055376,
554
+ "rewards/rejected": -2.6465611151087356,
555
+ "step": 420
556
+ },
557
+ {
558
+ "epoch": 0.22584033613445378,
559
+ "grad_norm": 51.90314639757878,
560
+ "kl": 0.0,
561
+ "learning_rate": 4.396721947261496e-07,
562
+ "logps/chosen": -290.494140625,
563
+ "logps/rejected": -266.31001420454544,
564
+ "loss": 0.3513,
565
+ "rewards/chosen": 0.22074366190347328,
566
+ "rewards/margins": 2.942957208854569,
567
+ "rewards/rejected": -2.722213546951096,
568
+ "step": 430
569
+ },
570
+ {
571
+ "epoch": 0.23109243697478993,
572
+ "grad_norm": 45.08774574606674,
573
+ "kl": 0.0,
574
+ "learning_rate": 4.3695925986459107e-07,
575
+ "logps/chosen": -274.8046160442073,
576
+ "logps/rejected": -247.5850110176282,
577
+ "loss": 0.3506,
578
+ "rewards/chosen": 0.5506789044636052,
579
+ "rewards/margins": 3.319194662488946,
580
+ "rewards/rejected": -2.7685157580253406,
581
+ "step": 440
582
+ },
583
+ {
584
+ "epoch": 0.23634453781512604,
585
+ "grad_norm": 38.43146082353361,
586
+ "kl": 0.0,
587
+ "learning_rate": 4.341954267373494e-07,
588
+ "logps/chosen": -293.00198951863354,
589
+ "logps/rejected": -258.1283411949685,
590
+ "loss": 0.3646,
591
+ "rewards/chosen": 0.5552752477041683,
592
+ "rewards/margins": 2.609661794480898,
593
+ "rewards/rejected": -2.0543865467767297,
594
+ "step": 450
595
+ },
596
+ {
597
+ "epoch": 0.2415966386554622,
598
+ "grad_norm": 42.36936446062343,
599
+ "kl": 0.0,
600
+ "learning_rate": 4.313814477774035e-07,
601
+ "logps/chosen": -307.16541371855345,
602
+ "logps/rejected": -266.2528872282609,
603
+ "loss": 0.3188,
604
+ "rewards/chosen": 0.6136390638051543,
605
+ "rewards/margins": 3.2912566758975457,
606
+ "rewards/rejected": -2.677617612092391,
607
+ "step": 460
608
+ },
609
+ {
610
+ "epoch": 0.24684873949579833,
611
+ "grad_norm": 36.14877170239455,
612
+ "kl": 0.0,
613
+ "learning_rate": 4.2851808906956134e-07,
614
+ "logps/chosen": -299.0251017011834,
615
+ "logps/rejected": -283.08899006622516,
616
+ "loss": 0.3248,
617
+ "rewards/chosen": 0.8636552257650703,
618
+ "rewards/margins": 4.432308458769002,
619
+ "rewards/rejected": -3.568653233003932,
620
+ "step": 470
621
+ },
622
+ {
623
+ "epoch": 0.25210084033613445,
624
+ "grad_norm": 48.52314483845558,
625
+ "kl": 0.0,
626
+ "learning_rate": 4.256061301418996e-07,
627
+ "logps/chosen": -288.11904978197674,
628
+ "logps/rejected": -273.0204022381757,
629
+ "loss": 0.3566,
630
+ "rewards/chosen": 0.09835364097772642,
631
+ "rewards/margins": 3.3749882696860247,
632
+ "rewards/rejected": -3.2766346287082984,
633
+ "step": 480
634
+ },
635
+ {
636
+ "epoch": 0.25735294117647056,
637
+ "grad_norm": 40.328590442957,
638
+ "kl": 0.0,
639
+ "learning_rate": 4.2264636375354283e-07,
640
+ "logps/chosen": -290.32015931372547,
641
+ "logps/rejected": -256.6384496631737,
642
+ "loss": 0.3385,
643
+ "rewards/chosen": 0.6565687890146293,
644
+ "rewards/margins": 2.809185004337889,
645
+ "rewards/rejected": -2.1526162153232598,
646
+ "step": 490
647
+ },
648
+ {
649
+ "epoch": 0.26260504201680673,
650
+ "grad_norm": 51.22796460390333,
651
+ "kl": 0.0,
652
+ "learning_rate": 4.1963959567884045e-07,
653
+ "logps/chosen": -302.9179159628378,
654
+ "logps/rejected": -269.88603742732556,
655
+ "loss": 0.3213,
656
+ "rewards/chosen": 1.2393865843076963,
657
+ "rewards/margins": 3.752552790135276,
658
+ "rewards/rejected": -2.51316620582758,
659
+ "step": 500
660
+ },
661
+ {
662
+ "epoch": 0.26785714285714285,
663
+ "grad_norm": 40.763747675132,
664
+ "kl": 0.0,
665
+ "learning_rate": 4.1658664448800094e-07,
666
+ "logps/chosen": -278.07726258116884,
667
+ "logps/rejected": -242.03510918674698,
668
+ "loss": 0.3613,
669
+ "rewards/chosen": 1.3944342476981026,
670
+ "rewards/margins": 2.644719200987824,
671
+ "rewards/rejected": -1.2502849532897213,
672
+ "step": 510
673
+ },
674
+ {
675
+ "epoch": 0.27310924369747897,
676
+ "grad_norm": 41.37886905858638,
677
+ "kl": 0.0,
678
+ "learning_rate": 4.1348834132424204e-07,
679
+ "logps/chosen": -303.84667354559747,
680
+ "logps/rejected": -279.02011354813664,
681
+ "loss": 0.355,
682
+ "rewards/chosen": 0.7681676516742826,
683
+ "rewards/margins": 3.026760719293654,
684
+ "rewards/rejected": -2.2585930676193713,
685
+ "step": 520
686
+ },
687
+ {
688
+ "epoch": 0.27836134453781514,
689
+ "grad_norm": 37.134601685804,
690
+ "kl": 0.0,
691
+ "learning_rate": 4.103455296775181e-07,
692
+ "logps/chosen": -311.4682323042169,
693
+ "logps/rejected": -256.0505783279221,
694
+ "loss": 0.3388,
695
+ "rewards/chosen": 1.0605972473879894,
696
+ "rewards/margins": 2.909687229061589,
697
+ "rewards/rejected": -1.8490899816736,
698
+ "step": 530
699
+ },
700
+ {
701
+ "epoch": 0.28361344537815125,
702
+ "grad_norm": 50.21606157716232,
703
+ "kl": 0.0,
704
+ "learning_rate": 4.071590651548867e-07,
705
+ "logps/chosen": -282.52527573529414,
706
+ "logps/rejected": -283.90199288922156,
707
+ "loss": 0.3322,
708
+ "rewards/chosen": 0.6519086251851001,
709
+ "rewards/margins": 3.288977687743577,
710
+ "rewards/rejected": -2.637069062558477,
711
+ "step": 540
712
+ },
713
+ {
714
+ "epoch": 0.28886554621848737,
715
+ "grad_norm": 40.94800472377613,
716
+ "kl": 0.0,
717
+ "learning_rate": 4.039298152475754e-07,
718
+ "logps/chosen": -296.5549363057325,
719
+ "logps/rejected": -296.0996932515337,
720
+ "loss": 0.3261,
721
+ "rewards/chosen": 0.4167486786083051,
722
+ "rewards/margins": 3.556985000742124,
723
+ "rewards/rejected": -3.140236322133819,
724
+ "step": 550
725
+ },
726
+ {
727
+ "epoch": 0.29411764705882354,
728
+ "grad_norm": 47.7284966576563,
729
+ "kl": 0.0,
730
+ "learning_rate": 4.006586590948141e-07,
731
+ "logps/chosen": -287.6368415880503,
732
+ "logps/rejected": -276.68611704192546,
733
+ "loss": 0.3348,
734
+ "rewards/chosen": -0.10697765470300831,
735
+ "rewards/margins": 3.6730512630716423,
736
+ "rewards/rejected": -3.7800289177746507,
737
+ "step": 560
738
+ },
739
+ {
740
+ "epoch": 0.29936974789915966,
741
+ "grad_norm": 45.50328767344306,
742
+ "kl": 0.0,
743
+ "learning_rate": 3.973464872444958e-07,
744
+ "logps/chosen": -296.46134996118013,
745
+ "logps/rejected": -281.4715507075472,
746
+ "loss": 0.3322,
747
+ "rewards/chosen": 0.1290158692354001,
748
+ "rewards/margins": 3.9525375132998652,
749
+ "rewards/rejected": -3.8235216440644653,
750
+ "step": 570
751
+ },
752
+ {
753
+ "epoch": 0.30462184873949577,
754
+ "grad_norm": 38.47971512593867,
755
+ "kl": 0.0,
756
+ "learning_rate": 3.939942014107318e-07,
757
+ "logps/chosen": -262.15194610778445,
758
+ "logps/rejected": -292.0669424019608,
759
+ "loss": 0.3594,
760
+ "rewards/chosen": 0.3589071856287425,
761
+ "rewards/margins": 3.160983936739343,
762
+ "rewards/rejected": -2.8020767511106004,
763
+ "step": 580
764
+ },
765
+ {
766
+ "epoch": 0.30987394957983194,
767
+ "grad_norm": 34.207797203706725,
768
+ "kl": 0.0,
769
+ "learning_rate": 3.9060271422836624e-07,
770
+ "logps/chosen": -292.61703330592104,
771
+ "logps/rejected": -271.3793247767857,
772
+ "loss": 0.3159,
773
+ "rewards/chosen": 1.2860569201017682,
774
+ "rewards/margins": 4.367648571654968,
775
+ "rewards/rejected": -3.0815916515531994,
776
+ "step": 590
777
+ },
778
+ {
779
+ "epoch": 0.31512605042016806,
780
+ "grad_norm": 57.39149614591673,
781
+ "kl": 0.0,
782
+ "learning_rate": 3.871729490045185e-07,
783
+ "logps/chosen": -290.49703584558824,
784
+ "logps/rejected": -252.63739583333333,
785
+ "loss": 0.3164,
786
+ "rewards/chosen": 0.8580891328699448,
787
+ "rewards/margins": 3.7738939831303617,
788
+ "rewards/rejected": -2.915804850260417,
789
+ "step": 600
790
+ },
791
+ {
792
+ "epoch": 0.32037815126050423,
793
+ "grad_norm": 40.37544580915199,
794
+ "kl": 0.0,
795
+ "learning_rate": 3.837058394672196e-07,
796
+ "logps/chosen": -273.4779296875,
797
+ "logps/rejected": -287.0096923828125,
798
+ "loss": 0.3289,
799
+ "rewards/chosen": 0.3099039077758789,
800
+ "rewards/margins": 3.173082160949707,
801
+ "rewards/rejected": -2.863178253173828,
802
+ "step": 610
803
+ },
804
+ {
805
+ "epoch": 0.32563025210084034,
806
+ "grad_norm": 57.45015573264363,
807
+ "kl": 0.0,
808
+ "learning_rate": 3.8020232951121166e-07,
809
+ "logps/chosen": -290.50161637931035,
810
+ "logps/rejected": -294.233125,
811
+ "loss": 0.3188,
812
+ "rewards/chosen": 0.1405291984821188,
813
+ "rewards/margins": 3.6642485065178327,
814
+ "rewards/rejected": -3.523719308035714,
815
+ "step": 620
816
+ },
817
+ {
818
+ "epoch": 0.33088235294117646,
819
+ "grad_norm": 39.287989744489565,
820
+ "kl": 0.0,
821
+ "learning_rate": 3.7666337294097985e-07,
822
+ "logps/chosen": -302.1064724392361,
823
+ "logps/rejected": -260.4077814275568,
824
+ "loss": 0.3435,
825
+ "rewards/chosen": -0.013078765736685859,
826
+ "rewards/margins": 3.530285672406958,
827
+ "rewards/rejected": -3.5433644381436435,
828
+ "step": 630
829
+ },
830
+ {
831
+ "epoch": 0.33613445378151263,
832
+ "grad_norm": 44.59203672342995,
833
+ "kl": 0.0,
834
+ "learning_rate": 3.730899332110855e-07,
835
+ "logps/chosen": -281.5672392003676,
836
+ "logps/rejected": -294.1050866168478,
837
+ "loss": 0.2851,
838
+ "rewards/chosen": 0.49381469277774587,
839
+ "rewards/margins": 5.133490330727814,
840
+ "rewards/rejected": -4.639675637950068,
841
+ "step": 640
842
+ },
843
+ {
844
+ "epoch": 0.34138655462184875,
845
+ "grad_norm": 41.26338033254354,
846
+ "kl": 0.0,
847
+ "learning_rate": 3.694829831638738e-07,
848
+ "logps/chosen": -272.1574797453704,
849
+ "logps/rejected": -303.05520668512656,
850
+ "loss": 0.3318,
851
+ "rewards/chosen": -0.2948492309193552,
852
+ "rewards/margins": 3.4522689825893473,
853
+ "rewards/rejected": -3.7471182135087027,
854
+ "step": 650
855
+ },
856
+ {
857
+ "epoch": 0.34663865546218486,
858
+ "grad_norm": 53.39190760647335,
859
+ "kl": 0.0,
860
+ "learning_rate": 3.658435047646238e-07,
861
+ "logps/chosen": -281.7225378787879,
862
+ "logps/rejected": -267.5343245967742,
863
+ "loss": 0.3109,
864
+ "rewards/chosen": 0.5971468838778409,
865
+ "rewards/margins": 4.032071987834494,
866
+ "rewards/rejected": -3.4349251039566533,
867
+ "step": 660
868
+ },
869
+ {
870
+ "epoch": 0.35189075630252103,
871
+ "grad_norm": 50.14394406023911,
872
+ "kl": 0.0,
873
+ "learning_rate": 3.621724888342161e-07,
874
+ "logps/chosen": -307.17963248239437,
875
+ "logps/rejected": -244.93414238061797,
876
+ "loss": 0.3456,
877
+ "rewards/chosen": 0.5407970052369884,
878
+ "rewards/margins": 2.9319001711099695,
879
+ "rewards/rejected": -2.391103165872981,
880
+ "step": 670
881
+ },
882
+ {
883
+ "epoch": 0.35714285714285715,
884
+ "grad_norm": 44.42377974125605,
885
+ "kl": 0.0,
886
+ "learning_rate": 3.584709347793895e-07,
887
+ "logps/chosen": -328.5807938664596,
888
+ "logps/rejected": -273.77439563679246,
889
+ "loss": 0.3028,
890
+ "rewards/chosen": 0.6922078103012179,
891
+ "rewards/margins": 4.292771099535692,
892
+ "rewards/rejected": -3.6005632892344734,
893
+ "step": 680
894
+ },
895
+ {
896
+ "epoch": 0.36239495798319327,
897
+ "grad_norm": 64.33577601812173,
898
+ "kl": 0.0,
899
+ "learning_rate": 3.5473985032065946e-07,
900
+ "logps/chosen": -339.54715008802816,
901
+ "logps/rejected": -288.18354985955057,
902
+ "loss": 0.3051,
903
+ "rewards/chosen": 0.7634544372558594,
904
+ "rewards/margins": 4.167792459552208,
905
+ "rewards/rejected": -3.4043380222963484,
906
+ "step": 690
907
+ },
908
+ {
909
+ "epoch": 0.36764705882352944,
910
+ "grad_norm": 50.09740849242821,
911
+ "kl": 0.0,
912
+ "learning_rate": 3.509802512179737e-07,
913
+ "logps/chosen": -320.1032510080645,
914
+ "logps/rejected": -247.99145359848484,
915
+ "loss": 0.3144,
916
+ "rewards/chosen": -0.7038151402627268,
917
+ "rewards/margins": 4.308948235684243,
918
+ "rewards/rejected": -5.01276337594697,
919
+ "step": 700
920
+ },
921
+ {
922
+ "epoch": 0.37289915966386555,
923
+ "grad_norm": 48.91753923946882,
924
+ "kl": 0.0,
925
+ "learning_rate": 3.4719316099417983e-07,
926
+ "logps/chosen": -319.73927600931677,
927
+ "logps/rejected": -308.56738895440253,
928
+ "loss": 0.3676,
929
+ "rewards/chosen": -1.7911930439635093,
930
+ "rewards/margins": 3.521101202302214,
931
+ "rewards/rejected": -5.312294246265723,
932
+ "step": 710
933
+ },
934
+ {
935
+ "epoch": 0.37815126050420167,
936
+ "grad_norm": 58.85183781985146,
937
+ "kl": 0.0,
938
+ "learning_rate": 3.4337961065637786e-07,
939
+ "logps/chosen": -351.19349449685535,
940
+ "logps/rejected": -328.6528289984472,
941
+ "loss": 0.3281,
942
+ "rewards/chosen": -0.32832849850444673,
943
+ "rewards/margins": 4.381978936714885,
944
+ "rewards/rejected": -4.710307435219332,
945
+ "step": 720
946
+ },
947
+ {
948
+ "epoch": 0.38340336134453784,
949
+ "grad_norm": 34.12584073424692,
950
+ "kl": 0.0,
951
+ "learning_rate": 3.395406384152371e-07,
952
+ "logps/chosen": -318.7544778963415,
953
+ "logps/rejected": -281.5553635817308,
954
+ "loss": 0.3211,
955
+ "rewards/chosen": 0.22548980247683642,
956
+ "rewards/margins": 3.761472765843223,
957
+ "rewards/rejected": -3.5359829633663864,
958
+ "step": 730
959
+ },
960
+ {
961
+ "epoch": 0.38865546218487396,
962
+ "grad_norm": 66.4865558017911,
963
+ "kl": 0.0,
964
+ "learning_rate": 3.356772894023505e-07,
965
+ "logps/chosen": -274.7813126671123,
966
+ "logps/rejected": -285.46543115601503,
967
+ "loss": 0.376,
968
+ "rewards/chosen": -0.13362450013186206,
969
+ "rewards/margins": 3.3327875780371934,
970
+ "rewards/rejected": -3.4664120781690553,
971
+ "step": 740
972
+ },
973
+ {
974
+ "epoch": 0.3939075630252101,
975
+ "grad_norm": 23.48397277478164,
976
+ "kl": 0.0,
977
+ "learning_rate": 3.317906153857054e-07,
978
+ "logps/chosen": -302.44775079617835,
979
+ "logps/rejected": -287.7147958205521,
980
+ "loss": 0.3351,
981
+ "rewards/chosen": 0.14863140567852434,
982
+ "rewards/margins": 4.358767210773425,
983
+ "rewards/rejected": -4.2101358050949,
984
+ "step": 750
985
+ },
986
+ {
987
+ "epoch": 0.39915966386554624,
988
+ "grad_norm": 34.1827992343062,
989
+ "kl": 0.0,
990
+ "learning_rate": 3.2788167448334784e-07,
991
+ "logps/chosen": -262.08561812106916,
992
+ "logps/rejected": -274.4614227484472,
993
+ "loss": 0.374,
994
+ "rewards/chosen": -1.007453294670057,
995
+ "rewards/margins": 2.7304841113130562,
996
+ "rewards/rejected": -3.7379374059831134,
997
+ "step": 760
998
+ },
999
+ {
1000
+ "epoch": 0.40441176470588236,
1001
+ "grad_norm": 45.645905851061976,
1002
+ "kl": 0.0,
1003
+ "learning_rate": 3.2395153087531763e-07,
1004
+ "logps/chosen": -290.2492659395973,
1005
+ "logps/rejected": -247.69814510233917,
1006
+ "loss": 0.3295,
1007
+ "rewards/chosen": -0.16713363852276897,
1008
+ "rewards/margins": 3.4756729221497453,
1009
+ "rewards/rejected": -3.6428065606725144,
1010
+ "step": 770
1011
+ },
1012
+ {
1013
+ "epoch": 0.4096638655462185,
1014
+ "grad_norm": 52.083381199139474,
1015
+ "kl": 0.0,
1016
+ "learning_rate": 3.20001254513933e-07,
1017
+ "logps/chosen": -301.3512216862416,
1018
+ "logps/rejected": -320.7805875365497,
1019
+ "loss": 0.3564,
1020
+ "rewards/chosen": 0.2707460390641385,
1021
+ "rewards/margins": 3.1255901602721066,
1022
+ "rewards/rejected": -2.854844121207968,
1023
+ "step": 780
1024
+ },
1025
+ {
1026
+ "epoch": 0.41491596638655465,
1027
+ "grad_norm": 39.20450298136074,
1028
+ "kl": 0.0,
1029
+ "learning_rate": 3.160319208325044e-07,
1030
+ "logps/chosen": -320.55803206699346,
1031
+ "logps/rejected": -272.5804172904192,
1032
+ "loss": 0.3116,
1033
+ "rewards/chosen": 0.21087132871540543,
1034
+ "rewards/margins": 3.79839062605822,
1035
+ "rewards/rejected": -3.5875192973428143,
1036
+ "step": 790
1037
+ },
1038
+ {
1039
+ "epoch": 0.42016806722689076,
1040
+ "grad_norm": 49.5026281906496,
1041
+ "kl": 0.0,
1042
+ "learning_rate": 3.1204461045255597e-07,
1043
+ "logps/chosen": -305.87476748511904,
1044
+ "logps/rejected": -282.5663291529605,
1045
+ "loss": 0.3302,
1046
+ "rewards/chosen": 0.5268175942557198,
1047
+ "rewards/margins": 3.157528597609441,
1048
+ "rewards/rejected": -2.630711003353721,
1049
+ "step": 800
1050
+ },
1051
+ {
1052
+ "epoch": 0.4254201680672269,
1053
+ "grad_norm": 41.129555544878784,
1054
+ "kl": 0.0,
1055
+ "learning_rate": 3.0804040888963367e-07,
1056
+ "logps/chosen": -305.3494718309859,
1057
+ "logps/rejected": -257.2256846910112,
1058
+ "loss": 0.2981,
1059
+ "rewards/chosen": 0.24150611984897669,
1060
+ "rewards/margins": 4.293326681975907,
1061
+ "rewards/rejected": -4.051820562126931,
1062
+ "step": 810
1063
+ },
1064
+ {
1065
+ "epoch": 0.43067226890756305,
1066
+ "grad_norm": 42.46230974734401,
1067
+ "kl": 0.0,
1068
+ "learning_rate": 3.040204062577824e-07,
1069
+ "logps/chosen": -312.58470262096773,
1070
+ "logps/rejected": -261.4371448863636,
1071
+ "loss": 0.316,
1072
+ "rewards/chosen": 0.39130327778477825,
1073
+ "rewards/margins": 3.736164117628528,
1074
+ "rewards/rejected": -3.34486083984375,
1075
+ "step": 820
1076
+ },
1077
+ {
1078
+ "epoch": 0.43592436974789917,
1079
+ "grad_norm": 61.00539417491642,
1080
+ "kl": 0.0,
1081
+ "learning_rate": 2.999856969727704e-07,
1082
+ "logps/chosen": -305.4535361842105,
1083
+ "logps/rejected": -265.3578020134228,
1084
+ "loss": 0.3327,
1085
+ "rewards/chosen": 1.0649953920241686,
1086
+ "rewards/margins": 3.993097206857432,
1087
+ "rewards/rejected": -2.9281018148332634,
1088
+ "step": 830
1089
+ },
1090
+ {
1091
+ "epoch": 0.4411764705882353,
1092
+ "grad_norm": 37.0047213388179,
1093
+ "kl": 0.0,
1094
+ "learning_rate": 2.959373794541426e-07,
1095
+ "logps/chosen": -282.06252472310126,
1096
+ "logps/rejected": -256.6561053240741,
1097
+ "loss": 0.3265,
1098
+ "rewards/chosen": 0.3667142361025267,
1099
+ "rewards/margins": 3.1375505835176503,
1100
+ "rewards/rejected": -2.7708363474151234,
1101
+ "step": 840
1102
+ },
1103
+ {
1104
+ "epoch": 0.44642857142857145,
1105
+ "grad_norm": 36.25355732454002,
1106
+ "kl": 0.0,
1107
+ "learning_rate": 2.9187655582618407e-07,
1108
+ "logps/chosen": -313.9366851993865,
1109
+ "logps/rejected": -287.78662420382165,
1110
+ "loss": 0.3112,
1111
+ "rewards/chosen": 0.5094557686086081,
1112
+ "rewards/margins": 3.8714661655781546,
1113
+ "rewards/rejected": -3.3620103969695463,
1114
+ "step": 850
1115
+ },
1116
+ {
1117
+ "epoch": 0.45168067226890757,
1118
+ "grad_norm": 64.53574065510352,
1119
+ "kl": 0.0,
1120
+ "learning_rate": 2.878043316178753e-07,
1121
+ "logps/chosen": -283.32109866352204,
1122
+ "logps/rejected": -264.0886063664596,
1123
+ "loss": 0.3276,
1124
+ "rewards/chosen": 0.1551594764181653,
1125
+ "rewards/margins": 4.185146314054035,
1126
+ "rewards/rejected": -4.029986837635869,
1127
+ "step": 860
1128
+ },
1129
+ {
1130
+ "epoch": 0.4569327731092437,
1131
+ "grad_norm": 35.86505890578853,
1132
+ "kl": 0.0,
1133
+ "learning_rate": 2.837218154619193e-07,
1134
+ "logps/chosen": -312.15406436011904,
1135
+ "logps/rejected": -274.98959189967104,
1136
+ "loss": 0.3348,
1137
+ "rewards/chosen": -0.1483011245727539,
1138
+ "rewards/margins": 4.045587087932386,
1139
+ "rewards/rejected": -4.19388821250514,
1140
+ "step": 870
1141
+ },
1142
+ {
1143
+ "epoch": 0.46218487394957986,
1144
+ "grad_norm": 46.44246066253616,
1145
+ "kl": 0.0,
1146
+ "learning_rate": 2.796301187929257e-07,
1147
+ "logps/chosen": -316.67644557823127,
1148
+ "logps/rejected": -274.3472949783237,
1149
+ "loss": 0.331,
1150
+ "rewards/chosen": -0.024288294266681283,
1151
+ "rewards/margins": 3.5203153511903276,
1152
+ "rewards/rejected": -3.5446036454570087,
1153
+ "step": 880
1154
+ },
1155
+ {
1156
+ "epoch": 0.46743697478991597,
1157
+ "grad_norm": 67.03947638674309,
1158
+ "kl": 0.0,
1159
+ "learning_rate": 2.755303555448301e-07,
1160
+ "logps/chosen": -296.37250766871165,
1161
+ "logps/rejected": -287.87425358280257,
1162
+ "loss": 0.2935,
1163
+ "rewards/chosen": 0.2513600448889235,
1164
+ "rewards/margins": 4.716395453054728,
1165
+ "rewards/rejected": -4.465035408165805,
1166
+ "step": 890
1167
+ },
1168
+ {
1169
+ "epoch": 0.4726890756302521,
1170
+ "grad_norm": 44.0580333940613,
1171
+ "kl": 0.0,
1172
+ "learning_rate": 2.7142364184763424e-07,
1173
+ "logps/chosen": -275.6317777317881,
1174
+ "logps/rejected": -295.6349852071006,
1175
+ "loss": 0.3427,
1176
+ "rewards/chosen": -0.019468964330407956,
1177
+ "rewards/margins": 3.7003294679689636,
1178
+ "rewards/rejected": -3.7197984322993713,
1179
+ "step": 900
1180
+ },
1181
+ {
1182
+ "epoch": 0.47794117647058826,
1183
+ "grad_norm": 58.59520003164634,
1184
+ "kl": 0.0,
1185
+ "learning_rate": 2.673110957235479e-07,
1186
+ "logps/chosen": -305.7110357704403,
1187
+ "logps/rejected": -299.5851853649068,
1188
+ "loss": 0.2967,
1189
+ "rewards/chosen": 0.24971195436873525,
1190
+ "rewards/margins": 4.398466382260817,
1191
+ "rewards/rejected": -4.148754427892081,
1192
+ "step": 910
1193
+ },
1194
+ {
1195
+ "epoch": 0.4831932773109244,
1196
+ "grad_norm": 49.15023451078887,
1197
+ "kl": 0.0,
1198
+ "learning_rate": 2.6319383678261557e-07,
1199
+ "logps/chosen": -326.3589082154088,
1200
+ "logps/rejected": -291.8697350543478,
1201
+ "loss": 0.34,
1202
+ "rewards/chosen": -0.1070428524377211,
1203
+ "rewards/margins": 3.931509757592364,
1204
+ "rewards/rejected": -4.038552610030085,
1205
+ "step": 920
1206
+ },
1207
+ {
1208
+ "epoch": 0.4884453781512605,
1209
+ "grad_norm": 60.143948744622776,
1210
+ "kl": 0.0,
1211
+ "learning_rate": 2.5907298591791105e-07,
1212
+ "logps/chosen": -293.6828878012048,
1213
+ "logps/rejected": -316.3496601055195,
1214
+ "loss": 0.3121,
1215
+ "rewards/chosen": 0.37629242402961455,
1216
+ "rewards/margins": 4.533702884662731,
1217
+ "rewards/rejected": -4.157410460633117,
1218
+ "step": 930
1219
+ },
1220
+ {
1221
+ "epoch": 0.49369747899159666,
1222
+ "grad_norm": 41.39145780505376,
1223
+ "kl": 0.0,
1224
+ "learning_rate": 2.5494966500038264e-07,
1225
+ "logps/chosen": -298.37648703835225,
1226
+ "logps/rejected": -283.5568033854167,
1227
+ "loss": 0.3381,
1228
+ "rewards/chosen": 0.19053040851246228,
1229
+ "rewards/margins": 2.8964626018447106,
1230
+ "rewards/rejected": -2.705932193332248,
1231
+ "step": 940
1232
+ },
1233
+ {
1234
+ "epoch": 0.4989495798319328,
1235
+ "grad_norm": 44.85935706825639,
1236
+ "kl": 0.0,
1237
+ "learning_rate": 2.508249965734319e-07,
1238
+ "logps/chosen": -315.8209355828221,
1239
+ "logps/rejected": -277.33056329617835,
1240
+ "loss": 0.2815,
1241
+ "rewards/chosen": 0.5839554488293233,
1242
+ "rewards/margins": 4.811377494883264,
1243
+ "rewards/rejected": -4.227422046053941,
1244
+ "step": 950
1245
+ },
1246
+ {
1247
+ "epoch": 0.5042016806722689,
1248
+ "grad_norm": 86.97621065062417,
1249
+ "kl": 0.0,
1250
+ "learning_rate": 2.467001035473103e-07,
1251
+ "logps/chosen": -302.4603059505988,
1252
+ "logps/rejected": -287.89797794117646,
1253
+ "loss": 0.3293,
1254
+ "rewards/chosen": 0.40462347704493357,
1255
+ "rewards/margins": 3.534475173582924,
1256
+ "rewards/rejected": -3.1298516965379903,
1257
+ "step": 960
1258
+ },
1259
+ {
1260
+ "epoch": 0.509453781512605,
1261
+ "grad_norm": 32.22216772039752,
1262
+ "kl": 0.0,
1263
+ "learning_rate": 2.425761088934142e-07,
1264
+ "logps/chosen": -273.99361898291926,
1265
+ "logps/rejected": -258.7056308962264,
1266
+ "loss": 0.3326,
1267
+ "rewards/chosen": -0.09355034443162243,
1268
+ "rewards/margins": 4.204330392105327,
1269
+ "rewards/rejected": -4.29788073653695,
1270
+ "step": 970
1271
+ },
1272
+ {
1273
+ "epoch": 0.5147058823529411,
1274
+ "grad_norm": 53.215396805561575,
1275
+ "kl": 0.0,
1276
+ "learning_rate": 2.3845413533856514e-07,
1277
+ "logps/chosen": -299.4835397012579,
1278
+ "logps/rejected": -287.7150621118012,
1279
+ "loss": 0.2911,
1280
+ "rewards/chosen": 0.30934901207498033,
1281
+ "rewards/margins": 4.708151206611083,
1282
+ "rewards/rejected": -4.398802194536103,
1283
+ "step": 980
1284
+ },
1285
+ {
1286
+ "epoch": 0.5199579831932774,
1287
+ "grad_norm": 70.13460679739038,
1288
+ "kl": 0.0,
1289
+ "learning_rate": 2.343353050593553e-07,
1290
+ "logps/chosen": -318.12441770186336,
1291
+ "logps/rejected": -282.98125491352204,
1292
+ "loss": 0.3042,
1293
+ "rewards/chosen": 0.9980694314707881,
1294
+ "rewards/margins": 5.077612059345198,
1295
+ "rewards/rejected": -4.07954262787441,
1296
+ "step": 990
1297
+ },
1298
+ {
1299
+ "epoch": 0.5252100840336135,
1300
+ "grad_norm": 91.66490200361514,
1301
+ "kl": 0.0,
1302
+ "learning_rate": 2.3022073937664383e-07,
1303
+ "logps/chosen": -318.62791700487014,
1304
+ "logps/rejected": -299.1699924698795,
1305
+ "loss": 0.2987,
1306
+ "rewards/chosen": -0.4350905975738129,
1307
+ "rewards/margins": 3.7659268511272415,
1308
+ "rewards/rejected": -4.201017448701054,
1309
+ "step": 1000
1310
+ },
1311
+ {
1312
+ "epoch": 0.5304621848739496,
1313
+ "grad_norm": 47.014009528040546,
1314
+ "kl": 0.0,
1315
+ "learning_rate": 2.261115584502849e-07,
1316
+ "logps/chosen": -322.26196442610063,
1317
+ "logps/rejected": -273.58297748447205,
1318
+ "loss": 0.3504,
1319
+ "rewards/chosen": -0.4901385037404186,
1320
+ "rewards/margins": 2.980573340567617,
1321
+ "rewards/rejected": -3.4707118443080356,
1322
+ "step": 1010
1323
+ },
1324
+ {
1325
+ "epoch": 0.5357142857142857,
1326
+ "grad_norm": 65.73363572647297,
1327
+ "kl": 0.0,
1328
+ "learning_rate": 2.2200888097417302e-07,
1329
+ "logps/chosen": -265.984912109375,
1330
+ "logps/rejected": -273.97919921875,
1331
+ "loss": 0.3327,
1332
+ "rewards/chosen": 0.2538748264312744,
1333
+ "rewards/margins": 3.538307809829712,
1334
+ "rewards/rejected": -3.2844329833984376,
1335
+ "step": 1020
1336
+ },
1337
+ {
1338
+ "epoch": 0.5409663865546218,
1339
+ "grad_norm": 37.39061372036576,
1340
+ "kl": 0.0,
1341
+ "learning_rate": 2.1791382387168684e-07,
1342
+ "logps/chosen": -294.4272490530303,
1343
+ "logps/rejected": -283.10264616935484,
1344
+ "loss": 0.3218,
1345
+ "rewards/chosen": 0.5568286317767519,
1346
+ "rewards/margins": 4.048672366025744,
1347
+ "rewards/rejected": -3.491843734248992,
1348
+ "step": 1030
1349
+ },
1350
+ {
1351
+ "epoch": 0.5462184873949579,
1352
+ "grad_norm": 38.42991974376242,
1353
+ "kl": 0.0,
1354
+ "learning_rate": 2.1382750199161495e-07,
1355
+ "logps/chosen": -314.6301011029412,
1356
+ "logps/rejected": -277.451328125,
1357
+ "loss": 0.3087,
1358
+ "rewards/chosen": 0.9661560956169577,
1359
+ "rewards/margins": 3.718452645096124,
1360
+ "rewards/rejected": -2.7522965494791665,
1361
+ "step": 1040
1362
+ },
1363
+ {
1364
+ "epoch": 0.5514705882352942,
1365
+ "grad_norm": 58.32907293331433,
1366
+ "kl": 0.0,
1367
+ "learning_rate": 2.0975102780464673e-07,
1368
+ "logps/chosen": -306.8538490853659,
1369
+ "logps/rejected": -290.90675080128204,
1370
+ "loss": 0.3062,
1371
+ "rewards/chosen": 0.684349339182784,
1372
+ "rewards/margins": 3.878205602954819,
1373
+ "rewards/rejected": -3.193856263772035,
1374
+ "step": 1050
1375
+ },
1376
+ {
1377
+ "epoch": 0.5567226890756303,
1378
+ "grad_norm": 42.31783649576492,
1379
+ "kl": 0.0,
1380
+ "learning_rate": 2.0568551110051074e-07,
1381
+ "logps/chosen": -265.01874508647796,
1382
+ "logps/rejected": -256.457298136646,
1383
+ "loss": 0.3375,
1384
+ "rewards/chosen": 0.7610905485333137,
1385
+ "rewards/margins": 3.0508450962211047,
1386
+ "rewards/rejected": -2.289754547687791,
1387
+ "step": 1060
1388
+ },
1389
+ {
1390
+ "epoch": 0.5619747899159664,
1391
+ "grad_norm": 43.282189967649956,
1392
+ "kl": 0.0,
1393
+ "learning_rate": 2.016320586858422e-07,
1394
+ "logps/chosen": -270.2985276442308,
1395
+ "logps/rejected": -287.5562357088415,
1396
+ "loss": 0.3143,
1397
+ "rewards/chosen": 1.108019804343199,
1398
+ "rewards/margins": 3.3270524641064423,
1399
+ "rewards/rejected": -2.219032659763243,
1400
+ "step": 1070
1401
+ },
1402
+ {
1403
+ "epoch": 0.5672268907563025,
1404
+ "grad_norm": 59.248701734675386,
1405
+ "kl": 0.0,
1406
+ "learning_rate": 1.9759177408286337e-07,
1407
+ "logps/chosen": -312.609375,
1408
+ "logps/rejected": -267.16066028225805,
1409
+ "loss": 0.3365,
1410
+ "rewards/chosen": 1.2182994495738637,
1411
+ "rewards/margins": 3.07108659045088,
1412
+ "rewards/rejected": -1.8527871408770162,
1413
+ "step": 1080
1414
+ },
1415
+ {
1416
+ "epoch": 0.5724789915966386,
1417
+ "grad_norm": 45.19596654926437,
1418
+ "kl": 0.0,
1419
+ "learning_rate": 1.9356575722895808e-07,
1420
+ "logps/chosen": -299.373361013986,
1421
+ "logps/rejected": -271.5683042019774,
1422
+ "loss": 0.3178,
1423
+ "rewards/chosen": 1.0334772496790319,
1424
+ "rewards/margins": 3.244971479960194,
1425
+ "rewards/rejected": -2.2114942302811618,
1426
+ "step": 1090
1427
+ },
1428
+ {
1429
+ "epoch": 0.5777310924369747,
1430
+ "grad_norm": 44.21402337263303,
1431
+ "kl": 0.0,
1432
+ "learning_rate": 1.895551041772216e-07,
1433
+ "logps/chosen": -309.0861150568182,
1434
+ "logps/rejected": -255.60587565104166,
1435
+ "loss": 0.3402,
1436
+ "rewards/chosen": 0.8877213217995383,
1437
+ "rewards/margins": 3.0730268112336745,
1438
+ "rewards/rejected": -2.1853054894341364,
1439
+ "step": 1100
1440
+ },
1441
+ {
1442
+ "epoch": 0.582983193277311,
1443
+ "grad_norm": 61.69332868228022,
1444
+ "kl": 0.0,
1445
+ "learning_rate": 1.8556090679806847e-07,
1446
+ "logps/chosen": -298.82965455572287,
1447
+ "logps/rejected": -267.7830509334416,
1448
+ "loss": 0.2942,
1449
+ "rewards/chosen": 1.1525323706937123,
1450
+ "rewards/margins": 4.799233143065871,
1451
+ "rewards/rejected": -3.646700772372159,
1452
+ "step": 1110
1453
+ },
1454
+ {
1455
+ "epoch": 0.5882352941176471,
1456
+ "grad_norm": 31.77308050751697,
1457
+ "kl": 0.0,
1458
+ "learning_rate": 1.8158425248197928e-07,
1459
+ "logps/chosen": -330.5060499237805,
1460
+ "logps/rejected": -304.98880709134613,
1461
+ "loss": 0.2989,
1462
+ "rewards/chosen": 1.3816723707245617,
1463
+ "rewards/margins": 5.020397847111781,
1464
+ "rewards/rejected": -3.6387254763872194,
1465
+ "step": 1120
1466
+ },
1467
+ {
1468
+ "epoch": 0.5934873949579832,
1469
+ "grad_norm": 59.622620200950486,
1470
+ "kl": 0.0,
1471
+ "learning_rate": 1.7762622384346609e-07,
1472
+ "logps/chosen": -292.1063850308642,
1473
+ "logps/rejected": -255.3093848892405,
1474
+ "loss": 0.311,
1475
+ "rewards/chosen": 1.0882685155044367,
1476
+ "rewards/margins": 3.8796188681176687,
1477
+ "rewards/rejected": -2.791350352613232,
1478
+ "step": 1130
1479
+ },
1480
+ {
1481
+ "epoch": 0.5987394957983193,
1482
+ "grad_norm": 92.89241064630508,
1483
+ "kl": 0.0,
1484
+ "learning_rate": 1.7368789842633907e-07,
1485
+ "logps/chosen": -324.4837362421384,
1486
+ "logps/rejected": -303.8497670807453,
1487
+ "loss": 0.2984,
1488
+ "rewards/chosen": 0.8251325499336675,
1489
+ "rewards/margins": 4.589403024385337,
1490
+ "rewards/rejected": -3.7642704744516693,
1491
+ "step": 1140
1492
+ },
1493
+ {
1494
+ "epoch": 0.6039915966386554,
1495
+ "grad_norm": 42.994311331506765,
1496
+ "kl": 0.0,
1497
+ "learning_rate": 1.697703484103532e-07,
1498
+ "logps/chosen": -292.4465987042683,
1499
+ "logps/rejected": -259.40554887820514,
1500
+ "loss": 0.3131,
1501
+ "rewards/chosen": 0.9567713853789539,
1502
+ "rewards/margins": 4.075035216884363,
1503
+ "rewards/rejected": -3.1182638315054088,
1504
+ "step": 1150
1505
+ },
1506
+ {
1507
+ "epoch": 0.6092436974789915,
1508
+ "grad_norm": 63.999900745814266,
1509
+ "kl": 0.0,
1510
+ "learning_rate": 1.6587464031931526e-07,
1511
+ "logps/chosen": -303.69353428171644,
1512
+ "logps/rejected": -277.118405577957,
1513
+ "loss": 0.3398,
1514
+ "rewards/chosen": 0.867815444718546,
1515
+ "rewards/margins": 3.5977176506472253,
1516
+ "rewards/rejected": -2.7299022059286795,
1517
+ "step": 1160
1518
+ },
1519
+ {
1520
+ "epoch": 0.6144957983193278,
1521
+ "grad_norm": 33.78499266925509,
1522
+ "kl": 0.0,
1523
+ "learning_rate": 1.6200183473073048e-07,
1524
+ "logps/chosen": -299.6548768939394,
1525
+ "logps/rejected": -280.22903225806454,
1526
+ "loss": 0.2992,
1527
+ "rewards/chosen": 0.6621969512014678,
1528
+ "rewards/margins": 3.7750548928597336,
1529
+ "rewards/rejected": -3.112857941658266,
1530
+ "step": 1170
1531
+ },
1532
+ {
1533
+ "epoch": 0.6197478991596639,
1534
+ "grad_norm": 31.77803791723765,
1535
+ "kl": 0.0,
1536
+ "learning_rate": 1.5815298598706888e-07,
1537
+ "logps/chosen": -291.1797602872671,
1538
+ "logps/rejected": -286.78439465408803,
1539
+ "loss": 0.3176,
1540
+ "rewards/chosen": -0.010194837676812403,
1541
+ "rewards/margins": 4.250023752633722,
1542
+ "rewards/rejected": -4.260218590310535,
1543
+ "step": 1180
1544
+ },
1545
+ {
1546
+ "epoch": 0.625,
1547
+ "grad_norm": 37.859068866257594,
1548
+ "kl": 0.0,
1549
+ "learning_rate": 1.5432914190872756e-07,
1550
+ "logps/chosen": -283.2485907832278,
1551
+ "logps/rejected": -299.72665895061726,
1552
+ "loss": 0.3475,
1553
+ "rewards/chosen": 0.012598955178562599,
1554
+ "rewards/margins": 4.188036976614712,
1555
+ "rewards/rejected": -4.1754380214361495,
1556
+ "step": 1190
1557
+ },
1558
+ {
1559
+ "epoch": 0.6302521008403361,
1560
+ "grad_norm": 34.730160989586295,
1561
+ "kl": 0.0,
1562
+ "learning_rate": 1.505313435087698e-07,
1563
+ "logps/chosen": -300.31629922945206,
1564
+ "logps/rejected": -295.37841235632186,
1565
+ "loss": 0.3124,
1566
+ "rewards/chosen": 0.33270381248160585,
1567
+ "rewards/margins": 4.571263930005852,
1568
+ "rewards/rejected": -4.238560117524246,
1569
+ "step": 1200
1570
+ },
1571
+ {
1572
+ "epoch": 0.6355042016806722,
1573
+ "grad_norm": 73.6570576556008,
1574
+ "kl": 0.0,
1575
+ "learning_rate": 1.4676062470951705e-07,
1576
+ "logps/chosen": -316.37948140337426,
1577
+ "logps/rejected": -306.59623805732485,
1578
+ "loss": 0.318,
1579
+ "rewards/chosen": 0.23967742919921875,
1580
+ "rewards/margins": 4.2318552102252935,
1581
+ "rewards/rejected": -3.992177781026075,
1582
+ "step": 1210
1583
+ },
1584
+ {
1585
+ "epoch": 0.6407563025210085,
1586
+ "grad_norm": 86.8105694173325,
1587
+ "kl": 0.0,
1588
+ "learning_rate": 1.430180120610711e-07,
1589
+ "logps/chosen": -283.5435161226115,
1590
+ "logps/rejected": -276.21841449386505,
1591
+ "loss": 0.2916,
1592
+ "rewards/chosen": 0.35688832762894357,
1593
+ "rewards/margins": 4.4911308476394876,
1594
+ "rewards/rejected": -4.134242520010544,
1595
+ "step": 1220
1596
+ },
1597
+ {
1598
+ "epoch": 0.6460084033613446,
1599
+ "grad_norm": 65.59031614397247,
1600
+ "kl": 0.0,
1601
+ "learning_rate": 1.3930452446184385e-07,
1602
+ "logps/chosen": -324.24284699675326,
1603
+ "logps/rejected": -301.1185523343373,
1604
+ "loss": 0.3146,
1605
+ "rewards/chosen": 0.3668579745602298,
1606
+ "rewards/margins": 4.802705980843173,
1607
+ "rewards/rejected": -4.435848006282944,
1608
+ "step": 1230
1609
+ },
1610
+ {
1611
+ "epoch": 0.6512605042016807,
1612
+ "grad_norm": 82.83869882352816,
1613
+ "kl": 0.0,
1614
+ "learning_rate": 1.3562117288116923e-07,
1615
+ "logps/chosen": -315.7268702651515,
1616
+ "logps/rejected": -280.67207661290325,
1617
+ "loss": 0.3109,
1618
+ "rewards/chosen": 0.6113425514914773,
1619
+ "rewards/margins": 4.853238264067082,
1620
+ "rewards/rejected": -4.241895712575605,
1621
+ "step": 1240
1622
+ },
1623
+ {
1624
+ "epoch": 0.6565126050420168,
1625
+ "grad_norm": 53.82976674411678,
1626
+ "kl": 0.0,
1627
+ "learning_rate": 1.319689600840747e-07,
1628
+ "logps/chosen": -304.3841062898089,
1629
+ "logps/rejected": -285.0993577453988,
1630
+ "loss": 0.3324,
1631
+ "rewards/chosen": 0.08269594884981775,
1632
+ "rewards/margins": 3.7769891991853237,
1633
+ "rewards/rejected": -3.694293250335506,
1634
+ "step": 1250
1635
+ },
1636
+ {
1637
+ "epoch": 0.6617647058823529,
1638
+ "grad_norm": 34.58790215772178,
1639
+ "kl": 0.0,
1640
+ "learning_rate": 1.2834888035828596e-07,
1641
+ "logps/chosen": -301.4619855182927,
1642
+ "logps/rejected": -272.6404246794872,
1643
+ "loss": 0.2994,
1644
+ "rewards/chosen": 0.32138896569973086,
1645
+ "rewards/margins": 4.973683418073529,
1646
+ "rewards/rejected": -4.652294452373798,
1647
+ "step": 1260
1648
+ },
1649
+ {
1650
+ "epoch": 0.667016806722689,
1651
+ "grad_norm": 29.984315889657353,
1652
+ "kl": 0.0,
1653
+ "learning_rate": 1.2476191924353932e-07,
1654
+ "logps/chosen": -345.89991071428574,
1655
+ "logps/rejected": -282.63448275862066,
1656
+ "loss": 0.326,
1657
+ "rewards/chosen": 0.5500016566685267,
1658
+ "rewards/margins": 4.73898333770301,
1659
+ "rewards/rejected": -4.188981681034483,
1660
+ "step": 1270
1661
+ },
1662
+ {
1663
+ "epoch": 0.6722689075630253,
1664
+ "grad_norm": 61.179118361711794,
1665
+ "kl": 0.0,
1666
+ "learning_rate": 1.2120905326327596e-07,
1667
+ "logps/chosen": -298.07992788461536,
1668
+ "logps/rejected": -293.07983236754967,
1669
+ "loss": 0.3307,
1670
+ "rewards/chosen": 0.1393580013478296,
1671
+ "rewards/margins": 4.935787970175437,
1672
+ "rewards/rejected": -4.796429968827607,
1673
+ "step": 1280
1674
+ },
1675
+ {
1676
+ "epoch": 0.6775210084033614,
1677
+ "grad_norm": 35.426220175192356,
1678
+ "kl": 0.0,
1679
+ "learning_rate": 1.1769124965879091e-07,
1680
+ "logps/chosen": -309.7057636589404,
1681
+ "logps/rejected": -253.99468380177515,
1682
+ "loss": 0.3127,
1683
+ "rewards/chosen": 0.3561096696664166,
1684
+ "rewards/margins": 4.979999841056024,
1685
+ "rewards/rejected": -4.623890171389608,
1686
+ "step": 1290
1687
+ },
1688
+ {
1689
+ "epoch": 0.6827731092436975,
1690
+ "grad_norm": 43.346023169027426,
1691
+ "kl": 0.0,
1692
+ "learning_rate": 1.1420946612590837e-07,
1693
+ "logps/chosen": -270.4368489583333,
1694
+ "logps/rejected": -281.6362047697368,
1695
+ "loss": 0.3397,
1696
+ "rewards/chosen": 0.1588947886512393,
1697
+ "rewards/margins": 3.5548700019530486,
1698
+ "rewards/rejected": -3.395975213301809,
1699
+ "step": 1300
1700
+ },
1701
+ {
1702
+ "epoch": 0.6880252100840336,
1703
+ "grad_norm": 45.324757492893184,
1704
+ "kl": 0.0,
1705
+ "learning_rate": 1.1076465055425646e-07,
1706
+ "logps/chosen": -290.0300690406977,
1707
+ "logps/rejected": -258.5001319679054,
1708
+ "loss": 0.3082,
1709
+ "rewards/chosen": 0.7295433754144713,
1710
+ "rewards/margins": 4.570315849548762,
1711
+ "rewards/rejected": -3.8407724741342903,
1712
+ "step": 1310
1713
+ },
1714
+ {
1715
+ "epoch": 0.6932773109243697,
1716
+ "grad_norm": 263.89653772607954,
1717
+ "kl": 0.0,
1718
+ "learning_rate": 1.0735774076921128e-07,
1719
+ "logps/chosen": -287.7608642578125,
1720
+ "logps/rejected": -237.489306640625,
1721
+ "loss": 0.3446,
1722
+ "rewards/chosen": 0.27030837535858154,
1723
+ "rewards/margins": 3.555664134025574,
1724
+ "rewards/rejected": -3.2853557586669924,
1725
+ "step": 1320
1726
+ },
1727
+ {
1728
+ "epoch": 0.6985294117647058,
1729
+ "grad_norm": 61.837360014332575,
1730
+ "kl": 0.0,
1731
+ "learning_rate": 1.039896642765809e-07,
1732
+ "logps/chosen": -297.6917317708333,
1733
+ "logps/rejected": -314.11572265625,
1734
+ "loss": 0.2958,
1735
+ "rewards/chosen": 0.6960362328423394,
1736
+ "rewards/margins": 5.248802512583106,
1737
+ "rewards/rejected": -4.552766279740767,
1738
+ "step": 1330
1739
+ },
1740
+ {
1741
+ "epoch": 0.7037815126050421,
1742
+ "grad_norm": 51.294856185176336,
1743
+ "kl": 0.0,
1744
+ "learning_rate": 1.0066133801009871e-07,
1745
+ "logps/chosen": -328.30787295386904,
1746
+ "logps/rejected": -244.40604440789474,
1747
+ "loss": 0.3097,
1748
+ "rewards/chosen": 0.7595687593732562,
1749
+ "rewards/margins": 4.201100693609481,
1750
+ "rewards/rejected": -3.441531934236225,
1751
+ "step": 1340
1752
+ },
1753
+ {
1754
+ "epoch": 0.7090336134453782,
1755
+ "grad_norm": 43.77043748464739,
1756
+ "kl": 0.0,
1757
+ "learning_rate": 9.737366808179553e-08,
1758
+ "logps/chosen": -303.54956371753246,
1759
+ "logps/rejected": -250.67248682228916,
1760
+ "loss": 0.3096,
1761
+ "rewards/chosen": 0.9003205732865767,
1762
+ "rewards/margins": 4.412543414848136,
1763
+ "rewards/rejected": -3.5122228415615586,
1764
+ "step": 1350
1765
+ },
1766
+ {
1767
+ "epoch": 0.7142857142857143,
1768
+ "grad_norm": 72.17773141718298,
1769
+ "kl": 0.0,
1770
+ "learning_rate": 9.412754953531663e-08,
1771
+ "logps/chosen": -262.4382017215569,
1772
+ "logps/rejected": -292.5083997140523,
1773
+ "loss": 0.3377,
1774
+ "rewards/chosen": 0.434063574511134,
1775
+ "rewards/margins": 3.880319925358765,
1776
+ "rewards/rejected": -3.446256350847631,
1777
+ "step": 1360
1778
+ },
1779
+ {
1780
+ "epoch": 0.7195378151260504,
1781
+ "grad_norm": 61.94872379316782,
1782
+ "kl": 0.0,
1783
+ "learning_rate": 9.092386610225325e-08,
1784
+ "logps/chosen": -265.7097810444079,
1785
+ "logps/rejected": -279.9903273809524,
1786
+ "loss": 0.3299,
1787
+ "rewards/chosen": 0.8910425085770456,
1788
+ "rewards/margins": 3.7223031897293892,
1789
+ "rewards/rejected": -2.8312606811523438,
1790
+ "step": 1370
1791
+ },
1792
+ {
1793
+ "epoch": 0.7247899159663865,
1794
+ "grad_norm": 47.60678426366227,
1795
+ "kl": 0.0,
1796
+ "learning_rate": 8.776348996155317e-08,
1797
+ "logps/chosen": -290.0428740530303,
1798
+ "logps/rejected": -278.20430947580644,
1799
+ "loss": 0.3123,
1800
+ "rewards/chosen": 0.3940057927911932,
1801
+ "rewards/margins": 4.4418612930432095,
1802
+ "rewards/rejected": -4.047855500252016,
1803
+ "step": 1380
1804
+ },
1805
+ {
1806
+ "epoch": 0.7300420168067226,
1807
+ "grad_norm": 41.557754145037535,
1808
+ "kl": 0.0,
1809
+ "learning_rate": 8.464728150207636e-08,
1810
+ "logps/chosen": -295.76380699685535,
1811
+ "logps/rejected": -294.7458753881988,
1812
+ "loss": 0.3185,
1813
+ "rewards/chosen": 0.6997887953272406,
1814
+ "rewards/margins": 4.221460825000183,
1815
+ "rewards/rejected": -3.5216720296729425,
1816
+ "step": 1390
1817
+ },
1818
+ {
1819
+ "epoch": 0.7352941176470589,
1820
+ "grad_norm": 45.38199821794469,
1821
+ "kl": 0.0,
1822
+ "learning_rate": 8.15760890883607e-08,
1823
+ "logps/chosen": -275.74736265923565,
1824
+ "logps/rejected": -261.5596721625767,
1825
+ "loss": 0.3109,
1826
+ "rewards/chosen": 0.7350611109642466,
1827
+ "rewards/margins": 4.5443022829590705,
1828
+ "rewards/rejected": -3.809241171994824,
1829
+ "step": 1400
1830
+ },
1831
+ {
1832
+ "epoch": 0.740546218487395,
1833
+ "grad_norm": 22.290814745353508,
1834
+ "kl": 0.0,
1835
+ "learning_rate": 7.855074882966103e-08,
1836
+ "logps/chosen": -311.78690011160717,
1837
+ "logps/rejected": -279.9527652138158,
1838
+ "loss": 0.2985,
1839
+ "rewards/chosen": 0.7244267236618769,
1840
+ "rewards/margins": 5.052972545002337,
1841
+ "rewards/rejected": -4.328545821340461,
1842
+ "step": 1410
1843
+ },
1844
+ {
1845
+ "epoch": 0.7457983193277311,
1846
+ "grad_norm": 40.315164584155504,
1847
+ "kl": 0.0,
1848
+ "learning_rate": 7.557208435232449e-08,
1849
+ "logps/chosen": -274.712925295858,
1850
+ "logps/rejected": -278.3036268625828,
1851
+ "loss": 0.3147,
1852
+ "rewards/chosen": 1.077583267843935,
1853
+ "rewards/margins": 4.07225724310004,
1854
+ "rewards/rejected": -2.9946739752561053,
1855
+ "step": 1420
1856
+ },
1857
+ {
1858
+ "epoch": 0.7510504201680672,
1859
+ "grad_norm": 36.626625075926555,
1860
+ "kl": 0.0,
1861
+ "learning_rate": 7.264090657556443e-08,
1862
+ "logps/chosen": -292.9152413091716,
1863
+ "logps/rejected": -242.9820467715232,
1864
+ "loss": 0.3085,
1865
+ "rewards/chosen": 0.9502442489714312,
1866
+ "rewards/margins": 4.543455846136166,
1867
+ "rewards/rejected": -3.593211597164735,
1868
+ "step": 1430
1869
+ },
1870
+ {
1871
+ "epoch": 0.7563025210084033,
1872
+ "grad_norm": 43.54130112641965,
1873
+ "kl": 0.0,
1874
+ "learning_rate": 6.975801349069385e-08,
1875
+ "logps/chosen": -280.07919034090907,
1876
+ "logps/rejected": -276.84921875,
1877
+ "loss": 0.2987,
1878
+ "rewards/chosen": 0.817291444720644,
1879
+ "rewards/margins": 5.2739698721399995,
1880
+ "rewards/rejected": -4.456678427419355,
1881
+ "step": 1440
1882
+ },
1883
+ {
1884
+ "epoch": 0.7615546218487395,
1885
+ "grad_norm": 39.41222523190064,
1886
+ "kl": 0.0,
1887
+ "learning_rate": 6.692418994387799e-08,
1888
+ "logps/chosen": -288.40252246732024,
1889
+ "logps/rejected": -284.47291354790417,
1890
+ "loss": 0.2993,
1891
+ "rewards/chosen": 0.9826148538028493,
1892
+ "rewards/margins": 5.268814549138741,
1893
+ "rewards/rejected": -4.286199695335891,
1894
+ "step": 1450
1895
+ },
1896
+ {
1897
+ "epoch": 0.7668067226890757,
1898
+ "grad_norm": 42.76397567606349,
1899
+ "kl": 0.0,
1900
+ "learning_rate": 6.414020742246593e-08,
1901
+ "logps/chosen": -288.7798685213415,
1902
+ "logps/rejected": -293.82729867788464,
1903
+ "loss": 0.2617,
1904
+ "rewards/chosen": 0.9620797692275629,
1905
+ "rewards/margins": 5.808872528267026,
1906
+ "rewards/rejected": -4.846792759039463,
1907
+ "step": 1460
1908
+ },
1909
+ {
1910
+ "epoch": 0.7720588235294118,
1911
+ "grad_norm": 85.5452615105662,
1912
+ "kl": 0.0,
1913
+ "learning_rate": 6.140682384495902e-08,
1914
+ "logps/chosen": -311.87120301573424,
1915
+ "logps/rejected": -282.2307997881356,
1916
+ "loss": 0.2877,
1917
+ "rewards/chosen": 0.8217522681176246,
1918
+ "rewards/margins": 4.416604004412646,
1919
+ "rewards/rejected": -3.594851736295021,
1920
+ "step": 1470
1921
+ },
1922
+ {
1923
+ "epoch": 0.7773109243697479,
1924
+ "grad_norm": 47.14939912048749,
1925
+ "kl": 0.0,
1926
+ "learning_rate": 5.872478335467298e-08,
1927
+ "logps/chosen": -270.7433546112805,
1928
+ "logps/rejected": -274.62745392628204,
1929
+ "loss": 0.329,
1930
+ "rewards/chosen": -0.1099286428311976,
1931
+ "rewards/margins": 4.24396219187934,
1932
+ "rewards/rejected": -4.353890834710537,
1933
+ "step": 1480
1934
+ },
1935
+ {
1936
+ "epoch": 0.782563025210084,
1937
+ "grad_norm": 75.39266090840431,
1938
+ "kl": 0.0,
1939
+ "learning_rate": 5.60948161171505e-08,
1940
+ "logps/chosen": -298.3712257179054,
1941
+ "logps/rejected": -296.9724518531977,
1942
+ "loss": 0.3039,
1943
+ "rewards/chosen": 0.3439343168928817,
1944
+ "rewards/margins": 4.58285359718903,
1945
+ "rewards/rejected": -4.238919280296148,
1946
+ "step": 1490
1947
+ },
1948
+ {
1949
+ "epoch": 0.7878151260504201,
1950
+ "grad_norm": 101.47398109074602,
1951
+ "kl": 0.0,
1952
+ "learning_rate": 5.351763812137916e-08,
1953
+ "logps/chosen": -285.37012357026146,
1954
+ "logps/rejected": -305.706282747006,
1955
+ "loss": 0.3075,
1956
+ "rewards/chosen": -0.05964530371372996,
1957
+ "rewards/margins": 4.639130484553485,
1958
+ "rewards/rejected": -4.698775788267215,
1959
+ "step": 1500
1960
+ },
1961
+ {
1962
+ "epoch": 0.7930672268907563,
1963
+ "grad_norm": 51.64579106329186,
1964
+ "kl": 0.0,
1965
+ "learning_rate": 5.0993950984868836e-08,
1966
+ "logps/chosen": -273.9158336568323,
1967
+ "logps/rejected": -296.97481819968556,
1968
+ "loss": 0.3228,
1969
+ "rewards/chosen": -0.35097162471794935,
1970
+ "rewards/margins": 4.398951601500603,
1971
+ "rewards/rejected": -4.749923226218553,
1972
+ "step": 1510
1973
+ },
1974
+ {
1975
+ "epoch": 0.7983193277310925,
1976
+ "grad_norm": 164.1846468516791,
1977
+ "kl": 0.0,
1978
+ "learning_rate": 4.8524441762641284e-08,
1979
+ "logps/chosen": -323.8402122641509,
1980
+ "logps/rejected": -265.2092148680124,
1981
+ "loss": 0.3144,
1982
+ "rewards/chosen": 0.010938104593528892,
1983
+ "rewards/margins": 4.735114546994538,
1984
+ "rewards/rejected": -4.724176442401009,
1985
+ "step": 1520
1986
+ },
1987
+ {
1988
+ "epoch": 0.8035714285714286,
1989
+ "grad_norm": 57.97209020416802,
1990
+ "kl": 0.0,
1991
+ "learning_rate": 4.6109782760184956e-08,
1992
+ "logps/chosen": -283.25261976047904,
1993
+ "logps/rejected": -294.36603860294116,
1994
+ "loss": 0.3462,
1995
+ "rewards/chosen": 0.19619906305552956,
1996
+ "rewards/margins": 4.044921234215661,
1997
+ "rewards/rejected": -3.848722171160131,
1998
+ "step": 1530
1999
+ },
2000
+ {
2001
+ "epoch": 0.8088235294117647,
2002
+ "grad_norm": 41.10976267046813,
2003
+ "kl": 0.0,
2004
+ "learning_rate": 4.375063135042445e-08,
2005
+ "logps/chosen": -301.61559264053255,
2006
+ "logps/rejected": -321.35994412251654,
2007
+ "loss": 0.3079,
2008
+ "rewards/chosen": 0.2954674895698502,
2009
+ "rewards/margins": 5.432189877426845,
2010
+ "rewards/rejected": -5.136722387856995,
2011
+ "step": 1540
2012
+ },
2013
+ {
2014
+ "epoch": 0.8140756302521008,
2015
+ "grad_norm": 54.92221531592682,
2016
+ "kl": 0.0,
2017
+ "learning_rate": 4.144762979475575e-08,
2018
+ "logps/chosen": -306.94077797202794,
2019
+ "logps/rejected": -280.2052436440678,
2020
+ "loss": 0.315,
2021
+ "rewards/chosen": -0.0729095952494161,
2022
+ "rewards/margins": 4.834762834239461,
2023
+ "rewards/rejected": -4.907672429488877,
2024
+ "step": 1550
2025
+ },
2026
+ {
2027
+ "epoch": 0.819327731092437,
2028
+ "grad_norm": 48.43314600236666,
2029
+ "kl": 0.0,
2030
+ "learning_rate": 3.9201405068195385e-08,
2031
+ "logps/chosen": -312.2001696134868,
2032
+ "logps/rejected": -292.7070777529762,
2033
+ "loss": 0.3395,
2034
+ "rewards/chosen": 0.025735384539553995,
2035
+ "rewards/margins": 4.3085366947609085,
2036
+ "rewards/rejected": -4.2828013102213545,
2037
+ "step": 1560
2038
+ },
2039
+ {
2040
+ "epoch": 0.8245798319327731,
2041
+ "grad_norm": 82.79685734368942,
2042
+ "kl": 0.0,
2043
+ "learning_rate": 3.701256868869124e-08,
2044
+ "logps/chosen": -321.1130847091195,
2045
+ "logps/rejected": -254.03585986024845,
2046
+ "loss": 0.3356,
2047
+ "rewards/chosen": 0.22610101759808618,
2048
+ "rewards/margins": 4.19767455700026,
2049
+ "rewards/rejected": -3.971573539402174,
2050
+ "step": 1570
2051
+ },
2052
+ {
2053
+ "epoch": 0.8298319327731093,
2054
+ "grad_norm": 69.63465669746606,
2055
+ "kl": 0.0,
2056
+ "learning_rate": 3.488171655064107e-08,
2057
+ "logps/chosen": -289.7723396650327,
2058
+ "logps/rejected": -288.95059880239523,
2059
+ "loss": 0.3227,
2060
+ "rewards/chosen": -0.3906313079634523,
2061
+ "rewards/margins": 4.496159441592124,
2062
+ "rewards/rejected": -4.886790749555576,
2063
+ "step": 1580
2064
+ },
2065
+ {
2066
+ "epoch": 0.8350840336134454,
2067
+ "grad_norm": 66.42429414400927,
2068
+ "kl": 0.0,
2069
+ "learning_rate": 3.28094287626651e-08,
2070
+ "logps/chosen": -340.9739889705882,
2071
+ "logps/rejected": -299.48609375,
2072
+ "loss": 0.3308,
2073
+ "rewards/chosen": 0.048118759604061354,
2074
+ "rewards/margins": 4.789248316895728,
2075
+ "rewards/rejected": -4.741129557291667,
2076
+ "step": 1590
2077
+ },
2078
+ {
2079
+ "epoch": 0.8403361344537815,
2080
+ "grad_norm": 42.16586593578885,
2081
+ "kl": 0.0,
2082
+ "learning_rate": 3.079626948967534e-08,
2083
+ "logps/chosen": -312.65874787414964,
2084
+ "logps/rejected": -287.83103775289015,
2085
+ "loss": 0.3062,
2086
+ "rewards/chosen": -0.0730894049819635,
2087
+ "rewards/margins": 4.740564265312472,
2088
+ "rewards/rejected": -4.813653670294436,
2089
+ "step": 1600
2090
+ },
2091
+ {
2092
+ "epoch": 0.8455882352941176,
2093
+ "grad_norm": 42.50176028584879,
2094
+ "kl": 0.0,
2095
+ "learning_rate": 2.88427867992862e-08,
2096
+ "logps/chosen": -298.3803453947368,
2097
+ "logps/rejected": -256.55287856543623,
2098
+ "loss": 0.3216,
2099
+ "rewards/chosen": 0.292915611936335,
2100
+ "rewards/margins": 4.54555282224464,
2101
+ "rewards/rejected": -4.252637210308305,
2102
+ "step": 1610
2103
+ },
2104
+ {
2105
+ "epoch": 0.8508403361344538,
2106
+ "grad_norm": 83.97476475765178,
2107
+ "kl": 0.0,
2108
+ "learning_rate": 2.6949512512606965e-08,
2109
+ "logps/chosen": -293.518928433642,
2110
+ "logps/rejected": -296.5833910205696,
2111
+ "loss": 0.2953,
2112
+ "rewards/chosen": 0.5612210874204282,
2113
+ "rewards/margins": 5.312669320336765,
2114
+ "rewards/rejected": -4.751448232916337,
2115
+ "step": 1620
2116
+ },
2117
+ {
2118
+ "epoch": 0.8560924369747899,
2119
+ "grad_norm": 35.09955655518953,
2120
+ "kl": 0.0,
2121
+ "learning_rate": 2.5116962059457653e-08,
2122
+ "logps/chosen": -307.30205078125,
2123
+ "logps/rejected": -280.090478515625,
2124
+ "loss": 0.2881,
2125
+ "rewards/chosen": 0.4744985580444336,
2126
+ "rewards/margins": 5.155851554870606,
2127
+ "rewards/rejected": -4.681352996826172,
2128
+ "step": 1630
2129
+ },
2130
+ {
2131
+ "epoch": 0.8613445378151261,
2132
+ "grad_norm": 100.20739187851474,
2133
+ "kl": 0.0,
2134
+ "learning_rate": 2.334563433804687e-08,
2135
+ "logps/chosen": -319.6615634704969,
2136
+ "logps/rejected": -282.31458824685535,
2137
+ "loss": 0.3014,
2138
+ "rewards/chosen": 0.6534406768609278,
2139
+ "rewards/margins": 4.888445989606604,
2140
+ "rewards/rejected": -4.235005312745676,
2141
+ "step": 1640
2142
+ },
2143
+ {
2144
+ "epoch": 0.8665966386554622,
2145
+ "grad_norm": 88.81840072824282,
2146
+ "kl": 0.0,
2147
+ "learning_rate": 2.1636011579150793e-08,
2148
+ "logps/chosen": -295.85146484375,
2149
+ "logps/rejected": -290.18388671875,
2150
+ "loss": 0.2796,
2151
+ "rewards/chosen": 0.5988120079040528,
2152
+ "rewards/margins": 4.9494441032409675,
2153
+ "rewards/rejected": -4.350632095336914,
2154
+ "step": 1650
2155
+ },
2156
+ {
2157
+ "epoch": 0.8718487394957983,
2158
+ "grad_norm": 41.43708238784869,
2159
+ "kl": 0.0,
2160
+ "learning_rate": 1.998855921482906e-08,
2161
+ "logps/chosen": -276.827294921875,
2162
+ "logps/rejected": -240.9879638671875,
2163
+ "loss": 0.3411,
2164
+ "rewards/chosen": 0.4913971424102783,
2165
+ "rewards/margins": 4.203098344802856,
2166
+ "rewards/rejected": -3.711701202392578,
2167
+ "step": 1660
2168
+ },
2169
+ {
2170
+ "epoch": 0.8771008403361344,
2171
+ "grad_norm": 61.90087346557062,
2172
+ "kl": 0.0,
2173
+ "learning_rate": 1.8403725751714615e-08,
2174
+ "logps/chosen": -287.8762538580247,
2175
+ "logps/rejected": -314.7857001582278,
2176
+ "loss": 0.2888,
2177
+ "rewards/chosen": 0.6117756690508054,
2178
+ "rewards/margins": 5.216582303495031,
2179
+ "rewards/rejected": -4.604806634444225,
2180
+ "step": 1670
2181
+ },
2182
+ {
2183
+ "epoch": 0.8823529411764706,
2184
+ "grad_norm": 43.47883434595641,
2185
+ "kl": 0.0,
2186
+ "learning_rate": 1.6881942648911074e-08,
2187
+ "logps/chosen": -286.86601796407183,
2188
+ "logps/rejected": -269.488791870915,
2189
+ "loss": 0.3423,
2190
+ "rewards/chosen": 0.3007767500277765,
2191
+ "rewards/margins": 4.408972534852123,
2192
+ "rewards/rejected": -4.108195784824346,
2193
+ "step": 1680
2194
+ },
2195
+ {
2196
+ "epoch": 0.8876050420168067,
2197
+ "grad_norm": 43.77865447701488,
2198
+ "kl": 0.0,
2199
+ "learning_rate": 1.5423624200531115e-08,
2200
+ "logps/chosen": -300.1749822443182,
2201
+ "logps/rejected": -305.69072808159723,
2202
+ "loss": 0.3209,
2203
+ "rewards/chosen": 0.8302790901877664,
2204
+ "rewards/margins": 4.638904205476395,
2205
+ "rewards/rejected": -3.8086251152886286,
2206
+ "step": 1690
2207
+ },
2208
+ {
2209
+ "epoch": 0.8928571428571429,
2210
+ "grad_norm": 126.94155434263403,
2211
+ "kl": 0.0,
2212
+ "learning_rate": 1.4029167422908105e-08,
2213
+ "logps/chosen": -279.4980215097403,
2214
+ "logps/rejected": -270.3898013930723,
2215
+ "loss": 0.3016,
2216
+ "rewards/chosen": 1.0889219804243608,
2217
+ "rewards/margins": 5.310351644392734,
2218
+ "rewards/rejected": -4.221429663968373,
2219
+ "step": 1700
2220
+ },
2221
+ {
2222
+ "epoch": 0.898109243697479,
2223
+ "grad_norm": 61.01512892000863,
2224
+ "kl": 0.0,
2225
+ "learning_rate": 1.2698951946511327e-08,
2226
+ "logps/chosen": -302.3543386399371,
2227
+ "logps/rejected": -267.2125873447205,
2228
+ "loss": 0.3007,
2229
+ "rewards/chosen": 0.34754651147614485,
2230
+ "rewards/margins": 4.579919861631424,
2231
+ "rewards/rejected": -4.23237335015528,
2232
+ "step": 1710
2233
+ },
2234
+ {
2235
+ "epoch": 0.9033613445378151,
2236
+ "grad_norm": 41.136022643204946,
2237
+ "kl": 0.0,
2238
+ "learning_rate": 1.1433339912594265e-08,
2239
+ "logps/chosen": -308.92118158284023,
2240
+ "logps/rejected": -271.45762624172187,
2241
+ "loss": 0.3021,
2242
+ "rewards/chosen": 0.45501975336018397,
2243
+ "rewards/margins": 5.495058411653852,
2244
+ "rewards/rejected": -5.040038658293668,
2245
+ "step": 1720
2246
+ },
2247
+ {
2248
+ "epoch": 0.9086134453781513,
2249
+ "grad_norm": 110.83416055708793,
2250
+ "kl": 0.0,
2251
+ "learning_rate": 1.0232675874604608e-08,
2252
+ "logps/chosen": -312.7164007867133,
2253
+ "logps/rejected": -318.51147598870057,
2254
+ "loss": 0.2863,
2255
+ "rewards/chosen": 0.45250099022071677,
2256
+ "rewards/margins": 4.343536339576296,
2257
+ "rewards/rejected": -3.891035349355579,
2258
+ "step": 1730
2259
+ },
2260
+ {
2261
+ "epoch": 0.9138655462184874,
2262
+ "grad_norm": 26.122066134245365,
2263
+ "kl": 0.0,
2264
+ "learning_rate": 9.097286704381896e-09,
2265
+ "logps/chosen": -315.04382560483873,
2266
+ "logps/rejected": -272.66723484848484,
2267
+ "loss": 0.3122,
2268
+ "rewards/chosen": 0.7614468482232863,
2269
+ "rewards/margins": 4.353459750685407,
2270
+ "rewards/rejected": -3.592012902462121,
2271
+ "step": 1740
2272
+ },
2273
+ {
2274
+ "epoch": 0.9191176470588235,
2275
+ "grad_norm": 53.34556224286923,
2276
+ "kl": 0.0,
2277
+ "learning_rate": 8.02748150316937e-09,
2278
+ "logps/chosen": -283.3781151107595,
2279
+ "logps/rejected": -270.81105324074076,
2280
+ "loss": 0.3131,
2281
+ "rewards/chosen": 0.739828278746786,
2282
+ "rewards/margins": 4.5300670241653815,
2283
+ "rewards/rejected": -3.7902387454185957,
2284
+ "step": 1750
2285
+ },
2286
+ {
2287
+ "epoch": 0.9243697478991597,
2288
+ "grad_norm": 87.26559549436006,
2289
+ "kl": 0.0,
2290
+ "learning_rate": 7.023551517463089e-09,
2291
+ "logps/chosen": -271.55817018072287,
2292
+ "logps/rejected": -290.5874594155844,
2293
+ "loss": 0.3368,
2294
+ "rewards/chosen": 0.3112575117363987,
2295
+ "rewards/margins": 3.797579691753647,
2296
+ "rewards/rejected": -3.4863221800172486,
2297
+ "step": 1760
2298
+ },
2299
+ {
2300
+ "epoch": 0.9296218487394958,
2301
+ "grad_norm": 31.56919055846343,
2302
+ "kl": 0.0,
2303
+ "learning_rate": 6.085770059722634e-09,
2304
+ "logps/chosen": -272.63416371855345,
2305
+ "logps/rejected": -279.8696380046584,
2306
+ "loss": 0.2818,
2307
+ "rewards/chosen": 0.9944836118686124,
2308
+ "rewards/margins": 5.193192850999047,
2309
+ "rewards/rejected": -4.198709239130435,
2310
+ "step": 1770
2311
+ },
2312
+ {
2313
+ "epoch": 0.9348739495798319,
2314
+ "grad_norm": 37.83348753574193,
2315
+ "kl": 0.0,
2316
+ "learning_rate": 5.214392433963488e-09,
2317
+ "logps/chosen": -287.8191769622093,
2318
+ "logps/rejected": -277.8159311655405,
2319
+ "loss": 0.3277,
2320
+ "rewards/chosen": 0.5125378453454306,
2321
+ "rewards/margins": 3.781902073465152,
2322
+ "rewards/rejected": -3.2693642281197213,
2323
+ "step": 1780
2324
+ },
2325
+ {
2326
+ "epoch": 0.9401260504201681,
2327
+ "grad_norm": 45.4550208744577,
2328
+ "kl": 0.0,
2329
+ "learning_rate": 4.409655866252693e-09,
2330
+ "logps/chosen": -282.79800397398844,
2331
+ "logps/rejected": -302.68510841836735,
2332
+ "loss": 0.3304,
2333
+ "rewards/chosen": 0.5359358153591266,
2334
+ "rewards/margins": 4.274966925850198,
2335
+ "rewards/rejected": -3.7390311104910716,
2336
+ "step": 1790
2337
+ },
2338
+ {
2339
+ "epoch": 0.9453781512605042,
2340
+ "grad_norm": 47.92999315794789,
2341
+ "kl": 0.0,
2342
+ "learning_rate": 3.671779440125644e-09,
2343
+ "logps/chosen": -299.54941281847135,
2344
+ "logps/rejected": -283.9407831671779,
2345
+ "loss": 0.3024,
2346
+ "rewards/chosen": 0.9072784618207603,
2347
+ "rewards/margins": 4.687179322753467,
2348
+ "rewards/rejected": -3.779900860932707,
2349
+ "step": 1800
2350
+ },
2351
+ {
2352
+ "epoch": 0.9506302521008403,
2353
+ "grad_norm": 32.60296033727532,
2354
+ "kl": 0.0,
2355
+ "learning_rate": 3.000964036942305e-09,
2356
+ "logps/chosen": -278.7902070063694,
2357
+ "logps/rejected": -298.68158071319016,
2358
+ "loss": 0.2903,
2359
+ "rewards/chosen": 0.4591269523474821,
2360
+ "rewards/margins": 4.919044079336555,
2361
+ "rewards/rejected": -4.4599171269890725,
2362
+ "step": 1810
2363
+ },
2364
+ {
2365
+ "epoch": 0.9558823529411765,
2366
+ "grad_norm": 38.232447498754475,
2367
+ "kl": 0.0,
2368
+ "learning_rate": 2.397392281198729e-09,
2369
+ "logps/chosen": -302.90917338709676,
2370
+ "logps/rejected": -267.12088068181816,
2371
+ "loss": 0.301,
2372
+ "rewards/chosen": 0.69748047859438,
2373
+ "rewards/margins": 4.834925731908774,
2374
+ "rewards/rejected": -4.137445253314394,
2375
+ "step": 1820
2376
+ },
2377
+ {
2378
+ "epoch": 0.9611344537815126,
2379
+ "grad_norm": 22.326350506185886,
2380
+ "kl": 0.0,
2381
+ "learning_rate": 1.861228490808886e-09,
2382
+ "logps/chosen": -294.7616077769886,
2383
+ "logps/rejected": -289.75987413194446,
2384
+ "loss": 0.3229,
2385
+ "rewards/chosen": 0.6278800530867144,
2386
+ "rewards/margins": 4.9228116767575045,
2387
+ "rewards/rejected": -4.29493162367079,
2388
+ "step": 1830
2389
+ },
2390
+ {
2391
+ "epoch": 0.9663865546218487,
2392
+ "grad_norm": 40.257301749189914,
2393
+ "kl": 0.0,
2394
+ "learning_rate": 1.3926186323703903e-09,
2395
+ "logps/chosen": -274.1588462271341,
2396
+ "logps/rejected": -264.75538361378204,
2397
+ "loss": 0.3362,
2398
+ "rewards/chosen": 0.5637630369605088,
2399
+ "rewards/margins": 4.09795015822954,
2400
+ "rewards/rejected": -3.5341871212690306,
2401
+ "step": 1840
2402
+ },
2403
+ {
2404
+ "epoch": 0.9716386554621849,
2405
+ "grad_norm": 52.95763177365781,
2406
+ "kl": 0.0,
2407
+ "learning_rate": 9.916902814261774e-10,
2408
+ "logps/chosen": -288.03719758064517,
2409
+ "logps/rejected": -290.255634469697,
2410
+ "loss": 0.271,
2411
+ "rewards/chosen": 0.5408120432207661,
2412
+ "rewards/margins": 5.058847746937622,
2413
+ "rewards/rejected": -4.518035703716856,
2414
+ "step": 1850
2415
+ },
2416
+ {
2417
+ "epoch": 0.976890756302521,
2418
+ "grad_norm": 48.81020773750272,
2419
+ "kl": 0.0,
2420
+ "learning_rate": 6.585525877328968e-10,
2421
+ "logps/chosen": -290.9931640625,
2422
+ "logps/rejected": -323.75256631540697,
2423
+ "loss": 0.3325,
2424
+ "rewards/chosen": 0.10595728899981524,
2425
+ "rewards/margins": 4.301412795790331,
2426
+ "rewards/rejected": -4.195455506790516,
2427
+ "step": 1860
2428
+ },
2429
+ {
2430
+ "epoch": 0.9821428571428571,
2431
+ "grad_norm": 35.96392604697334,
2432
+ "kl": 0.0,
2433
+ "learning_rate": 3.9329624554584883e-10,
2434
+ "logps/chosen": -281.2289627259036,
2435
+ "logps/rejected": -280.72377232142856,
2436
+ "loss": 0.3147,
2437
+ "rewards/chosen": 0.46981898847832737,
2438
+ "rewards/margins": 4.369042811530275,
2439
+ "rewards/rejected": -3.899223823051948,
2440
+ "step": 1870
2441
+ },
2442
+ {
2443
+ "epoch": 0.9873949579831933,
2444
+ "grad_norm": 98.21290759884454,
2445
+ "kl": 0.0,
2446
+ "learning_rate": 1.959934689280962e-10,
2447
+ "logps/chosen": -292.91434151785717,
2448
+ "logps/rejected": -292.6137952302632,
2449
+ "loss": 0.3104,
2450
+ "rewards/chosen": 0.6014651343936012,
2451
+ "rewards/margins": 4.848261591784637,
2452
+ "rewards/rejected": -4.246796457391036,
2453
+ "step": 1880
2454
+ },
2455
+ {
2456
+ "epoch": 0.9926470588235294,
2457
+ "grad_norm": 33.92572486964675,
2458
+ "kl": 0.0,
2459
+ "learning_rate": 6.669797209069017e-11,
2460
+ "logps/chosen": -269.0030942766854,
2461
+ "logps/rejected": -273.2933263644366,
2462
+ "loss": 0.3204,
2463
+ "rewards/chosen": 0.08876853042773986,
2464
+ "rewards/margins": 4.26088122367708,
2465
+ "rewards/rejected": -4.17211269324934,
2466
+ "step": 1890
2467
+ },
2468
+ {
2469
+ "epoch": 0.9978991596638656,
2470
+ "grad_norm": 57.704679882719425,
2471
+ "kl": 0.0,
2472
+ "learning_rate": 5.444954769395771e-12,
2473
+ "logps/chosen": -258.240974378882,
2474
+ "logps/rejected": -283.904309158805,
2475
+ "loss": 0.3031,
2476
+ "rewards/chosen": 0.38151166453865004,
2477
+ "rewards/margins": 4.77392472590854,
2478
+ "rewards/rejected": -4.39241306136989,
2479
+ "step": 1900
2480
+ },
2481
+ {
2482
+ "epoch": 1.0,
2483
+ "step": 1904,
2484
+ "total_flos": 0.0,
2485
+ "train_loss": 0.3297611388589154,
2486
+ "train_runtime": 9235.4216,
2487
+ "train_samples_per_second": 6.596,
2488
+ "train_steps_per_second": 0.206
2489
+ }
2490
+ ],
2491
+ "logging_steps": 10,
2492
+ "max_steps": 1904,
2493
+ "num_input_tokens_seen": 0,
2494
+ "num_train_epochs": 1,
2495
+ "save_steps": 500,
2496
+ "stateful_callbacks": {
2497
+ "TrainerControl": {
2498
+ "args": {
2499
+ "should_epoch_stop": false,
2500
+ "should_evaluate": false,
2501
+ "should_log": false,
2502
+ "should_save": true,
2503
+ "should_training_stop": true
2504
+ },
2505
+ "attributes": {}
2506
+ }
2507
+ },
2508
+ "total_flos": 0.0,
2509
+ "train_batch_size": 8,
2510
+ "trial_name": null,
2511
+ "trial_params": null
2512
+ }