RaushanTurganbay HF staff commited on
Commit
57b4cf3
1 Parent(s): e2bd36a

Upload LlavaOnevisionForConditionalGeneration

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. config.json +3 -7
  3. model.safetensors +1 -1
README.md CHANGED
@@ -2,15 +2,15 @@
2
  language:
3
  - en
4
  - zh
5
- pipeline_tag: image-text-to-text
6
- inference: false
7
- arxiv: 2408.03326
8
  license: apache-2.0
9
  tags:
10
  - vision
11
  - image-text-to-text
12
  datasets:
13
  - lmms-lab/LLaVA-OneVision-Data
 
 
 
14
  ---
15
  # LLaVA-Onevision Model Card
16
 
 
2
  language:
3
  - en
4
  - zh
 
 
 
5
  license: apache-2.0
6
  tags:
7
  - vision
8
  - image-text-to-text
9
  datasets:
10
  - lmms-lab/LLaVA-OneVision-Data
11
+ pipeline_tag: image-text-to-text
12
+ inference: false
13
+ arxiv: 2408.03326
14
  ---
15
  # LLaVA-Onevision Model Card
16
 
config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_name_or_path": "/raid/raushan/ov-500",
3
  "architectures": [
4
- "LlavaNextForConditionalGeneration"
5
  ],
6
  "ignore_index": -100,
7
  "image_grid_pinpoints": [
@@ -151,7 +151,7 @@
151
  ]
152
  ],
153
  "image_token_index": 151646,
154
- "model_type": "llava_next",
155
  "projector_hidden_act": "gelu",
156
  "text_config": {
157
  "_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
@@ -162,30 +162,26 @@
162
  "eos_token_id": 151645,
163
  "hidden_size": 896,
164
  "intermediate_size": 4864,
165
- "max_position_embeddings": 32768,
166
  "max_window_layers": 24,
167
  "model_type": "qwen2",
168
  "num_attention_heads": 14,
169
  "num_hidden_layers": 24,
170
  "num_key_value_heads": 2,
171
  "rope_theta": 1000000.0,
172
- "sliding_window": null,
173
  "tie_word_embeddings": true,
174
  "torch_dtype": "bfloat16",
175
- "use_sliding_window": false,
176
  "vocab_size": 152000
177
  },
178
  "tie_word_embeddings": false,
179
  "torch_dtype": "float16",
180
  "transformers_version": "4.45.0.dev0",
181
  "use_image_newline_parameter": true,
 
182
  "vision_aspect_ratio": "anyres_max_9",
183
  "vision_config": {
184
- "hidden_act": "gelu_pytorch_tanh",
185
  "hidden_size": 1152,
186
  "image_size": 384,
187
  "intermediate_size": 4304,
188
- "layer_norm_eps": 1e-06,
189
  "model_type": "siglip_vision_model",
190
  "num_attention_heads": 16,
191
  "num_hidden_layers": 26,
 
1
  {
2
  "_name_or_path": "/raid/raushan/ov-500",
3
  "architectures": [
4
+ "LlavaOnevisionForConditionalGeneration"
5
  ],
6
  "ignore_index": -100,
7
  "image_grid_pinpoints": [
 
151
  ]
152
  ],
153
  "image_token_index": 151646,
154
+ "model_type": "llava_onevision",
155
  "projector_hidden_act": "gelu",
156
  "text_config": {
157
  "_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
 
162
  "eos_token_id": 151645,
163
  "hidden_size": 896,
164
  "intermediate_size": 4864,
 
165
  "max_window_layers": 24,
166
  "model_type": "qwen2",
167
  "num_attention_heads": 14,
168
  "num_hidden_layers": 24,
169
  "num_key_value_heads": 2,
170
  "rope_theta": 1000000.0,
 
171
  "tie_word_embeddings": true,
172
  "torch_dtype": "bfloat16",
 
173
  "vocab_size": 152000
174
  },
175
  "tie_word_embeddings": false,
176
  "torch_dtype": "float16",
177
  "transformers_version": "4.45.0.dev0",
178
  "use_image_newline_parameter": true,
179
+ "video_token_index": 151647,
180
  "vision_aspect_ratio": "anyres_max_9",
181
  "vision_config": {
 
182
  "hidden_size": 1152,
183
  "image_size": 384,
184
  "intermediate_size": 4304,
 
185
  "model_type": "siglip_vision_model",
186
  "num_attention_heads": 16,
187
  "num_hidden_layers": 26,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:735c36a3a8630727bae46fb4ca44cf21296df29361393a341592e90bb6392b01
3
  size 1787445680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07b3362c3412de79baf2379e44e5b0b2a8f4b965ebebd11d7b5b3eb4450fe96e
3
  size 1787445680