nm-testing
/

open_llama_3b_instruct_v_0.2-pruned50-quant-ds

Text Generation

text-generation-inference

Model card Files Files and versions Community

mwitiderrick commited on Dec 8, 2023

Commit

7bcf2bb

•

1 Parent(s): 1f82687

Create recipe.yaml

Files changed (1) hide show

recipe.yaml +38 -0

recipe.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+test_stage:
+  obcq_modifiers:
+    SmoothQuantModifier:
+      smoothing_strength: 0.8
+      mappings: [
+        [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
+        [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
+      ]
+    QuantizationModifier:
+      ignore:
+      # These operations don't make sense to quantize
+      - LlamaRotaryEmbedding
+      - LlamaRMSNorm
+      - SiLUActivation
+      # Skip quantizing the BMMs
+      - QuantizableMatMul
+      # Skip quantizing the layers with the most sensitive activations
+      - model.layers.5.mlp.down_proj
+      - model.layers.24.mlp.down_proj
+      - model.layers.23.mlp.down_proj
+      - model.layers.25.mlp.down_proj
+      - model.layers.2.mlp.down_proj
+      - model.layers.8.mlp.down_proj
+      post_oneshot_calibration: true
+      scheme_overrides:
+        Embedding:
+          input_activations: null
+          weights:
+            num_bits: 8
+            symmetric: false
+    SparseGPTModifier:
+      sparsity: 0.5
+      block_size: 128
+      sequential_update: true
+      quantize: true
+      percdamp: 0.01
+      mask_structure: "0:0"
+      targets: ["re:model.layers.\\d*$"]