vllm-project
diff --git a/‎src/llmcompressor/modifiers/autoround/base.py‎
Lines changed: 31 additions & 27 deletions b/‎src/llmcompressor/modifiers/autoround/base.py‎
Lines changed: 31 additions & 27 deletions
diff --git a/‎src/llmcompressor/modifiers/awq/base.py‎
Lines changed: 20 additions & 20 deletions b/‎src/llmcompressor/modifiers/awq/base.py‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎src/llmcompressor/modifiers/pruning/sparsegpt/base.py‎
Lines changed: 21 additions & 17 deletions b/‎src/llmcompressor/modifiers/pruning/sparsegpt/base.py‎
Lines changed: 21 additions & 17 deletions
diff --git a/‎src/llmcompressor/modifiers/pruning/wanda/base.py‎
Lines changed: 20 additions & 16 deletions b/‎src/llmcompressor/modifiers/pruning/wanda/base.py‎
Lines changed: 20 additions & 16 deletions
diff --git a/‎src/llmcompressor/modifiers/quantization/gptq/base.py‎
Lines changed: 38 additions & 34 deletions b/‎src/llmcompressor/modifiers/quantization/gptq/base.py‎
Lines changed: 38 additions & 34 deletions
diff --git a/‎src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py‎
Lines changed: 1 addition & 1 deletion b/‎src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llmcompressor/modifiers/quantization/quantization/base.py‎
Lines changed: 1 addition & 1 deletion b/‎src/llmcompressor/modifiers/quantization/quantization/base.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llmcompressor/modifiers/quantization/quantization/mixin.py‎
Lines changed: 20 additions & 18 deletions b/‎src/llmcompressor/modifiers/quantization/quantization/mixin.py‎
Lines changed: 20 additions & 18 deletions
@@ -60,35 +60,39 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     This modifier leverages signed gradient descent (SignSGD) optimizer and
     block-wise loss to optimize rounding values and weight clipping in a few steps.
 
-    | Sample yaml:
-    | test_stage:
-    |    modifiers:
-    |      AutoRoundModifier:
-    |          iters: 200
-    |          config_groups:
-    |            group_0:
-    |                targets:
-    |                  - "Linear"
-    |                input_activations: null
-    |                output_activations: null
-    |                weights:
-    |                    num_bits: 4
-    |                    type: "int"
-    |                    symmetric: true
-    |                    strategy: group
-    |                    group_size: 128
+    Sample yaml:
+
+    ```yaml
+    test_stage:
+      modifiers:
+        AutoRoundModifier:
+          iters: 200
+          config_groups:
+            group_0:
+              targets:
+                - "Linear"
+              input_activations: null
+              output_activations: null
+              weights:
+                num_bits: 4
+                type: "int"
+                symmetric: true
+                strategy: group
+                group_size: 128
+    ```
 
     Lifecycle:
-        - on_initialize
-            - apply config to model
-        - on_start
-            - add input capture hooks to decoding layers
-        - on_sequential_epoch_end
-            - apply_autoround
-            - post_autoround_cleanup
-        - on_finalize
-            - remove_hooks()
-            - model.apply(freeze_module_quantization)
+
+    - on_initialize
+        - apply config to model
+    - on_start
+        - add input capture hooks to decoding layers
+    - on_sequential_epoch_end
+        - apply_autoround
+        - post_autoround_cleanup
+    - on_finalize
+        - remove_hooks()
+        - model.apply(freeze_module_quantization)
 
     :param config_groups: dictionary specifying quantization schemes to apply to target
         modules. Modules not matching a scheme target will NOT be quantized.
 
@@ -58,7 +58,6 @@ class AWQModifier(Modifier, QuantizationMixin):
           balance_layers: ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"]
         - smooth_layer: "re:.*final_layer_norm"
           balance_layers: ["re:.*fc1"]
-      ]
       ignore: ["lm_head"]
       config_groups:
         group_0:
@@ -75,25 +74,26 @@ class AWQModifier(Modifier, QuantizationMixin):
     ```
 
     Lifecycle:
-        - on_initialize
-            - resolve mappings
-            - capture kwargs needed for forward passes into modules
-        - on_start
-            - set up activation cache hooks to capture input activations
-                to balance layers
-        - on sequential epoch end
-            - apply smoothing to each smoothing layer
-                - consume cached activations across all batches
-                    - clear cached activations as they are used
-                - find best smoothing scale for each smoothing layer
-                - apply to model weights
-                - raise error if any unused activations remain
-        - on_end
-            - re-run logic of sequential epoch end (in case of basic pipeline)
-            - set scales and zero points
-            - remove activation hooks
-        - on_finalize
-            - clear resolved mappings and captured activations
+
+    - on_initialize
+        - resolve mappings
+        - capture kwargs needed for forward passes into modules
+    - on_start
+        - set up activation cache hooks to capture input activations
+            to balance layers
+    - on sequential epoch end
+        - apply smoothing to each smoothing layer
+            - consume cached activations across all batches
+                - clear cached activations as they are used
+            - find best smoothing scale for each smoothing layer
+            - apply to model weights
+            - raise error if any unused activations remain
+    - on_end
+        - re-run logic of sequential epoch end (in case of basic pipeline)
+        - set scales and zero points
+        - remove activation hooks
+    - on_finalize
+        - clear resolved mappings and captured activations
 
     :param sequential_targets: list of module names to compress in
         the same calibration pass
 
@@ -26,24 +26,28 @@ class SparseGPTModifier(SparsityModifierBase):
     """
     Modifier for applying the one-shot SparseGPT algorithm to a model
 
-    | Sample yaml:
-    |   test_stage:
-    |       obcq_modifiers:
-    |           SparseGPTModifier:
-    |               sparsity: 0.5
-    |               mask_structure: "2:4"
-    |               dampening_frac: 0.001
-    |               block_size: 128
-    |               targets: ['Linear']
-    |               ignore: ['re:.*lm_head']
+    Sample yaml:
+
+    ```yaml
+    test_stage:
+      obcq_modifiers:
+        SparseGPTModifier:
+          sparsity: 0.5
+          mask_structure: "2:4"
+          dampening_frac: 0.001
+          block_size: 128
+          targets: ['Linear']
+          ignore: ['re:.*lm_head']
+    ```
 
     Lifecycle:
-        - on_initialize
-            - register_hook(module, calibrate_module, "forward")
-        - on_sequential_batch_end
-            - sparsify_weight
-        - on_finalize
-            - remove_hooks()
+
+    - on_initialize
+        - register_hook(module, calibrate_module, "forward")
+    - on_sequential_batch_end
+        - sparsify_weight
+    - on_finalize
+        - remove_hooks()
 
     :param sparsity: Sparsity to compress model to
     :param sparsity_profile: Can be set to 'owl' to use Outlier Weighed
@@ -92,7 +96,7 @@ def calibrate_module(
 
         :param module: module being calibrated
         :param args: inputs to the module, the first element of which is the
-            cannonical input
+            canonical input
         :param _output: uncompressed module output, unused
         """
         # Assume that the first argument is the input
 
@@ -26,23 +26,27 @@ class WandaPruningModifier(SparsityModifierBase):
     Modifier for applying the one-shot WANDA algorithm to a model
     from the paper: https://arxiv.org/abs/2306.11695
 
-    | Sample yaml:
-    |   test_stage:
-    |       sparsity_modifiers:
-    |           WandaPruningModifier:
-    |               sparsity: 0.5
-    |               mask_structure: "2:4"
+    Sample yaml:
+
+    ```yaml
+    test_stage:
+      sparsity_modifiers:
+        WandaPruningModifier:
+          sparsity: 0.5
+          mask_structure: "2:4"
+    ```
 
     Lifecycle:
-        - on_initialize
-            - register_hook(module, calibrate_module, "forward")
-            - run_sequential / run_basic
-                - make_empty_row_scalars
-                - accumulate_row_scalars
-        - on_sequential_batch_end
-            - sparsify_weight
-        - on_finalize
-            - remove_hooks()
+
+    - on_initialize
+        - register_hook(module, calibrate_module, "forward")
+        - run_sequential / run_basic
+            - make_empty_row_scalars
+            - accumulate_row_scalars
+    - on_sequential_batch_end
+        - sparsify_weight
+    - on_finalize
+        - remove_hooks()
 
     :param sparsity: Sparsity to compress model to
     :param sparsity_profile: Can be set to 'owl' to use Outlier Weighed
@@ -78,7 +82,7 @@ def calibrate_module(
 
         :param module: module being calibrated
         :param args: inputs to the module, the first element of which is the
-            cannonical input
+            canonical input
         :param _output: uncompressed module output, unused
         """
         # Assume that the first argument is the input
 
@@ -36,40 +36,44 @@ class GPTQModifier(Modifier, QuantizationMixin):
     """
     Implements the GPTQ algorithm from https://arxiv.org/abs/2210.17323. This modifier
     uses activations to calibrate a hessian matrix, which is then used to determine
-    optimal quantizion values and orderings for the model weights.
-
-    | Sample yaml:
-    | test_stage:
-    |    obcq_modifiers:
-    |      GPTQModifier:
-    |          block_size: 128
-    |          dampening_frac: 0.001
-    |          offload_hessians: False
-    |          actorder: static
-    |          config_groups:
-    |            group_0:
-    |                targets:
-    |                  - "Linear"
-    |                input_activations: null
-    |                output_activations: null
-    |                weights:
-    |                    num_bits: 8
-    |                    type: "int"
-    |                    symmetric: true
-    |                    strategy: group
-    |                    group_size: 128
+    optimal quantization values and orderings for the model weights.
+
+    Sample yaml:
+
+    ```yaml
+    test_stage:
+      obcq_modifiers:
+        GPTQModifier:
+          block_size: 128
+          dampening_frac: 0.001
+          offload_hessians: False
+          actorder: static
+          config_groups:
+            group_0:
+              targets:
+                - "Linear"
+              input_activations: null
+              output_activations: null
+              weights:
+                num_bits: 8
+                type: "int"
+                symmetric: true
+                strategy: group
+                group_size: 128
+    ```
 
     Lifecycle:
-        - on_initialize
-            - apply config to model
-        - on_start
-            - add activation calibration hooks
-            - add gptq weight calibration hooks
-        - on_sequential_epoch_end
-            - quantize_weight
-        - on_finalize
-            - remove_hooks()
-            - model.apply(freeze_module_quantization)
+
+    - on_initialize
+        - apply config to model
+    - on_start
+        - add activation calibration hooks
+        - add gptq weight calibration hooks
+    - on_sequential_epoch_end
+        - quantize_weight
+    - on_finalize
+        - remove_hooks()
+        - model.apply(freeze_module_quantization)
 
     :param sequential_targets: list of layer names to compress during GPTQ, or
         '__ALL__' to compress every layer in the model
@@ -99,7 +103,7 @@ class GPTQModifier(Modifier, QuantizationMixin):
         the kv_cache_scheme gets converted into a QuantizationScheme that:
             - targets the `q_proj` and `k_proj` modules of the model. The outputs
               of those modules are the keys and values that might be cached
-            - quantizes the outputs of the aformentioned layers, so that
+            - quantizes the outputs of the aforementioned layers, so that
               keys and values are compressed before storing them in the cache
         There is an explicit assumption that the model contains modules with
         `k_proj` and `v_proj` in their names. If this is not the case
@@ -220,7 +224,7 @@ def calibrate_module(
 
         :param module: module being calibrated
         :param args: inputs to the module, the first element of which is the
-            cannonical input
+            canonical input
         :param _output: uncompressed module output, unused
         """
         # Assume that first argument is the input
 
@@ -286,7 +286,7 @@ def _apply_activation_ordering(
     W: torch.Tensor, H: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Permute weight and hessian in order of greatest outupt activations
+    Permute weight and hessian in order of greatest output activations
 
     :param W: weight to permute
     :param H: hessian used to determine activation ordering
 
@@ -37,7 +37,7 @@ class QuantizationModifier(Modifier, QuantizationMixin):
         the kv_cache_scheme gets converted into a QuantizationScheme that:
             - targets the `q_proj` and `k_proj` modules of the model. The outputs
               of those modules are the keys and values that might be cached
-            - quantizes the outputs of the aformentioned layers, so that
+            - quantizes the outputs of the aforementioned layers, so that
               keys and values are compressed before storing them in the cache
         There is an explicit assumption that the model contains modules with
         `k_proj` and `v_proj` in their names. If this is not the case
 
@@ -41,26 +41,28 @@
 
 class QuantizationMixin(HooksMixin):
     """
-    Mixin which enables a Modifier to act as a quantization config, attching observers,
+    Mixin which enables a Modifier to act as a quantization config, attaching observers,
     calibration hooks, and compression wrappers to modifiers
 
     Lifecycle:
-        - on_initialize: QuantizationMixin.initialize_quantization
-            - Attach schemes to modules
-            - Attach observers to modules
-            - Disable quantization until calibration starts/finishes
-        - on_start: QuantizationMixin.start_calibration
-            - Attach calibration hooks
-            - Apply calibration status
-            - Enable quantization during calibration
-        - on_end: QuantizationMixin.end_calibration
-            - Remove calibration hooks
-            - Apply freeze status
-            - Keep quantization enabled for future steps
-        NOTE: QuantizationMixin does not update scales and zero-points on its own,
-          as this is not desired for all Modifiers inheriting from it. Modifier must
-          explicitly call `update_weight_zp_scale`.
-          See QuantizationModifier.on_start method for example
+
+    - on_initialize: QuantizationMixin.initialize_quantization
+        - Attach schemes to modules
+        - Attach observers to modules
+        - Disable quantization until calibration starts/finishes
+    - on_start: QuantizationMixin.start_calibration
+        - Attach calibration hooks
+        - Apply calibration status
+        - Enable quantization during calibration
+    - on_end: QuantizationMixin.end_calibration
+        - Remove calibration hooks
+        - Apply freeze status
+        - Keep quantization enabled for future steps
+
+    NOTE: QuantizationMixin does not update scales and zero-points on its own,
+        as this is not desired for all Modifiers inheriting from it. Modifier must
+        explicitly call `update_weight_zp_scale`.
+        See QuantizationModifier.on_start method for example
 
     :param config_groups: dictionary specifying quantization schemes to apply to target
         modules. Modules not matching a scheme target will NOT be quantized.
@@ -83,7 +85,7 @@ class QuantizationMixin(HooksMixin):
         the kv_cache_scheme gets converted into a QuantizationScheme that:
             - targets the `q_proj` and `k_proj` modules of the model. The outputs
               of those modules are the keys and values that might be cached
-            - quantizes the outputs of the aformentioned layers, so that
+            - quantizes the outputs of the aforementioned layers, so that
               keys and values are compressed before storing them in the cache
         There is an explicit assumption that the model contains modules with
         `k_proj` and `v_proj` in their names. If this is not the case