neuralmagic · horheynm · Jun 4, 2024 · Jun 4, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py
@@ -50,6 +50,7 @@ class GPTQModifier(Modifier):
             - LayerCompressor.revert_layer_wrappers()
 
 
+    :param actorder: Whether to use activation reordering or not
     :param sequential_update: Whether or not to update weights sequentially by layer,
         True saves on GPU memory
     :param targets: list of layer names to compress during GPTQ, or '__ALL__'
@@ -80,6 +81,7 @@ class GPTQModifier(Modifier):
         and activation 8 bit quantization on the Linear layers.
     """
 
+    actorder: bool = False
     sequential_update: Optional[bool] = False
     targets: Union[str, List[str], None] = None
     block_size: int = 128

diff --git a/src/sparseml/modifiers/quantization/gptq/pytorch.py b/src/sparseml/modifiers/quantization/gptq/pytorch.py
@@ -156,7 +156,7 @@ def apply_compression(
                 layer_compressor.pre_compress()
                 _LOGGER.info(f"Calibrating {layer_compressor.name}...")
                 run_calibration_forward(self.model, dataloader, mask_padding=True)
-            layer_compressor.compress()
+            layer_compressor.compress(self.actorder)
             layer_compressor.post_compress()
             layer_compressor.revert_layer_wrappers()
             torch.cuda.empty_cache()

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -81,6 +81,7 @@ def add_batch(self, inp: torch.Tensor, out: torch.Tensor):
 
     def fasterprune(
         self,
+        actorder: bool = False,
         blocksize: int = 128,
         percdamp: float = 0.01,
     ):
@@ -109,6 +110,12 @@ def fasterprune(
         self.H[dead, dead] = 1
         W[:, dead] = 0
 
+        if actorder:
+            perm = torch.argsort(torch.diag(self.H), descending=True)
+            W = W[:, perm]
+            self.H = self.H[perm][:, perm]
+            invperm = torch.argsort(perm)
+
         Losses = torch.zeros(self.rows, device=self.dev)
 
         damp = percdamp * torch.mean(torch.diag(self.H))
@@ -153,6 +160,7 @@ def fasterprune(
             for i in range(count):
                 w = W1[:, i]
                 d = Hinv1[i, i]
+
                 q = w.clone()
                 if sparsity >= SPARSITY_THRESHOLD:
                     q[mask1[:, i]] = 0
@@ -227,6 +235,9 @@ def fasterprune(
         _LOGGER.info("time %.2f" % (time.time() - tick))
         _LOGGER.info("error %.2f" % torch.sum(Losses).item())
 
+        if actorder:
+            W = W[:, invperm]
+
         if isinstance(self.layer, transformers.Conv1D):
             W = W.t()
         W = W.reshape(final_shape).to(final_dtype)

diff --git a/src/sparseml/modifiers/utils/layer_compressor.py b/src/sparseml/modifiers/utils/layer_compressor.py
@@ -131,7 +131,7 @@ def revert_layer_wrappers(self):
             module_wrapper.free()
         self.modules = None
 
-    def compress(self):
+    def compress(self, actorder: bool = False):
         """
         Apply compression to each wrapped submodule in the layer
         """
@@ -141,7 +141,7 @@ def prune(module):
             if isinstance(module, self.module_compressor_class):
                 full_name = self._get_full_submodule_name(module.name)
                 _LOGGER.info(f"Compressing {full_name}...")
-                module.fasterprune(**self.args)
+                module.fasterprune(actorder=actorder, **self.args)
 
         self.layer.apply(prune)