Xilinx · Giuseppe5 · May 6, 2025 · Apr 10, 2025 · Apr 12, 2025 · Apr 13, 2025
diff --git a/src/brevitas/graph/gptq.py b/src/brevitas/graph/gptq.py
@@ -52,14 +52,16 @@ def __init__(
         self.blocksize = math.ceil(self.columns / num_blocks)
 
         # Initialize Hessian matrix and counter. We need it in float32 to compute the inverse
-        self.H = torch.zeros((self.groups, self.columns, self.columns),
-                             device='cpu',
-                             dtype=torch.float32,
-                             pin_memory=torch.cuda.is_available())
-        self.B = torch.zeros((self.groups, self.columns, self.columns),
-                             device='cpu',
-                             dtype=torch.float32,
-                             pin_memory=torch.cuda.is_available())
+        self.H = torch.zeros(
+            (self.groups, self.columns, self.columns),
+            device='cpu',
+            dtype=torch.float32,
+        )
+        self.B = torch.zeros(
+            (self.groups, self.columns, self.columns),
+            device='cpu',
+            dtype=torch.float32,
+        )
         self.nsamples = 0
 
         assert torch_version >= version.parse('1.10'), "GPTQ requires torch 1.10 or higher"

diff --git a/src/brevitas/graph/gpxq.py b/src/brevitas/graph/gpxq.py
@@ -242,13 +242,26 @@ def single_layer_update(self):
         pass
 
     def get_quant_weights(self, i, i1, permutation_list, with_quant_history=False):
+        from brevitas.quant_tensor import _unpack_quant_tensor
+
         # We need to recompute quant weights at runtime since our float weights are being updated
         # Add offset in case of blockwise computation
         i = i1 + i
+
         # For QuantLinear and for some QuantConvolutional layers, we exploit the possibility
         # of quantizing only a subset of the entire matrix speeding up the computation of GPxQ
+        no_slice = False
+        # Groupwise Quantization does not support slicing
+        no_slice = no_slice or self.layer.weight_quant.is_groupwise
+        # If we need quantization of past channels, we do not use slicing
+        no_slice = no_slice or with_quant_history
+        # If we are in export mode (i.e., inference mode), we do not slice for torch.compile
+        # compatibility
+        no_slice = no_slice or self.layer.weight_quant.export_mode
+
         if isinstance(self.layer, qnn.QuantLinear):
-            if self.layer.weight_quant.is_groupwise or with_quant_history:
+            if no_slice:
+
                 # No slicing, not optimized
                 q = self.layer.quant_weight(quant_input=self.quant_metadata)
                 q = _unpack_quant_tensor(q).unsqueeze(0)  # [1, OC, IC]
@@ -264,11 +277,11 @@ def get_quant_weights(self, i, i1, permutation_list, with_quant_history=False):
                         subtensor_slice_list=subtensor_slice_list,
                         quant_input=self.quant_metadata)).unsqueeze(0)  # [1, OC, 1]
         elif isinstance(self.layer, SUPPORTED_CONV_OP):
-            # For depthwise and ConvTranspose we fall back to quantizing the entire martix.
-            # For all other cases, we create a mask that represent the slicing we will perform on the weight matrix
-            # and we quantize only the selected dimensions.
-            if self.layer.weight_quant.is_groupwise or with_quant_history or self.groups > 1 or (
-                    self.groups == 1 and is_conv_transposed(self.layer)):
+            # Depthwise and ConvTranspose does not support slicing
+            no_slice_conv = no_slice or (self.groups > 1 or is_conv_transposed(self.layer))
+
+            if no_slice_conv:
+
                 quant_weight = self.layer.quant_weight(quant_input=self.quant_metadata)
                 quant_weight = _unpack_quant_tensor(quant_weight)
 

diff --git a/src/brevitas_examples/stable_diffusion/README.md b/src/brevitas_examples/stable_diffusion/README.md
@@ -97,7 +97,7 @@ usage: main.py [-h] [-m MODEL] [-d DEVICE] [-b BATCH_SIZE] [--prompt PROMPT]
                [--weight-quant-format WEIGHT_QUANT_FORMAT]
                [--input-quant-format INPUT_QUANT_FORMAT]
                [--weight-quant-granularity {per_channel,per_tensor,per_group}]
-               [--input-quant-granularity {per_tensor,per_group}]
+               [--input-quant-granularity {per_tensor,per_group,per_row}]
                [--input-scale-type {static,dynamic}]
                [--weight-group-size WEIGHT_GROUP_SIZE]
                [--input-group-size INPUT_GROUP_SIZE]
@@ -116,6 +116,8 @@ usage: main.py [-h] [-m MODEL] [-d DEVICE] [-b BATCH_SIZE] [--prompt PROMPT]
                [--inference-pipeline {samples,reference_images,mlperf}]
                [--caption-path CAPTION_PATH]
                [--reference-images-path REFERENCE_IMAGES_PATH]
+               [--few-shot-calibration [FEW_SHOT_CALIBRATION ...]]
+               [--calibration-batch-size CALIBRATION_BATCH_SIZE]
                [--quantize-weight-zero-point | --no-quantize-weight-zero-point]
                [--exclude-blacklist-act-eq | --no-exclude-blacklist-act-eq]
                [--quantize-input-zero-point | --no-quantize-input-zero-point]
@@ -251,7 +253,7 @@ options:
   --weight-quant-granularity {per_channel,per_tensor,per_group}
                         Granularity for scales/zero-point of weights. Default:
                         per_channel.
-  --input-quant-granularity {per_tensor,per_group}
+  --input-quant-granularity {per_tensor,per_group,per_row}
                         Granularity for scales/zero-point of inputs. Default:
                         per_tensor.
   --input-scale-type {static,dynamic}
@@ -307,6 +309,11 @@ options:
                         Inference pipeline for evaluation. Default: None
   --reference-images-path REFERENCE_IMAGES_PATH
                         Inference pipeline for evaluation. Default: None
+  --few-shot-calibration [FEW_SHOT_CALIBRATION ...]
+                        What timesteps to use for few-shot-calibration.
+                        Default: []
+  --calibration-batch-size CALIBRATION_BATCH_SIZE
+                        Batch size for few-shot-calibration. Default: 1
   --quantize-weight-zero-point
                         Enable Quantize weight zero-point. Default: Enabled
   --no-quantize-weight-zero-point