Last touches

Giuseppe5 · Giuseppe5 · commit 6fbebeabda4c · 2025-04-14T09:07:33.000+01:00
diff --git a/src/brevitas/graph/gpxq.py b/src/brevitas/graph/gpxq.py
@@ -252,10 +252,20 @@ def get_quant_weights(self, i, i1, permutation_list, with_quant_history=False):
         # We need to recompute quant weights at runtime since our float weights are being updated
         # Add offset in case of blockwise computation
         i = i1 + i
+
         # For QuantLinear and for some QuantConvolutional layers, we exploit the possibility
         # of quantizing only a subset of the entire matrix speeding up the computation of GPxQ
+        no_slice = False
+        # Groupwise Quantization does not support slicing
+        no_slice = no_slice or self.layer.weight_quant.is_groupwise
+        # If we need quantization of past channels, we do not use slicing
+        no_slice = no_slice or with_quant_history
+        # If we are in export mode (i.e., inference mode), we do not slice for torch.compile
+        # compatibility
+        no_slice = no_slice or self.layer.weight_quant.export_mode
+
         if isinstance(self.layer, qnn.QuantLinear):
-            if True:  #self.layer.weight_quant.is_groupwise or with_quant_history:
+            if no_slice:
 
                 # No slicing, not optimized
                 q = self.layer.quant_weight(quant_input=self.quant_metadata)
@@ -272,12 +282,12 @@ def get_quant_weights(self, i, i1, permutation_list, with_quant_history=False):
                         subtensor_slice_list=subtensor_slice_list,
                         quant_input=self.quant_metadata)).unsqueeze(0)  # [1, OC, 1]
         elif isinstance(self.layer, SUPPORTED_CONV_OP):
-            # For depthwise and ConvTranspose we fall back to quantizing the entire martix.
-            # For all other cases, we create a mask that represent the slicing we will perform on the weight matrix
-            # and we quantize only the selected dimensions.
-            if True:  #self.layer.weight_quant.is_groupwise or with_quant_history or self.groups > 1 or (
-                # self.groups == 1 and
-                # isinstance(self.layer, (qnn.QuantConvTranspose1d, qnn.QuantConvTranspose2d))):
+            # DepthWise and ConvTranspose does not support slicing
+            no_slice_conv = no_slice or (
+                self.groups > 1 or
+                isinstance(self.layer, (qnn.QuantConvTranspose1d, qnn.QuantConvTranspose2d)))
+
+            if no_slice_conv:
 
                 quant_weight = self.layer.quant_weight(quant_input=self.quant_metadata)
                 quant_weight = _unpack_quant_tensor(quant_weight)
diff --git a/src/brevitas_examples/stable_diffusion/main.py b/src/brevitas_examples/stable_diffusion/main.py
@@ -301,13 +301,14 @@ def main(args):
             subfolder='float')
 
     if len(args.few_shot_calibration) > 0:
+        args.few_shot_calibration = list(map(int, args.few_shot_calibration))
         pipe.set_progress_bar_config(disable=True)
-        new_calib_set = []
+        few_shot_calibration_prompts = []
         counter = [0]
 
         def calib_hook(module, inp, inp_kwargs):
             if counter[0] in args.few_shot_calibration:
-                new_calib_set.append((inp, inp_kwargs))
+                few_shot_calibration_prompts.append((inp, inp_kwargs))
             counter[0] += 1
             if counter[0] == args.calibration_steps:
                 counter[0] = 0
@@ -329,6 +330,8 @@ def calib_hook(module, inp, inp_kwargs):
             is_unet=is_unet,
             batch=args.calibration_batch_size)
         h.remove()
+    else:
+        few_shot_calibration_prompts = calibration_prompts
 
     # Detect Stable Diffusion XL pipeline
     is_sd_xl = isinstance(pipe, StableDiffusionXLPipeline)
@@ -358,15 +361,18 @@ def calib_hook(module, inp, inp_kwargs):
         if hasattr(m, 'lora_layer') and m.lora_layer is not None:
             raise RuntimeError("LoRA layers should be fused in before calling into quantization.")
 
-    def calibration_step(calibration_prompts, force_full_evaluation=False):
-        if len(args.few_shot_calibration) > 0 or not force_full_evaluation:
-            for i, (inp_args, inp_kwargs) in enumerate(new_calib_set):
+    def calibration_step(force_full_calibration=False, num_prompts=None):
+        if len(args.few_shot_calibration) > 0 or not force_full_calibration:
+            for i, (inp_args, inp_kwargs) in enumerate(few_shot_calibration_prompts):
                 denoising_network(*inp_args, **inp_kwargs)
+                if num_prompts is not None and i == num_prompts:
+                    break
         else:
+            prompts_subset = calibration_prompts[:num_prompts] if num_prompts is not None else calibration_prompts
             run_val_inference(
                 pipe,
                 args.resolution,
-                calibration_prompts,
+                prompts_subset,
                 test_seeds,
                 args.device,
                 dtype,
@@ -389,12 +395,11 @@ def calibration_step(calibration_prompts, force_full_evaluation=False):
             for m in denoising_network.modules():
                 if isinstance(m, KwargsForwardHook) and hasattr(m.module, 'in_features'):
                     m.in_features = m.module.in_features
-
-            if args.dry_run or args.load_checkpoint is not None:
-                calibration_prompts = [calibration_prompts[0]]
+            act_eq_num_prompts = 1 if args.dry_run or args.load_checkpoint else len(
+                calibration_prompts)
 
             # SmoothQuant seems to be make better use of all the timesteps
-            calibration_step(calibration_prompts, force_full_evaluation=True)
+            calibration_step(force_full_calibration=True, num_prompts=act_eq_num_prompts)
 
         # Workaround to expose `in_features` attribute from the EqualizedModule Wrapper
         for m in denoising_network.modules():
@@ -601,7 +606,7 @@ def sdpa_zp_stats_type():
         pipe.set_progress_bar_config(disable=True)
 
         with torch.no_grad():
-            calibration_step([calibration_prompts[0]])
+            calibration_step(num_prompts=1)
 
         if args.load_checkpoint is not None:
             with load_quant_model_mode(denoising_network):
@@ -616,7 +621,7 @@ def sdpa_zp_stats_type():
             if needs_calibration:
                 print("Applying activation calibration")
                 with torch.no_grad(), calibration_mode(denoising_network):
-                    calibration_step(calibration_prompts)
+                    calibration_step()
 
         if args.svd_quant:
             print("Apply SVDQuant...")
@@ -634,24 +639,21 @@ def sdpa_zp_stats_type():
                     m.compile_quant()
         if args.gptq:
             print("Applying GPTQ. It can take several hours")
-            gptq_subset = calibration_prompts[:128]
             with torch.no_grad(), quant_inference_mode(denoising_network, compile=True):
-                calibration_step([gptq_subset[0]])
                 with gptq_mode(denoising_network,
                                create_weight_orig=False,
                                use_quant_activations=True,
                                return_forward_output=False,
                                act_order=True) as gptq:
                     for _ in tqdm(range(gptq.num_layers)):
-                        calibration_step(gptq_subset)
+                        calibration_step(num_prompts=128)
                         gptq.update()
 
         if args.bias_correction:
             print("Applying bias correction")
             with torch.no_grad(), quant_inference_mode(denoising_network, compile=True):
-                calibration_step([calibration_prompts[0]])
                 with bias_correction_mode(denoising_network):
-                    calibration_step(calibration_prompts)
+                    calibration_step()
 
     if args.vae_fp16_fix and is_sd_xl:
         vae_fix_scale = 128