format

stamalakhov · stamalakhov · commit 428e5ee88319 · 2026-05-14T13:33:46.000+03:00
TICO-DCO-1.0-Signed-off-by: s.malakhov &lt;s.malakhov@partner.samsung.com&gt;
diff --git a/tico/quantization/algorithm/fpi_gptq/fpi_gptq.py b/tico/quantization/algorithm/fpi_gptq/fpi_gptq.py
@@ -32,7 +32,8 @@
 )
 
 from tico.quantization.algorithm.gptq.quant import quantize, Quantizer
-from tico.quantization.algorithm.fpi_gptq.util import quantize, iterate_GPTQ
+from tico.quantization.algorithm.fpi_gptq.util import iterate_GPTQ, quantize
+
 
 class FPI_GPTQ:
     def __init__(self, layer):
diff --git a/tico/quantization/algorithm/fpi_gptq/util.py b/tico/quantization/algorithm/fpi_gptq/util.py
@@ -20,6 +20,7 @@
 
 import torch
 
+
 def quantize(x, scale, zero, maxq):
     if maxq < 0:
         return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
@@ -49,7 +50,7 @@ def iterate_GPTQ(scale, zero, maxq, W, Hinv, max_num_of_iters=50):
 
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-            
+
     cur_Q = quantize(cur_weights, scale, zero, maxq)
 
     return cur_Q, cur_weights
diff --git a/tico/quantization/algorithm/gptq/gptq.py b/tico/quantization/algorithm/gptq/gptq.py
@@ -360,9 +360,9 @@ def fasterquant(
         H = torch.cholesky_inverse(H)
         H = torch.linalg.cholesky(H, upper=True).float()
         Hinv = H
-        
+
         self.quantizer.update(W, Hinv, perm)
-    
+
         assert isinstance(Hinv, torch.Tensor)
         for i1 in range(0, self.columns, blocksize):
             i2 = min(i1 + blocksize, self.columns)
diff --git a/tico/quantization/algorithm/gptq/quant.py b/tico/quantization/algorithm/gptq/quant.py
@@ -23,6 +23,7 @@
 
 from tico.quantization.algorithm.fpi_gptq.util import iterate_GPTQ
 
+
 def quantize(x, scale, zero, maxq):
     if maxq < 0:
         return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
@@ -62,11 +63,11 @@ def configure(
 
     def _prepare_tensor(self, x, weight=False):
         """Prepare tensor for quantization by flattening according to per-channel setting.
-        
+
         Args:
             x: Input tensor to prepare
             weight: Whether the tensor is a weight (affects flattening for activations)
-            
+
         Returns:
             Tuple of (prepared tensor, original shape)
         """
@@ -88,10 +89,10 @@ def _prepare_tensor(self, x, weight=False):
 
     def _compute_scale_zero_bounds(self, x):
         """Compute scale and zero bounds from tensor values.
-        
+
         Args:
             x: Prepared tensor (flattened according to per-channel setting)
-            
+
         Returns:
             Tuple of (scale, zero, xmin, xmax) computed from tensor bounds
         """
@@ -123,17 +124,17 @@ def _compute_scale_zero_bounds(self, x):
 
     def _reshape_scale_zero(self, shape, weight=False):
         """Reshape scale and zero tensors according to the original tensor shape.
-        
+
         Args:
             shape: Original tensor shape before preparation
             weight: Whether the tensor is a weight (affects reshape for activations)
         """
         if weight:
             shape = [-1] + [1] * (len(shape) - 1)
-            self.scale = self.scale.reshape(shape)
-            self.zero = self.zero.reshape(shape)
+            self.scale = self.scale.reshape(shape)  # type: ignore[has-type]
+            self.zero = self.zero.reshape(shape)  # type: ignore[has-type]
             return
-            
+
         if len(shape) == 4:
             self.scale = self.scale.reshape((1, -1, 1, 1))
             self.zero = self.zero.reshape((1, -1, 1, 1))
@@ -146,7 +147,7 @@ def _reshape_scale_zero(self, shape, weight=False):
 
     def _expand_for_per_tensor(self, shape, weight=False):
         """Expand scale and zero for per-tensor quantization.
-        
+
         Args:
             shape: Original tensor shape before preparation
             weight: Whether the tensor is a weight
@@ -169,20 +170,24 @@ def find_params(self, x, weight=False):
 
         self.scale, self.zero, xmin, xmax = self._compute_scale_zero_bounds(x)
 
-        if self.mse is not None and self.mse != "smse_for_gptq" and self.mse != "mse_for_gptq":
+        if (
+            self.mse is not None
+            and self.mse != "smse_for_gptq"
+            and self.mse != "mse_for_gptq"
+        ):
             self._optimize_mse(x, xmin, xmax)
 
         self._expand_for_per_tensor(shape, weight)
         self._reshape_scale_zero(shape, weight)
 
     def _compute_shrink_params(self, p, xmin, xmax):
         """Compute scale and zero for a shrink factor p.
-        
+
         Args:
             p: Shrink factor (1 - i / grid)
             xmin: Minimum values per channel
             xmax: Maximum values per channel
-            
+
         Returns:
             Tuple of (scale1, zero1) for the given shrink factor
         """
@@ -194,13 +199,13 @@ def _compute_shrink_params(self, p, xmin, xmax):
 
     def _update_best_params(self, best, err, scale1, zero1):
         """Update best parameters if current error is lower.
-        
+
         Args:
             best: Current best error values
             err: Current iteration error values
             scale1: Current iteration scale values
             zero1: Current iteration zero values
-            
+
         Returns:
             Updated best error values
         """
@@ -213,7 +218,7 @@ def _update_best_params(self, best, err, scale1, zero1):
 
     def _grid_search(self, x, xmin, xmax, compute_error_fn):
         """Common grid search loop for MSE optimization.
-        
+
         Args:
             x: Prepared tensor
             xmin: Minimum values per channel
@@ -230,25 +235,28 @@ def _grid_search(self, x, xmin, xmax, compute_error_fn):
 
     def _optimize_mse(self, x, xmin, xmax):
         """Optimize scale and zero using MSE-based grid search.
-        
+
         Args:
             x: Prepared tensor
             xmin: Minimum values per channel
             xmax: Maximum values per channel
         """
+
         def compute_error(x, scale1, zero1):
             q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
             q -= x
             q.abs_()
             if self.mse == "smse":  # sensitivity weighted mse
                 # in case sensitivity is a second order derivatives of some global loss
                 # (q**2) * self.sensitivity is just a global loss change due to quantization.
-                q = (q**2) * self.sensitivity.to(q.device)  # estimate global target change
+                q = (q**2) * self.sensitivity.to(
+                    q.device
+                )  # estimate global target change
             else:
                 assert self.mse == "mse"
                 q.pow_(self.norm)
             return torch.sum(q, 1)
-        
+
         self._grid_search(x, xmin, xmax, compute_error)
 
     def update(self, x, Hinv, perm):
@@ -269,13 +277,13 @@ def update(self, x, Hinv, perm):
         self._optimize_mse_for_gptq(x, Hinv, sensitivity, xmin, xmax)
 
         self._reshape_scale_zero(shape, weight=True)
-        
+
         del sensitivity
         sensitivity = None
 
     def _optimize_mse_for_gptq(self, x, Hinv, sensitivity, xmin, xmax):
         """Optimize scale and zero using GPTQ-aware MSE grid search.
-        
+
         Args:
             x: Prepared tensor
             Hinv: Inverse Hessian matrix
@@ -284,7 +292,7 @@ def _optimize_mse_for_gptq(self, x, Hinv, sensitivity, xmin, xmax):
             xmax: Maximum values per channel
         """
         num_of_iters = 15
-        
+
         def compute_error(x, scale1, zero1):
             q, _ = iterate_GPTQ(
                 scale1.unsqueeze(1),
@@ -298,7 +306,7 @@ def compute_error(x, scale1, zero1):
             assert self.mse == "smse_for_gptq"
             err = ((q - x) ** 2) * sensitivity.to(q.device)
             return torch.sum(err, 1)
-        
+
         self._grid_search(x, xmin, xmax, compute_error)
 
     def quantize(self, x):
diff --git a/tico/quantization/algorithm/gptq/quantizer.py b/tico/quantization/algorithm/gptq/quantizer.py
@@ -478,7 +478,7 @@ def _hook(_, inp, out):
         model.lm_head = model.lm_head.to(old_device)
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-        
+
         device = next(layer.parameters()).device  # in case lm_head is located on cpu
         for batch_idx in tqdm(
             range(batch_num),
diff --git a/tico/quantization/algorithm/gptq/utils.py b/tico/quantization/algorithm/gptq/utils.py
@@ -223,7 +223,7 @@ def compute_sensitivity_info(self):
         if model.device.type != "cpu":
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
-                
+
         model = model.to(dtype)
 
         return sensitivity
diff --git a/tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py b/tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py
@@ -370,7 +370,7 @@ def _print_sample(title, items):
         _print_sample("unused GPTQ entries", unused)
 
 
-def evaluate_ppl_of_model_on_dataset(model, dataset, device: str = "cuda"):
+def evaluate_ppl_of_model_on_dataset(model, dataset, device):
     if hasattr(model, "device") and model.device.type != device.type:
         if hasattr(model, "to"):
             model.to(device)
@@ -415,6 +415,7 @@ def evaluate_ppl_of_model_on_dataset(model, dataset, device: str = "cuda"):
     ppl = np.exp(torch.cat(nlls, dim=-1).mean().item())
     return ppl
 
+
 # -------------------------------------------------------------------------
 # Helper — clear gptq quantizers after injection
 # -------------------------------------------------------------------------
@@ -1349,12 +1350,11 @@ def main():
 
     calib_inputs = build_calibration_inputs(model, tokenizer, args, device)
     train_ppl_ioqdtype = evaluate_ppl_of_model_on_dataset(
-            model, calib_inputs, device=device
-        )
+        model, calib_inputs, device=device
+    )
     print("\n┌── Wikitext-2 train perplexity ─────────────")
     print(f"│ FP32 : {train_ppl_ioqdtype:8.2f}")
     print("└───────────────────────────────────────────")
-    
 
     model = apply_spinquant(model, args)
     model = apply_cle(model, args)
@@ -1363,14 +1363,14 @@ def main():
     q_m = quantize_using_PTQ(model, calib_inputs, args)
 
     evaluate(q_m, tokenizer, dataset_test, args)
-    
+
     train_ppl_ioqdtype = evaluate_ppl_of_model_on_dataset(
-            q_m, calib_inputs, device=device
-        )
+        q_m, calib_inputs, device=device
+    )
     print("\n┌── Wikitext-2 train perplexity ─────────────")
     print(f"│ int16 : {train_ppl_ioqdtype:8.2f}")
     print("└───────────────────────────────────────────")
-    
+
     save_requested_artifacts(q_m, tokenizer, calib_inputs, args)
 
 

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,8 @@`
`32`	`32`	`)`
`33`	`33`
`34`	`34`	`from tico.quantization.algorithm.gptq.quant import quantize, Quantizer`
`35`		`-from tico.quantization.algorithm.fpi_gptq.util import quantize, iterate_GPTQ`
	`35`	`+from tico.quantization.algorithm.fpi_gptq.util import iterate_GPTQ, quantize`
	`36`	`+`
`36`	`37`
`37`	`38`	`class FPI_GPTQ:`
`38`	`39`	`def __init__(self, layer):`