address reviews

JyotinderSingh · JyotinderSingh · commit d77f7bf95e33 · 2025-10-18T02:15:27.000+05:30
diff --git a/guides/ipynb/writing_quantization_compatible_layers.ipynb b/guides/ipynb/writing_quantization_compatible_layers.ipynb
@@ -140,10 +140,12 @@
     "    )\n",
     "    scale = ops.squeeze(scale, axis=0)\n",
     "\n",
+    "    kernel_shape = self._kernel.shape\n",
+    "\n",
     "    del self._kernel\n",
     "\n",
     "    # Allocate INT8 variables. Discussed in the next section.\n",
-    "    self._int8_build(kernel_shape=self._kernel.shape)\n",
+    "    self._int8_build(kernel_shape)\n",
     "\n",
     "    self._kernel.assign(quantized_kernel)\n",
     "    self.scale.assign(scale)\n",
@@ -246,6 +248,10 @@
     "quantized variables allocated in `_int8_build(...)` and de-scales the output\n",
     "back to floating-point.\n",
     "\n",
+    "The base `keras.Layer` class automatically dispatches to this method when the\n",
+    "layer is quantized. Your regular call() method will be used for the\n",
+    "full-precision forward pass.\n",
+    "\n",
     "The INT8 path mirrors the float computation `y = x * w` but performs:\n",
     "\n",
     "1. Elementwise multiply using the quantized weight.\n",
@@ -320,7 +326,7 @@
     "\n",
     "        del self._kernel\n",
     "\n",
-    "        self._int8_build(kernel_shape=kernel_shape)\n",
+    "        self._int8_build(kernel_shape)\n",
     "\n",
     "        self._kernel.assign(quantized_kernel)\n",
     "        self.scale.assign(scale)\n",
@@ -587,7 +593,7 @@
     "\n",
     "        del self._kernel\n",
     "\n",
-    "        self._int8_build(kernel_shape=kernel_shape)\n",
+    "        self._int8_build(kernel_shape)\n",
     "\n",
     "        self._kernel.assign(quantized_kernel)\n",
     "        self.scale.assign(scale)\n",
@@ -717,8 +723,8 @@
     "  - The axis you packed along (e.g., `_int4_pack_axis`).\n",
     "  - The original (unpacked) length on that axis (e.g., `_original_input_dim` or\n",
     "    `_original_length_along_pack_axis`).\n",
-    "- In `call(...)`, compute with the quantized buffers and de-scale back to float\n",
-    "  at the end, wherever possible. This allows you to leverage optimized\n",
+    "- In quantized call hooks, compute with the quantized buffers and de-scale back\n",
+    "  to float at the end, wherever possible. This allows you to leverage optimized\n",
     "  low-precision kernels (e.g., cuBLAS INT8 GEMM).\n",
     "\n",
     "- INT4 specifics (packed nibbles)\n",
diff --git a/guides/md/writing_quantization_compatible_layers.md b/guides/md/writing_quantization_compatible_layers.md
@@ -99,10 +99,12 @@ def quantize(self, mode, **kwargs):
     )
     scale = ops.squeeze(scale, axis=0)
 
+    kernel_shape = self._kernel.shape
+
     del self._kernel
 
     # Allocate INT8 variables. Discussed in the next section.
-    self._int8_build(kernel_shape=self._kernel.shape)
+    self._int8_build(kernel_shape)
 
     self._kernel.assign(quantized_kernel)
     self.scale.assign(scale)
@@ -172,6 +174,10 @@ The `_int8_call(...)` method implements a minimal INT8 forward path. It uses the
 quantized variables allocated in `_int8_build(...)` and de-scales the output
 back to floating-point.
 
+The base `keras.Layer` class automatically dispatches to this method when the
+layer is quantized. Your regular call() method will be used for the
+full-precision forward pass.
+
 The INT8 path mirrors the float computation `y = x * w` but performs:
 
 1. Elementwise multiply using the quantized weight.
@@ -227,7 +233,7 @@ class SimpleScale(Layer):
 
         del self._kernel
 
-        self._int8_build(kernel_shape=kernel_shape)
+        self._int8_build(kernel_shape)
 
         self._kernel.assign(quantized_kernel)
         self.scale.assign(scale)
@@ -288,8 +294,8 @@ print("SimpleScale INT8 sample:", y_int8[0].numpy())
 
 <div class="k-default-codeblock">
 ```
-SimpleScale FP32 sample: [-0.00756585 -0.0135909  -0.02137992  0.01047459]
-SimpleScale INT8 sample: [-0.00756123 -0.01362174 -0.02146736  0.01047461]
+SimpleScale FP32 sample: [ 0.00074363 -0.02807784 -0.0032404  -0.03456082]
+SimpleScale INT8 sample: [ 0.00074166 -0.0279077  -0.00322246 -0.03456089]
 ```
 </div>
 
@@ -446,7 +452,7 @@ class SimpleScale(Layer):
 
         del self._kernel
 
-        self._int8_build(kernel_shape=kernel_shape)
+        self._int8_build(kernel_shape)
 
         self._kernel.assign(quantized_kernel)
         self.scale.assign(scale)
@@ -542,8 +548,8 @@ print("Loaded INT8 sample:", y_loaded[0].numpy())
 
 <div class="k-default-codeblock">
 ```
-SimpleScale INT8 sample: [ 0.02398201 -0.00298704  0.02251735  0.0029661 ]
-Loaded INT8 sample: [ 0.02398201 -0.00298704  0.02251735  0.0029661 ]
+SimpleScale INT8 sample: [-0.00047286  0.02926966 -0.00708966  0.03041461]
+Loaded INT8 sample: [-0.00047286  0.02926966 -0.00708966  0.03041461]
 
 /Users/jyotindersingh/miniconda3/envs/keras-io-env-3.12/lib/python3.12/site-packages/keras/src/models/model.py:472: UserWarning: Layer InputLayer does not have a `quantize` method implemented.
   warnings.warn(str(e))
@@ -562,8 +568,8 @@ Here are concrete patterns you can reuse when making your own layers PTQ-friendl
   - The axis you packed along (e.g., `_int4_pack_axis`).
   - The original (unpacked) length on that axis (e.g., `_original_input_dim` or
     `_original_length_along_pack_axis`).
-- In `call(...)`, compute with the quantized buffers and de-scale back to float
-  at the end, wherever possible. This allows you to leverage optimized
+- In quantized call hooks, compute with the quantized buffers and de-scale back
+  to float at the end, wherever possible. This allows you to leverage optimized
   low-precision kernels (e.g., cuBLAS INT8 GEMM).
 
 - INT4 specifics (packed nibbles)
diff --git a/guides/writing_quantization_compatible_layers.py b/guides/writing_quantization_compatible_layers.py
@@ -96,10 +96,12 @@ def quantize(self, mode, **kwargs):
     )
     scale = ops.squeeze(scale, axis=0)
 
+    kernel_shape = self._kernel.shape
+
     del self._kernel
 
     # Allocate INT8 variables. Discussed in the next section.
-    self._int8_build(kernel_shape=self._kernel.shape)
+    self._int8_build(kernel_shape)
 
     self._kernel.assign(quantized_kernel)
     self.scale.assign(scale)
@@ -171,6 +173,10 @@ def _int8_build(self, kernel_shape):
 quantized variables allocated in `_int8_build(...)` and de-scales the output
 back to floating-point.
 
+The base `keras.Layer` class automatically dispatches to this method when the
+layer is quantized. Your regular call() method will be used for the
+full-precision forward pass.
+
 The INT8 path mirrors the float computation `y = x * w` but performs:
 
 1. Elementwise multiply using the quantized weight.
@@ -223,7 +229,7 @@ def quantize(self, mode, **kwargs):
 
         del self._kernel
 
-        self._int8_build(kernel_shape=kernel_shape)
+        self._int8_build(kernel_shape)
 
         self._kernel.assign(quantized_kernel)
         self.scale.assign(scale)
@@ -429,7 +435,7 @@ def quantize(self, mode, **kwargs):
 
         del self._kernel
 
-        self._int8_build(kernel_shape=kernel_shape)
+        self._int8_build(kernel_shape)
 
         self._kernel.assign(quantized_kernel)
         self.scale.assign(scale)
@@ -532,8 +538,8 @@ def load_own_variables(self, store):
   - The axis you packed along (e.g., `_int4_pack_axis`).
   - The original (unpacked) length on that axis (e.g., `_original_input_dim` or
     `_original_length_along_pack_axis`).
-- In `call(...)`, compute with the quantized buffers and de-scale back to float
-  at the end, wherever possible. This allows you to leverage optimized
+- In quantized call hooks, compute with the quantized buffers and de-scale back
+  to float at the end, wherever possible. This allows you to leverage optimized
   low-precision kernels (e.g., cuBLAS INT8 GEMM).
 
 - INT4 specifics (packed nibbles)
diff --git a/simplescale_int8.keras b/simplescale_int8.keras