address reviews

JyotinderSingh · JyotinderSingh · commit b7104490f960 · 2026-01-22T10:15:37.000+05:30
diff --git a/guides/awq_quantization_in_keras.py b/guides/awq_quantization_in_keras.py
@@ -51,9 +51,9 @@
 parameter) causal language model.
 
 """
+from datasets import load_dataset
 import keras
 from keras_hub.models import Gemma3CausalLM
-from datasets import load_dataset
 
 
 prompt = "Keras is a"
@@ -93,9 +93,12 @@
 # Calibration slice (use a larger/representative set in practice)
 texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")["text"]
 
-calibration_dataset = [
-    s + "." for text in texts for s in map(str.strip, text.split(".")) if s
-]
+calibration_dataset = []
+for text in texts:
+    for s in text.split("."):
+        s = s.strip()
+        if s:
+            calibration_dataset.append(s + ".")
 
 awq_config = keras.quantizers.AWQConfig(
     dataset=calibration_dataset,
@@ -161,7 +164,7 @@
 | ------ | --- | ---- |
 | **Algorithm** | Grid search for activation-aware scales | Hessian-based second-order optimization |
 | **Quantization speed** | Faster (no Hessian computation) | Slower (requires Hessian estimation) |
-| **Bit-widths supported** | only 4-bit supported for now | 2/3/4/8-bit |
+| **Bit-widths supported** | 4-bit | 2/3/4/8-bit |
 | **Accuracy** | Competitive, especially on encoder models | Often slightly better on decoder LLMs |
 | **Memory during quantization** | Lower | Higher (Hessian storage) |
 | **Calibration sensitivity** | Less prone to overfitting | May overfit calibration set, affecting out-of-distribution performance |
diff --git a/guides/gptq_quantization_in_keras.py b/guides/gptq_quantization_in_keras.py
@@ -31,9 +31,9 @@
 parameter) causal language model.
 
 """
+from datasets import load_dataset
 import keras
 from keras_hub.models import Gemma3CausalLM
-from datasets import load_dataset
 
 
 prompt = "Keras is a"
@@ -140,7 +140,7 @@
 | ------ | ---- | --- |
 | **Algorithm** | Hessian-based second-order optimization | Grid search for activation-aware scales |
 | **Quantization speed** | Slower (requires Hessian estimation) | Faster (no Hessian computation) |
-| **Bit-widths supported** | 2/3/4/8-bit | Only 4-bit supported for now |
+| **Bit-widths supported** | 2/3/4/8-bit | 4-bit |
 | **Accuracy** | Often slightly better on decoder LLMs | Competitive, especially on encoder models |
 | **Memory during quantization** | Higher (Hessian storage) | Lower |
 | **Calibration sensitivity** | May overfit calibration set, affecting out-of-distribution performance | Less prone to overfitting |
diff --git a/guides/ipynb/awq_quantization_in_keras.ipynb b/guides/ipynb/awq_quantization_in_keras.ipynb
@@ -76,9 +76,9 @@
    },
    "outputs": [],
    "source": [
+    "from datasets import load_dataset\n",
     "import keras\n",
     "from keras_hub.models import Gemma3CausalLM\n",
-    "from datasets import load_dataset\n",
     "\n",
     "\n",
     "prompt = \"Keras is a\"\n",
@@ -132,9 +132,12 @@
     "# Calibration slice (use a larger/representative set in practice)\n",
     "texts = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train[:1%]\")[\"text\"]\n",
     "\n",
-    "calibration_dataset = [\n",
-    "    s + \".\" for text in texts for s in map(str.strip, text.split(\".\")) if s\n",
-    "]\n",
+    "calibration_dataset = []\n",
+    "for text in texts:\n",
+    "    for s in text.split(\".\"):\n",
+    "        s = s.strip()\n",
+    "        if s:\n",
+    "            calibration_dataset.append(s + \".\")\n",
     "\n",
     "awq_config = keras.quantizers.AWQConfig(\n",
     "    dataset=calibration_dataset,\n",
@@ -225,7 +228,7 @@
     "| ------ | --- | ---- |\n",
     "| **Algorithm** | Grid search for activation-aware scales | Hessian-based second-order optimization |\n",
     "| **Quantization speed** | Faster (no Hessian computation) | Slower (requires Hessian estimation) |\n",
-    "| **Bit-widths supported** | only 4-bit supported for now | 2/3/4/8-bit |\n",
+    "| **Bit-widths supported** | 4-bit | 2/3/4/8-bit |\n",
     "| **Accuracy** | Competitive, especially on encoder models | Often slightly better on decoder LLMs |\n",
     "| **Memory during quantization** | Lower | Higher (Hessian storage) |\n",
     "| **Calibration sensitivity** | Less prone to overfitting | May overfit calibration set, affecting out-of-distribution performance |\n",
diff --git a/guides/ipynb/gptq_quantization_in_keras.ipynb b/guides/ipynb/gptq_quantization_in_keras.ipynb
@@ -167,12 +167,12 @@
     "Dataset: WikiText-2.\n",
     "\n",
     "\n",
-    "| Model (preset)                              | Perplexity Increase % (↓ better) | Disk Storage Reduction Δ % (↓ better) | VRAM Reduction Δ % (↓ better) | First-token Latency Δ % (↓ better) | Throughput Δ % (↑ better) |\n",
-    "| ------------------------------------------- | -------------------------------: | ------------------------------------: | ----------------------------: | ---------------------------------: | ------------------------: |\n",
-    "| GPT2 (gpt2_base_en_cnn_dailymail) |                             1.0% |                              -50.1% ↓ |                      -41.1% ↓ |                            +0.7% ↑ |                  +20.1% ↑ |\n",
-    "| OPT (opt_125m_en)                 |                            10.0% |                              -49.8% ↓ |                      -47.0% ↓ |                            +6.7% ↑ |                  -15.7% ↓ |\n",
-    "| Bloom (bloom_1.1b_multi)          |                             7.0% |                              -47.0% ↓ |                      -54.0% ↓ |                            +1.8% ↑ |                  -15.7% ↓ |\n",
-    "| Gemma3 (gemma3_1b)                |                             3.0% |                              -51.5% ↓ |                      -51.8% ↓ |                           +39.5% ↑ |                   +5.7% ↑ |\n",
+    "| Model (preset)                    | Perplexity Increase % (\u2193 better) | Disk Storage Reduction \u0394 % (\u2193 better) | VRAM Reduction \u0394 % (\u2193 better) | First-token Latency \u0394 % (\u2193 better) | Throughput \u0394 % (\u2191 better) |\n",
+    "| --------------------------------- | -------------------------------: | ------------------------------------: | ----------------------------: | ---------------------------------: | ------------------------: |\n",
+    "| GPT2 (gpt2_base_en_cnn_dailymail) |                             1.0% |                              -50.1% \u2193 |                      -41.1% \u2193 |                            +0.7% \u2191 |                  +20.1% \u2191 |\n",
+    "| OPT (opt_125m_en)                 |                            10.0% |                              -49.8% \u2193 |                      -47.0% \u2193 |                            +6.7% \u2191 |                  -15.7% \u2193 |\n",
+    "| Bloom (bloom_1.1b_multi)          |                             7.0% |                              -47.0% \u2193 |                      -54.0% \u2193 |                            +1.8% \u2191 |                  -15.7% \u2193 |\n",
+    "| Gemma3 (gemma3_1b)                |                             3.0% |                              -51.5% \u2193 |                      -51.8% \u2193 |                           +39.5% \u2191 |                   +5.7% \u2191 |\n",
     "\n",
     "\n",
     "Detailed benchmarking numbers and scripts are available\n",
@@ -191,8 +191,37 @@
   },
   {
    "cell_type": "markdown",
-   "source": "## GPTQ vs AWQ?\n\nBoth GPTQ and AWQ are weight-only quantization methods that require calibration\ndata. Here's how to choose between them:\n\n| Aspect | GPTQ | AWQ |\n| ------ | ---- | --- |\n| **Algorithm** | Hessian-based second-order optimization | Grid search for activation-aware scales |\n| **Quantization speed** | Slower (requires Hessian estimation) | Faster (no Hessian computation) |\n| **Bit-widths supported** | 2/3/4/8-bit | Only 4-bit supported for now |\n| **Accuracy** | Often slightly better on decoder LLMs | Competitive, especially on encoder models |\n| **Memory during quantization** | Higher (Hessian storage) | Lower |\n| **Calibration sensitivity** | May overfit calibration set, affecting out-of-distribution performance | Less prone to overfitting |\n\n**Choose GPTQ when:**\n\n* You need bit-widths other than 4 (e.g., 2-bit or 8-bit).\n* Maximum accuracy is critical and you can afford longer quantization time.\n* You're working with decoder-only LLMs where GPTQ may have a slight edge.\n\n**Choose AWQ when:**\n\n* You need faster quantization (AWQ is typically 2-3x faster than GPTQ).\n* Memory during quantization is constrained.\n* 4-bit is sufficient for your use case.\n* Your model will be used on diverse/out-of-distribution data (AWQ is less prone to overfitting on calibration data).",
-   "metadata": {}
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## GPTQ vs AWQ?\n",
+    "\n",
+    "Both GPTQ and AWQ are weight-only quantization methods that require calibration\n",
+    "data. Here's how to choose between them:\n",
+    "\n",
+    "| Aspect | GPTQ | AWQ |\n",
+    "| ------ | ---- | --- |\n",
+    "| **Algorithm** | Hessian-based second-order optimization | Grid search for activation-aware scales |\n",
+    "| **Quantization speed** | Slower (requires Hessian estimation) | Faster (no Hessian computation) |\n",
+    "| **Bit-widths supported** | 2/3/4/8-bit | 4-bit |\n",
+    "| **Accuracy** | Often slightly better on decoder LLMs | Competitive, especially on encoder models |\n",
+    "| **Memory during quantization** | Higher (Hessian storage) | Lower |\n",
+    "| **Calibration sensitivity** | May overfit calibration set, affecting out-of-distribution performance | Less prone to overfitting |\n",
+    "\n",
+    "**Choose GPTQ when:**\n",
+    "\n",
+    "* You need bit-widths other than 4 (e.g., 2-bit or 8-bit).\n",
+    "* Maximum accuracy is critical and you can afford longer quantization time.\n",
+    "* You're working with decoder-only LLMs where GPTQ may have a slight edge.\n",
+    "\n",
+    "**Choose AWQ when:**\n",
+    "\n",
+    "* You need faster quantization (AWQ is typically 2-3x faster than GPTQ).\n",
+    "* Memory during quantization is constrained.\n",
+    "* 4-bit is sufficient for your use case.\n",
+    "* Your model will be used on diverse/out-of-distribution data (AWQ is less prone to overfitting on calibration data)."
+   ]
   },
   {
    "cell_type": "markdown",
@@ -207,11 +236,6 @@
     "* Use a representative calibration set; small slices are only for demos.\n",
     "* Start with W4 group_size=128; tune per model/task."
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
   }
  ],
  "metadata": {
diff --git a/guides/md/awq_quantization_in_keras.md b/guides/md/awq_quantization_in_keras.md
@@ -54,9 +54,9 @@ parameter) causal language model.
 
 
 ```python
+from datasets import load_dataset
 import keras
 from keras_hub.models import Gemma3CausalLM
-from datasets import load_dataset
 
 
 prompt = "Keras is a"
@@ -104,9 +104,12 @@ the model using the `.quantize(...)` API.
 # Calibration slice (use a larger/representative set in practice)
 texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")["text"]
 
-calibration_dataset = [
-    s + "." for text in texts for s in map(str.strip, text.split(".")) if s
-]
+calibration_dataset = []
+for text in texts:
+    for s in text.split("."):
+        s = s.strip()
+        if s:
+            calibration_dataset.append(s + ".")
 
 awq_config = keras.quantizers.AWQConfig(
     dataset=calibration_dataset,
@@ -126,7 +129,7 @@ print(outputs)
 
 <div class="k-default-codeblock">
 ```
-26/26 ━━━━━━━━━━━━━━━━━━━━ 240s 9s/step
+26/26 ━━━━━━━━━━━━━━━━━━━━ 239s 9s/step
 
 Keras is a Python library for deep learning. It is a high-level interface to the TensorFlow library.
 
@@ -192,7 +195,7 @@ data. Here's how to choose between them:
 | ------ | --- | ---- |
 | **Algorithm** | Grid search for activation-aware scales | Hessian-based second-order optimization |
 | **Quantization speed** | Faster (no Hessian computation) | Slower (requires Hessian estimation) |
-| **Bit-widths supported** | only 4-bit supported for now | 2/3/4/8-bit |
+| **Bit-widths supported** | 4-bit | 2/3/4/8-bit |
 | **Accuracy** | Competitive, especially on encoder models | Often slightly better on decoder LLMs |
 | **Memory during quantization** | Lower | Higher (Hessian storage) |
 | **Calibration sensitivity** | Less prone to overfitting | May overfit calibration set, affecting out-of-distribution performance |
diff --git a/guides/md/gptq_quantization_in_keras.md b/guides/md/gptq_quantization_in_keras.md
@@ -34,9 +34,9 @@ parameter) causal language model.
 
 
 ```python
+from datasets import load_dataset
 import keras
 from keras_hub.models import Gemma3CausalLM
-from datasets import load_dataset
 
 
 prompt = "Keras is a"
@@ -101,6 +101,24 @@ print(outputs)
 
 <div class="k-default-codeblock">
 ```
+/home/jyotinder/anaconda3/envs/keras-io/lib/python3.12/site-packages/keras/src/models/model.py:547: UserWarning: Layer InputLayer does not have a `quantize` method implemented.
+  warnings.warn(str(e))
+/home/jyotinder/anaconda3/envs/keras-io/lib/python3.12/site-packages/keras/src/models/model.py:547: UserWarning: Layer RMSNormalization does not have a `quantize` method implemented.
+  warnings.warn(str(e))
+/home/jyotinder/anaconda3/envs/keras-io/lib/python3.12/site-packages/keras/src/models/model.py:547: UserWarning: Layer RotaryEmbedding does not have a `quantize` method implemented.
+  warnings.warn(str(e))
+/home/jyotinder/anaconda3/envs/keras-io/lib/python3.12/site-packages/keras/src/models/model.py:547: UserWarning: Layer Softmax does not have a `quantize` method implemented.
+  warnings.warn(str(e))
+/home/jyotinder/anaconda3/envs/keras-io/lib/python3.12/site-packages/keras/src/models/model.py:547: UserWarning: Layer Dropout does not have a `quantize` method implemented.
+  warnings.warn(str(e))
+
+/home/jyotinder/anaconda3/envs/keras-io/lib/python3.12/site-packages/keras/src/models/model.py:547: UserWarning: Invalid quantization mode. Expected one of ('int8', 'int4'). Received: quantization_mode=gptq
+  warnings.warn(str(e))
+
+I0000 00:00:1769055417.299190   54325 cuda_solvers.cc:175] Creating GpuSolver handles for stream 0x1f32b460
+
+26/26 ━━━━━━━━━━━━━━━━━━━━ 1235s 47s/step
+
 Keras is a Python library for deep learning. It is a high-level interface to the TensorFlow library.
 
 Keras is a great library
@@ -119,7 +137,6 @@ model.save_to_preset("gemma3_gptq_w4gs128_preset")
 model_from_preset = Gemma3CausalLM.from_preset("gemma3_gptq_w4gs128_preset")
 output = model_from_preset.generate(prompt, max_length=30)
 print(output)
-
 ```
 
 <div class="k-default-codeblock">
@@ -170,7 +187,7 @@ data. Here's how to choose between them:
 | ------ | ---- | --- |
 | **Algorithm** | Hessian-based second-order optimization | Grid search for activation-aware scales |
 | **Quantization speed** | Slower (requires Hessian estimation) | Faster (no Hessian computation) |
-| **Bit-widths supported** | 2/3/4/8-bit | Only 4-bit supported for now |
+| **Bit-widths supported** | 2/3/4/8-bit | 4-bit |
 | **Accuracy** | Often slightly better on decoder LLMs | Competitive, especially on encoder models |
 | **Memory during quantization** | Higher (Hessian storage) | Lower |
 | **Calibration sensitivity** | May overfit calibration set, affecting out-of-distribution performance | Less prone to overfitting |