vllm-project · colldata79 · Mar 24, 2026 · Mar 24, 2026 · brian-dellabetta · Mar 24, 2026
diff --git a/examples/awq/README.md b/examples/awq/README.md
@@ -6,14 +6,26 @@ The AWQ implementation found in LLM Compressor is derived from the pioneering wo
 
 ## AWQ Recipe ##
 
-The AWQ recipe has been inferfaced as follows, where the `AWQModifier` adjusts model scales ahead of efficient weight quantization by the `QuantizationModifier`
+AWQ is a **pre-quantization transform** — it computes and applies smoothing scales to model weights, but does not produce final quantized weights on its own. A downstream quantizer (`QuantizationModifier` or `GPTQModifier`) must follow AWQ in the recipe to finalize quantization.
 
 ```python
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
 recipe = [
     AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
+    QuantizationModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
 ]
 ```
 
+The `scheme` on `AWQModifier` tells AWQ how the downstream quantizer will quantize, so that the grid search optimizes for the correct quantization format. It must match the downstream quantizer's scheme.
+
+AWQ can also be stacked with other transforms and quantizers:
+```python
+recipe = [AWQModifier(...), GPTQModifier(...)]
+recipe = [AWQModifier(...), QuantizationModifier(...)]
+```
+
 ## Compressing Your Own Model ##
 To use your own model, start with an existing example change the `model_id` to match your own model stub.
 ```python

diff --git a/examples/awq/fp8_block_llama_example.py b/examples/awq/fp8_block_llama_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,10 +50,13 @@ def tokenize(sample):
 
 
 # Configure the quantization algorithm to run.
+# AWQModifier applies smoothing, then QuantizationModifier finalizes quantization.
+_ignore = ["lm_head"]
+_scheme = "FP8_BLOCK"
+_targets = ["Linear"]
 recipe = [
-    AWQModifier(
-        ignore=["lm_head"], scheme="FP8_BLOCK", targets=["Linear"], duo_scaling="both"
-    ),
+    AWQModifier(ignore=_ignore, scheme=_scheme, targets=_targets, duo_scaling="both"),
+    QuantizationModifier(ignore=_ignore, scheme=_scheme, targets=_targets),
 ]
 
 # Apply algorithms.

diff --git a/examples/awq/fp8_dynamic_llama_example.py b/examples/awq/fp8_dynamic_llama_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,10 +50,13 @@ def tokenize(sample):
 
 
 # Configure the quantization algorithm to run.
+# AWQModifier applies smoothing, then QuantizationModifier finalizes quantization.
+_ignore = ["lm_head"]
+_scheme = "FP8_DYNAMIC"
+_targets = ["Linear"]
 recipe = [
-    AWQModifier(
-        ignore=["lm_head"], scheme="FP8_DYNAMIC", targets=["Linear"], duo_scaling="both"
-    ),
+    AWQModifier(ignore=_ignore, scheme=_scheme, targets=_targets, duo_scaling="both"),
+    QuantizationModifier(ignore=_ignore, scheme=_scheme, targets=_targets),
 ]
 
 # Apply algorithms.

diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,10 +50,13 @@ def tokenize(sample):
 
 
 # Configure the quantization algorithm to run.
+# AWQModifier applies smoothing, then QuantizationModifier finalizes quantization.
+_ignore = ["lm_head"]
+_scheme = "W4A16_ASYM"
+_targets = ["Linear"]
 recipe = [
-    AWQModifier(
-        ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
-    ),
+    AWQModifier(ignore=_ignore, scheme=_scheme, targets=_targets, duo_scaling="both"),
+    QuantizationModifier(ignore=_ignore, scheme=_scheme, targets=_targets),
 ]
 
 # Apply algorithms.

diff --git a/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py b/examples/awq/qwen3-vl-30b-a3b-Instruct-example.py
@@ -5,6 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 
 MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
 
@@ -60,37 +61,46 @@ def data_collator(batch):
     }
 
 
-# Configure AWQ quantization with smoothing and balancing
+# Configure AWQ smoothing + downstream quantization.
 # NOTE: This recipe uses W4A16 quantization with group_size=32
-# rather than the default preset with group_size=128
-recipe = AWQModifier(
-    ignore=[
-        "re:.*embed_tokens",
-        "re:.*input_layernorm$",
-        "re:.*mlp[.]gate$",
-        "re:.*post_attention_layernorm$",
-        "re:.*norm$",
-        "re:model[.]visual.*",
-        "re:visual.*",
-        "lm_head",
-    ],
-    duo_scaling=True,
-    config_groups={
-        "group_0": {
-            "targets": ["Linear"],
-            "weights": {
-                "num_bits": 4,
-                "type": "int",
-                "symmetric": True,
-                "group_size": 32,
-                "strategy": "group",
-                "dynamic": False,
-                "actorder": None,
-                "observer": "mse",
-            },
-        }
-    },
-)
+# rather than the default preset with group_size=128.
+# AWQModifier applies smoothing, then QuantizationModifier finalizes quantization.
+_ignore = [
+    "re:.*embed_tokens",
+    "re:.*input_layernorm$",
+    "re:.*mlp[.]gate$",
+    "re:.*post_attention_layernorm$",
+    "re:.*norm$",
+    "re:model[.]visual.*",
+    "re:visual.*",
+    "lm_head",
+]
+_config_groups = {
+    "group_0": {
+        "targets": ["Linear"],
+        "weights": {
+            "num_bits": 4,
+            "type": "int",
+            "symmetric": True,
+            "group_size": 32,
+            "strategy": "group",
+            "dynamic": False,
+            "actorder": None,
+            "observer": "mse",
+        },
+    }
+}
+recipe = [
+    AWQModifier(
+        ignore=_ignore,
+        duo_scaling=True,
+        config_groups=_config_groups,
+    ),
+    QuantizationModifier(
+        ignore=_ignore,
+        config_groups=_config_groups,
+    ),
+]
 
 # Apply AWQ quantization.
 oneshot(

diff --git a/examples/awq/qwen3_coder_moe_example.py b/examples/awq/qwen3_coder_moe_example.py
@@ -4,18 +4,19 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 
 MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
 SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"
 
 # Configure the quantization algorithm to run.
+# AWQModifier applies smoothing, then QuantizationModifier finalizes quantization.
+_ignore = ["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"]
+_scheme = "W4A16"
+_targets = ["Linear"]
 recipe = [
-    AWQModifier(
-        duo_scaling=False,
-        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
-        scheme="W4A16",
-        targets=["Linear"],
-    ),
+    AWQModifier(duo_scaling=False, ignore=_ignore, scheme=_scheme, targets=_targets),
+    QuantizationModifier(ignore=_ignore, scheme=_scheme, targets=_targets),
 ]
-recipe = [
-    AWQModifier(
-        duo_scaling=False,
-        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
-        scheme="W4A16",
-        targets=["Linear"],
-    ),
-    QuantizationModifier(
-        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
-        scheme="W4A16",
-        targets=["Linear"],
-    ),
-]
+_ignore = ["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"]
+_scheme = "W4A16"
+_targets = ["Linear"]
+recipe = [
+    AWQModifier(
+        duo_scaling=False,
+        ignore=_ignore,
+        scheme=_scheme,
+        targets=_targets,
+    ),
+    QuantizationModifier(
+        ignore=_ignore,
+        scheme=_scheme,
+        targets=_targets,
+    ),
+]
-recipe = [
-    AWQModifier(
-        duo_scaling=False,
-        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
-        scheme="W4A16",
-        targets=["Linear"],
-    ),
-    QuantizationModifier(
-        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
-        scheme="W4A16",
-        targets=["Linear"],
-    ),
-]
+_ignore = ["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"]
+_scheme = "W4A16"
+_targets = ["Linear"]
+recipe = [
+    AWQModifier(
+        duo_scaling=False,
+        ignore=_ignore,
+        scheme=_scheme,
+        targets=_targets,
+    ),
+    QuantizationModifier(
+        ignore=_ignore,
+        scheme=_scheme,
+        targets=_targets,
+    ),
+]
 
 # Select calibration dataset.

diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen3-30B-A3B"
@@ -50,12 +51,13 @@ def tokenize(sample):
 
 # Configure the quantization algorithm to run.
 # NOTE: vllm currently does not support asym MoE, using symmetric here
+# AWQModifier applies smoothing, then QuantizationModifier finalizes quantization.
+_ignore = ["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"]
+_scheme = "W4A16"
+_targets = ["Linear"]
 recipe = [
-    AWQModifier(
-        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
-        scheme="W4A16",
-        targets=["Linear"],
-    ),
+    AWQModifier(ignore=_ignore, scheme=_scheme, targets=_targets),
+    QuantizationModifier(ignore=_ignore, scheme=_scheme, targets=_targets),
 ]
 
 # Apply algorithms.

diff --git a/examples/awq/w4a8_fp8_llama_example.py b/examples/awq/w4a8_fp8_llama_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -39,13 +40,13 @@ def preprocess(example):
 # Configure the quantization algorithm to run.
 # W4AFP8 scheme: 4-bit integer weights (group 128) + FP8 dynamic per-token activations
 # AWQ smooths the weights before quantization to reduce quantization error.
+# AWQModifier applies smoothing, then QuantizationModifier finalizes quantization.
+_ignore = ["lm_head"]
+_scheme = "W4AFP8"
+_targets = ["Linear"]
 recipe = [
-    AWQModifier(
-        ignore=["lm_head"],
-        scheme="W4AFP8",
-        targets=["Linear"],
-        duo_scaling=True,
-    ),
+    AWQModifier(ignore=_ignore, scheme=_scheme, targets=_targets, duo_scaling=True),
+    QuantizationModifier(ignore=_ignore, scheme=_scheme, targets=_targets),
 ]
 
 # Apply algorithms.