[Examples] Add Gemma 4 E4B NVFP4A16 quantization example

2imi9 · 2imi9 · commit 1fb9697668b9 · 2026-04-02T20:25:48.000-04:00
Add NVFP4A16 weight-only quantization example for google/gemma-4-E4B-it.
Includes a Dockerfile since Gemma 4 requires transformers from git main
which is newer than the version currently pinned by llmcompressor.

The ignore list skips vision_tower, audio_tower, embed_vision, and
embed_audio modules which are specific to Gemma 4's multimodal
architecture. Uses AutoModelForImageTextToText and AutoProcessor
as required by the Gemma 4 model class.

Tested end-to-end: quantization, sample generation, and model saving
all complete successfully.

Signed-off-by: Ziming &lt;frankziming26@outlook.com&gt;
diff --git a/examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4 b/examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4
@@ -0,0 +1,24 @@
+FROM nvcr.io/nvidia/pytorch:25.04-py3
+
+WORKDIR /workspace
+
+# Install llmcompressor and upgrade transformers for Gemma 4 support.
+# Gemma 4 (model_type: gemma4) requires transformers from git main which is newer
+# than the version currently pinned by llmcompressor.
+#
+# Step 1: Install llmcompressor (keeps NVIDIA constraint file so torch/cuda stay).
+# Step 2: Force-upgrade transformers, huggingface_hub, regex (bypass constraints).
+RUN pip install --no-deps git+https://github.com/vllm-project/llm-compressor.git \
+        "compressed-tensors>=0.14.1a2" loguru "datasets>=4.0.0" accelerate \
+        "auto-round>=0.10.2" nvidia-ml-py && \
+    PIP_CONSTRAINT="" pip install multiprocess dill xxhash fsspec && \
+    PIP_CONSTRAINT="" pip install --force-reinstall \
+        git+https://github.com/huggingface/transformers.git \
+        "huggingface_hub>=1.5.0" \
+        "regex>=2025.10.22" \
+        tokenizers safetensors && \
+    pip install "numpy<2"
+
+COPY gemma4_example.py .
+
+ENTRYPOINT ["python", "gemma4_example.py"]
diff --git a/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py b/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
@@ -0,0 +1,45 @@
+from compressed_tensors.offload import dispatch_model
+from transformers import AutoModelForImageTextToText, AutoProcessor
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# Load model.
+MODEL_ID = "google/gemma-4-E4B-it"
+model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, dtype="auto")
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp4 with per group 16 via ptq
+#   * skip the vision encoder, audio encoder, embedding projections, and lm_head
+recipe = QuantizationModifier(
+    targets="Linear",
+    scheme="NVFP4A16",
+    ignore=[
+        "lm_head",
+        "re:.*vision_tower.*",
+        "re:.*audio_tower.*",
+        "re:.*embed_vision.*",
+        "re:.*embed_audio.*",
+    ],
+)
+
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+print("\n\n========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+messages = [
+    {"role": "user", "content": "Hello my name is"},
+]
+text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+inputs = processor(text=text, return_tensors="pt").to(model.device)
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
+print("==========================================\n\n")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)