Remove gpt_oss test code and add in examples

isharif168 · isharif168 · commit 201d06c5db2d · 2025-12-04T14:08:58.000Z
- Add gpt_oss_20b_example.py which does the convert and quantization
- Clean up the gpt_oss.py from the test code

Signed-off-by: Sharif Inamdar &lt;sharif.inamdar@arm.com&gt;
diff --git a/examples/quantization_w4a8/gpt_oss_20b_example.py b/examples/quantization_w4a8/gpt_oss_20b_example.py
@@ -0,0 +1,80 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+from compressed_tensors.quantization import QuantizationScheme
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
+
+from llmcompressor.modeling.gpt_oss import convert_model_for_quantization_gptoss
+
+
+def main():
+    MODEL_ID = "openai/gpt-oss-20b"
+    BASE_NAME = MODEL_ID.rstrip("/").split("/")[-1]
+    OUTPUT_DIR = f"{BASE_NAME}-w4a8-channelwise"
+
+    print(f"[GPT-OSS] Loading model: {MODEL_ID}")
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+    # ---- GPT-OSS MoE → linear experts conversion ----
+    print("[GPT-OSS] Converting fused MoE experts to SequentialGPTOSSMoE for quantization...")
+    convert_model_for_quantization_gptoss(model)
+    print("[GPT-OSS] Conversion completed.")
+
+    # ---- Quantization config: W4A8 (int4 weights, int8 activations) ----
+
+    # Weights: 4-bit, channelwise, symmetric, static
+    weights_args = QuantizationArgs(
+        num_bits=4,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.CHANNEL,
+        symmetric=True,
+        dynamic=False,
+    )
+
+    # Activations: 8-bit, per-token, asymmetric, dynamic
+    activations_args = QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.TOKEN,
+        symmetric=False,
+        dynamic=True,
+        observer=None,
+    )
+
+    # Apply to all Linear layers, excluding lm_head
+    scheme = QuantizationScheme(
+        targets=["Linear"],
+        weights=weights_args,
+        input_activations=activations_args,
+    )
+
+    recipe = QuantizationModifier(
+        config_groups={"group_0": scheme},
+        ignore=["lm_head"],
+    )
+
+    print(f"[GPT-OSS] Starting oneshot quantization → {OUTPUT_DIR}")
+    oneshot(
+        model=model,
+        recipe=recipe,
+        tokenizer=tokenizer,
+        output_dir=OUTPUT_DIR,
+        trust_remote_code_model=True,
+    )
+    print(f"[GPT-OSS] Quantization finished. Quantized model written to: {OUTPUT_DIR}")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/llmcompressor/modeling/gpt_oss.py b/src/llmcompressor/modeling/gpt_oss.py
@@ -1,12 +1,8 @@
 import gc
 import torch
 from torch import nn
-import os
-from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor import oneshot
 from llmcompressor.utils.dev import skip_weights_initialize
-from llmcompressor.modifiers.quantization import QuantizationModifier
 from compressed_tensors.utils import align_module_device, update_offload_parameter
 
 def convert_model_for_quantization_gptoss(model):
@@ -49,8 +45,9 @@ def convert_model_for_quantization_gptoss(model):
     if to_delete:
         gc.collect()
         try:
-            torch.cuda.synchronize() 
-            torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+                torch.cuda.empty_cache()
         except Exception as e:
             print(f"[GPT-OSS] Warning: Failed to empty CUDA cache: {e}", flush=True)
 
@@ -163,54 +160,12 @@ def forward(self, hidden_states):
         for j in range(self.top_k):
             idx = router_indices[:, j]
             w   = router_scores[torch.arange(idx.size(0), device=idx.device), idx].unsqueeze(-1)
-            unique_experts = torch.unique(idx)
-            for e in unique_experts:
+            for e in range(self.num_experts):
                 mask = (idx == e)
-                out[mask] += self.experts[e](x[mask]) * w[mask]
+                if mask.any():
+                    out[mask] += self.experts[e](x[mask]) * w[mask]
 
         out = out.view(B, T, H)
         router_scores = router_scores.view(B * T, -1)  # shape doesn't matter much; it’s ignored by the decoder
         return out, router_scores
 
-
-model_id = "unsloth/gpt-oss-120b-BF16"
-
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    trust_remote_code=True,
-)
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-
-convert_model_for_quantization_gptoss(model)
-
-# -----------------------------
-# Quantization recipe
-# -----------------------------
-recipe = QuantizationModifier(
-    targets="Linear",
-    scheme="FP8_DYNAMIC",
-    ignore=[
-        "re:.*lm_head",
-        "re:.*self_attn",
-        "re:.*attn",
-        "re:.*attention.*",
-        "re:.*router",
-    ],
-)
-
-SAVE_DIR = f"{model_id.split('/')[-1]}-FP8-Dynamic"
-
-# Oneshot quantization
-oneshot(
-    model=model,
-    tokenizer=tokenizer,
-    recipe=recipe,
-    trust_remote_code_model=True,
-    output_dir=SAVE_DIR,
-)
-
-# Save compressed
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)