Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1746,7 +1746,6 @@ def _adjust_immediate_packing_and_saving(self):

if self.is_immediate_saving and "int" not in self.data_type:
logger.warning("immediate_saving is only supported for int quantization, set to False")
self.is_immediate_saving = False

if self.orig_output_dir is None:
self.is_immediate_saving = False
Expand Down
17 changes: 2 additions & 15 deletions test/test_cpu/models/test_moe_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,7 @@ def test_gptoss(scheme, tiny_gpt_oss_model_path, tmp_path):

# verify the quantized model can be loaded and run inference
loaded_model = GptOssForCausalLM.from_pretrained(output_dir)
for n, m in quantized_model.named_modules():
if m.__class__.__name__ == "QuantLinear":
loaded_m = loaded_model.get_submodule(n)
if scheme == "MXFP4":
assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()
if scheme == "MXFP8":
assert (loaded_m.weight.to("cpu") == m.weight.to("cpu")).all()

inp = torch.randint(0, 100, (1, 32))
with torch.inference_mode():
loaded_out = loaded_model(inp)
Expand All @@ -84,10 +78,7 @@ def test_llama4(tiny_llama4_model_path):
assert quantized_model is not None, "Quantized model should not be None."

loaded_model = Llama4ForConditionalGeneration.from_pretrained(output_dir)
for n, m in quantized_model.named_modules():
if m.__class__.__name__ == "QuantLinear":
loaded_m = loaded_model.get_submodule(n)
assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()

inp = torch.randint(0, 100, (1, 32))
with torch.inference_mode():
loaded_out = loaded_model(inp)
Expand All @@ -110,10 +101,6 @@ def test_qwen3_vl_moe_mxfp(tiny_qwen3_vl_moe_model_path):
assert quantized_model is not None, "Quantized model should not be None."
loaded_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(output_dir, device_map="cpu")

for n, m in quantized_model.named_modules():
if m.__class__.__name__ == "QuantLinear":
loaded_m = loaded_model.get_submodule(n)
assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()
inp = torch.randint(0, 100, (1, 32))
with torch.inference_mode():
loaded_out = loaded_model(inp)
Expand Down
Loading