Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
dde4ba9
fix cpu ut for transformersv5
sys-lpot-val Jan 23, 2026
735ca68
minor fix
sys-lpot-val Jan 23, 2026
7d335a7
merge main
sys-lpot-val Jan 23, 2026
dafcc5f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 23, 2026
f45be48
update issue link
sys-lpot-val Jan 26, 2026
f35e42a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 26, 2026
e0c16ba
fix wf init
sys-lpot-val Jan 26, 2026
dbf8cb4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 26, 2026
e0623fa
revert wf
sys-lpot-val Jan 26, 2026
b4f0108
fix mixed type
sys-lpot-val Jan 26, 2026
fc1a96b
fix copy out of meta tensor issue
WeiweiZhang1 Jan 27, 2026
a51ff10
fix copy out of meta tensor issue
WeiweiZhang1 Jan 27, 2026
6f3691a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2026
e10c525
fix lm_eval import issue and update requirement to suit transformers …
xin3he Jan 27, 2026
8b4f2d5
skip PhiConfig has no attribute 'pad_token_id'
sys-lpot-val Jan 27, 2026
c3fec96
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2026
a55841c
fix mixed type in qlinera_torch_zp
sys-lpot-val Jan 27, 2026
e725099
update no_init_weights import for gpt-oss
xin3he Jan 27, 2026
85eb094
skip diffusers
sys-lpot-val Jan 27, 2026
a93c9e3
skip phiconfig
sys-lpot-val Jan 27, 2026
eb6d09e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2026
d9a9e4e
update vllm and sglang to use git main branch
xin3he Jan 27, 2026
8ca34fe
skip diffusers import
sys-lpot-val Jan 27, 2026
de7819f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2026
ec19894
Merge branch 'main' into kaihui/v5_cpu_ut
Kaihui-intel Jan 27, 2026
4bd8b48
set trust_remote_code=False for tiny-deepseek
xin3he Jan 27, 2026
fb21e4e
set pad_token_id to None in setup_llama4 for compatibility with Llama…
xin3he Jan 27, 2026
c4c74f0
fix
XuehaoSun Jan 27, 2026
2ba863a
Merge branch 'main' into kaihui/v5_cpu_ut
n1ck-guo Jan 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion auto_round/modelling/llama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@
# limitations under the License.
# Note: adapted from # https://github.com/vllm-project/llm-compressor/blob/main/src/llmcompressor/modeling/llama4.py
import torch
from transformers.modeling_utils import no_init_weights
import transformers
from packaging import version
transformers_version = version.parse(transformers.__version__)
if transformers_version < version.parse("5.0.0"):
from transformers.modeling_utils import no_init_weights
else:
from transformers.initialization import no_init_weights
from transformers.models.llama4.modeling_llama4 import Llama4Config, Llama4TextMLP, Llama4TextMoe

from auto_round.modelling.replace_modules import ReplacementModuleBase
Expand Down
4 changes: 4 additions & 0 deletions test/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import transformers

from auto_round.utils import get_attr, llm_load_model, mllm_load_model, set_attr
from packaging import version

transformers_version = version.parse(transformers.__version__)



# Automatic choose local path or model name.
Expand Down
4 changes: 2 additions & 2 deletions test/test_cpu/backends/test_torch_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ def test_torch_4bits_asym(self, dataloader):
torch.cuda.empty_cache()

model = AutoModelForCausalLM.from_pretrained(
self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
self.save_folder, dtype=torch.float32, device_map="cpu", quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
tokenizer = AutoTokenizer.from_pretrained(self.save_folder, torch_dtype=torch.bfloat16)
model_infer(model, tokenizer)
result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
print(result["results"]["lambada_openai"]["acc,none"])
Expand Down
11 changes: 8 additions & 3 deletions test/test_cpu/core/test_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import pytest
import torch
from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
from packaging import version

from auto_round import AutoRound
from auto_round.eval.evaluation import simple_evaluate_user_model
from auto_round.utils import get_module

from ...helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path
from ...helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path, transformers_version


class TestAutoRound:
Expand Down Expand Up @@ -456,8 +457,12 @@ def test_not_convert_modules(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16
)
assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
if transformers_version < version.parse("5.0.0"):
assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
else:
assert isinstance(model.model.visual.blocks[0].attn.qkv, torch.nn.Linear)
assert not isinstance(model.model.visual.merger.mlp[0], QuantLinear)
if hasattr(model.model, "language_model"):
assert isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear)
else:
Expand Down
5 changes: 3 additions & 2 deletions test/test_cpu/export/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,8 @@ def test_static_afp8_export(self, static_kv_dtype):
assert "model.decoder.layers.8.self_attn.v_scale" in f.keys()
assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1])
assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1])
assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32
assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32 or \
f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.bfloat16
shutil.rmtree(quantized_model_path, ignore_errors=True)

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
Expand Down Expand Up @@ -318,7 +319,7 @@ def test_static_fp8_attn(self):
weight_name = f"model.decoder.layers.8.self_attn.{attr}"
assert weight_name in f.keys()
assert f.get_tensor(weight_name).shape == torch.Size([1])
assert f.get_tensor(weight_name).dtype == torch.float32
assert f.get_tensor(weight_name).dtype == torch.float32 or f.get_tensor(weight_name).dtype == torch.bfloat16

shutil.rmtree(quantized_model_path, ignore_errors=True)

Expand Down
11 changes: 9 additions & 2 deletions test/test_cpu/export/test_gguf_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@

from auto_round import AutoRound

from ...helpers import get_model_path, get_tiny_model
from ...helpers import get_model_path, get_tiny_model, transformers_version
from packaging import version

AUTO_ROUND_PATH = __file__.split("/")
AUTO_ROUND_PATH = "/".join(AUTO_ROUND_PATH[: AUTO_ROUND_PATH.index("test")])


@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="GGUF format saveing and loading falied in transformers v5")
class TestGGUF:

@classmethod
Expand Down Expand Up @@ -60,6 +61,12 @@ def test_q4_0(self):

autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0")
gguf_file = os.listdir(quantized_model_path)[0]

# TODO: fix the issue of gguf loading error in transformers v5
# cls = transformers.generation.configuration_utils.GenerationConfig'>, json_file = None
# def _dict_from_json_file(cls, json_file: str | os.PathLike):
# > with open(json_file, "r", encoding="utf-8") as reader:
# E TypeError: expected str, bytes or os.PathLike object, not NoneType
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
text = "There is a girl who likes adventure,"
inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
Expand Down
5 changes: 4 additions & 1 deletion test/test_cpu/integrations/test_llmc_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from llmcompressor import oneshot
from llmcompressor.modifiers.autoround import AutoRoundModifier
from transformers import AutoModelForCausalLM, AutoTokenizer
from ...helpers import transformers_version
from packaging import version

from auto_round.calib_dataset import get_dataset

Expand Down Expand Up @@ -38,7 +40,8 @@
},
)


@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), \
reason="transformers 5.0 use_auth_token is deprecated and llmcompressor oneshot has not been updated yet")
@pytest.mark.parametrize(
"recipe",
[
Expand Down
9 changes: 5 additions & 4 deletions test/test_cpu/models/test_moe_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from auto_round import AutoRound

from ...helpers import get_model_path
from ...helpers import get_model_path, transformers_version
from packaging import version

gpt_oss_name_or_path = get_model_path("unsloth/gpt-oss-20b-BF16")
llama4_name_or_path = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct")
Expand Down Expand Up @@ -36,11 +37,11 @@ def setup_llama4():
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
config.vision_config.num_hidden_layers = 1 # Reduce layers for testing
config.text_config.num_hidden_layers = 1
# config.vision_config.rope_theta = config.vision_config.rope_parameters["rope_theta"] # for transformers >= 5.0
model = Llama4ForConditionalGeneration(config)
output_dir = "./tmp/test_quantized_llama4"
return model, tokenizer, output_dir, config



@pytest.fixture
def setup_qwen3_vl_moe():
"""Fixture to set up the qwen3_vl_moe model and tokenizer."""
Expand Down Expand Up @@ -120,7 +121,7 @@ def test_gptoss(setup_gpt_oss, scheme):
# clean the output directory after test
shutil.rmtree(output_dir, ignore_errors=True)


@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 'Llama4VisionConfig' object has no attribute 'rope_theta'")
def test_llama4(setup_llama4):
model, tokenizer, output_dir, config = setup_llama4

Expand Down
9 changes: 7 additions & 2 deletions test/test_cpu/quantization/test_mxfp_nvfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from auto_round import AutoRound

from ...helpers import is_model_outputs_similar
from ...helpers import is_model_outputs_similar, transformers_version
from packaging import version


def _get_folder_size(path: str) -> float:
Expand All @@ -30,7 +31,8 @@ def setup_class(self):
def teardown_class(self):
shutil.rmtree("./saved", ignore_errors=True)
shutil.rmtree("runs", ignore_errors=True)


@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 MOE model has breaking changes")
def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader):
model_name = tiny_deepseek_v2_model_path
layer_config = {
Expand Down Expand Up @@ -58,6 +60,7 @@ def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader):
), "Illegal NVFP4 quantization for lm_head layer"
shutil.rmtree(self.save_dir, ignore_errors=True)

@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 MOE model has breaking changes")
def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
model_name = tiny_deepseek_v2_model_path
layer_config = {
Expand Down Expand Up @@ -90,6 +93,7 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
assert is_model_outputs_similar(model_name, quantized_model_path)
shutil.rmtree(self.save_dir, ignore_errors=True)

@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 MOE model has breaking changes")
def test_mxfp4_moe_ar(self, tiny_deepseek_v2_model_path, dataloader):
model_name = tiny_deepseek_v2_model_path
layer_config = {
Expand Down Expand Up @@ -323,6 +327,7 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader):
), "Illegal NVFP4 packing name or data_type or shape"
shutil.rmtree(quantized_model_path, ignore_errors=True)

@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 MOE model has breaking changes")
def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader):
model_name = tiny_qwen_moe_model_path
layer_config = {
Expand Down