intel · Kaihui-intel · Jan 28, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/auto_round/modelling/llama4.py b/auto_round/modelling/llama4.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 # Note: adapted from # https://github.com/vllm-project/llm-compressor/blob/main/src/llmcompressor/modeling/llama4.py
 import torch
-from transformers.modeling_utils import no_init_weights
+import transformers
+from packaging import version
+transformers_version = version.parse(transformers.__version__)
+if transformers_version < version.parse("5.0.0"):
+    from transformers.modeling_utils import no_init_weights
+else:
+    from transformers.initialization import no_init_weights
 from transformers.models.llama4.modeling_llama4 import Llama4Config, Llama4TextMLP, Llama4TextMoe
 
 from auto_round.modelling.replace_modules import ReplacementModuleBase

diff --git a/test/helpers.py b/test/helpers.py
@@ -6,6 +6,10 @@
 import transformers
 
 from auto_round.utils import get_attr, llm_load_model, mllm_load_model, set_attr
+from packaging import version
+
+transformers_version = version.parse(transformers.__version__)
+
 
 
 # Automatic choose local path or model name.

diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py
@@ -53,10 +53,10 @@ def test_torch_4bits_asym(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float32, device_map="cpu", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_folder, torch_dtype=torch.bfloat16)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])

diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py
@@ -4,12 +4,13 @@
 import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
+from packaging import version
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.utils import get_module
 
-from ...helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path
+from ...helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path, transformers_version
 
 
 class TestAutoRound:
@@ -456,8 +457,12 @@ def test_not_convert_modules(self):
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16
         )
-        assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
-        assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
+        if transformers_version < version.parse("5.0.0"):
+            assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
+            assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
+        else:
+            assert isinstance(model.model.visual.blocks[0].attn.qkv, torch.nn.Linear)
+            assert not isinstance(model.model.visual.merger.mlp[0], QuantLinear)
         if hasattr(model.model, "language_model"):
             assert isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear)
         else:

diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py
@@ -262,7 +262,8 @@ def test_static_afp8_export(self, static_kv_dtype):
             assert "model.decoder.layers.8.self_attn.v_scale" in f.keys()
             assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1])
             assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1])
-            assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32
+            assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32 or \
+                f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.bfloat16
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -318,7 +319,7 @@ def test_static_fp8_attn(self):
             weight_name = f"model.decoder.layers.8.self_attn.{attr}"
             assert weight_name in f.keys()
             assert f.get_tensor(weight_name).shape == torch.Size([1])
-            assert f.get_tensor(weight_name).dtype == torch.float32
+            assert f.get_tensor(weight_name).dtype == torch.float32 or f.get_tensor(weight_name).dtype == torch.bfloat16
 
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 

diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
@@ -8,12 +8,13 @@
 
 from auto_round import AutoRound
 
-from ...helpers import get_model_path, get_tiny_model
+from ...helpers import get_model_path, get_tiny_model, transformers_version
+from packaging import version
 
 AUTO_ROUND_PATH = __file__.split("/")
 AUTO_ROUND_PATH = "/".join(AUTO_ROUND_PATH[: AUTO_ROUND_PATH.index("test")])
 
-
+@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="GGUF format saveing and loading falied in transformers v5")
 class TestGGUF:
 
     @classmethod
@@ -60,6 +61,12 @@ def test_q4_0(self):
 
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0")
         gguf_file = os.listdir(quantized_model_path)[0]
+
+        # TODO: fix the issue of gguf loading error in transformers v5
+        # cls = transformers.generation.configuration_utils.GenerationConfig'>, json_file = None
+        #     def _dict_from_json_file(cls, json_file: str | os.PathLike):
+        # >       with open(json_file, "r", encoding="utf-8") as reader:
+        # E       TypeError: expected str, bytes or os.PathLike object, not NoneType
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)

diff --git a/test/test_cpu/integrations/test_llmc_integration.py b/test/test_cpu/integrations/test_llmc_integration.py
@@ -4,6 +4,8 @@
 from llmcompressor import oneshot
 from llmcompressor.modifiers.autoround import AutoRoundModifier
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from ...helpers import transformers_version
+from packaging import version
 
 from auto_round.calib_dataset import get_dataset
 
@@ -38,7 +40,8 @@
     },
 )
 
-
+@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), \
+    reason="transformers 5.0 use_auth_token is deprecated and llmcompressor oneshot has not been updated yet")
 @pytest.mark.parametrize(
     "recipe",
     [

diff --git a/test/test_cpu/models/test_moe_model.py b/test/test_cpu/models/test_moe_model.py
@@ -7,7 +7,8 @@
 
 from auto_round import AutoRound
 
-from ...helpers import get_model_path
+from ...helpers import get_model_path, transformers_version
+from packaging import version
 
 gpt_oss_name_or_path = get_model_path("unsloth/gpt-oss-20b-BF16")
 llama4_name_or_path = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct")
@@ -36,11 +37,11 @@ def setup_llama4():
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     config.vision_config.num_hidden_layers = 1  # Reduce layers for testing
     config.text_config.num_hidden_layers = 1
+    # config.vision_config.rope_theta = config.vision_config.rope_parameters["rope_theta"] # for transformers >= 5.0
     model = Llama4ForConditionalGeneration(config)
     output_dir = "./tmp/test_quantized_llama4"
     return model, tokenizer, output_dir, config
-
-
+
 @pytest.fixture
 def setup_qwen3_vl_moe():
     """Fixture to set up the qwen3_vl_moe model and tokenizer."""
@@ -120,7 +121,7 @@ def test_gptoss(setup_gpt_oss, scheme):
     # clean the output directory after test
     shutil.rmtree(output_dir, ignore_errors=True)
 
-
+@pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 'Llama4VisionConfig' object has no attribute 'rope_theta'")
 def test_llama4(setup_llama4):
     model, tokenizer, output_dir, config = setup_llama4
 

diff --git a/test/test_cpu/quantization/test_mxfp_nvfp.py b/test/test_cpu/quantization/test_mxfp_nvfp.py
@@ -7,7 +7,8 @@
 
 from auto_round import AutoRound
 
-from ...helpers import is_model_outputs_similar
+from ...helpers import is_model_outputs_similar, transformers_version
+from packaging import version
 
 
 def _get_folder_size(path: str) -> float:
@@ -30,7 +31,8 @@ def setup_class(self):
     def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-
+
+    @pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 MOE model has breaking changes")
     def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader):
         model_name = tiny_deepseek_v2_model_path
         layer_config = {
@@ -58,6 +60,7 @@ def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader):
         ), "Illegal NVFP4 quantization for lm_head layer"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
+    @pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 MOE model has breaking changes")
     def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
         model_name = tiny_deepseek_v2_model_path
         layer_config = {
@@ -90,6 +93,7 @@ def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
         assert is_model_outputs_similar(model_name, quantized_model_path)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
+    @pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 MOE model has breaking changes")
     def test_mxfp4_moe_ar(self, tiny_deepseek_v2_model_path, dataloader):
         model_name = tiny_deepseek_v2_model_path
         layer_config = {
@@ -323,6 +327,7 @@ def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
+    @pytest.mark.skipif(transformers_version >= version.parse("5.0.0"), reason="transformers v5 MOE model has breaking changes")
     def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader):
         model_name = tiny_qwen_moe_model_path
         layer_config = {