intel · Kaihui-intel · Jan 28, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
@@ -279,11 +279,7 @@ def eval_task_by_task(
     device_str, parallelism = get_device_and_parallelism(device)
 
     # load after _eval_int in order to make sure import torch after set CUDA_VISIBLE_DEVICES
-    import traceback
-
-    from lm_eval import simple_evaluate as lm_simple_evaluate  # pylint: disable=E0401
     from lm_eval.models.huggingface import HFLM  # pylint: disable=E0401
-    from transformers import AutoModelForCausalLM, AutoTokenizer
 
     from auto_round.utils import logger
 
@@ -402,7 +398,7 @@ def _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry
     import time
     import traceback
 
-    from lm_eval import simple_evaluate as lm_simple_evaluate  # pylint: disable=E0401
+    import lm_eval  # pylint: disable=E0401
     from lm_eval.utils import make_table  # pylint: disable=E0401
 
     from auto_round.utils import logger
@@ -418,7 +414,7 @@ def _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry
         current_retry_times = retry_times
         while current_retry_times:
             try:
-                res = lm_simple_evaluate(
+                res = lm_eval.simple_evaluate(
                     model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit
                 )
                 break
@@ -430,7 +426,7 @@ def _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry
                         for k, v in hflm.batch_sizes.items():
                             hflm.batch_sizes[k] = max(v // 2, 1)
                         logger.warning(f"Out of memory, reset batch_size to {hflm.batch_sizes} and re-try.")
-                        res = lm_simple_evaluate(
+                        res = lm_eval.simple_evaluate(
                             model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit
                         )
                         hflm.batch_sizes = ori_batch_sizes

diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py
@@ -31,7 +31,7 @@ def simple_evaluate_user_model(
     mllm: bool = False,
     **kwargs,
 ):
-    from lm_eval import simple_evaluate as lm_simple_evaluate  # pylint: disable=E0401
+    import lm_eval  # pylint: disable=E0401
     from lm_eval.models.huggingface import HFLM  # pylint: disable=E0401
 
     if mllm:
@@ -57,7 +57,7 @@ def simple_evaluate_user_model(
             dtype=eval_model_dtype,
             add_bos_token=add_bos_token,
         )
-    return lm_simple_evaluate(
+    return lm_eval.simple_evaluate(
         model=hflm, model_args=None, batch_size=batch_size, max_batch_size=max_batch_size, limit=limit, **kwargs
     )
 
@@ -71,9 +71,9 @@ def simple_evaluate(
     device: Optional[str] = None,
     **kwargs,
 ):
-    from lm_eval import simple_evaluate as lm_simple_evaluate  # pylint: disable=E0401
+    import lm_eval  # pylint: disable=E0401
 
-    return lm_simple_evaluate(
+    return lm_eval.simple_evaluate(
         model=model,
         model_args=model_args,
         batch_size=batch_size,

diff --git a/auto_round/modelling/gpt_oss.py b/auto_round/modelling/gpt_oss.py
@@ -14,8 +14,16 @@
 
 
 import torch
+import transformers
+from packaging import version
 from torch import nn
-from transformers.modeling_utils import no_init_weights as skip_weights_initialize
+
+transformers_version = version.parse(transformers.__version__)
+if transformers_version < version.parse("5.0.0"):
+    from transformers.modeling_utils import no_init_weights
+else:
+    from transformers.initialization import no_init_weights
+
 from transformers.models.gpt_oss.configuration_gpt_oss import GptOssConfig
 from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP
 
@@ -71,7 +79,7 @@ def __init__(self, original: GptOssMLP, config: GptOssConfig):
         # Build per-expert MLPs
         self.experts = nn.ModuleList()
         target_device = next(original.experts.parameters()).device
-        with skip_weights_initialize(), torch.device("meta"):
+        with no_init_weights(), torch.device("meta"):
             for _ in range(E):
                 self.experts.append(GPTOssSingleExpert(hidden_size, intermediate_size, dtype=dtype))
 

diff --git a/auto_round/modelling/llama4.py b/auto_round/modelling/llama4.py
@@ -13,7 +13,14 @@
 # limitations under the License.
 # Note: adapted from # https://github.com/vllm-project/llm-compressor/blob/main/src/llmcompressor/modeling/llama4.py
 import torch
-from transformers.modeling_utils import no_init_weights
+import transformers
+from packaging import version
+
+transformers_version = version.parse(transformers.__version__)
+if transformers_version < version.parse("5.0.0"):
+    from transformers.modeling_utils import no_init_weights
+else:
+    from transformers.initialization import no_init_weights
 from transformers.models.llama4.modeling_llama4 import Llama4Config, Llama4TextMLP, Llama4TextMoe
 
 from auto_round.modelling.replace_modules import ReplacementModuleBase

diff --git a/auto_round_extension/torch/qlinear_torch.py b/auto_round_extension/torch/qlinear_torch.py
@@ -25,6 +25,25 @@
 logger = getLogger(__name__)
 
 
+_DEVICE_WF_3BITS_TENSORS = {}
+# Constants for FP4 values (E2M1 format)
+_wf_3bits = [
+    [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
+    [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
+    [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
+]
+
+
+def get_wf_3bits_tensor(device):
+    """Get device-specific wf_3bits tensor, creating it if needed."""
+    device_str = str(device)
+    if device_str not in _DEVICE_WF_3BITS_TENSORS:
+        _DEVICE_WF_3BITS_TENSORS[device_str] = torch.tensor(_wf_3bits, dtype=torch.int32, device=device).reshape(
+            1, 3, 12
+        )
+    return _DEVICE_WF_3BITS_TENSORS[device_str]
+
+
 class QuantLinear(nn.Module):
     """
     Torch quantized linear layer.
@@ -72,17 +91,11 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa
 
         # is performed by unpacking the weights and using torch.matmul
         if self.bits in [2, 4, 8]:
-            self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
-        elif self.bits == 3:
             self.wf = torch.tensor(
-                [
-                    [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
-                    [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
-                    [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
-                ],
-                dtype=torch.int32,
-            ).reshape(1, 3, 12)
-
+                list(range(0, 32, self.bits)), dtype=torch.int32, device=self.qweight.device
+            ).unsqueeze(0)
+        elif self.bits == 3:
+            self.wf = get_wf_3bits_tensor(device=self.qweight.device)
         self.dequant_dtype = torch.int16 if self.bits == 8 else torch.int8
 
     def post_init(self):
@@ -277,7 +290,9 @@ def forward(self, x):
 
         if self.bits in [2, 4, 8]:
             if self.wf.device != self.qzeros.device:
-                self.wf = self.wf.to(self.qzeros.device)
+                self.wf = torch.tensor(
+                    list(range(0, 32, self.bits)), dtype=torch.int32, device=self.qzeros.device
+                ).unsqueeze(0)
             zeros = torch.bitwise_right_shift(
                 torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
                 self.wf.unsqueeze(0),
@@ -293,7 +308,7 @@ def forward(self, x):
             )
         elif self.bits == 3:
             if self.wf.device != self.qzeros.device:
-                self.wf = self.wf.to(self.qzeros.device)
+                self.wf = get_wf_3bits_tensor(self.qzeros.device)
             zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(-1, -1, -1, 12)
             zeros = zeros >> self.wf.unsqueeze(0)
             zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
@@ -335,7 +350,7 @@ def forward(self, x):
         out = torch.matmul(x, weights)
         out = out.to(x_dtype)
         out = out.reshape(out_shape)
-        out = out + self.bias if self.bias is not None else out
+        out = (out + self.bias).to(x_dtype) if self.bias is not None else out
         return out
 
 

diff --git a/auto_round_extension/torch/qlinear_torch_zp.py b/auto_round_extension/torch/qlinear_torch_zp.py
@@ -21,6 +21,7 @@
 import transformers
 
 from auto_round.utils import get_packing_device
+from auto_round_extension.torch.qlinear_torch import get_wf_3bits_tensor
 
 logger = getLogger(__name__)
 
@@ -72,16 +73,11 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa
 
         # is performed by unpacking the weights and using torch.matmul
         if self.bits in [2, 4, 8]:
-            self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
-        else:  ## bits == 3
             self.wf = torch.tensor(
-                [
-                    [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
-                    [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
-                    [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
-                ],
-                dtype=torch.int32,
-            ).reshape(1, 3, 12)
+                list(range(0, 32, self.bits)), dtype=torch.int32, device=self.qweight.device
+            ).unsqueeze(0)
+        else:  ## bits == 3
+            self.wf = get_wf_3bits_tensor(device=self.qweight.device)
 
         self.dequant_dtype = torch.int16 if self.bits == 8 else torch.int8
 
@@ -276,7 +272,9 @@ def forward(self, x):
 
         if self.bits in [2, 4, 8]:
             if self.wf.device != self.qzeros.device:
-                self.wf = self.wf.to(self.qzeros.device)
+                self.wf = torch.tensor(
+                    list(range(0, 32, self.bits)), dtype=torch.int32, device=self.qzeros.device
+                ).unsqueeze(0)
             zeros = torch.bitwise_right_shift(
                 torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
                 self.wf.unsqueeze(0),
@@ -292,7 +290,7 @@ def forward(self, x):
             )
         elif self.bits == 3:
             if self.wf.device != self.qzeros.device:
-                self.wf = self.wf.to(self.qzeros.device)
+                self.wf = get_wf_3bits_tensor(device=self.qzeros.device)
             zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(-1, -1, -1, 12)
             zeros = zeros >> self.wf.unsqueeze(0)
             zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
@@ -335,7 +333,7 @@ def forward(self, x):
         out = torch.matmul(x, weights)
         out = out.to(x_dtype)
         out = out.reshape(out_shape)
-        out = out + self.bias if self.bias is not None else out
+        out = (out + self.bias).to(x_dtype) if self.bias is not None else out
         return out
 
 

diff --git a/test/helpers.py b/test/helpers.py
@@ -4,9 +4,12 @@
 import pytest
 import torch
 import transformers
+from packaging import version
 
 from auto_round.utils import get_attr, llm_load_model, mllm_load_model, set_attr
 
+transformers_version = version.parse(transformers.__version__)
+
 
 # Automatic choose local path or model name.
 def get_model_path(model_name: str) -> str:

diff --git a/test/test_ark/requirements.txt b/test/test_ark/requirements.txt
@@ -1,2 +1,2 @@
 auto-round-kernel
-lm-eval
+lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@main
diff --git a/test/test_cpu/core/test_autoround.py b/test/test_cpu/core/test_autoround.py
@@ -3,13 +3,14 @@
 
 import pytest
 import torch
+from packaging import version
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.utils import get_module
 
-from ...helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path
+from ...helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path, transformers_version
 
 
 class TestAutoRound:
@@ -72,6 +73,10 @@ def test_remove_whole_block(self, tiny_opt_model_path, dataloader):
         )
         autoround.quantize()
 
+    @pytest.mark.skipif(
+        transformers_version >= version.parse("5.0"),
+        reason="PhiConfig missing pad_token_id, https://github.com/huggingface/transformers/pull/43453",
+    )
     def test_consecutive_quant(self, tiny_opt_model_path, tiny_phi2_model_path, dataloader):
         bits, group_size, sym = 4, -1, False
         autoround = AutoRound(
@@ -456,8 +461,12 @@ def test_not_convert_modules(self):
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16
         )
-        assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
-        assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
+        if transformers_version < version.parse("5.0.0"):
+            assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
+            assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
+        else:
+            assert isinstance(model.model.visual.blocks[0].attn.qkv, torch.nn.Linear)
+            assert not isinstance(model.model.visual.merger.mlp[0], QuantLinear)
         if hasattr(model.model, "language_model"):
             assert isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear)
         else:

diff --git a/test/test_cpu/export/test_export.py b/test/test_cpu/export/test_export.py
@@ -262,7 +262,10 @@ def test_static_afp8_export(self, static_kv_dtype):
             assert "model.decoder.layers.8.self_attn.v_scale" in f.keys()
             assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1])
             assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1])
-            assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32
+            assert (
+                f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32
+                or f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.bfloat16
+            )
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -318,7 +321,7 @@ def test_static_fp8_attn(self):
             weight_name = f"model.decoder.layers.8.self_attn.{attr}"
             assert weight_name in f.keys()
             assert f.get_tensor(weight_name).shape == torch.Size([1])
-            assert f.get_tensor(weight_name).dtype == torch.float32
+            assert f.get_tensor(weight_name).dtype == torch.float32 or f.get_tensor(weight_name).dtype == torch.bfloat16
 
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 

diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
@@ -4,16 +4,22 @@
 
 import pytest
 import torch
+from packaging import version
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
-from ...helpers import get_model_path, get_tiny_model, save_tiny_model
+from ...helpers import get_model_path, get_tiny_model, save_tiny_model, transformers_version
 
 AUTO_ROUND_PATH = __file__.split("/")
 AUTO_ROUND_PATH = "/".join(AUTO_ROUND_PATH[: AUTO_ROUND_PATH.index("test")])
 
 
+@pytest.mark.skipif(
+    transformers_version >= version.parse("5.0.0"),
+    reason="GGUF format saving and loading failed in transformers v5, \
+        https://github.com/huggingface/transformers/issues/43482",
+)
 class TestGGUF:
 
     @classmethod
@@ -60,6 +66,12 @@ def test_q4_0(self):
 
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0")
         gguf_file = os.listdir(quantized_model_path)[0]
+
+        # TODO: fix the issue of gguf loading error in transformers v5
+        # cls = transformers.generation.configuration_utils.GenerationConfig'>, json_file = None
+        #     def _dict_from_json_file(cls, json_file: str | os.PathLike):
+        # >       with open(json_file, "r", encoding="utf-8") as reader:
+        # E       TypeError: expected str, bytes or os.PathLike object, not NoneType
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)

diff --git a/test/test_cpu/integrations/test_llmc_integration.py b/test/test_cpu/integrations/test_llmc_integration.py
@@ -3,10 +3,13 @@
 from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
 from llmcompressor import oneshot
 from llmcompressor.modifiers.autoround import AutoRoundModifier
+from packaging import version
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round.calib_dataset import get_dataset
 
+from ...helpers import transformers_version
+
 recipe_str = """
 quant_stage:
     quant_modifiers:
@@ -39,6 +42,11 @@
 )
 
 
+@pytest.mark.skipif(
+    transformers_version >= version.parse("5.0.0"),
+    reason="transformers 5.0 use_auth_token is deprecated and llmcompressor oneshot has not been updated yet, \
+        https://github.com/vllm-project/llm-compressor/issues/2289",
+)
 @pytest.mark.parametrize(
     "recipe",
     [