Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
dde4ba9
fix cpu ut for transformersv5
sys-lpot-val Jan 23, 2026
735ca68
minor fix
sys-lpot-val Jan 23, 2026
7d335a7
merge main
sys-lpot-val Jan 23, 2026
dafcc5f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 23, 2026
f45be48
update issue link
sys-lpot-val Jan 26, 2026
f35e42a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 26, 2026
e0c16ba
fix wf init
sys-lpot-val Jan 26, 2026
dbf8cb4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 26, 2026
e0623fa
revert wf
sys-lpot-val Jan 26, 2026
b4f0108
fix mixed type
sys-lpot-val Jan 26, 2026
fc1a96b
fix copy out of meta tensor issue
WeiweiZhang1 Jan 27, 2026
a51ff10
fix copy out of meta tensor issue
WeiweiZhang1 Jan 27, 2026
6f3691a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2026
e10c525
fix lm_eval import issue and update requirement to suit transformers …
xin3he Jan 27, 2026
8b4f2d5
skip PhiConfig has no attribute 'pad_token_id'
sys-lpot-val Jan 27, 2026
c3fec96
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2026
a55841c
fix mixed type in qlinera_torch_zp
sys-lpot-val Jan 27, 2026
e725099
update no_init_weights import for gpt-oss
xin3he Jan 27, 2026
85eb094
skip diffusers
sys-lpot-val Jan 27, 2026
a93c9e3
skip phiconfig
sys-lpot-val Jan 27, 2026
eb6d09e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2026
d9a9e4e
update vllm and sglang to use git main branch
xin3he Jan 27, 2026
8ca34fe
skip diffusers import
sys-lpot-val Jan 27, 2026
de7819f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2026
ec19894
Merge branch 'main' into kaihui/v5_cpu_ut
Kaihui-intel Jan 27, 2026
4bd8b48
set trust_remote_code=False for tiny-deepseek
xin3he Jan 27, 2026
fb21e4e
set pad_token_id to None in setup_llama4 for compatibility with Llama…
xin3he Jan 27, 2026
c4c74f0
fix
XuehaoSun Jan 27, 2026
2ba863a
Merge branch 'main' into kaihui/v5_cpu_ut
n1ck-guo Jan 28, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions auto_round/eval/eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,11 +279,7 @@ def eval_task_by_task(
device_str, parallelism = get_device_and_parallelism(device)

# load after _eval_int in order to make sure import torch after set CUDA_VISIBLE_DEVICES
import traceback

from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0401
from lm_eval.models.huggingface import HFLM # pylint: disable=E0401
from transformers import AutoModelForCausalLM, AutoTokenizer

from auto_round.utils import logger

Expand Down Expand Up @@ -402,7 +398,7 @@ def _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry
import time
import traceback

from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0401
import lm_eval # pylint: disable=E0401
from lm_eval.utils import make_table # pylint: disable=E0401

from auto_round.utils import logger
Expand All @@ -418,7 +414,7 @@ def _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry
current_retry_times = retry_times
while current_retry_times:
try:
res = lm_simple_evaluate(
res = lm_eval.simple_evaluate(
model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit
)
break
Expand All @@ -430,7 +426,7 @@ def _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry
for k, v in hflm.batch_sizes.items():
hflm.batch_sizes[k] = max(v // 2, 1)
logger.warning(f"Out of memory, reset batch_size to {hflm.batch_sizes} and re-try.")
res = lm_simple_evaluate(
res = lm_eval.simple_evaluate(
model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit
)
hflm.batch_sizes = ori_batch_sizes
Expand Down
8 changes: 4 additions & 4 deletions auto_round/eval/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def simple_evaluate_user_model(
mllm: bool = False,
**kwargs,
):
from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0401
import lm_eval # pylint: disable=E0401
from lm_eval.models.huggingface import HFLM # pylint: disable=E0401

if mllm:
Expand All @@ -57,7 +57,7 @@ def simple_evaluate_user_model(
dtype=eval_model_dtype,
add_bos_token=add_bos_token,
)
return lm_simple_evaluate(
return lm_eval.simple_evaluate(
model=hflm, model_args=None, batch_size=batch_size, max_batch_size=max_batch_size, limit=limit, **kwargs
)

Expand All @@ -71,9 +71,9 @@ def simple_evaluate(
device: Optional[str] = None,
**kwargs,
):
from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0401
import lm_eval # pylint: disable=E0401

return lm_simple_evaluate(
return lm_eval.simple_evaluate(
model=model,
model_args=model_args,
batch_size=batch_size,
Expand Down
12 changes: 10 additions & 2 deletions auto_round/modelling/gpt_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,16 @@


import torch
import transformers
from packaging import version
from torch import nn
from transformers.modeling_utils import no_init_weights as skip_weights_initialize

transformers_version = version.parse(transformers.__version__)
if transformers_version < version.parse("5.0.0"):
from transformers.modeling_utils import no_init_weights
else:
from transformers.initialization import no_init_weights

from transformers.models.gpt_oss.configuration_gpt_oss import GptOssConfig
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP

Expand Down Expand Up @@ -71,7 +79,7 @@ def __init__(self, original: GptOssMLP, config: GptOssConfig):
# Build per-expert MLPs
self.experts = nn.ModuleList()
target_device = next(original.experts.parameters()).device
with skip_weights_initialize(), torch.device("meta"):
with no_init_weights(), torch.device("meta"):
for _ in range(E):
self.experts.append(GPTOssSingleExpert(hidden_size, intermediate_size, dtype=dtype))

Expand Down
9 changes: 8 additions & 1 deletion auto_round/modelling/llama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,14 @@
# limitations under the License.
# Note: adapted from # https://github.com/vllm-project/llm-compressor/blob/main/src/llmcompressor/modeling/llama4.py
import torch
from transformers.modeling_utils import no_init_weights
import transformers
from packaging import version

transformers_version = version.parse(transformers.__version__)
if transformers_version < version.parse("5.0.0"):
from transformers.modeling_utils import no_init_weights
else:
from transformers.initialization import no_init_weights
from transformers.models.llama4.modeling_llama4 import Llama4Config, Llama4TextMLP, Llama4TextMoe

from auto_round.modelling.replace_modules import ReplacementModuleBase
Expand Down
41 changes: 28 additions & 13 deletions auto_round_extension/torch/qlinear_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,25 @@
logger = getLogger(__name__)


_DEVICE_WF_3BITS_TENSORS = {}
# Constants for FP4 values (E2M1 format)
_wf_3bits = [
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
[0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
[0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
]


def get_wf_3bits_tensor(device):
"""Get device-specific wf_3bits tensor, creating it if needed."""
device_str = str(device)
if device_str not in _DEVICE_WF_3BITS_TENSORS:
_DEVICE_WF_3BITS_TENSORS[device_str] = torch.tensor(_wf_3bits, dtype=torch.int32, device=device).reshape(
1, 3, 12
)
return _DEVICE_WF_3BITS_TENSORS[device_str]


class QuantLinear(nn.Module):
"""
Torch quantized linear layer.
Expand Down Expand Up @@ -72,17 +91,11 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa

# is performed by unpacking the weights and using torch.matmul
if self.bits in [2, 4, 8]:
self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
elif self.bits == 3:
self.wf = torch.tensor(
[
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
[0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
[0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
],
dtype=torch.int32,
).reshape(1, 3, 12)

list(range(0, 32, self.bits)), dtype=torch.int32, device=self.qweight.device
).unsqueeze(0)
elif self.bits == 3:
self.wf = get_wf_3bits_tensor(device=self.qweight.device)
self.dequant_dtype = torch.int16 if self.bits == 8 else torch.int8

def post_init(self):
Expand Down Expand Up @@ -277,7 +290,9 @@ def forward(self, x):

if self.bits in [2, 4, 8]:
if self.wf.device != self.qzeros.device:
self.wf = self.wf.to(self.qzeros.device)
self.wf = torch.tensor(
list(range(0, 32, self.bits)), dtype=torch.int32, device=self.qzeros.device
).unsqueeze(0)
zeros = torch.bitwise_right_shift(
torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
self.wf.unsqueeze(0),
Expand All @@ -293,7 +308,7 @@ def forward(self, x):
)
elif self.bits == 3:
if self.wf.device != self.qzeros.device:
self.wf = self.wf.to(self.qzeros.device)
self.wf = get_wf_3bits_tensor(self.qzeros.device)
zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(-1, -1, -1, 12)
zeros = zeros >> self.wf.unsqueeze(0)
zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
Expand Down Expand Up @@ -335,7 +350,7 @@ def forward(self, x):
out = torch.matmul(x, weights)
out = out.to(x_dtype)
out = out.reshape(out_shape)
out = out + self.bias if self.bias is not None else out
out = (out + self.bias).to(x_dtype) if self.bias is not None else out
return out


Expand Down
22 changes: 10 additions & 12 deletions auto_round_extension/torch/qlinear_torch_zp.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import transformers

from auto_round.utils import get_packing_device
from auto_round_extension.torch.qlinear_torch import get_wf_3bits_tensor

logger = getLogger(__name__)

Expand Down Expand Up @@ -72,16 +73,11 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=Fa

# is performed by unpacking the weights and using torch.matmul
if self.bits in [2, 4, 8]:
self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
else: ## bits == 3
self.wf = torch.tensor(
[
[0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
[0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
[0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
],
dtype=torch.int32,
).reshape(1, 3, 12)
list(range(0, 32, self.bits)), dtype=torch.int32, device=self.qweight.device
).unsqueeze(0)
else: ## bits == 3
self.wf = get_wf_3bits_tensor(device=self.qweight.device)

self.dequant_dtype = torch.int16 if self.bits == 8 else torch.int8

Expand Down Expand Up @@ -276,7 +272,9 @@ def forward(self, x):

if self.bits in [2, 4, 8]:
if self.wf.device != self.qzeros.device:
self.wf = self.wf.to(self.qzeros.device)
self.wf = torch.tensor(
list(range(0, 32, self.bits)), dtype=torch.int32, device=self.qzeros.device
).unsqueeze(0)
zeros = torch.bitwise_right_shift(
torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
self.wf.unsqueeze(0),
Expand All @@ -292,7 +290,7 @@ def forward(self, x):
)
elif self.bits == 3:
if self.wf.device != self.qzeros.device:
self.wf = self.wf.to(self.qzeros.device)
self.wf = get_wf_3bits_tensor(device=self.qzeros.device)
zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(-1, -1, -1, 12)
zeros = zeros >> self.wf.unsqueeze(0)
zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
Expand Down Expand Up @@ -335,7 +333,7 @@ def forward(self, x):
out = torch.matmul(x, weights)
out = out.to(x_dtype)
out = out.reshape(out_shape)
out = out + self.bias if self.bias is not None else out
out = (out + self.bias).to(x_dtype) if self.bias is not None else out
return out


Expand Down
3 changes: 3 additions & 0 deletions test/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import pytest
import torch
import transformers
from packaging import version

from auto_round.utils import get_attr, llm_load_model, mllm_load_model, set_attr

transformers_version = version.parse(transformers.__version__)


# Automatic choose local path or model name.
def get_model_path(model_name: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion test/test_ark/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
auto-round-kernel
lm-eval
lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@main
15 changes: 12 additions & 3 deletions test/test_cpu/core/test_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@

import pytest
import torch
from packaging import version
from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer

from auto_round import AutoRound
from auto_round.eval.evaluation import simple_evaluate_user_model
from auto_round.utils import get_module

from ...helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path
from ...helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path, transformers_version


class TestAutoRound:
Expand Down Expand Up @@ -72,6 +73,10 @@ def test_remove_whole_block(self, tiny_opt_model_path, dataloader):
)
autoround.quantize()

@pytest.mark.skipif(
transformers_version >= version.parse("5.0"),
reason="PhiConfig missing pad_token_id, https://github.com/huggingface/transformers/pull/43453",
)
def test_consecutive_quant(self, tiny_opt_model_path, tiny_phi2_model_path, dataloader):
bits, group_size, sym = 4, -1, False
autoround = AutoRound(
Expand Down Expand Up @@ -456,8 +461,12 @@ def test_not_convert_modules(self):
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16
)
assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
if transformers_version < version.parse("5.0.0"):
assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
else:
assert isinstance(model.model.visual.blocks[0].attn.qkv, torch.nn.Linear)
assert not isinstance(model.model.visual.merger.mlp[0], QuantLinear)
if hasattr(model.model, "language_model"):
assert isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear)
else:
Expand Down
7 changes: 5 additions & 2 deletions test/test_cpu/export/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,10 @@ def test_static_afp8_export(self, static_kv_dtype):
assert "model.decoder.layers.8.self_attn.v_scale" in f.keys()
assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1])
assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1])
assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32
assert (
f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32
or f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.bfloat16
)
shutil.rmtree(quantized_model_path, ignore_errors=True)

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
Expand Down Expand Up @@ -318,7 +321,7 @@ def test_static_fp8_attn(self):
weight_name = f"model.decoder.layers.8.self_attn.{attr}"
assert weight_name in f.keys()
assert f.get_tensor(weight_name).shape == torch.Size([1])
assert f.get_tensor(weight_name).dtype == torch.float32
assert f.get_tensor(weight_name).dtype == torch.float32 or f.get_tensor(weight_name).dtype == torch.bfloat16

shutil.rmtree(quantized_model_path, ignore_errors=True)

Expand Down
14 changes: 13 additions & 1 deletion test/test_cpu/export/test_gguf_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,22 @@

import pytest
import torch
from packaging import version
from transformers import AutoModelForCausalLM, AutoTokenizer

from auto_round import AutoRound

from ...helpers import get_model_path, get_tiny_model, save_tiny_model
from ...helpers import get_model_path, get_tiny_model, save_tiny_model, transformers_version

AUTO_ROUND_PATH = __file__.split("/")
AUTO_ROUND_PATH = "/".join(AUTO_ROUND_PATH[: AUTO_ROUND_PATH.index("test")])


@pytest.mark.skipif(
transformers_version >= version.parse("5.0.0"),
reason="GGUF format saving and loading failed in transformers v5, \
https://github.com/huggingface/transformers/issues/43482",
)
class TestGGUF:

@classmethod
Expand Down Expand Up @@ -60,6 +66,12 @@ def test_q4_0(self):

autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_0")
gguf_file = os.listdir(quantized_model_path)[0]

# TODO: fix the issue of gguf loading error in transformers v5
# cls = transformers.generation.configuration_utils.GenerationConfig'>, json_file = None
# def _dict_from_json_file(cls, json_file: str | os.PathLike):
# > with open(json_file, "r", encoding="utf-8") as reader:
# E TypeError: expected str, bytes or os.PathLike object, not NoneType
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
text = "There is a girl who likes adventure,"
inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
Expand Down
8 changes: 8 additions & 0 deletions test/test_cpu/integrations/test_llmc_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
from llmcompressor import oneshot
from llmcompressor.modifiers.autoround import AutoRoundModifier
from packaging import version
from transformers import AutoModelForCausalLM, AutoTokenizer

from auto_round.calib_dataset import get_dataset

from ...helpers import transformers_version

recipe_str = """
quant_stage:
quant_modifiers:
Expand Down Expand Up @@ -39,6 +42,11 @@
)


@pytest.mark.skipif(
transformers_version >= version.parse("5.0.0"),
reason="transformers 5.0 use_auth_token is deprecated and llmcompressor oneshot has not been updated yet, \
https://github.com/vllm-project/llm-compressor/issues/2289",
)
@pytest.mark.parametrize(
"recipe",
[
Expand Down
Loading
Loading