Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5ee2c2d
init moe support
yiliu30 Oct 23, 2025
c278f9d
add test
yiliu30 Oct 23, 2025
418e6a0
fix import
yiliu30 Oct 23, 2025
184783f
clean envs
yiliu30 Oct 23, 2025
b9da06f
add script for apply ext
yiliu30 Oct 23, 2025
187f38d
clean docs
yiliu30 Oct 23, 2025
4031724
fix license
yiliu30 Oct 23, 2025
5fe01ef
fix
yiliu30 Oct 23, 2025
73f1e9b
fix import and sitecustomize
yiliu30 Oct 23, 2025
8495854
move to ext
yiliu30 Oct 24, 2025
c473934
update mxfp4
yiliu30 Oct 24, 2025
9f65bd1
fix
yiliu30 Oct 24, 2025
8038a5f
fix model name
yiliu30 Oct 24, 2025
e0872b6
Merge branch 'main' into vllm-ext
yiliu30 Oct 24, 2025
c82bce1
fix
yiliu30 Oct 27, 2025
19e18c7
Merge branch 'vllm-ext' of https://github.com/intel/auto-round into v…
yiliu30 Oct 27, 2025
adf7ebf
use absolute path
yiliu30 Oct 27, 2025
59f5cd2
Merge branch 'main' into vllm-ext
yiliu30 Oct 27, 2025
8f27041
Merge branch 'main' into vllm-ext
yiliu30 Oct 30, 2025
ad8537c
fix
yiliu30 Oct 30, 2025
77844f6
mark round method as todo
yiliu30 Oct 30, 2025
ce985ef
tmp wa for llmc
yiliu30 Oct 31, 2025
8832530
tmp wa for llmc
yiliu30 Oct 31, 2025
361491f
return ds
yiliu30 Nov 2, 2025
db65d74
add more log
yiliu30 Nov 2, 2025
60a0023
refine code
yiliu30 Nov 3, 2025
2f96c13
Merge branch 'llmc' of https://github.com/intel/auto-round into llmc
yiliu30 Nov 3, 2025
7a1716e
refactor
Nov 3, 2025
a20f9df
refactor
Nov 4, 2025
553ee5c
fix offloaf
Nov 5, 2025
2bd3c4b
fix
Nov 5, 2025
b992c31
remove time
yiliu30 Nov 5, 2025
0354c2b
update
yiliu30 Nov 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 27 additions & 14 deletions auto_round/calib_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,15 +632,8 @@ def select_dataset(dataset, indices):
return dataset


def get_dataloader(
tokenizer,
seqlen,
dataset_name="NeelNanda/pile-10k",
seed=42,
bs=8,
nsamples=512,
):
"""Generate a DataLoader for calibration using specified parameters.
def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512):
"""Generate a dataset for calibration.

Args:
tokenizer (Tokenizer): The tokenizer to use for tokenization.
Expand All @@ -655,7 +648,7 @@ def get_dataloader(
apply_chat_template: Whether to apply chat template in tokenization.

Returns:
DataLoader: The DataLoader for the calibrated dataset.
Dataset: The processed dataset ready for calibration.
"""
dataset_names = dataset_name.split(",")

Expand Down Expand Up @@ -823,7 +816,29 @@ def concat_dataset_element(dataset):
else:
dataset_final = datasets[0]

# dataset_final = datasets[0]
if len(dataset_final) > nsamples:
dataset_final = select_dataset(dataset_final, range(nsamples))
return dataset_final


def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512):
"""Generate a DataLoader for calibration using specified parameters.

Args:
tokenizer (Tokenizer): The tokenizer to use for tokenization.
seqlen (int): The exact sequence length. samples < seqlen will be dropped,
samples longer than seqlen will be truncated
dataset_name (str, optional): The name of the dataset or datasets separated by commas.
Defaults to "NeelNanda/pile-10k".
split (str, optional): The data split to use. Defaults to None.
seed (int, optional): The random seed for reproducibility. Defaults to 42.
bs (int, optional): The batch size. Defaults to 4.
nsamples (int, optional): The total number of samples to include. Defaults to 512.
apply_chat_template: Whether to apply chat template in tokenization.

Returns:
DataLoader: The DataLoader for the calibrated dataset.
"""

@torch.no_grad()
def collate_batch(batch):
Expand All @@ -849,8 +864,6 @@ def collate_batch(batch):
res = {"input_ids": input_ids_new, "attention_mask": attention_mask_new}
return res

if len(dataset_final) > nsamples:
dataset_final = select_dataset(dataset_final, range(nsamples))

dataset_final = get_dataset(tokenizer, seqlen, dataset_name, seed, bs, nsamples)
calib_dataloader = DataLoader(dataset_final, batch_size=bs, shuffle=False, collate_fn=collate_batch)
return calib_dataloader
100 changes: 52 additions & 48 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import traceback
from collections import defaultdict
from dataclasses import asdict, fields
from typing import Any, Callable, Union
from typing import Any, Callable, Optional, Union

import accelerate
import torch
Expand Down Expand Up @@ -85,6 +85,7 @@
is_hpex_available,
llm_load_model,
mv_module_from_gpu,
normalize_input,
set_amax_for_all_moe_layers,
set_module,
to_device,
Expand Down Expand Up @@ -351,7 +352,8 @@ def __init__(
# Some helpers
if "hpu" in str(self.device):
self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear")
self.batch_dim = None
# TODO: check with heng/weiwei
self.batch_dim = 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this is required, hidden in kwargs, and add comments

self.infer_bs_coeff = 1

self.block_forward = compile_func(block_forward, self.device) if self.enable_torch_compile else block_forward
Expand Down Expand Up @@ -1495,6 +1497,21 @@ def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tens
q_inputs = q_inputs.pop(input_id_str[0], None)
return inputs, q_inputs

def configure_layer_config(self, enable_gguf_official_mixed: None | bool = False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better set the enable_gguf_official_mixed to True by default

self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config(
self.model,
self.layer_config,
self.scheme,
self.scale_dtype,
self.supported_types,
self.inner_supported_types,
self.quant_block_list,
self.fp_layers,
self.quant_lm_head,
enable_gguf_official_mixed=enable_gguf_official_mixed,
is_mllm=self.mllm,
)

def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
"""Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound.
Returns:
Expand All @@ -1513,20 +1530,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
enable_gguf_official_mixed = True
else:
enable_gguf_official_mixed = False
self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config(
self.model,
self.layer_config,
self.scheme,
self.scale_dtype,
self.supported_types,
self.inner_supported_types,
self.quant_block_list,
self.fp_layers,
self.quant_lm_head,
enable_gguf_official_mixed=enable_gguf_official_mixed,
is_mllm=self.mllm,
)

self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed)
if not hasattr(self, "formats"):
logger.warning("this API is deprecated, please use `quantize_and_save` instead")
else:
Expand Down Expand Up @@ -2420,13 +2425,14 @@ def _get_current_num_elm(
current_input_ids = [input_ids[i] for i in indices]
return sum(id.numel() for id in current_input_ids)

def _quantize_block(
def quantize_block(
self,
block: torch.nn.Module,
input_ids: Union[list[torch.Tensor], dict],
input_others: dict,
inputs: tuple[Union[list[torch.Tensor], dict, Any], Optional[dict]],
q_input: Union[torch.Tensor, dict, None] = None,
normalize_inputs: bool = False,
device: Union[str, torch.device] = "cpu",
auto_offload=True,
):
"""Quantize the weights of a given block of the model.

Expand All @@ -2445,30 +2451,34 @@ def _quantize_block(
if is_fp8_linear(m):
new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device)
set_module(block, n, new_layer)
if normalize_inputs:
input_ids, input_others = normalize_input(inputs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not moving these change to llmc side

else:
input_ids, input_others = inputs
if auto_offload:
if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
set_auto_device_map_for_block_with_tuning(
block, self.device_map, input_ids, self.low_gpu_mem_usage, self.mem_per_param_scale
)

if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
set_auto_device_map_for_block_with_tuning(
block, self.device_map, input_ids, self.low_gpu_mem_usage, self.mem_per_param_scale
)

if self.device_map is not None:
for n, m in block.named_modules():
if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
continue
from accelerate.hooks import AlignDevicesHook, add_hook_to_module
if self.device_map is not None:
for n, m in block.named_modules():
if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
continue
from accelerate.hooks import AlignDevicesHook, add_hook_to_module

hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
add_hook_to_module(m, hook, True)
hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
add_hook_to_module(m, hook, True)

if q_input is None:
hook_handles = self._register_act_max_hook(block)

output = self._get_block_outputs(
block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device
)

for handle in hook_handles:
handle.remove()
if auto_offload:
for handle in hook_handles:
handle.remove()
else:
output = self._get_block_outputs(
block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device
Expand Down Expand Up @@ -2565,6 +2575,7 @@ def _quantize_block(
best_params = {}
total_loss = 0
for i in range(self.iters):
logger.trace(f"Quant block iteration {i}/{self.iters}, best loss so far: {best_loss}")
total_loss = 0
if self.sampler == "rand":
whole_indices = torch.randperm(nsamples)[:pick_samples]
Expand All @@ -2587,7 +2598,7 @@ def _quantize_block(
else:
tmp_attention_mask = 1.0
if self.amp:
with autocast(device_type=device.split(":")[0], dtype=self.amp_dtype):
with autocast(device_type=str(device).split(":")[0], dtype=self.amp_dtype):
loss = mse_loss( # pylint: disable=not-callable
output_q * tmp_attention_mask, current_output * tmp_attention_mask
)
Expand Down Expand Up @@ -2636,7 +2647,7 @@ def _quantize_block(
if is_nv_fp(self.act_data_type):
# enable moe experts act_max automatic generation for WrapperWALayer
set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max")

q_outputs = None
if self.enable_quanted_input:
clear_memory()
q_outputs = self._get_block_outputs(
Expand All @@ -2647,19 +2658,13 @@ def _quantize_block(
device,
cache_device=self.cache_device,
)
if auto_offload:
if self.device_map is not None:
accelerate.hooks.remove_hook_from_submodules(block)
mv_module_from_gpu(block)
clear_memory(input_ids)

return q_outputs, output
clear_memory(input_ids)

else:
if self.device_map is not None:
accelerate.hooks.remove_hook_from_submodules(block)
mv_module_from_gpu(block)
clear_memory(input_ids)
return None, output
return q_outputs, output

def _split_inputs(self, inputs: dict) -> tuple[torch.Tensor, dict]:
input_ids = inputs["input_ids"]
Expand Down Expand Up @@ -2733,9 +2738,9 @@ def _quantize_blocks(
else:
logger.info("using algorithm extension for quantization.")
except (ImportError, ModuleNotFoundError):
quantize_block = self._quantize_block
quantize_block = self.quantize_block
else:
quantize_block = self._quantize_block
quantize_block = self.quantize_block

if pbar is None:
pbar = tqdm(range(0, len(block_names), nblocks))
Expand All @@ -2756,8 +2761,7 @@ def _quantize_blocks(
m = m.to(device)
q_input, input_ids = quantize_block(
m,
input_ids,
input_others,
(input_ids, input_others),
q_input=q_input,
device=device,
)
Expand Down
2 changes: 1 addition & 1 deletion auto_round/compressors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def block_forward(
alibi = input_others["alibi"]
input_others["alibi"] = alibi.reshape(-1, alibi.shape[2], alibi.shape[3])
if amp:
with autocast(device_type=device.split(":")[0], dtype=amp_dtype): # pragma: no cover
with autocast(device_type=str(device).split(":")[0], dtype=amp_dtype): # pragma: no cover
output = block(input_ids, *input_tuple, **input_others)
else:
output = block(input_ids, *input_tuple, **input_others)
Expand Down
25 changes: 25 additions & 0 deletions auto_round/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,28 @@ def get_reciprocal(tensor):
else:
tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor)
return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor))


def normalize_input(cur_inputs):
input_ids = []
input_others = {}
positional_inputs = []
attention_mask = None
position_ids = None
cache_position = None
position_embeddings = (None, None)
for cur_inp in cur_inputs:
input_ids.append(cur_inp[0][0][0])
for key, val in cur_inp[0][1].items():
if key == "position_ids":
position_ids = val
elif key == "position_embeddings":
position_embeddings = val
elif key == "cache_position":
cache_position = val
input_others["position_ids"] = position_ids
input_others["positional_inputs"] = positional_inputs
input_others["attention_mask"] = attention_mask
input_others["position_embeddings"] = position_embeddings
input_others["cache_position"] = cache_position
return input_ids, input_others
3 changes: 2 additions & 1 deletion auto_round/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,8 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
"pre_mm_projector_norm",
"vision",
]

# FIXME: yi, fix it later
return False
model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path
if not os.path.isdir(model_path):
model_path = download_hf_model(model_path)
Expand Down