intel · yiliu30 · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py
@@ -632,15 +632,8 @@ def select_dataset(dataset, indices):
         return dataset
 
 
-def get_dataloader(
-    tokenizer,
-    seqlen,
-    dataset_name="NeelNanda/pile-10k",
-    seed=42,
-    bs=8,
-    nsamples=512,
-):
-    """Generate a DataLoader for calibration using specified parameters.
+def get_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512):
+    """Generate a dataset for calibration.
 
     Args:
         tokenizer (Tokenizer): The tokenizer to use for tokenization.
@@ -655,7 +648,7 @@ def get_dataloader(
         apply_chat_template: Whether to apply chat template in tokenization.
 
     Returns:
-        DataLoader: The DataLoader for the calibrated dataset.
+        Dataset: The processed dataset ready for calibration.
     """
     dataset_names = dataset_name.split(",")
 
@@ -823,7 +816,29 @@ def concat_dataset_element(dataset):
         else:
             dataset_final = datasets[0]
 
-    # dataset_final = datasets[0]
+    if len(dataset_final) > nsamples:
+        dataset_final = select_dataset(dataset_final, range(nsamples))
+    return dataset_final
+
+
+def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=512):
+    """Generate a DataLoader for calibration using specified parameters.
+
+    Args:
+        tokenizer (Tokenizer): The tokenizer to use for tokenization.
+        seqlen (int): The exact sequence length. samples < seqlen will be dropped,
+                      samples longer than seqlen will be truncated
+        dataset_name (str, optional): The name of the dataset or datasets separated by commas.
+                                     Defaults to "NeelNanda/pile-10k".
+        split (str, optional): The data split to use. Defaults to None.
+        seed (int, optional): The random seed for reproducibility. Defaults to 42.
+        bs (int, optional): The batch size. Defaults to 4.
+        nsamples (int, optional): The total number of samples to include. Defaults to 512.
+        apply_chat_template: Whether to apply chat template in tokenization.
+
+    Returns:
+        DataLoader: The DataLoader for the calibrated dataset.
+    """
 
     @torch.no_grad()
     def collate_batch(batch):
@@ -849,8 +864,6 @@ def collate_batch(batch):
         res = {"input_ids": input_ids_new, "attention_mask": attention_mask_new}
         return res
 
-    if len(dataset_final) > nsamples:
-        dataset_final = select_dataset(dataset_final, range(nsamples))
-
+    dataset_final = get_dataset(tokenizer, seqlen, dataset_name, seed, bs, nsamples)
     calib_dataloader = DataLoader(dataset_final, batch_size=bs, shuffle=False, collate_fn=collate_batch)
     return calib_dataloader
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -20,7 +20,7 @@
 import traceback
 from collections import defaultdict
 from dataclasses import asdict, fields
-from typing import Any, Callable, Union
+from typing import Any, Callable, Optional, Union
 
 import accelerate
 import torch
@@ -85,6 +85,7 @@
     is_hpex_available,
     llm_load_model,
     mv_module_from_gpu,
+    normalize_input,
     set_amax_for_all_moe_layers,
     set_module,
     to_device,
@@ -351,7 +352,8 @@ def __init__(
         # Some helpers
         if "hpu" in str(self.device):
             self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear")
-        self.batch_dim = None
+        # TODO: check with heng/weiwei
+        self.batch_dim = 0
         self.infer_bs_coeff = 1
 
         self.block_forward = compile_func(block_forward, self.device) if self.enable_torch_compile else block_forward
@@ -1495,6 +1497,21 @@ def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tens
             q_inputs = q_inputs.pop(input_id_str[0], None)
         return inputs, q_inputs
 
+    def configure_layer_config(self, enable_gguf_official_mixed: None | bool = False):
+        self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config(
+            self.model,
+            self.layer_config,
+            self.scheme,
+            self.scale_dtype,
+            self.supported_types,
+            self.inner_supported_types,
+            self.quant_block_list,
+            self.fp_layers,
+            self.quant_lm_head,
+            enable_gguf_official_mixed=enable_gguf_official_mixed,
+            is_mllm=self.mllm,
+        )
+
     def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound.
         Returns:
@@ -1513,20 +1530,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
             enable_gguf_official_mixed = True
         else:
             enable_gguf_official_mixed = False
-        self.layer_config, self.has_qlayer_outside_block, self.regex_config = set_layer_config(
-            self.model,
-            self.layer_config,
-            self.scheme,
-            self.scale_dtype,
-            self.supported_types,
-            self.inner_supported_types,
-            self.quant_block_list,
-            self.fp_layers,
-            self.quant_lm_head,
-            enable_gguf_official_mixed=enable_gguf_official_mixed,
-            is_mllm=self.mllm,
-        )
 
+        self.configure_layer_config(enable_gguf_official_mixed=enable_gguf_official_mixed)
         if not hasattr(self, "formats"):
             logger.warning("this API is deprecated, please use `quantize_and_save` instead")
         else:
@@ -2420,13 +2425,14 @@ def _get_current_num_elm(
         current_input_ids = [input_ids[i] for i in indices]
         return sum(id.numel() for id in current_input_ids)
 
-    def _quantize_block(
+    def quantize_block(
         self,
         block: torch.nn.Module,
-        input_ids: Union[list[torch.Tensor], dict],
-        input_others: dict,
+        inputs: tuple[Union[list[torch.Tensor], dict, Any], Optional[dict]],
         q_input: Union[torch.Tensor, dict, None] = None,
+        normalize_inputs: bool = False,
         device: Union[str, torch.device] = "cpu",
+        auto_offload=True,
     ):
         """Quantize the weights of a given block of the model.
 
@@ -2445,30 +2451,34 @@ def _quantize_block(
                 if is_fp8_linear(m):
                     new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device)
                     set_module(block, n, new_layer)
+        if normalize_inputs:
+            input_ids, input_others = normalize_input(inputs)
+        else:
+            input_ids, input_others = inputs
+        if auto_offload:
+            if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
+                set_auto_device_map_for_block_with_tuning(
+                    block, self.device_map, input_ids, self.low_gpu_mem_usage, self.mem_per_param_scale
+                )
 
-        if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
-            set_auto_device_map_for_block_with_tuning(
-                block, self.device_map, input_ids, self.low_gpu_mem_usage, self.mem_per_param_scale
-            )
-
-        if self.device_map is not None:
-            for n, m in block.named_modules():
-                if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
-                    continue
-                from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+            if self.device_map is not None:
+                for n, m in block.named_modules():
+                    if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
+                        continue
+                    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
 
-                hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
-                add_hook_to_module(m, hook, True)
+                    hook = AlignDevicesHook(m.tuning_device, io_same_device=True)
+                    add_hook_to_module(m, hook, True)
 
         if q_input is None:
             hook_handles = self._register_act_max_hook(block)
 
             output = self._get_block_outputs(
                 block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device
             )
-
-            for handle in hook_handles:
-                handle.remove()
+            if auto_offload:
+                for handle in hook_handles:
+                    handle.remove()
         else:
             output = self._get_block_outputs(
                 block, input_ids, input_others, self.batch_size * self.infer_bs_coeff, device, self.cache_device
@@ -2565,6 +2575,7 @@ def _quantize_block(
         best_params = {}
         total_loss = 0
         for i in range(self.iters):
+            logger.trace(f"Quant block iteration {i}/{self.iters}, best loss so far: {best_loss}")
             total_loss = 0
             if self.sampler == "rand":
                 whole_indices = torch.randperm(nsamples)[:pick_samples]
@@ -2587,7 +2598,7 @@ def _quantize_block(
                 else:
                     tmp_attention_mask = 1.0
                 if self.amp:
-                    with autocast(device_type=device.split(":")[0], dtype=self.amp_dtype):
+                    with autocast(device_type=str(device).split(":")[0], dtype=self.amp_dtype):
                         loss = mse_loss(  # pylint: disable=not-callable
                             output_q * tmp_attention_mask, current_output * tmp_attention_mask
                         )
@@ -2636,7 +2647,7 @@ def _quantize_block(
         if is_nv_fp(self.act_data_type):
             # enable moe experts act_max automatic generation for WrapperWALayer
             set_amax_for_all_moe_layers(block, attr_name="orig_layer.act_max")
-
+        q_outputs = None
         if self.enable_quanted_input:
             clear_memory()
             q_outputs = self._get_block_outputs(
@@ -2647,19 +2658,13 @@ def _quantize_block(
                 device,
                 cache_device=self.cache_device,
             )
+        if auto_offload:
             if self.device_map is not None:
                 accelerate.hooks.remove_hook_from_submodules(block)
             mv_module_from_gpu(block)
-            clear_memory(input_ids)
-
-            return q_outputs, output
+        clear_memory(input_ids)
 
-        else:
-            if self.device_map is not None:
-                accelerate.hooks.remove_hook_from_submodules(block)
-            mv_module_from_gpu(block)
-            clear_memory(input_ids)
-            return None, output
+        return q_outputs, output
 
     def _split_inputs(self, inputs: dict) -> tuple[torch.Tensor, dict]:
         input_ids = inputs["input_ids"]
@@ -2733,9 +2738,9 @@ def _quantize_blocks(
                 else:
                     logger.info("using algorithm extension for quantization.")
             except (ImportError, ModuleNotFoundError):
-                quantize_block = self._quantize_block
+                quantize_block = self.quantize_block
         else:
-            quantize_block = self._quantize_block
+            quantize_block = self.quantize_block
 
         if pbar is None:
             pbar = tqdm(range(0, len(block_names), nblocks))
@@ -2756,8 +2761,7 @@ def _quantize_blocks(
             m = m.to(device)
             q_input, input_ids = quantize_block(
                 m,
-                input_ids,
-                input_others,
+                (input_ids, input_others),
                 q_input=q_input,
                 device=device,
             )

diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
@@ -111,7 +111,7 @@ def block_forward(
         alibi = input_others["alibi"]
         input_others["alibi"] = alibi.reshape(-1, alibi.shape[2], alibi.shape[3])
     if amp:
-        with autocast(device_type=device.split(":")[0], dtype=amp_dtype):  # pragma: no cover
+        with autocast(device_type=str(device).split(":")[0], dtype=amp_dtype):  # pragma: no cover
             output = block(input_ids, *input_tuple, **input_others)
     else:
         output = block(input_ids, *input_tuple, **input_others)

diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
@@ -297,3 +297,28 @@ def get_reciprocal(tensor):
     else:
         tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor)
     return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor))
+
+
+def normalize_input(cur_inputs):
+    input_ids = []
+    input_others = {}
+    positional_inputs = []
+    attention_mask = None
+    position_ids = None
+    cache_position = None
+    position_embeddings = (None, None)
+    for cur_inp in cur_inputs:
+        input_ids.append(cur_inp[0][0][0])
+        for key, val in cur_inp[0][1].items():
+            if key == "position_ids":
+                position_ids = val
+            elif key == "position_embeddings":
+                position_embeddings = val
+            elif key == "cache_position":
+                cache_position = val
+    input_others["position_ids"] = position_ids
+    input_others["positional_inputs"] = positional_inputs
+    input_others["attention_mask"] = attention_mask
+    input_others["position_embeddings"] = position_embeddings
+    input_others["cache_position"] = cache_position
+    return input_ids, input_others
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
@@ -443,7 +443,8 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
         "pre_mm_projector_norm",
         "vision",
     ]
-
+    # FIXME: yi, fix it later
+    return False
     model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path
     if not os.path.isdir(model_path):
         model_path = download_hf_model(model_path)