diff --git a/examples/quantization_w8a8_fp8/qwen3_reranker_example.py b/examples/quantization_w8a8_fp8/qwen3_reranker_example.py index 91dfba842d..08313e8140 100644 --- a/examples/quantization_w8a8_fp8/qwen3_reranker_example.py +++ b/examples/quantization_w8a8_fp8/qwen3_reranker_example.py @@ -61,7 +61,7 @@ scores = outputs.logits[:, -1, :].max(dim=-1).values for i, (doc, score) in enumerate(zip(documents, scores)): - print(f"Document {i+1} score: {score.item():.4f}") + print(f"Document {i + 1} score: {score.item():.4f}") print(f" Content: {doc[:80]}...") print("==========================================") diff --git a/examples/quantization_w8a8_int8/benchmark_smoothquant_ddp.py b/examples/quantization_w8a8_int8/benchmark_smoothquant_ddp.py index b9680644cf..522e2f2525 100644 --- a/examples/quantization_w8a8_int8/benchmark_smoothquant_ddp.py +++ b/examples/quantization_w8a8_int8/benchmark_smoothquant_ddp.py @@ -137,7 +137,7 @@ def tokenize(sample): logger.info(f"Calibration: {NUM_CALIBRATION_SAMPLES} samples total") logger.info(f"Samples/rank: {samples_per_rank}") logger.info(f"World size: {world_size}") - logger.info(f"Total time: {elapsed:.1f}s ({elapsed/60:.2f} min)") + logger.info(f"Total time: {elapsed:.1f}s ({elapsed / 60:.2f} min)") logger.info(f"Peak GPU mem: {peak_mem_gb:.2f} GB (rank 0)") logger.info("=" * 60) diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index fbbd7c9d51..53fc7d2386 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -4,11 +4,11 @@ from dataclasses import dataclass from functools import wraps from types import FunctionType, MethodType -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable import torch from accelerate.hooks import remove_hook_from_module -from compressed_tensors.offload import disable_onloading, offload_model +from compressed_tensors.offload import disable_onloading from compressed_tensors.utils import patch_attr from compressed_tensors.utils.match import match_named_modules from loguru import logger @@ -22,7 +22,6 @@ from llmcompressor.modifiers import Modifier from llmcompressor.modifiers.utils.hooks import HooksMixin from llmcompressor.pipelines.sequential.transformers_helpers import HFTracer -from llmcompressor.utils.dev import get_main_device from llmcompressor.utils.helpers import calibration_forward_context from llmcompressor.utils.pytorch.module import get_no_split_params @@ -35,7 +34,6 @@ "trace_subgraphs", "Subgraph", "get_sequential_targets", - "dispatch_for_sequential", "handle_sequential_oom", ] @@ -516,24 +514,6 @@ def is_ancestor(module: Module) -> bool: return ancestors -def dispatch_for_sequential( - model: PreTrainedModel, - onload_device: Optional[torch.device | str] = None, - offload_device: Optional[torch.device | str] = None, -) -> PreTrainedModel: - """ - Dispatch a model for sequential calibration using a sequential pipeline. - The model will be offloaded to the CPU and dispatched to CUDA/XPU device - if available. Removes any existing hooks. - - :param model: model to dispatch - :return: dispatched model - """ - if onload_device is None: - onload_device = get_main_device() - return offload_model(model, onload_device, offload_device) - - def _get_autowrap_functions() -> tuple[Callable[[Any], Any], ...]: try: from transformers.masking_utils import LAYER_PATTERN_TO_MASK_FUNCTION_MAPPING diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py index 600ba1061b..4bd3e16050 100644 --- a/src/llmcompressor/pipelines/sequential/pipeline.py +++ b/src/llmcompressor/pipelines/sequential/pipeline.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Iterator import torch +from compressed_tensors.offload import set_onload_device from compressed_tensors.utils import disable_offloading from torch.utils.data.dataloader import DataLoader from tqdm import tqdm @@ -11,7 +12,6 @@ from llmcompressor.pipelines.cache import IntermediatesCache from llmcompressor.pipelines.registry import CalibrationPipeline from llmcompressor.pipelines.sequential.helpers import ( - dispatch_for_sequential, get_sequential_targets, handle_sequential_oom, trace_subgraphs, @@ -89,7 +89,7 @@ def __call__( # prepare model for sequential onloading onload_device = get_main_device() offload_device = torch.device(dataset_args.sequential_offload_device) - dispatch_for_sequential(model, onload_device) + set_onload_device(model, onload_device) # prepare to trace subgraphs modifiers = session.lifecycle.recipe.modifiers diff --git a/tests/llmcompressor/utils/test_helpers.py b/tests/llmcompressor/utils/test_helpers.py index cec66e507e..4ce1e19166 100644 --- a/tests/llmcompressor/utils/test_helpers.py +++ b/tests/llmcompressor/utils/test_helpers.py @@ -1,6 +1,6 @@ import pytest import torch -from compressed_tensors.offload import dispatch_model, offload_model +from compressed_tensors.offload import dispatch_model, set_onload_device from transformers import ( AutoModelForCausalLM, MllamaForConditionalGeneration, @@ -71,7 +71,7 @@ def test_disable_cache(model_cls, model_stub): def test_disable_lm_head(offload): model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2") if offload == "sequential": - offload_model(model, "cuda") + set_onload_device(model, "cuda") if offload == "basic": dispatch_model(model) if offload == "none":