diff --git a/src/llmcompressor/entrypoints/model_free/lifecycle.py b/src/llmcompressor/entrypoints/model_free/lifecycle.py index b1bbf1ac9b..71d2d593e1 100644 --- a/src/llmcompressor/entrypoints/model_free/lifecycle.py +++ b/src/llmcompressor/entrypoints/model_free/lifecycle.py @@ -1,6 +1,4 @@ import torch -from compressed_tensors.compressors import BaseCompressor -from compressed_tensors.config.format import _get_quant_compression_format from compressed_tensors.quantization import ( QuantizationScheme, initialize_module_for_quantization, @@ -20,7 +18,6 @@ "validate_weight_for_quantization", "calibrate_global_scale", "calibrate_scale_zp", - "compress_module", ] @@ -64,29 +61,3 @@ def calibrate_scale_zp(module: torch.nn.Linear): apply_calibration_status(module) update_weight_zp_scale(module) freeze_module_quantization(module) - - -def compress_module(module: torch.nn.Linear): - scheme: QuantizationScheme = getattr(module, "quantization_scheme") - - format = _get_quant_compression_format(scheme.input_activations, scheme.weights) - scheme.format = format.value - - compressor = BaseCompressor.load_from_registry(format.value) - data = compressor.compress_weight( - module.weight, - quantization_args=scheme.weights, - scale=getattr(module, "weight_scale"), - zero_point=getattr(module, "weight_zero_point", None), - global_scale=getattr(module, "weight_global_scale", None), - ) - - # `compress_weight` is a messy api - delattr(module, "weight") - for key, value in data.items(): - if hasattr(module, key): - getattr(module, key).data = value - else: - module.register_parameter( - key, torch.nn.Parameter(value, requires_grad=False) - ) diff --git a/src/llmcompressor/entrypoints/model_free/process.py b/src/llmcompressor/entrypoints/model_free/process.py index 44835c7f81..66dcf32469 100644 --- a/src/llmcompressor/entrypoints/model_free/process.py +++ b/src/llmcompressor/entrypoints/model_free/process.py @@ -3,6 +3,7 @@ from typing import Iterable import torch +from compressed_tensors.compressors import compress_module from compressed_tensors.entrypoints.convert import Converter from compressed_tensors.quantization import QuantizationScheme from compressed_tensors.utils import match_quantizable_tensors @@ -12,7 +13,6 @@ from llmcompressor.entrypoints.model_free.lifecycle import ( calibrate_global_scale, calibrate_scale_zp, - compress_module, initialize_quantized_linear, validate_weight_for_quantization, ) diff --git a/src/llmcompressor/transformers/compression/compressed_tensors_utils.py b/src/llmcompressor/transformers/compression/compressed_tensors_utils.py index 188b757c33..c65a5cbd5a 100644 --- a/src/llmcompressor/transformers/compression/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/compression/compressed_tensors_utils.py @@ -3,7 +3,6 @@ from functools import wraps import torch -from accelerate.accelerator import get_state_dict_offloaded_model from compressed_tensors import ( ModelCompressor, SparsityCompressionConfig, @@ -15,9 +14,6 @@ from llmcompressor.core import active_session from llmcompressor.pytorch.model_load.helpers import copy_python_files_from_model_cache -from llmcompressor.transformers.compression.sparsity_metadata_config import ( - SparsityConfigMetadata, -) from llmcompressor.transformers.utils import RECIPE_FILE_NAME from llmcompressor.transformers.utils.helpers import infer_recipe_from_model_path @@ -143,59 +139,14 @@ def get_model_compressor( :param disable_sparse_compression: bool to skip sparse compression """ - if sparsity_config is None: - """ - Case 1: No sparsity config is provided - 1. Will either skip sparsity compression - 2. Or we will infer sparsity from the model directly - - Check recipe for applied sparsity: - - Set skip_sparsity_compression_stats to False if don't find a - sparsity structure from the recipe - - If we identify sparsity based on the recipe or the user - set skip_sparsity_compression_stats to False, generate config - """ - sparsity_structure = SparsityConfigMetadata.infer_sparsity_structure( - model, check_only_modifiers=True + if ( + sparsity_config is not None + or not skip_sparsity_compression_stats + or disable_sparse_compression + ): + logger.warning( + "Sparse compression is no longer supported by compressed-tensors" ) - if sparsity_structure is not None: - skip_sparsity_compression_stats = False - - if skip_sparsity_compression_stats: - logger.info( - "skip_sparsity_compression_stats set to True. Skipping sparsity " - "compression statistic calculations. No sparsity compressor will " - "be applied." - ) - sparsity_config = None - else: - state_dict = get_state_dict_offloaded_model(model) - - sparsity_config = SparsityConfigMetadata.from_pretrained( - model, - state_dict=state_dict, - compress=save_compressed, - quantization_format=quantization_format, - disable_sparse_compression=disable_sparse_compression, - sparsity_structure=sparsity_structure, - ) - else: - """ - # Case 2: User provides a Sparsity Config - - This is the case when there is existing sparsity in the - model that we'd like to account for while compressing - - Users should provide a SparsityConfig, conveying the model's - sparsity structure when saving the model - """ - if sparsity_config.sparsity_structure is None: - logger.info( - "SparsityConfigMetadata provided without indicating ", - "the sparsity structure. Sparisty will be inferred from the model. " - "Consider providing the structure to skip this step ", - ) - sparsity_config.sparsity_structure = ( - SparsityConfigMetadata.infer_sparsity_structure(model) - ) if not save_compressed: if quantization_format not in (None, CompressionFormat.dense.value): @@ -209,7 +160,6 @@ def get_model_compressor( return ModelCompressor.from_pretrained_model( model, - sparsity_config_or_format=sparsity_config, quantization_format=quantization_format, ) diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml index 6685efb1ea..070d8718e9 100644 --- a/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml +++ b/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml @@ -1,4 +1,3 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml index 330023a801..1246d535e9 100644 --- a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml +++ b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml @@ -1,4 +1,3 @@ cadence: "nightly" test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml index 337e6c19e5..86225c2618 100644 --- a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml +++ b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml @@ -1,4 +1,3 @@ cadence: "nightly" test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml index b5a846cbc7..16827150fc 100644 --- a/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml +++ b/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml @@ -1,4 +1,3 @@ cadence: "commit" test_type: "regression" -compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" -skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" \ No newline at end of file +compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" \ No newline at end of file diff --git a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py index a47cb1440d..13dca54e22 100644 --- a/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py +++ b/tests/llmcompressor/transformers/compression/test_compress_tensor_utils.py @@ -1,4 +1,3 @@ -import math import os import shutil @@ -6,9 +5,8 @@ import torch from accelerate import dispatch_model from accelerate.accelerator import get_state_dict_offloaded_model -from compressed_tensors import QUANTIZATION_CONFIG_NAME, CompressionFormat -from compressed_tensors.compressors import ModelCompressor -from compressed_tensors.config import BitmaskConfig, DenseSparsityConfig +from compressed_tensors import QUANTIZATION_CONFIG_NAME +from compressed_tensors.compressors.format import infer_model_format from compressed_tensors.quantization import ( QuantizationConfig, QuantizationStatus, @@ -19,140 +17,20 @@ from transformers.utils.quantization_config import CompressedTensorsConfig from llmcompressor import oneshot -from llmcompressor.core import reset_session -from llmcompressor.pytorch.utils.helpers import tensor_sparsity from llmcompressor.transformers.compression.compressed_tensors_utils import ( - get_model_compressor, modify_save_pretrained, ) -from llmcompressor.transformers.compression.sparsity_metadata_config import ( - SparsityConfigMetadata, -) from llmcompressor.utils import untie_word_embeddings from tests.testing_utils import requires_gpu -@pytest.mark.parametrize( - "compressed,config,dtype", - [ - [True, None, torch.float32], - [False, DenseSparsityConfig(), torch.float16], - [True, BitmaskConfig(), torch.bfloat16], - [False, BitmaskConfig(), torch.float32], - [False, None, torch.float16], - ], -) -def test_sparse_model_reload(compressed, config, dtype, tmp_path): - recipe_str = "tests/llmcompressor/transformers/sparsegpt/recipes/test_tiny2.yaml" - expected_sparsity = 0.5 - model_path = "nm-testing/tinysmokellama-3.2" - dataset = "open_platypus" - concatenate_data = False - num_calibration_samples = 64 - output_dir = tmp_path / "oneshot_out" - splits = {"calibration": "train[:10%]"} - one_of_sparse_weights = "model.layers.1.mlp.up_proj.weight" - - # create a sparse model - oneshot( - model=model_path, - dataset=dataset, - output_dir=output_dir, - num_calibration_samples=num_calibration_samples, - recipe=recipe_str, - concatenate_data=concatenate_data, - splits=splits, - precision=dtype, - clear_sparse_session=False, - tie_word_embeddings=False, - ) - - model = AutoModelForCausalLM.from_pretrained(tmp_path / "oneshot_out", dtype=dtype) - - # assert that sample layer has the intended sparsity - assert math.isclose( - tensor_sparsity(model.state_dict()[one_of_sparse_weights]), - expected_sparsity, - rel_tol=1e-3, - ) - - inferred_structure = SparsityConfigMetadata.infer_sparsity_structure() - assert inferred_structure == "0:0" - - model.save_pretrained( - tmp_path / "compress_out", - sparsity_config=config, - save_compressed=compressed, - ) - - config = AutoConfig.from_pretrained(tmp_path / "compress_out") - compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - sparsity_config = ModelCompressor.parse_sparsity_config(compression_config) - assert ( - sparsity_config["format"] == "dense" - if (not compressed and config is None) - else "sparse_bitmask" - ) - assert sparsity_config[ - "global_sparsity" - ] == SparsityConfigMetadata.infer_global_sparsity(model) - assert sparsity_config["sparsity_structure"] == inferred_structure - - dense_model = AutoModelForCausalLM.from_pretrained( - tmp_path / "compress_out", dtype="auto" - ) - - og_state_dict = model.state_dict() - reconstructed_state_dict = dense_model.state_dict() - assert len(og_state_dict) == len(reconstructed_state_dict) - for key in og_state_dict.keys(): - dense_tensor = og_state_dict[key] - reconstructed_tensor = reconstructed_state_dict[key] - assert dense_tensor.dtype == reconstructed_tensor.dtype == dtype - assert torch.equal(dense_tensor, reconstructed_tensor) - - if os.path.isdir(tmp_path): - shutil.rmtree(tmp_path) - - -@pytest.mark.parametrize( - "skip_compression_stats,save_compressed", - [[True, True], [True, False], [False, True], [False, False]], -) -def test_dense_model_save(tmp_path, skip_compression_stats, save_compressed): - reset_session() - - model_path = "nm-testing/tinysmokellama-3.2" - model = AutoModelForCausalLM.from_pretrained(model_path) - - inferred_global_sparsity = SparsityConfigMetadata.infer_global_sparsity(model) - assert math.isclose(inferred_global_sparsity, 0.0, rel_tol=1e-3) - inferred_structure = SparsityConfigMetadata.infer_sparsity_structure() - assert inferred_structure == "unstructured" - - model.save_pretrained( - tmp_path / "dense_out", - skip_compression_stats=skip_compression_stats, - save_compressed=save_compressed, - ) - - # for models with 0% sparsity no sparsity config is saved regardless - config = AutoConfig.from_pretrained(tmp_path / "dense_out") - compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - sparsity_config = ModelCompressor.parse_sparsity_config(compression_config) - assert sparsity_config is None - - if os.path.isdir(tmp_path): - shutil.rmtree(tmp_path) - - @pytest.mark.parametrize( "format,dtype", [ - ["dense", torch.float32], - ["dense", torch.float16], - # TODO: Int8 Decompression fails for transformers>4.49 - # ["int_quantized", torch.float32], + ["dense", torch.bfloat16], + # NOTE: Int8 Decompression fails for transformers>4.49 due to bug in loading + # parameters with integers (hf attempts to attach gradients to int params) + # ["int-quantized", torch.bfloat16], ], ) def test_quant_model_reload(format, dtype, tmp_path): @@ -164,7 +42,7 @@ def test_quant_model_reload(format, dtype, tmp_path): dataset = "open_platypus" concatenate_data = False num_calibration_samples = 16 - splits = {"calibration": "train[:10%]"} + splits = {"calibration": f"train[:{num_calibration_samples}]"} # create a quantized model model = oneshot( @@ -191,7 +69,7 @@ def test_quant_model_reload(format, dtype, tmp_path): module.quantization_status == QuantizationStatus.FROZEN ), f"Module {name} has incorrect quantization status" - # Save to disk + # Save to disk, override format model.save_pretrained( save_path_compressed, quantization_format=format, @@ -200,8 +78,7 @@ def test_quant_model_reload(format, dtype, tmp_path): # Verify config on disk config = AutoConfig.from_pretrained(save_path_compressed) - compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - quant_config = ModelCompressor.parse_quantization_config(compression_config) + quant_config = getattr(config, QUANTIZATION_CONFIG_NAME) assert quant_config["format"] == format decompressed_model = AutoModelForCausalLM.from_pretrained( @@ -210,6 +87,7 @@ def test_quant_model_reload(format, dtype, tmp_path): quantization_config=CompressedTensorsConfig(run_compressed=False), ) + _remove_zp(og_state_dict) # HACK: remove extra zero points added during quant init reconstructed_state_dict = decompressed_model.state_dict() assert len(og_state_dict) == len(reconstructed_state_dict) for key in og_state_dict.keys(): @@ -280,188 +158,12 @@ def test_model_reload_gpu(offload, dtype, tie_word_embeddings, device, tmp_path) test_model_reload(offload, dtype, tie_word_embeddings, device, tmp_path) -@requires_gpu -@pytest.mark.parametrize( - "model_stub, recipe, sparse_format, quant_format", - [ - ( - "nm-testing/tinysmokellama-3.2", - "tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml", - CompressionFormat.sparse_24_bitmask.value, - CompressionFormat.float_quantized.value, - ), - ], -) -def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tmp_path): - from llmcompressor.pytorch.model_load.helpers import get_session_model - - device = "cuda:0" if not torch.cuda.is_available() else "cpu" - dataset = "open_platypus" - concatenate_data = False - num_calibration_samples = 64 - splits = {"calibration": "train[:10%]"} - - oneshot( - model=model_stub, - dataset=dataset, - num_calibration_samples=num_calibration_samples, - recipe=recipe, - concatenate_data=concatenate_data, - splits=splits, - ) - - # Fetch the oneshot model - model = get_session_model() - og_state_dict = model.state_dict() - path = tmp_path / "compressed" - - # As HFQuantizer doesn't decompress the model, use the compressor to decompress - # the model instead - compressor = ModelCompressor.from_pretrained_model( - model, sparsity_config_or_format=sparse_format, quantization_format=quant_format - ) - - assert ( - compressor.sparsity_compressor is not None - ), "Sparse compressor not initialized" - assert compressor.sparsity_config.format == sparse_format - - assert ( - compressor.quantization_compressor is not None - ), "Quantization compressor not initialized" - - compressor.compress_model(model) - compressor.decompress_model(model) - compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN - - # Verify the abs difference between the decompressed model - # and the original model - reconstructed_state_dict = model.state_dict() - for key in reconstructed_state_dict.keys(): - dense_tensor = og_state_dict[key].to(device) - reconstructed_tensor = reconstructed_state_dict[key].to(device) - assert dense_tensor.dtype == reconstructed_tensor.dtype - if key.endswith("weight") and quant_format != "dense": - # we don't expect an exact match for compressed - diff = torch.abs(dense_tensor - reconstructed_tensor) - # maximum quantization error as a result of compression is ~0.025 - assert not torch.any(diff > 0.025), f"Max diff: {torch.max(diff)}" - else: - assert torch.equal(dense_tensor, reconstructed_tensor) - - # Recompress and save; validate correct formats used - model.save_pretrained(path) - config = AutoConfig.from_pretrained(path) - compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - quant_config = ModelCompressor.parse_quantization_config(compression_config) - sparsity_config = ModelCompressor.parse_sparsity_config(compression_config) - assert quant_config["format"] == quant_format - assert sparsity_config["format"] == sparse_format - - if os.path.isdir(tmp_path): - shutil.rmtree(tmp_path) - - -@pytest.mark.parametrize( - "model_stub, recipe, sparse_format", - [ - ( - "nm-testing/tinysmokellama-3.2", - "tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml", - CompressionFormat.sparse_24_bitmask.value, - ), - ], -) -def test_sparse_24_compressor_is_lossless(model_stub, recipe, sparse_format, tmp_path): - device = "cuda:0" if not torch.cuda.is_available() else "cpu" - dataset = "open_platypus" - concatenate_data = False - num_calibration_samples = 64 - splits = {"calibration": "train[:10%]"} - empty_model = AutoModelForCausalLM.from_pretrained(model_stub, dtype="auto") - - model = oneshot( - model=model_stub, - dataset=dataset, - num_calibration_samples=num_calibration_samples, - recipe=recipe, - concatenate_data=concatenate_data, - splits=splits, - clear_sparse_session=False, - ) - - og_state_dict = model.state_dict() - path = tmp_path / "compressed" - - # Compress and save - model.save_pretrained( - path, - save_compressed=True, - ) - - # Verify config on disk - config = AutoConfig.from_pretrained(path) - compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - - # As HFQuantizer doesn't decompress the model, use the compressor to decompress - # the model instead - compressor = ModelCompressor.from_compression_config(compression_config) - - assert ( - compressor.sparsity_compressor is not None - ), "Sparse compressor not initialized" - assert compressor.sparsity_config.format == sparse_format - - compressor.decompress(model_path=path, model=empty_model) - - # Verify the abs difference between the decompressed model - # and the original model - reconstructed_state_dict = empty_model.state_dict() - assert len(og_state_dict) == len(reconstructed_state_dict) - for key in og_state_dict.keys(): - dense_tensor = og_state_dict[key].to(device) - reconstructed_tensor = reconstructed_state_dict[key].to(device) - assert dense_tensor.dtype == reconstructed_tensor.dtype - if key.endswith("weight"): - assert torch.equal(dense_tensor, reconstructed_tensor) - if os.path.isdir(tmp_path): - shutil.rmtree(tmp_path) - - -def test_disable_sparse_compression_flag(tmp_path): - two_four_sparse_model_id = "nm-testing/llama2.c-stories42M-pruned2.4" - two_four_sparse_model = AutoModelForCausalLM.from_pretrained( - two_four_sparse_model_id, dtype="auto" - ) - modify_save_pretrained(two_four_sparse_model) - - save_path = tmp_path / "no_sparse_compression_model" - sparsity_config = SparsityConfigMetadata.from_pretrained( - two_four_sparse_model, - sparsity_structure="2:4", - ) - two_four_sparse_model.save_pretrained( - save_path, disable_sparse_compression=True, sparsity_config=sparsity_config - ) - - config = AutoConfig.from_pretrained(save_path) - quantization_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - - assert quantization_config - sparsity_config = quantization_config.get("sparsity_config") - - assert sparsity_config - assert sparsity_config["format"] == "dense" - if os.path.isdir(tmp_path): - shutil.rmtree(tmp_path) - - class DummyLinearModel(nn.Module): """ A dummy linear model for testing purposes, simulating a quantized linear layer. """ - def __init__(self, weights, weight_scale=None, weight_zero_point=None): + def __init__(self, weights, weight_scale=None, zero_point=None): super().__init__() out_features, in_features = weights.shape @@ -470,14 +172,8 @@ def __init__(self, weights, weight_scale=None, weight_zero_point=None): self.linear.weight = nn.Parameter(weights, requires_grad=True) # Attach scale and zero-point if provided - if weight_scale is not None: - self.linear.weight_scale = nn.Parameter( - torch.tensor(weight_scale), requires_grad=False - ) - if weight_zero_point is not None: - self.linear.weight_zero_point = nn.Parameter( - torch.tensor(weight_zero_point), requires_grad=False - ) + self.linear.weight_scale = nn.Parameter(weight_scale, requires_grad=False) + self.linear.weight_zero_point = nn.Parameter(zero_point, requires_grad=False) def forward(self, x): return self.linear(x) @@ -541,53 +237,23 @@ def _quantization_config_from_string(config_str, q_type): ) -def _make_24_sparse(tensor): - """ - Apply 2:4 sparsity pattern to the given tensor. - """ - reshaped_tensor = tensor.view(tensor.size(0), -1, 4) - mask = torch.zeros_like(reshaped_tensor, dtype=torch.bool) - mask[..., :2] = True - sparsified_tensor = torch.where( - mask, reshaped_tensor, torch.tensor(0.0, dtype=tensor.dtype) - ) - return sparsified_tensor.view_as(tensor) - - @pytest.mark.parametrize( - "quant_style, quant_type, is_24, expected_quant_compressor, " - "expected_sparsity_compressor", + "quant_style,quant_type,expected_format", [ - ("W8A8", "int", False, "int-quantized", "dense"), - ("W4A16", "int", False, "pack-quantized", "dense"), - ("W8A16", "int", False, "pack-quantized", "dense"), - ("W8A8", "int", True, "int-quantized", "sparse-24-bitmask"), - ("W4A16", "int", True, "marlin-24", "dense"), - ("W8A16", "int", True, "marlin-24", "dense"), - ("W8A8", "float", False, "float-quantized", "dense"), - ("W8A16", "float", False, "naive-quantized", "dense"), - ("W8A8", "float", True, "float-quantized", "sparse-24-bitmask"), - ("W8A16", "float", True, "naive-quantized", "dense"), + ("W8A8", "int", "int-quantized"), + ("W4A16", "int", "pack-quantized"), + ("W8A16", "int", "pack-quantized"), + ("W8A8", "float", "float-quantized"), + ("W8A16", "float", "naive-quantized"), ], ) def test_correct_compressor_inferred( quant_style, quant_type, - is_24, - expected_quant_compressor, - expected_sparsity_compressor, + expected_format, ): - """ - Test if the correct compressor is inferred based on - quantization and sparsity configurations. - """ + """Test if the correct compressor is inferred based on quantization""" weights = torch.rand(10, 4) - if is_24: - weights = _make_24_sparse(weights) - else: - weights[0, :] = torch.ones( - 4, - ) # guarantee not 24 sparse quantization_config = _quantization_config_from_string(quant_style, quant_type) quantization_args = quantization_config.config_groups["group_0"].weights @@ -607,20 +273,12 @@ def test_correct_compressor_inferred( model.linear.quantization_scheme = quantization_config.config_groups["group_0"] model.linear.quantization_status = QuantizationStatus.FROZEN - if is_24: - sparsity_config = SparsityConfigMetadata.from_pretrained( - model, sparsity_structure="2:4", compress=True - ) - else: - sparsity_config = None - compressor = get_model_compressor(model, sparsity_config=sparsity_config) + assert infer_model_format(model) == expected_format - assert compressor.quantization_config.format == expected_quant_compressor - if expected_sparsity_compressor == "dense": - assert ( - compressor.sparsity_config is None - or compressor.sparsity_config.format == expected_sparsity_compressor - ) - else: - assert compressor.sparsity_config.format == expected_sparsity_compressor +def _remove_zp(state_dict: dict) -> dict: + return { + key: value + for key, value in state_dict.items() + if not key.endswith("zero_point") + } diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py index 3b8cc6cbad..0148a1afc9 100644 --- a/tests/llmcompressor/transformers/compression/test_decompress.py +++ b/tests/llmcompressor/transformers/compression/test_decompress.py @@ -1,11 +1,7 @@ -import copy - import pytest import torch -from compressed_tensors import QUANTIZATION_CONFIG_NAME from compressed_tensors.compressors import ModelCompressor -from compressed_tensors.quantization import QuantizationStatus -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.utils.quantization_config import CompressedTensorsConfig from tests.testing_utils import parse_params, requires_gpu @@ -19,87 +15,37 @@ def test_hf_quantizer_decompress_match_manual_decompress(config): """ Check that HFQuantizer decompression is working as expected. Manually decompress a compressed model and compare the generations - - Decompression: - Given a skeleton model and path to the optimized model, - write the optimized model's safetensors to the skeleton model and decompress - Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16 - """ compressed_model_stub = config["compressed_model_stub"] - skeleton_model_stub = config["skeleton_model_stub"] sample_inputs = [ "I love 4-bit quantization because", "What is the capital of France?", "def fibonacci(n):", ] - tokenizer = AutoTokenizer.from_pretrained(compressed_model_stub) # Decompress using HFQuantizer from AutoModelForCausalLM - decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained( + hf_quantizer_model = AutoModelForCausalLM.from_pretrained( compressed_model_stub, dtype="auto", device_map="auto", quantization_config=CompressedTensorsConfig(run_compressed=False), ) - # Manually decompress this model - dense_model = AutoModelForCausalLM.from_pretrained( - skeleton_model_stub, - dtype=decompressed_model_hf_quantizer.dtype, - device_map=decompressed_model_hf_quantizer.device, - ) - - # decompression from HFQuantizer should populate weight_scale - assert hasattr( - decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj, - "weight_scale", - ) - - # dense model should not have weight_scale populated - assert not hasattr(dense_model.model.layers[0].self_attn.q_proj, "weight_scale") - - config = AutoConfig.from_pretrained(compressed_model_stub) - - compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) - compressor = ModelCompressor.from_compression_config(compression_config) - compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN - - # use the model_path to load the decompressed weights into dense_model - orig_dense_model = copy.deepcopy(dense_model) - - # overwrite the weights of the dense model - compressor.decompress( - model_path=compressed_model_stub, - model=dense_model, - ) - - # self.dense_model should be decompressed - assert dense_model is not orig_dense_model - - decompressed_model_manual = dense_model - - assert hasattr( - decompressed_model_manual.model.layers[0].self_attn.q_proj, - "weight_scale", + # Manually decompress from compressed model + manual_model = AutoModelForCausalLM.from_pretrained( + compressed_model_stub, + dtype=hf_quantizer_model.dtype, + device_map=hf_quantizer_model.device, ) + ModelCompressor().decompress_model(manual_model) - device = decompressed_model_manual.device - + # Check generations + device = manual_model.device for input in sample_inputs: inputs = tokenizer(input, return_tensors="pt", padding=True).to(device) - - decompressed_model_manual_output = decompressed_model_manual.generate( - **inputs, max_length=50 - ) - - decompressed_model_hf_quantizer_out = decompressed_model_hf_quantizer.generate( - **inputs, max_length=50 - ) - - assert torch.equal( - decompressed_model_hf_quantizer_out, decompressed_model_manual_output - ) + manual_output = manual_model.generate(**inputs, max_length=15) + hf_quantizer_output = hf_quantizer_model.generate(**inputs, max_length=15) + assert torch.equal(manual_output, hf_quantizer_output) diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py index de0f503402..dd62c01426 100644 --- a/tests/llmcompressor/transformers/compression/test_quantization.py +++ b/tests/llmcompressor/transformers/compression/test_quantization.py @@ -39,20 +39,16 @@ def _get_quant_info(model): for name, module in model.named_modules(): with align_module_device(module): if is_module_quantized(module): + # skip zero points, as these are removed between + # compression/decompression for symmetric models + if module.quantization_scheme.weights is not None: - quant_info_weights[name] = ( - module.weight_scale, - module.weight_zero_point, - module.weight, - ) + quant_info_weights[name] = (module.weight_scale, module.weight) if module.quantization_scheme.input_activations is not None: is_dynamic = module.quantization_scheme.input_activations.dynamic if not is_dynamic: - quant_info_inputs[name] = ( - module.input_scale, - module.input_zero_point, - ) + quant_info_inputs[name] = (module.input_scale,) return quant_info_weights, quant_info_inputs @@ -110,23 +106,19 @@ def test_quantization_reload(setup_model_and_config): # TODO: can remove `to` calls after # https://github.com/neuralmagic/compressed-tensors/pull/427 - for name, (o_scale, o_zp, o_weight) in og_weights.items(): - n_scale, n_zp, n_weight = reloaded_weights[name] + for name, (o_scale, o_weight) in og_weights.items(): + n_scale, n_weight = reloaded_weights[name] assert o_scale.dtype == n_scale.dtype == config["weight_dtype"] assert torch.equal(o_scale, n_scale.to(o_scale.device)) - assert o_zp.dtype == n_zp.dtype - assert torch.equal(o_zp, n_zp.to(o_zp.device)) # we don't expect an exact match here because o_weight still has the # original weight and n_weight has been fake_quantized assert n_weight.dtype == o_weight.dtype == config["weight_dtype"] - for name, (o_scale, o_zp) in og_inputs.items(): - n_scale, n_zp = reloaded_inputs[name] + for name, (o_scale,) in og_inputs.items(): + (n_scale,) = reloaded_inputs[name] assert o_scale.dtype == n_scale.dtype == config["weight_dtype"] assert torch.equal(o_scale, n_scale.to(o_scale.device)) - assert o_zp.dtype == n_zp.dtype - assert torch.equal(o_zp, n_zp.to(o_zp.device)) @requires_gpu