Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/Algorithms.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
- Symmetric 8 bit compression mode
- Symmetric and asymmetric 4 bit compression mode
- NF4 compression mode
- Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4_F8E4M3)
- Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4)
- MX-compliant types - MXFP4 and MXFP8_E4M3
- FP types - FP8_E4M3 and FP4
- Mixed precision weights compression
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ NNCF can automatically distribute precision assignments based on quantization se
| INT4_ASYM | INT4 | FP16 | Per-channel / Group-wise | [Asymmetric quantization](/docs/usage/training_time_compression/Quantization.md#asymmetric-quantization) |
| NF4 | FP32 | FP16 | Per-channel / Group-wise | [NormalFloat-4](https://arxiv.org/pdf/2305.14314v1.pdf) lookup table with 16 FP32 values |
| CODEBOOK | Any | FP16 | Per-channel / Group-wise | Arbitrary lookup table (codebook) |
| CB4_F8E4M3 | E4M3 | FP16 | Per-channel / Group-wise | A fixed lookup table with 16 E4M3 values based on NF4 values |
| CB4 | E4M3 | FP16 | Per-channel / Group-wise | A fixed lookup table with 16 E4M3 values based on NF4 values |
| MXFP4 | E2M1 | E8M0 | Group-wise (32) | [MX-compliant FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
| MXFP8_E4M3 | E4M3 | E8M0 | Group-wise (32) | [MX-compliant FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
| FP8_E4M3 | E4M3 | FP16 | Per-channel / Group-wise | [FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
| FP4 | E2M1 | FP16 | Per-channel / Group-wise | [FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
| FP4 | E2M1 | FP16 | Per-channel / Group-wise | [FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |

**Note**: Granularity refers to the scope of elements sharing quantization parameters. "Per-channel" applies different parameters for each output channel, while "Group-wise" divides weights into groups (e.g., group_size=128) that share the same parameters.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def default_codebook_example(model_id: str, compressed_model_id: str) -> list[st
answers_by_questions = generate_answers(QUESTIONS, model, tokenizer)
print_answers("Non-optimized model outputs:\n", answers_by_questions)

model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4_F8E4M3, ratio=1.0, group_size=64)
model.model = nncf.compress_weights(model.model, mode=nncf.CompressWeightsMode.CB4, ratio=1.0, group_size=64)
model.save_pretrained(compressed_model_id)
tokenizer.save_pretrained(compressed_model_id)

Expand Down
4 changes: 2 additions & 2 deletions src/nncf/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,15 @@ class CompressWeightsMode(StrEnum):
:param FP8_E4M3: A FP8 format with E4M3 values sharing group-level fp16 scale.
:param FP4: A FP4 format with E2M1 values sharing group-level fp16 scale.
:param CODEBOOK: Codebook (LUT) quantization format.
:param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
:param CB4: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
"""

INT8_SYM = "int8_sym"
INT8_ASYM = "int8_asym"
INT4_SYM = "int4_sym"
INT4_ASYM = "int4_asym"
NF4 = "nf4"
CB4_F8E4M3 = "cb4_f8e4m3"
CB4 = "cb4"
INT8 = "int8" # Deprecated mode
MXFP4 = "mxfp4"
MXFP8_E4M3 = "mxfp8_e4m3"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def get_weight_compression_configuration(
elif group_size is None and mode in NON_INT8_MODES:
if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]:
group_size = 32
elif mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]:
elif mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4]:
group_size = -1
else:
group_size = 128
Expand Down Expand Up @@ -523,7 +523,7 @@ def _get_backup_config(self, weight_dtype: TensorDataType) -> Optional[WeightCom
def _get_primary_config(self, group_size: int) -> WeightCompressionConfig:
codebook_values = None

if self._mode == CompressWeightsMode.CB4_F8E4M3:
if self._mode == CompressWeightsMode.CB4:
codebook_values = Tensor(CB4_QUANTILES)
elif self._mode == CompressWeightsMode.CODEBOOK:
codebook_values = Tensor(self._advanced_parameters.codebook)
Expand Down Expand Up @@ -959,9 +959,9 @@ def get_weight_compression_parameters(
# MoE operations are usually matmuls, so the check for matmul metatype is done
# This is to avoid raising the error for non-MoE cases with 3D weights.
parsed_ov_version = f"{ov_version[0]}.{ov_version[1]}.{ov_version[2]}-{ov_version[3]}"
msg = f"""NNCF compression algorithms do not support 3D weights with current version of
OpenVINO {parsed_ov_version} due to a known issue in statistics collection
Ticket - 176465. Please update to the latest OpenVINO nightly version.
msg = f"""NNCF compression algorithms do not support 3D weights with current version of
OpenVINO {parsed_ov_version} due to a known issue in statistics collection
Ticket - 176465. Please update to the latest OpenVINO nightly version.
Node with weight: {node.node_name}."""
raise nncf.UnsupportedModelError(msg)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,15 @@ def is_integer(self):
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.FP4,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
CompressWeightsMode.CB4,
]

@property
def is_codebook(self):
"""
:return: True if compression type is codebook, else False.
"""
return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4]

@property
def compression_dtype(self) -> TensorDataType:
Expand Down
6 changes: 3 additions & 3 deletions src/nncf/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ def compress_weights(
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.FP4,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
CompressWeightsMode.CB4,
]
if mode in not_supported_modes:
msg = (
Expand Down Expand Up @@ -559,7 +559,7 @@ def compress_weights(
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.FP4,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
CompressWeightsMode.CB4,
]
if mode in not_supported_modes:
msg = (
Expand Down Expand Up @@ -634,7 +634,7 @@ def compress_weights(
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.FP4,
CompressWeightsMode.CODEBOOK,
CompressWeightsMode.CB4_F8E4M3,
CompressWeightsMode.CB4,
]
if mode in not_supported_modes:
msg = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def get_mixed_mapping(primary_fn: Callable, list_layers: list[str]):
(CompressWeightsMode.INT4_SYM, 3, get_mixed_mapping(check_int4_sym_grouped, TEST_MODELS[IntegerModel])),
(CompressWeightsMode.INT4_ASYM, 3, get_mixed_mapping(check_int4_asym_grouped, TEST_MODELS[IntegerModel])),
(CompressWeightsMode.NF4, 3, get_mixed_mapping(check_nf4_grouped, TEST_MODELS[IntegerModel])),
(CompressWeightsMode.CB4_F8E4M3, 3, get_mixed_mapping(check_codebook_grouped, TEST_MODELS[IntegerModel])),
(CompressWeightsMode.CB4, 3, get_mixed_mapping(check_codebook_grouped, TEST_MODELS[IntegerModel])),
(CompressWeightsMode.MXFP4, 32, get_mixed_mapping(check_mxfp4, TEST_MODELS[IntegerModel])),
(CompressWeightsMode.MXFP8_E4M3, 32, get_mixed_mapping(check_mxfp8, TEST_MODELS[IntegerModel])),
(CompressWeightsMode.FP8_E4M3, 3, get_mixed_mapping(check_fp8, TEST_MODELS[IntegerModel])),
Expand Down Expand Up @@ -1330,7 +1330,7 @@ def test_mixed_precision_codebook(mode, all_layers, ratio, ref_ids):
model = SequentialMatmulModel().ov_model
compressed_model = compress_weights(
model,
mode=CompressWeightsMode.CB4_F8E4M3,
mode=CompressWeightsMode.CB4,
ratio=ratio,
group_size=1,
all_layers=all_layers,
Expand Down