use default dynamic quantization group size (#3495)

ljaljushkin · web-flow · commit daa5174e71dc · 2025-05-20T16:42:57.000+02:00
### Changes Removed explicit disabling of dynamic quantization Updated test durations from the latest job ### Reason for changes test default configuration ### Related tickets 157896 ### Tests no change in test examples - https://github.com/openvinotoolkit/nncf/actions/runs/15041910960 no change in conformance PTWC - https://github.com/openvinotoolkit/nncf/actions/runs/15041905952
diff --git a/examples/llm_compression/openvino/smollm2_360m_fp8/main.py b/examples/llm_compression/openvino/smollm2_360m_fp8/main.py
@@ -116,9 +116,7 @@ def main():
     model.save_pretrained(OUTPUT_DIR)
     tokenizer.save_pretrained(OUTPUT_DIR)
 
-    model = OVModelForCausalLM.from_pretrained(
-        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"}
-    )
+    model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"INFERENCE_PRECISION_HINT": "f32"})
     answers_by_questions = generate_answers(questions, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
     return answers_by_questions
diff --git a/examples/llm_compression/openvino/tiny_llama/main.py b/examples/llm_compression/openvino/tiny_llama/main.py
@@ -57,9 +57,7 @@ def transform_fn(data, tokenizer):
     )
     model.save_pretrained(OUTPUT_DIR)
 
-    model = OVModelForCausalLM.from_pretrained(
-        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"}
-    )
+    model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"KV_CACHE_PRECISION": "f16"})
     input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)
 
     start_t = time.time()
diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py
@@ -248,7 +248,6 @@ def main():
         "PERFORMANCE_HINT": "LATENCY",
         "NUM_STREAMS": "1",
         "CACHE_DIR": "",
-        "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0",
         "KV_CACHE_PRECISION": "f16",
     }
     model = OVModelForCausalLM.from_pretrained(
diff --git a/examples/llm_compression/torch/qat_with_lora/main.py b/examples/llm_compression/torch/qat_with_lora/main.py
@@ -272,7 +272,7 @@ def export_to_openvino(pretrained: str, ckpt_file: Path, ir_dir: Path) -> OVMode
         trust_remote_code=True,
         load_in_8bit=False,
         compile=True,
-        ov_config={"KV_CACHE_PRECISION": "f16", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},
+        ov_config={"KV_CACHE_PRECISION": "f16"},
     )
 
 
diff --git a/tests/cross_fw/examples/.test_durations b/tests/cross_fw/examples/.test_durations
@@ -1,20 +1,20 @@
 {
-    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression]": 222.974,
-    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_synthetic]": 873.780,
-    "tests/cross_fw/examples/test_examples.py::test_examples[llm_tune_params]": 1018.932,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_onnx_mobilenet_v2]": 178.509,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_onnx_yolo8_quantize_with_accuracy_control]": 292.766,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_anomaly_stfpm_quantize_with_accuracy_control]": 443.025,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_mobilenet_v2_quantize]": 169.789,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_yolo8_quantize]": 170.593,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_yolo8_quantize_with_accuracy_control]": 205.533,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_tensorflow_mobilenet_v2]": 149.202,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_mobilenet_v2]": 192.227,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_ssd300_vgg16]": 231.613,
-    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_anomalib]": 478.797,
-    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_resnet18]": 1251.144,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_fx_resnet18]": 412.243,
-    "tests/cross_fw/examples/test_examples.py::test_examples[fp8_llm_quantization]": 229.69,
-    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 1500.00,
-    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_qat_with_lora]": 665
+    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression]": 197,
+    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_synthetic]": 717,
+    "tests/cross_fw/examples/test_examples.py::test_examples[llm_tune_params]": 824,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_onnx_mobilenet_v2]": 207,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_onnx_yolo8_quantize_with_accuracy_control]": 800,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_anomaly_stfpm_quantize_with_accuracy_control]": 455,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_mobilenet_v2_quantize]": 171,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_yolo8_quantize]": 192,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_yolo8_quantize_with_accuracy_control]": 203,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_tensorflow_mobilenet_v2]": 148,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_mobilenet_v2]": 185,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_ssd300_vgg16]": 221,
+    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_anomalib]": 547,
+    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_resnet18]": 1189,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_fx_resnet18]": 178,
+    "tests/cross_fw/examples/test_examples.py::test_examples[fp8_llm_quantization]": 226,
+    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 891,
+    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_qat_with_lora]": 591
 }
diff --git a/tests/post_training/data/wc_test_durations.json b/tests/post_training/data/wc_test_durations.json
@@ -1,17 +1,17 @@
 {
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_awq_backup_mode_none_backend_OV]": 269,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_scale_estimation_backend_OV]": 421,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV]": 374,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_stateful_backend_OV]": 243,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_backend_OV]": 190,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV]": 1463,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_lora_stateful_backend_OV]": 483,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_awq_backup_mode_none_backend_OV]": 249,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_scale_estimation_backend_OV]": 368,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV]": 371,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_stateful_backend_OV]": 205,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_backend_OV]": 206,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV]": 1161,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_lora_stateful_backend_OV]": 473,
     "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_backend_FP32]": 0,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_backend_OV]": 196,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_int4_data_free_backend_TORCH]": 133,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_int8_data_free_backend_TORCH]": 154,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_NF4_scale_estimation_stateful_per_channel_backend_OV]": 256,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_scale_estimation_per_channel_backend_OV]": 258,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_awq_backend_OV]": 200,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_awq_backend_TORCH]": 200
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_backend_OV]": 187,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_int4_data_free_backend_TORCH]": 123,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_int8_data_free_backend_TORCH]": 165,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_NF4_scale_estimation_stateful_per_channel_backend_OV]": 193,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_scale_estimation_per_channel_backend_OV]": 251,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_awq_backend_OV]": 164,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_awq_backend_TORCH]": 210
 }
diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
@@ -366,7 +366,7 @@ def _validate(self) -> None:
                 load_in_8bit=False,
                 compile=False,
                 stateful=is_stateful,
-                ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"},
+                ov_config={"KV_CACHE_PRECISION": "f16"},
             )
         if self.backend == BackendType.FX_TORCH:
             compressed_model_hf = FXAutoModelForCausalLM(self.model, self.model_config)
diff --git a/tests/torch2/function_hook/quantization/test_fq_lora.py b/tests/torch2/function_hook/quantization/test_fq_lora.py
@@ -82,7 +82,7 @@ def get_ov_model(model: AutoModelForCausalLM, tmp_path: str) -> OVModelForCausal
         trust_remote_code=True,
         load_in_8bit=False,
         compile=True,
-        ov_config={"KV_CACHE_PRECISION": "f16", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},
+        ov_config={"KV_CACHE_PRECISION": "f16"},
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -57,9 +57,7 @@ def transform_fn(data, tokenizer):`
`57`	`57`	`)`
`58`	`58`	`model.save_pretrained(OUTPUT_DIR)`
`59`	`59`
`60`		`- model = OVModelForCausalLM.from_pretrained(`
`61`		`- OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"}`
`62`		`- )`
	`60`	`+ model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"KV_CACHE_PRECISION": "f16"})`
`63`	`61`	`input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)`
`64`	`62`
`65`	`63`	`start_t = time.time()`
Original file line number	Diff line number	Diff line change
`@@ -248,7 +248,6 @@ def main():`
`248`	`248`	`"PERFORMANCE_HINT": "LATENCY",`
`249`	`249`	`"NUM_STREAMS": "1",`
`250`	`250`	`"CACHE_DIR": "",`
`251`		`- "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0",`
`252`	`251`	`"KV_CACHE_PRECISION": "f16",`
`253`	`252`	`}`
`254`	`253`	`model = OVModelForCausalLM.from_pretrained(`
Original file line number	Diff line number	Diff line change
`@@ -272,7 +272,7 @@ def export_to_openvino(pretrained: str, ckpt_file: Path, ir_dir: Path) -> OVMode`
`272`	`272`	`trust_remote_code=True,`
`273`	`273`	`load_in_8bit=False,`
`274`	`274`	`compile=True,`
`275`		`- ov_config={"KV_CACHE_PRECISION": "f16", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},`
	`275`	`+ ov_config={"KV_CACHE_PRECISION": "f16"},`
`276`	`276`	`)`
`277`	`277`
`278`	`278`
Original file line number	Diff line number	Diff line change
`@@ -366,7 +366,7 @@ def _validate(self) -> None:`
`366`	`366`	`load_in_8bit=False,`
`367`	`367`	`compile=False,`
`368`	`368`	`stateful=is_stateful,`
`369`		`- ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"},`
	`369`	`+ ov_config={"KV_CACHE_PRECISION": "f16"},`
`370`	`370`	`)`
`371`	`371`	`if self.backend == BackendType.FX_TORCH:`
`372`	`372`	`compressed_model_hf = FXAutoModelForCausalLM(self.model, self.model_config)`
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ def get_ov_model(model: AutoModelForCausalLM, tmp_path: str) -> OVModelForCausal`
`82`	`82`	`trust_remote_code=True,`
`83`	`83`	`load_in_8bit=False,`
`84`	`84`	`compile=True,`
`85`		`- ov_config={"KV_CACHE_PRECISION": "f16", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},`
	`85`	`+ ov_config={"KV_CACHE_PRECISION": "f16"},`
`86`	`86`	`)`
`87`	`87`
`88`	`88`