openvinotoolkit · ljaljushkin · May 20, 2025 · May 15, 2025 · May 15, 2025 · May 15, 2025
@@ -116,9 +116,7 @@ def main():
     model.save_pretrained(OUTPUT_DIR)
     tokenizer.save_pretrained(OUTPUT_DIR)
 
-    model = OVModelForCausalLM.from_pretrained(
-        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"}
-    )
+    model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"INFERENCE_PRECISION_HINT": "f32"})
     answers_by_questions = generate_answers(questions, model, tokenizer)
     print(f"Optimized model outputs:\n{answers_by_questions}\n")
     return answers_by_questions

@@ -57,9 +57,7 @@ def transform_fn(data, tokenizer):
     )
     model.save_pretrained(OUTPUT_DIR)
 
-    model = OVModelForCausalLM.from_pretrained(
-        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"}
-    )
+    model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"KV_CACHE_PRECISION": "f16"})
     input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)
 
     start_t = time.time()

@@ -248,7 +248,6 @@ def main():
         "PERFORMANCE_HINT": "LATENCY",
         "NUM_STREAMS": "1",
         "CACHE_DIR": "",
-        "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0",
         "KV_CACHE_PRECISION": "f16",
     }
     model = OVModelForCausalLM.from_pretrained(

@@ -272,7 +272,7 @@ def export_to_openvino(pretrained: str, ckpt_file: Path, ir_dir: Path) -> OVMode
         trust_remote_code=True,
         load_in_8bit=False,
         compile=True,
-        ov_config={"KV_CACHE_PRECISION": "f16", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},
+        ov_config={"KV_CACHE_PRECISION": "f16"},
     )
 
 

@@ -1,20 +1,20 @@
 {
-    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression]": 222.974,
-    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_synthetic]": 873.780,
-    "tests/cross_fw/examples/test_examples.py::test_examples[llm_tune_params]": 1018.932,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_onnx_mobilenet_v2]": 178.509,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_onnx_yolo8_quantize_with_accuracy_control]": 292.766,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_anomaly_stfpm_quantize_with_accuracy_control]": 443.025,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_mobilenet_v2_quantize]": 169.789,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_yolo8_quantize]": 170.593,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_yolo8_quantize_with_accuracy_control]": 205.533,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_tensorflow_mobilenet_v2]": 149.202,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_mobilenet_v2]": 192.227,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_ssd300_vgg16]": 231.613,
-    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_anomalib]": 478.797,
-    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_resnet18]": 1251.144,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_fx_resnet18]": 412.243,
-    "tests/cross_fw/examples/test_examples.py::test_examples[fp8_llm_quantization]": 229.69,
-    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 1500.00,
-    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_qat_with_lora]": 665
+    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression]": 197,
+    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_synthetic]": 717,
+    "tests/cross_fw/examples/test_examples.py::test_examples[llm_tune_params]": 824,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_onnx_mobilenet_v2]": 207,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_onnx_yolo8_quantize_with_accuracy_control]": 800,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_anomaly_stfpm_quantize_with_accuracy_control]": 455,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_mobilenet_v2_quantize]": 171,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_yolo8_quantize]": 192,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_openvino_yolo8_quantize_with_accuracy_control]": 203,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_tensorflow_mobilenet_v2]": 148,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_mobilenet_v2]": 185,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_ssd300_vgg16]": 221,
+    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_anomalib]": 547,
+    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_resnet18]": 1189,
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_fx_resnet18]": 178,
+    "tests/cross_fw/examples/test_examples.py::test_examples[fp8_llm_quantization]": 226,
+    "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_tensorflow_mobilenet_v2]": 891,
+    "tests/cross_fw/examples/test_examples.py::test_examples[llm_compression_qat_with_lora]": 591
 }
@@ -1,17 +1,17 @@
 {
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_awq_backup_mode_none_backend_OV]": 269,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_scale_estimation_backend_OV]": 421,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV]": 374,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_stateful_backend_OV]": 243,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_backend_OV]": 190,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV]": 1463,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_lora_stateful_backend_OV]": 483,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_awq_backup_mode_none_backend_OV]": 249,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_scale_estimation_backend_OV]": 368,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV]": 371,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_awq_stateful_backend_OV]": 205,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_backend_OV]": 206,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV]": 1161,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_aware_lora_stateful_backend_OV]": 473,
     "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_backend_FP32]": 0,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_backend_OV]": 196,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_int4_data_free_backend_TORCH]": 133,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_int8_data_free_backend_TORCH]": 154,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_NF4_scale_estimation_stateful_per_channel_backend_OV]": 256,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_scale_estimation_per_channel_backend_OV]": 258,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_awq_backend_OV]": 200,
-    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_awq_backend_TORCH]": 200
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_backend_OV]": 187,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_int4_data_free_backend_TORCH]": 123,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_int8_data_free_backend_TORCH]": 165,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_NF4_scale_estimation_stateful_per_channel_backend_OV]": 193,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_scale_estimation_per_channel_backend_OV]": 251,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_awq_backend_OV]": 164,
+    "tests/post_training/test_quantize_conformance.py::test_weight_compression[tinyllama_data_free_awq_backend_TORCH]": 210
 }
@@ -366,7 +366,7 @@ def _validate(self) -> None:
                 load_in_8bit=False,
                 compile=False,
                 stateful=is_stateful,
-                ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "KV_CACHE_PRECISION": "f16"},
+                ov_config={"KV_CACHE_PRECISION": "f16"},
             )
         if self.backend == BackendType.FX_TORCH:
             compressed_model_hf = FXAutoModelForCausalLM(self.model, self.model_config)

@@ -82,7 +82,7 @@ def get_ov_model(model: AutoModelForCausalLM, tmp_path: str) -> OVModelForCausal
         trust_remote_code=True,
         load_in_8bit=False,
         compile=True,
-        ov_config={"KV_CACHE_PRECISION": "f16", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},
+        ov_config={"KV_CACHE_PRECISION": "f16"},
     )