openvinotoolkit · alexsu52 · May 30, 2025 · Jun 10, 2024 · Sep 4, 2024 · Sep 23, 2024
@@ -1099,12 +1099,21 @@ def _setup_initial_quantizers_for_operator_node(
             qconf_list = deepcopy(self.default_global_qconfig_list)
         assert qconf_list is not None
 
+        nncf_node_name = next(
+            iter(quant_prop_graph.op_node_keys_to_underlying_nodes_mapping[operator_node_key])
+        ).node_name
         if not HWConfig.is_wildcard_quantization(qconf_list):
-            nncf_node_ref = next(iter(quant_prop_graph.op_node_keys_to_underlying_nodes_mapping[operator_node_key]))
-            qconf_list = self._filter_qconfigs_according_to_scope(qconf_list, nncf_node_ref.node_name)
+            qconf_list = self._filter_qconfigs_according_to_scope(qconf_list, nncf_node_name)
         else:
             qconf_list = [deepcopy(DEFAULT_QUANTIZER_CONFIG)]
 
+        op_override_params = {}
+        op_scope_overrides = self._scope_overrides.get("operations", {})
+        for overridden_scope, scoped_override_dict in op_scope_overrides.items():
+            if matches_any(nncf_node_name, overridden_scope):
+                op_override_params.update(scoped_override_dict)
+        target_input_ports = op_override_params.get("target_input_ports", metatype.target_input_ports)
+
         is_unified_scale = metatype in self._unified_scales_operation_set
         if is_unified_scale:
             # Filtering out the per-channel cases in the unified scale scenario.
@@ -1147,7 +1156,7 @@ def _setup_initial_quantizers_for_operator_node(
             if input_port_id in metatype.ignored_input_ports:
                 continue
 
-            if metatype.target_input_ports is not None and input_port_id not in metatype.target_input_ports:
+            if target_input_ports is not None and input_port_id not in target_input_ports:
                 continue
 
             edge = quant_prop_graph.edges[pred_ip_key, operator_node_key]

@@ -289,11 +289,14 @@ def _set_mode_based_params(self) -> None:
             if getattr(self, self_name) is None:
                 setattr(self, self_name, default_value)
 
+    def _is_fp8(self):
+        return self._mode in (QuantizationMode.FP8_E4M3, QuantizationMode.FP8_E5M2)
+
     def _review_mode_based_params(self):
         """
         Reviews parameter values because mode option doesn't support them.
         """
-        if self._mode in (QuantizationMode.FP8_E4M3, QuantizationMode.FP8_E5M2):
+        if self._is_fp8():
             nncf_logger.warning(f"You're using experimental option mode with {self._mode} value.")
 
             if self._preset != QuantizationPreset.PERFORMANCE:
@@ -635,10 +638,14 @@ def _get_scope_overrides(self, inference_nncf_graph: NNCFGraph) -> dict:
             )
         ]
 
+        target_input_ports = [0, 1, 2] if self._is_fp8() else [0, 1]
+
         scope_overrides_activations = {}
+        scope_overrides_operations = {}
         for node_name in scaled_dot_product_attention_node_names:
             scope_overrides_activations[node_name] = {"mode": "symmetric"}
-        return {"activations": scope_overrides_activations}
+            scope_overrides_operations[node_name] = {"target_input_ports": target_input_ports}
+        return {"activations": scope_overrides_activations, "operations": scope_overrides_operations}
 
     def _get_quantizer_setup(
         self,

@@ -886,7 +886,7 @@ def _create_ov_model(self, weights_dtype: Optional[ov.Type] = None, activation_d
 
 
 class ScaledDotProductAttentionModel(OVReferenceModel):
-    def _create_ov_model(self):
+    def _create_ov_model(self, with_weights=False):
         input_ = opset.parameter([1, 1, 1, 64], name="Input_1")
         attn_mask = opset.parameter([1, 1, 1, 1], name="Input_2")
         x = opset.reshape(input_, [64], False)
@@ -898,6 +898,9 @@ def _create_ov_model(self):
         for _ in range(3):
             x_ = opset.reshape(x, [64], False)
             x_ = opset.reshape(x_, [1, 1, 1, 64], False)
+            if with_weights:
+                w_ = opset.constant(self._rng.random((64, 64)), dtype=np.float32)
+                x_ = opset.matmul(x_, w_, transpose_a=False, transpose_b=False)
             inputs.append(x_)
 
         attn = opset.scaled_dot_product_attention(*inputs, attn_mask)

@@ -31,6 +31,7 @@
 from tests.openvino.native.models import FPModel
 from tests.openvino.native.models import LinearModel
 from tests.openvino.native.models import MatMul2DModel
+from tests.openvino.native.models import ScaledDotProductAttentionModel
 from tests.openvino.native.models import UnifiedScalesModel
 from tests.openvino.native.models import WeightsModel
 from tests.openvino.native.models import get_torch_model_info
@@ -215,3 +216,21 @@ def test_fq_precision_orig_fp32model(const_dtype, input_dtype, inplace_statistic
             fq_input_node = inp_node.get_source_output().get_node()
             if fq_input_node.get_type_name() == "Constant":
                 assert op.get_element_type() == input_dtype
+
+
+@pytest.mark.parametrize(
+    "mode, num_quantizers, quantizer_name",
+    (
+        (QuantizationMode.FP8_E4M3, 7, "FakeConvert"),  # 3 for weights + 1 activation + 3 for SDPA
+        (QuantizationMode.FP8_E5M2, 7, "FakeConvert"),  # 3 for weights + 1 activation + 3 for SDPA
+        (None, 6, "FakeQuantize"),  # 3 for weights + 1 activation + 2 for SDPA
+    ),
+)
+@pytest.mark.parametrize("model_creator_func", [ScaledDotProductAttentionModel])
+def test_sdpa_layer(mode, num_quantizers, quantizer_name, model_creator_func):
+    model = model_creator_func(with_weights=True)
+    quantized_model = quantize_model(model.ov_model, {"mode": mode})
+
+    stat_nodes = get_fq_nodes_stats_algo(quantized_model)
+
+    assert len(stat_nodes) == num_quantizers, f"Expected {num_quantizers} {quantizer_name}, but got {len(stat_nodes)}"