openvinotoolkit · dmatveev · Mar 12, 2026 · Oct 17, 2025 · Oct 23, 2025 · Dec 15, 2025
@@ -57,6 +57,13 @@ TYPE_PRINTER(std::size_t)
 #ifndef ONEAPI_MAKE_VERSION
 /// @brief Generates generic 'oneAPI' API versions
 #    define ONEAPI_MAKE_VERSION(_major, _minor) ((_major << 16) | (_minor & 0x0000ffff))
+
+/// @brief extract 'oneAPI' API major version
+#    define ONEAPI_VERSION_MAJOR(_version) ((_version) >> 16)
+
+/// @brief extract 'oneAPI' API minor version
+#    define ONEAPI_VERSION_MINOR(_version) ((_version) & 0x0000ffff)
+
 #endif  // ONEAPI_MAKE_VERSION
 
 //

@@ -112,6 +112,7 @@ DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, RunTime);
 DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, RunTime);
 DEFINE_OPT(NPUW_DQ_FULL, bool, true, npuw::partitioning::dyn_quant_full, RunTime);
 DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, RunTime);
+DEFINE_OPT(NPUW_MM_GATED, bool, true, npuw::partitioning::matmul_gate_preserve_constants, RunTime);
 DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, RunTime);
 DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, RunTime);
 DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, RunTime);

@@ -216,6 +216,15 @@ static constexpr ov::Property<bool> dyn_quant_full{"NPUW_DQ_FULL"};
  */
 static constexpr ov::Property<std::string> par_matmul_merge_dims{"NPUW_PMM"};
 
+/**
+ * @brief
+ * Type: bool.
+ * whether to preserve constants for gated version of matmul
+ * on some version of compiler - might produce incorrect results when enabled
+ * Default value: YES
+ */
+static constexpr ov::Property<bool> matmul_gate_preserve_constants{"NPUW_MM_GATED"};
+
 /**
  * @brief
  * Type: bool.

@@ -29,6 +29,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_DQ>();
     desc.add<NPUW_DQ_FULL>();
     desc.add<NPUW_PMM>();
+    desc.add<NPUW_MM_GATED>();
     desc.add<NPUW_SLICE_OUT>();
     desc.add<NPUW_SPATIAL>();
     desc.add<NPUW_SPATIAL_NWAY>();

@@ -172,6 +172,7 @@ class Properties final {
         ov::intel_npu::npuw::partitioning::dyn_quant.name(),
         ov::intel_npu::npuw::partitioning::dyn_quant_full.name(),
         ov::intel_npu::npuw::partitioning::par_matmul_merge_dims.name(),
+        ov::intel_npu::npuw::partitioning::matmul_gate_preserve_constants.name(),
         ov::intel_npu::npuw::partitioning::slice_out.name(),
         ov::intel_npu::npuw::partitioning::spatial.name(),
         ov::intel_npu::npuw::partitioning::spatial_nway.name(),

@@ -635,8 +635,10 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared
     std::vector<CPtr> to_keep;
 
     ov::pass::GraphRewrite rewr2;
-    rewr2.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulAsymm>(std::ref(to_keep));
-    rewr2.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulSymm>(std::ref(to_keep));
+    ctx.mm_gate = m_cfg.get<::intel_npu::NPUW_MM_GATED>();
+
+    rewr2.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulAsymm>(std::ref(ctx), std::ref(to_keep));
+    rewr2.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulFP8>(std::ref(ctx), std::ref(to_keep));
     rewr2.run_on_model(model);
     // FIXME: since 3-model pipeline is the default option, the tail will be separate,
     // so we need to match either head or tail pattern here for host gather quantized feature to work.
@@ -2506,6 +2508,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
                           BIND(npuw::partitioning::dyn_quant_full, NPUW_DQ_FULL),
                           BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
+                          BIND(npuw::partitioning::matmul_gate_preserve_constants, NPUW_MM_GATED),
                           BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT),
                           BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
                           BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),

@@ -43,6 +43,23 @@
 
 namespace opp = ov::pass::pattern;
 
+// specific function that match subgraph appeared as result of lpt transformations
+auto match_down_up_convert_subgraph_after_lpt = [](const ov::Output<ov::Node>& input) {
+    auto upconvert = opp::wrap_type<ov::op::v0::Convert>({input}, opp::type_matches(ov::element::f32));
+
+    auto upscale = opp::wrap_type<ov::op::v0::Constant>(opp::rank_equals(0));
+    auto upmul = opp::wrap_type<ov::op::v1::Multiply>({upconvert, upscale});
+
+    auto downscale = opp::wrap_type<ov::op::v0::Constant>(opp::rank_equals(0));
+    auto downmul = opp::wrap_type<ov::op::v1::Multiply>({upmul, downscale});
+
+    auto downconvert =
+        opp::wrap_type<ov::op::v0::Convert>({downmul},
+                                            opp::type_matches_any({ov::element::f8e4m3, ov::element::f8e5m2}));
+
+    return downconvert;
+};
+
 class RemoveEmptyKVTensors : public ov::pass::MatcherPass {
 public:
     OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::RemoveEmptyKVTensors");
@@ -54,7 +71,10 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass {
 
     RemoveEmptyKVTensors(Context::Ref ctx) {
         auto param = opp::wrap_type<ov::op::v0::Parameter>();
-        auto concat = opp::wrap_type<ov::op::v0::Concat>({param, opp::any_input()});
+        auto param_or =
+            std::make_shared<opp::op::Or>(ov::OutputVector{param, match_down_up_convert_subgraph_after_lpt(param)});
+
+        auto concat = opp::wrap_type<ov::op::v0::Concat>({param_or, opp::any_input()});
 
         auto callback = [=](opp::Matcher& m) {
             auto& node_to_output = m.get_pattern_value_map();
@@ -63,15 +83,28 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass {
 
             ctx.get().old_params.push_back(matched_param);
 
-            auto users = matched_param->get_users();
-            if (users.size() == 2u) {
-                auto shapeof_node = ov::is_type<ov::op::v3::ShapeOf>(users[0]) ? users[0] : users[1];
-                NPUW_ASSERT(ov::is_type<ov::op::v3::ShapeOf>(shapeof_node));
-                auto cst_node =
-                    ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, matched_param->get_shape());
-                ov::replace_node(shapeof_node, cst_node);
-            } else {
-                NPUW_ASSERT(users.size() == 1u);
+            // Use concat's first input source node to find ShapeOf users.
+            // This works universally for both plain parameter and down_up_convert subgraph cases,
+            // because in the subgraph case matched_param->get_users() would return the Convert
+            // node (first node of the subgraph), not the ShapeOf.
+            auto concat_input0_node = matched_node_concat->input(0).get_source_output().get_node_shared_ptr();
+            auto users = concat_input0_node->get_users();
+
+            // In subgraph case the parameter itself may also have a ShapeOf user,
+            // so check both the concat input node and the parameter.
+            if (concat_input0_node != matched_param) {
+                auto param_users = matched_param->get_users();
+                users.insert(users.end(), param_users.begin(), param_users.end());
+            }
+
+            // Remove duplicates (concat itself will appear in users)
+            // Find and replace ShapeOf nodes with constants
+            for (auto& user : users) {
+                if (ov::is_type<ov::op::v3::ShapeOf>(user)) {
+                    auto cst_node =
+                        ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, matched_param->get_shape());
+                    ov::replace_node(user, cst_node);
+                }
             }
 
             // Redirect second concat input to every node which reads from concat
@@ -323,22 +356,6 @@ class GroupQueryAttentionDecomposition : public ov::pass::MatcherPass {
 class RedirectNewKvToOutput : public ov::pass::MatcherPass {
 public:
     RedirectNewKvToOutput() {
-        auto match_down_up_convert_subgraph = [](const ov::Output<ov::Node>& input) {
-            auto upconvert = opp::wrap_type<ov::op::v0::Convert>({input}, opp::type_matches(ov::element::f32));
-
-            auto upscale = opp::wrap_type<ov::op::v0::Constant>(opp::rank_equals(0));
-            auto upmul = opp::wrap_type<ov::op::v1::Multiply>({upconvert, upscale});
-
-            auto downscale = opp::wrap_type<ov::op::v0::Constant>(opp::rank_equals(0));
-            auto downmul = opp::wrap_type<ov::op::v1::Multiply>({upmul, downscale});
-
-            auto downconvert =
-                opp::wrap_type<ov::op::v0::Convert>({downmul},
-                                                    opp::type_matches_any({ov::element::f8e4m3, ov::element::f8e5m2}));
-
-            return downconvert;
-        };
-
         // example of fp8 inputs to concat
         // input0 : float8e4m3[1,32,1151,96]
         // input1 : float8e4m3[1,32,1,96]
@@ -348,13 +365,13 @@ class RedirectNewKvToOutput : public ov::pass::MatcherPass {
         // TODO: this matcher logic better to cover with unit-tests
         auto input0 = opp::wrap_type<ov::op::v0::Parameter>();
         auto input0_or =
-            std::make_shared<opp::op::Or>(ov::OutputVector{input0, match_down_up_convert_subgraph(input0)});
+            std::make_shared<opp::op::Or>(ov::OutputVector{input0, match_down_up_convert_subgraph_after_lpt(input0)});
 
         auto input1 = opp::any_input();
 
         auto kv_concat = opp::wrap_type<ov::op::v0::Concat>({input0_or, input1});
         auto result1 = opp::wrap_type<ov::op::v0::Result>(kv_concat);
-        auto result2 = opp::wrap_type<ov::op::v0::Result>(match_down_up_convert_subgraph(kv_concat));
+        auto result2 = opp::wrap_type<ov::op::v0::Result>(match_down_up_convert_subgraph_after_lpt(kv_concat));
 
         auto result_or = std::make_shared<opp::op::Or>(ov::OutputVector{result1, result2});
 
@@ -1162,6 +1179,7 @@ struct NPUDesc {
     std::string arch;
     int64_t max_tiles = 0;
     bool compiler_dq = false;
+    bool compiler_matmul_gate = false;
     int64_t compiler_ver = 0;
     bool support_flash_attention_tile = false;
 };
@@ -1199,6 +1217,19 @@ std::optional<NPUDesc> extract_npu_descriptor(const std::shared_ptr<const ov::IP
                                                ov::AnyMap{{ov::intel_npu::compiler_type.name(), target_compiler_type}})
                                 .as<int64_t>();
     }
+    LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "."
+                                  << ONEAPI_VERSION_MINOR(desc.compiler_ver));
+
+    constexpr std::string_view compiler_gate_support_msg =
+        "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : ";
+
+    if (desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 28)) {
+        // accuracy for gated matmul fixed at 7.28
+        desc.compiler_matmul_gate = true;
+        LOG_INFO(compiler_gate_support_msg << "supported");
+    } else {
+        LOG_WARN(compiler_gate_support_msg << "unsupported");
+    }
 
     if (desc.arch == "5010" && desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 29)) {
         // Flash attention tile is supported starting from compiler version 7.29 on NPU5010
@@ -1247,6 +1278,13 @@ ov::AnyMap get_baseline_common_config(const std::optional<NPUDesc>& npudesc) {
         config.erase("NPUW_DCOFF_TYPE");
         config.erase("NPUW_DCOFF_SCALE");
     }
+
+    // default value is ON
+    // for compiler versions >= 7.28 value is ON
+    // for other compiler versions value is OFF
+    if (npudesc.has_value()) {
+        config.emplace("NPUW_MM_GATED", (npudesc->compiler_matmul_gate ? "YES" : "NO"));
+    }
     return config;
 }
 
@@ -1877,7 +1915,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
 
     if (!m_is_embedding) {
         if (!m_use_chunk_prefill) {
-            // TODO: sometimes it is ok if we cannot find any empty inputs or not?
             NPUW_ASSERT(remove_empty_kv_inputs(prefill_model));
         } else {
             LOG_DEBUG("Don't remove input key/values from prefill model.");

@@ -1494,9 +1494,12 @@ void Partitioner::saveTailDictConstants(const std::string& func_name) {
     using CPtr = std::shared_ptr<ov::op::v0::Constant>;
     std::vector<CPtr> to_keep;
 
+    ov::npuw::patterns::opt::Context ctx;
+    ctx.mm_gate = cfg.get<::intel_npu::NPUW_MM_GATED>();
+
     ov::pass::GraphRewrite rewr;
-    rewr.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulAsymm>(std::ref(to_keep));
-    rewr.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulSymm>(std::ref(to_keep));
+    rewr.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulAsymm>(std::ref(ctx), std::ref(to_keep));
+    rewr.add_matcher<ov::npuw::patterns::opt::PreserveConstDictMatMulFP8>(std::ref(ctx), std::ref(to_keep));
     rewr.run_on_model(model_group.front());
 
     for (auto&& const_to_keep : to_keep) {

@@ -1919,7 +1919,8 @@ CompressDictMatMulf32::CompressDictMatMulf32(Context::Ref ctx) {
 //     Const(S) ---------------------> Multiply -> to(f32) -> MatMul -> Result
 //     ???(Act) -------------------------------------------->
 
-PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatMulAsymm::Results to_keep) {
+PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx,
+                                                           PreserveConstDictMatMulAsymm::Results to_keep) {
     auto qweight = opp::wrap_type<ov::op::v0::Constant>();
     auto qcoeff = opp::wrap_type<ov::op::v0::Constant>();
     auto qzerop = opp::wrap_type<ov::op::v0::Constant>();
@@ -1930,7 +1931,21 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatM
     auto qcvtm = opp::wrap_type<ov::op::v0::Convert>({qmuls});
     auto qmmi = opp::any_input();
     auto qmm = opp::wrap_type<ov::op::v0::MatMul>({qmmi, qcvtm});
-    auto qres = opp::wrap_type<ov::op::v0::Result>({qmm});
+    std::shared_ptr<Node> qres;
+
+    // MatMul -> Divide -> Tanh -> Multiply -> Result
+    if (ctx.get().mm_gate) {
+        auto div = opp::wrap_type<ov::op::v1::Multiply, ov::op::v1::Divide>({qmm, opp::any_input()});
+        auto tanh = opp::wrap_type<ov::op::v0::Tanh>({div});
+        auto matmul_multiply = opp::wrap_type<ov::op::v1::Multiply>({tanh, opp::any_input()});
+
+        auto matmul_or =
+            std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)});
+
+        qres = opp::wrap_type<ov::op::v0::Result>({matmul_or});
+    } else {
+        qres = opp::wrap_type<ov::op::v0::Result>({qmm});
+    }
 
     // Note: Use [=] to make sure the above objects stay alive in the callback
     auto callback = [=](ov::pass::pattern::Matcher& m) {
@@ -1964,14 +1979,28 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatM
 //     Const(S) ----------------> Multiply -> MatMul -> Result
 //     ???(Act) ---------------------------->
 
-PreserveConstDictMatMulSymm::PreserveConstDictMatMulSymm(PreserveConstDictMatMulSymm::Results to_keep) {
+PreserveConstDictMatMulFP8::PreserveConstDictMatMulFP8(Context::Ref ctx, PreserveConstDictMatMulFP8::Results to_keep) {
     auto qweight = opp::wrap_type<ov::op::v0::Constant>();
     auto qcoeff = opp::wrap_type<ov::op::v0::Constant>();
     auto qcvtw = opp::wrap_type<ov::op::v0::Convert>({qweight});
     auto qmuls = opp::wrap_type<ov::op::v1::Multiply>({qcvtw, qcoeff});
+    auto optional_cvt = opp::optional<ov::op::v0::Convert>({qmuls});
     auto qmmi = opp::any_input();
-    auto qmm = opp::wrap_type<ov::op::v0::MatMul>({qmmi, qmuls});
-    auto qres = opp::wrap_type<ov::op::v0::Result>({qmm});
+    auto qmm = opp::wrap_type<ov::op::v0::MatMul>({qmmi, optional_cvt});
+    std::shared_ptr<Node> qres;
+    // // MatMul -> Divide -> Tanh -> Multiply -> Result
+    if (ctx.get().mm_gate) {
+        auto div = opp::wrap_type<ov::op::v1::Multiply, ov::op::v1::Divide>({qmm, opp::any_input()});
+        auto tanh = opp::wrap_type<ov::op::v0::Tanh>({div});
+        auto matmul_multiply = opp::wrap_type<ov::op::v1::Multiply>({tanh, opp::any_input()});
+
+        auto matmul_or =
+            std::make_shared<ov::pass::pattern::op::Or>(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)});
+
+        qres = opp::wrap_type<ov::op::v0::Result>({matmul_or});
+    } else {
+        qres = opp::wrap_type<ov::op::v0::Result>({qmm});
+    }
 
     // Note: Use [=] to make sure the above objects stay alive in the callback
     auto callback = [=](ov::pass::pattern::Matcher& m) {
@@ -1997,7 +2026,7 @@ PreserveConstDictMatMulSymm::PreserveConstDictMatMulSymm(PreserveConstDictMatMul
         }
         return false;  // root hasn't changed
     };
-    register_matcher(std::make_shared<opp::Matcher>(qres, "OptPreserveConstDictMatMulSymm"), std::move(callback));
+    register_matcher(std::make_shared<opp::Matcher>(qres, "OptPreserveConstDictMatMulFP8"), std::move(callback));
 }
 
 SliceLastMatmul::SliceLastMatmul() {

@@ -23,6 +23,7 @@ struct Context {
     std::string pmm_dims;
     bool is_spatial = false;
     bool mm_dq_full = true;
+    bool mm_gate = false;
 
     using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
     using NPtr = std::shared_ptr<ov::Node>;
@@ -229,17 +230,17 @@ class PreserveConstDictMatMulAsymm : public ov::pass::MatcherPass {
     using CPtr = std::shared_ptr<ov::op::v0::Constant>;
     using Results = std::reference_wrapper<std::vector<CPtr>>;
 
-    PreserveConstDictMatMulAsymm(Results to_keep);
+    PreserveConstDictMatMulAsymm(Context::Ref ctx, Results to_keep);
 };
 
-class PreserveConstDictMatMulSymm : public ov::pass::MatcherPass {
+class PreserveConstDictMatMulFP8 : public ov::pass::MatcherPass {
 public:
-    OPENVINO_MATCHER_PASS_RTTI("npuw::patterns::opt::PreserveConstDictMatMulSymm");
+    OPENVINO_MATCHER_PASS_RTTI("npuw::patterns::opt::PreserveConstDictMatMulFP8");
 
     using CPtr = std::shared_ptr<ov::op::v0::Constant>;
     using Results = std::reference_wrapper<std::vector<CPtr>>;
 
-    PreserveConstDictMatMulSymm(Results to_keep);
+    PreserveConstDictMatMulFP8(Context::Ref ctx, Results to_keep);
 };
 
 // Slice last Matmul

@@ -305,6 +305,7 @@ void init_config(const IEngineBackend* backend, OptionsDesc& options, FilteredCo
     REGISTER_OPTION(NPUW_DQ);
     REGISTER_OPTION(NPUW_DQ_FULL);
     REGISTER_OPTION(NPUW_PMM);
+    REGISTER_OPTION(NPUW_MM_GATED);
     REGISTER_OPTION(NPUW_SLICE_OUT);
     REGISTER_OPTION(NPUW_SPATIAL);
     REGISTER_OPTION(NPUW_SPATIAL_NWAY);

@@ -607,6 +607,7 @@ void Properties::registerPluginProperties() {
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::dyn_quant, NPUW_DQ);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::dyn_quant_full, NPUW_DQ_FULL);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::par_matmul_merge_dims, NPUW_PMM);
+    TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::matmul_gate_preserve_constants, NPUW_MM_GATED);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::slice_out, NPUW_SLICE_OUT);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::spatial, NPUW_SPATIAL);
     TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY);

@@ -200,6 +200,7 @@ class PropertiesManagerTests : public ov::test::behavior::OVPluginTestBase,
         REGISTER_OPTION(NPUW_DQ);
         REGISTER_OPTION(NPUW_DQ_FULL);
         REGISTER_OPTION(NPUW_PMM);
+        REGISTER_OPTION(NPUW_MM_GATED);
         REGISTER_OPTION(NPUW_SLICE_OUT);
         REGISTER_OPTION(NPUW_SPATIAL);
         REGISTER_OPTION(NPUW_SPATIAL_NWAY);