From 6ad8b26806e5a462fe353f1483aa712001fa4b18 Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Sat, 18 Oct 2025 00:39:39 +0200 Subject: [PATCH 01/14] gemma-2 patterns added to preserve tail constants matcher --- .../plugin/npuw/partitioning/patterns/opt.cpp | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 2f8df0ed28e326..7860470234aa38 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -1929,7 +1929,17 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatM auto qcvtm = opp::wrap_type({qmuls}); auto qmmi = opp::any_input(); auto qmm = opp::wrap_type({qmmi, qcvtm}); - auto qres = opp::wrap_type({qmm}); + + + // MatMul -> Divide -> Tanh -> Multiply -> Result + auto div = opp::wrap_type({qmm, opp::any_input()}); + auto tanh = opp::wrap_type({div}); + auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); + + auto matmul_or = std::make_shared(ov::OutputVector{qmm->output(0), + matmul_multiply->output(0)}); + + auto qres = opp::wrap_type({matmul_or}); // Note: Use [=] to make sure the above objects stay alive in the callback auto callback = [=](ov::pass::pattern::Matcher& m) { @@ -1968,9 +1978,19 @@ PreserveConstDictMatMulSymm::PreserveConstDictMatMulSymm(PreserveConstDictMatMul auto qcoeff = opp::wrap_type(); auto qcvtw = opp::wrap_type({qweight}); auto qmuls = opp::wrap_type({qcvtw, qcoeff}); + auto optional_kvt = opp::optional({qmuls}); auto qmmi = opp::any_input(); - auto qmm = opp::wrap_type({qmmi, qmuls}); - auto qres = opp::wrap_type({qmm}); + auto qmm = opp::wrap_type({qmmi, optional_kvt}); + + // MatMul -> Divide -> Tanh -> Multiply -> Result + auto div = opp::wrap_type({qmm, opp::any_input()}); + auto tanh = opp::wrap_type({div}); + auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); + + auto matmul_or = std::make_shared(ov::OutputVector{qmm->output(0), + matmul_multiply->output(0)}); + + auto qres = opp::wrap_type({matmul_or}); // Note: Use [=] to make sure the above objects stay alive in the callback auto callback = [=](ov::pass::pattern::Matcher& m) { @@ -1988,11 +2008,12 @@ PreserveConstDictMatMulSymm::PreserveConstDictMatMulSymm(PreserveConstDictMatMul if ((ov::element::f8e4m3 == matched_qweight->get_element_type() || ov::element::f8e5m2 == matched_qweight->get_element_type() || - ov::element::f8e8m0 == matched_qweight->get_element_type()) && + ov::element::f8e8m0 == matched_qweight->get_element_type() || + ov::element::i8 == matched_qweight->get_element_type()) && qcoeff_shape[1] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { to_keep.get().push_back(matched_qweight); to_keep.get().push_back(matched_qcoeff); - return false; // root hasn't changed + return true; // root hasn't changed } return false; // root hasn't changed }; From 29cb65e8b145c8d64dfe368b3cbd918d35d58a79 Mon Sep 17 00:00:00 2001 From: esmirno1 Date: Mon, 15 Dec 2025 11:46:00 +0000 Subject: [PATCH 02/14] review fixes --- .../src/plugin/npuw/partitioning/patterns/opt.cpp | 15 +++++++-------- .../src/plugin/npuw/partitioning/patterns/opt.hpp | 6 +++--- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 7860470234aa38..6e26bf8e98a2fe 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -1930,14 +1930,13 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatM auto qmmi = opp::any_input(); auto qmm = opp::wrap_type({qmmi, qcvtm}); - // MatMul -> Divide -> Tanh -> Multiply -> Result auto div = opp::wrap_type({qmm, opp::any_input()}); auto tanh = opp::wrap_type({div}); auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); - auto matmul_or = std::make_shared(ov::OutputVector{qmm->output(0), - matmul_multiply->output(0)}); + auto matmul_or = + std::make_shared(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)}); auto qres = opp::wrap_type({matmul_or}); @@ -1973,7 +1972,7 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatM // Const(S) ----------------> Multiply -> MatMul -> Result // ???(Act) ----------------------------> -PreserveConstDictMatMulSymm::PreserveConstDictMatMulSymm(PreserveConstDictMatMulSymm::Results to_keep) { +PreserveConstDictMatMulFP8::PreserveConstDictMatMulFP8(PreserveConstDictMatMulFP8::Results to_keep) { auto qweight = opp::wrap_type(); auto qcoeff = opp::wrap_type(); auto qcvtw = opp::wrap_type({qweight}); @@ -1987,8 +1986,8 @@ PreserveConstDictMatMulSymm::PreserveConstDictMatMulSymm(PreserveConstDictMatMul auto tanh = opp::wrap_type({div}); auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); - auto matmul_or = std::make_shared(ov::OutputVector{qmm->output(0), - matmul_multiply->output(0)}); + auto matmul_or = + std::make_shared(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)}); auto qres = opp::wrap_type({matmul_or}); @@ -2013,11 +2012,11 @@ PreserveConstDictMatMulSymm::PreserveConstDictMatMulSymm(PreserveConstDictMatMul qcoeff_shape[1] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { to_keep.get().push_back(matched_qweight); to_keep.get().push_back(matched_qcoeff); - return true; // root hasn't changed + return false; // root hasn't changed } return false; // root hasn't changed }; - register_matcher(std::make_shared(qres, "OptPreserveConstDictMatMulSymm"), std::move(callback)); + register_matcher(std::make_shared(qres, "OptPreserveConstDictMatMulFP8"), std::move(callback)); } SliceLastMatmul::SliceLastMatmul() { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index f25f03c4b0bb94..9cc5c4ca5b1b04 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -232,14 +232,14 @@ class PreserveConstDictMatMulAsymm : public ov::pass::MatcherPass { PreserveConstDictMatMulAsymm(Results to_keep); }; -class PreserveConstDictMatMulSymm : public ov::pass::MatcherPass { +class PreserveConstDictMatMulFP8 : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("npuw::patterns::opt::PreserveConstDictMatMulSymm"); + OPENVINO_MATCHER_PASS_RTTI("npuw::patterns::opt::PreserveConstDictMatMulFP8"); using CPtr = std::shared_ptr; using Results = std::reference_wrapper>; - PreserveConstDictMatMulSymm(Results to_keep); + PreserveConstDictMatMulFP8(Results to_keep); }; // Slice last Matmul From 3c19c327fea170d3af8ee7690378b31fb1c193d0 Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Mon, 15 Dec 2025 23:04:10 +0100 Subject: [PATCH 03/14] build fixes --- src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp | 2 +- .../intel_npu/src/plugin/npuw/partitioning/partitioning.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 27a21d64704001..4167a744b59eb1 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -609,7 +609,7 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared ov::pass::GraphRewrite rewr2; rewr2.add_matcher(std::ref(to_keep)); - rewr2.add_matcher(std::ref(to_keep)); + rewr2.add_matcher(std::ref(to_keep)); rewr2.run_on_model(model); // FIXME: since 3-model pipeline is the default option, the tail will be separate, // so we need to match either head or tail pattern here for host gather quantized feature to work. diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 491f1bd477eff5..cfb883e7c9966f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1462,7 +1462,7 @@ void Partitioner::saveTailDictConstants(const std::string& func_name) { ov::pass::GraphRewrite rewr; rewr.add_matcher(std::ref(to_keep)); - rewr.add_matcher(std::ref(to_keep)); + rewr.add_matcher(std::ref(to_keep)); rewr.run_on_model(model_group.front()); for (auto&& const_to_keep : to_keep) { From 31252936d73460329c8c272ba1b173bf34443d3c Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Mon, 9 Mar 2026 18:34:20 +0100 Subject: [PATCH 04/14] version check added to preserve constants feature --- .../al/include/intel_npu/config/config.hpp | 7 +++ .../src/al/include/intel_npu/config/npuw.hpp | 1 + .../intel_npu/npuw_private_properties.hpp | 9 ++++ .../intel_npu/src/al/src/config/npuw.cpp | 1 + .../src/plugin/npuw/compiled_model.cpp | 7 ++- .../src/plugin/npuw/llm_compiled_model.cpp | 23 +++++++++- .../plugin/npuw/partitioning/partitioning.cpp | 7 ++- .../plugin/npuw/partitioning/partitioning.hpp | 9 ++-- .../plugin/npuw/partitioning/patterns/opt.cpp | 45 +++++++++++-------- .../plugin/npuw/partitioning/patterns/opt.hpp | 5 ++- .../npuw/partitioning/patterns/sdpa.cpp | 4 +- .../intel_npu/src/plugin/src/plugin.cpp | 1 + .../intel_npu/src/plugin/src/properties.cpp | 1 + .../internal/plugin/test_properties.hpp | 1 + 14 files changed, 89 insertions(+), 32 deletions(-) diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp index ebae76152394f6..8204217ed27e79 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp @@ -57,6 +57,13 @@ TYPE_PRINTER(std::size_t) #ifndef ONEAPI_MAKE_VERSION /// @brief Generates generic 'oneAPI' API versions # define ONEAPI_MAKE_VERSION(_major, _minor) ((_major << 16) | (_minor & 0x0000ffff)) + +/// @brief extract 'oneAPI' API major version +# define ONEAPI_VERSION_MAJOR(_version) ((_version) >> 16) + +/// @brief extract 'oneAPI' API minor version +# define ONEAPI_VERSION_MINOR(_version) ((_version) & 0x0000ffff) + #endif // ONEAPI_MAKE_VERSION // diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 4773cf0c06c476..31825a5de6ae77 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -112,6 +112,7 @@ DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, RunTime); DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, RunTime); DEFINE_OPT(NPUW_DQ_FULL, bool, true, npuw::partitioning::dyn_quant_full, RunTime); DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, RunTime); +DEFINE_OPT(NPUW_MM_GATED, bool, true, npuw::partitioning::matmul_gate_preserve_constants, RunTime); DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, RunTime); DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, RunTime); DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, RunTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index 7771184166bb12..8fe337c82094ac 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -216,6 +216,15 @@ static constexpr ov::Property dyn_quant_full{"NPUW_DQ_FULL"}; */ static constexpr ov::Property par_matmul_merge_dims{"NPUW_PMM"}; +/** + * @brief + * Type: bool. + * whether to preserve constants for gated version of matmul + * on some version of compiler - might produce incorrect results when enabled + * Default value: YES + */ +static constexpr ov::Property matmul_gate_preserve_constants{"NPUW_MM_GATED"}; + /** * @brief * Type: bool. diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index bdd984c2aac873..b503377ab061f3 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -29,6 +29,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 4e6233a56e969d..8556124c02a317 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -635,8 +635,10 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared std::vector to_keep; ov::pass::GraphRewrite rewr2; - rewr2.add_matcher(std::ref(to_keep)); - rewr2.add_matcher(std::ref(to_keep)); + ctx.mm_gate = m_cfg.get<::intel_npu::NPUW_MM_GATED>(); + + rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); + rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); rewr2.run_on_model(model); // FIXME: since 3-model pipeline is the default option, the tail will be separate, // so we need to match either head or tail pattern here for host gather quantized feature to work. @@ -2506,6 +2508,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::dyn_quant, NPUW_DQ), BIND(npuw::partitioning::dyn_quant_full, NPUW_DQ_FULL), BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM), + BIND(npuw::partitioning::matmul_gate_preserve_constants, NPUW_MM_GATED), BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT), BIND(npuw::partitioning::spatial, NPUW_SPATIAL), BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY), diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index b00973605e5a2e..9f5ba74a2e944f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -1162,6 +1162,7 @@ struct NPUDesc { std::string arch; int64_t max_tiles = 0; bool compiler_dq = false; + bool compiler_matmul_gate = false; int64_t compiler_ver = 0; }; @@ -1184,6 +1185,18 @@ std::optional extract_npu_descriptor(const std::shared_ptrget_property(ov::intel_npu::compiler_version.name(), ov::AnyMap{}).as(); + LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "." << ONEAPI_VERSION_MINOR(desc.compiler_ver)); + + constexpr std::string_view compiler_gate_support_msg + = "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : "; + + if (desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 28)) { + // accuracy for gated matmul fixed at 7.28 + desc.compiler_matmul_gate = true; + LOG_INFO(compiler_gate_support_msg << "supported"); + } else { + LOG_WARN(compiler_gate_support_msg << "unsupported"); + } return std::make_optional(std::move(desc)); } @@ -1227,6 +1240,12 @@ ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { config.erase("NPUW_DCOFF_TYPE"); config.erase("NPUW_DCOFF_SCALE"); } + + //default version is ON - while for older compiler it might be turned off + if (npudesc.has_value()) { + config.emplace("NPUW_MM_GATED", (npudesc->compiler_matmul_gate ? "YES" : "NO")); + } + return config; } @@ -1239,6 +1258,7 @@ ov::AnyMap get_default_common_config(const std::optional& npudesc) { } else { config.emplace("NPUW_FUNCALL_FOR_ALL", "YES"); } + return config; } @@ -1255,6 +1275,7 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, c config.emplace("NPUW_PMM", "NO"); } } + return config; } @@ -1838,7 +1859,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m if (!m_is_embedding) { if (!m_use_chunk_prefill) { // TODO: sometimes it is ok if we cannot find any empty inputs or not? - NPUW_ASSERT(remove_empty_kv_inputs(prefill_model)); + remove_empty_kv_inputs(prefill_model); } else { LOG_DEBUG("Don't remove input key/values from prefill model."); LOG_DEBUG("Ask prefill model to output key/values for prefill chunk size tokens."); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 5b3b4800b3be69..afd6c69de967c4 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1494,9 +1494,12 @@ void Partitioner::saveTailDictConstants(const std::string& func_name) { using CPtr = std::shared_ptr; std::vector to_keep; + ov::npuw::patterns::opt::Context ctx; + ctx.mm_gate = cfg.get<::intel_npu::NPUW_MM_GATED>(); + ov::pass::GraphRewrite rewr; - rewr.add_matcher(std::ref(to_keep)); - rewr.add_matcher(std::ref(to_keep)); + rewr.add_matcher(std::ref(ctx), std::ref(to_keep)); + rewr.add_matcher(std::ref(ctx), std::ref(to_keep)); rewr.run_on_model(model_group.front()); for (auto&& const_to_keep : to_keep) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp index c8dff61741a6dd..51389725663892 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp @@ -76,7 +76,7 @@ struct Subgraph { using Ref = std::reference_wrapper; - void settag(const std::string& t) { + void settag(const std::string & t) { LOG_DEBUG("Subgraph set-tag=" << t); _tag = t; } @@ -111,16 +111,15 @@ struct Function { // FIXME: shouldn't be here. Needed to not unpack some lazy closures in DCOFF std::set _idx_lazy_unpack; - void settag(const std::string& t) { + void settag(const std::string & t) { LOG_DEBUG("Function set-tag=" << t); _tag = t; } std::string gettag() const { return _tag; } - private: - std::string _tag; // derived from the partitioning + std::string _tag; // derived from the partitioning }; struct Group { @@ -140,7 +139,7 @@ struct Group { ov::npuw::Subgraph sg; - void settag(const std::string& t) { + void settag(const std::string & t) { LOG_DEBUG("group set-tag=" << t); _tag = t; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 3fe2ae44830602..29bb144a5314ae 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -1919,7 +1919,7 @@ CompressDictMatMulf32::CompressDictMatMulf32(Context::Ref ctx) { // Const(S) ---------------------> Multiply -> to(f32) -> MatMul -> Result // ???(Act) --------------------------------------------> -PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatMulAsymm::Results to_keep) { +PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, PreserveConstDictMatMulAsymm::Results to_keep) { auto qweight = opp::wrap_type(); auto qcoeff = opp::wrap_type(); auto qzerop = opp::wrap_type(); @@ -1930,16 +1930,22 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatM auto qcvtm = opp::wrap_type({qmuls}); auto qmmi = opp::any_input(); auto qmm = opp::wrap_type({qmmi, qcvtm}); + std::shared_ptr qres; + // MatMul -> Divide -> Tanh -> Multiply -> Result - auto div = opp::wrap_type({qmm, opp::any_input()}); - auto tanh = opp::wrap_type({div}); - auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); + if (ctx.get().mm_gate) { + auto div = opp::wrap_type({qmm, opp::any_input()}); + auto tanh = opp::wrap_type({div}); + auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); - auto matmul_or = - std::make_shared(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)}); + auto matmul_or = + std::make_shared(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)}); - auto qres = opp::wrap_type({matmul_or}); + qres = opp::wrap_type({matmul_or}); + } else { + qres = opp::wrap_type({qmm}); + } // Note: Use [=] to make sure the above objects stay alive in the callback auto callback = [=](ov::pass::pattern::Matcher& m) { @@ -1973,7 +1979,7 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatM // Const(S) ----------------> Multiply -> MatMul -> Result // ???(Act) ----------------------------> -PreserveConstDictMatMulFP8::PreserveConstDictMatMulFP8(PreserveConstDictMatMulFP8::Results to_keep) { +PreserveConstDictMatMulFP8::PreserveConstDictMatMulFP8(Context::Ref ctx, PreserveConstDictMatMulFP8::Results to_keep) { auto qweight = opp::wrap_type(); auto qcoeff = opp::wrap_type(); auto qcvtw = opp::wrap_type({qweight}); @@ -1981,16 +1987,20 @@ PreserveConstDictMatMulFP8::PreserveConstDictMatMulFP8(PreserveConstDictMatMulFP auto optional_kvt = opp::optional({qmuls}); auto qmmi = opp::any_input(); auto qmm = opp::wrap_type({qmmi, optional_kvt}); + std::shared_ptr qres; + // // MatMul -> Divide -> Tanh -> Multiply -> Result + if (ctx.get().mm_gate) { + auto div = opp::wrap_type({qmm, opp::any_input()}); + auto tanh = opp::wrap_type({div}); + auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); - // MatMul -> Divide -> Tanh -> Multiply -> Result - auto div = opp::wrap_type({qmm, opp::any_input()}); - auto tanh = opp::wrap_type({div}); - auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); - - auto matmul_or = - std::make_shared(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)}); + auto matmul_or = + std::make_shared(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)}); - auto qres = opp::wrap_type({matmul_or}); + qres = opp::wrap_type({matmul_or}); + } else { + qres = opp::wrap_type({qmm}); + } // Note: Use [=] to make sure the above objects stay alive in the callback auto callback = [=](ov::pass::pattern::Matcher& m) { @@ -2008,8 +2018,7 @@ PreserveConstDictMatMulFP8::PreserveConstDictMatMulFP8(PreserveConstDictMatMulFP if ((ov::element::f8e4m3 == matched_qweight->get_element_type() || ov::element::f8e5m2 == matched_qweight->get_element_type() || - ov::element::f8e8m0 == matched_qweight->get_element_type() || - ov::element::i8 == matched_qweight->get_element_type()) && + ov::element::f8e8m0 == matched_qweight->get_element_type()) && qcoeff_shape[1] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { to_keep.get().push_back(matched_qweight); to_keep.get().push_back(matched_qcoeff); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index 03d69748866823..c9463f249cb8cb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -23,6 +23,7 @@ struct Context { std::string pmm_dims; bool is_spatial = false; bool mm_dq_full = true; + bool mm_gate = false; using PPtr = std::shared_ptr; using NPtr = std::shared_ptr; @@ -229,7 +230,7 @@ class PreserveConstDictMatMulAsymm : public ov::pass::MatcherPass { using CPtr = std::shared_ptr; using Results = std::reference_wrapper>; - PreserveConstDictMatMulAsymm(Results to_keep); + PreserveConstDictMatMulAsymm(Context::Ref ctx, Results to_keep); }; class PreserveConstDictMatMulFP8 : public ov::pass::MatcherPass { @@ -239,7 +240,7 @@ class PreserveConstDictMatMulFP8 : public ov::pass::MatcherPass { using CPtr = std::shared_ptr; using Results = std::reference_wrapper>; - PreserveConstDictMatMulFP8(Results to_keep); + PreserveConstDictMatMulFP8(Context::Ref ctx, Results to_keep); }; // Slice last Matmul diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp index f1fc3e31449c51..eff81a3bc4ddeb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp @@ -109,7 +109,7 @@ SDPADecomposed::SDPADecomposed(const std::shared_ptr auto convert1 = opp::wrap_type({opp::any_input()}); auto concat1 = opp::wrap_type({convert1, opp::any_input()}); - // GQA optional nodes + //GQA optional nodes auto unsqueeze1 = opp::optional({concat1, opp::any_input()}); auto broadcast1 = opp::optional({unsqueeze1, opp::any_input()}); auto reshape1 = opp::optional({broadcast1, opp::any_input()}); @@ -117,7 +117,7 @@ SDPADecomposed::SDPADecomposed(const std::shared_ptr auto convert2 = opp::wrap_type({opp::any_input()}); auto concat2 = opp::wrap_type({convert2, opp::any_input()}); - // GQA optional nodes + //GQA optional nodes auto unsqueeze2 = opp::optional({concat2, opp::any_input()}); auto broadcast2 = opp::optional({unsqueeze2, opp::any_input()}); auto reshape2 = opp::optional({broadcast2, opp::any_input()}); diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 85b73eaec383d7..36c483efbf390d 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -304,6 +304,7 @@ void init_config(const IEngineBackend* backend, OptionsDesc& options, FilteredCo REGISTER_OPTION(NPUW_DQ); REGISTER_OPTION(NPUW_DQ_FULL); REGISTER_OPTION(NPUW_PMM); + REGISTER_OPTION(NPUW_MM_GATED); REGISTER_OPTION(NPUW_SLICE_OUT); REGISTER_OPTION(NPUW_SPATIAL); REGISTER_OPTION(NPUW_SPATIAL_NWAY); diff --git a/src/plugins/intel_npu/src/plugin/src/properties.cpp b/src/plugins/intel_npu/src/plugin/src/properties.cpp index c46b3b80c26a02..dbbd48e254ec86 100644 --- a/src/plugins/intel_npu/src/plugin/src/properties.cpp +++ b/src/plugins/intel_npu/src/plugin/src/properties.cpp @@ -591,6 +591,7 @@ void Properties::registerPluginProperties() { TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::dyn_quant, NPUW_DQ); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::dyn_quant_full, NPUW_DQ_FULL); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::par_matmul_merge_dims, NPUW_PMM); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::matmul_gate_preserve_constants, NPUW_MM_GATED); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::slice_out, NPUW_SLICE_OUT); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::spatial, NPUW_SPATIAL); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY); diff --git a/src/plugins/intel_npu/tests/functional/internal/plugin/test_properties.hpp b/src/plugins/intel_npu/tests/functional/internal/plugin/test_properties.hpp index f5f67d180c776c..4b96ec16fb6b62 100644 --- a/src/plugins/intel_npu/tests/functional/internal/plugin/test_properties.hpp +++ b/src/plugins/intel_npu/tests/functional/internal/plugin/test_properties.hpp @@ -199,6 +199,7 @@ class PropertiesManagerTests : public ov::test::behavior::OVPluginTestBase, REGISTER_OPTION(NPUW_DQ); REGISTER_OPTION(NPUW_DQ_FULL); REGISTER_OPTION(NPUW_PMM); + REGISTER_OPTION(NPUW_MM_GATED); REGISTER_OPTION(NPUW_SLICE_OUT); REGISTER_OPTION(NPUW_SPATIAL); REGISTER_OPTION(NPUW_SPATIAL_NWAY); From 5274f046a80d9e5752a972459b0dafacc13af6c1 Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Mon, 9 Mar 2026 22:45:58 +0100 Subject: [PATCH 05/14] extra spaces removed --- src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index d0f2e7d18dbc50..ae62c28cd0bb14 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -1259,7 +1259,6 @@ ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { if (npudesc.has_value()) { config.emplace("NPUW_MM_GATED", (npudesc->compiler_matmul_gate ? "YES" : "NO")); } - return config; } @@ -1272,7 +1271,6 @@ ov::AnyMap get_default_common_config(const std::optional& npudesc) { } else { config.emplace("NPUW_FUNCALL_FOR_ALL", "YES"); } - return config; } @@ -1289,7 +1287,6 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, c config.emplace("NPUW_PMM", "NO"); } } - return config; } From 268048a84bb77f62b26b52b431af0c76fbec5a90 Mon Sep 17 00:00:00 2001 From: esmirno1 Date: Mon, 9 Mar 2026 22:03:28 +0000 Subject: [PATCH 06/14] clang-format fixed --- src/plugins/intel_cpu/thirdparty/shl | 1 + src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp | 2 +- .../intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 9 +++++---- .../src/plugin/npuw/partitioning/partitioning.cpp | 2 +- .../src/plugin/npuw/partitioning/partitioning.hpp | 9 +++++---- .../src/plugin/npuw/partitioning/patterns/opt.cpp | 4 ++-- .../src/plugin/npuw/partitioning/patterns/sdpa.cpp | 4 ++-- 7 files changed, 17 insertions(+), 14 deletions(-) create mode 160000 src/plugins/intel_cpu/thirdparty/shl diff --git a/src/plugins/intel_cpu/thirdparty/shl b/src/plugins/intel_cpu/thirdparty/shl new file mode 160000 index 00000000000000..27992eaf41ef96 --- /dev/null +++ b/src/plugins/intel_cpu/thirdparty/shl @@ -0,0 +1 @@ +Subproject commit 27992eaf41ef967ed228ea8d801b1aa489ea8997 diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 8556124c02a317..72e68655d1658b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -635,7 +635,7 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared std::vector to_keep; ov::pass::GraphRewrite rewr2; - ctx.mm_gate = m_cfg.get<::intel_npu::NPUW_MM_GATED>(); + ctx.mm_gate = m_cfg.get<::intel_npu::NPUW_MM_GATED>(); rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index ae62c28cd0bb14..9c0e3adf9fc826 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -1199,10 +1199,11 @@ std::optional extract_npu_descriptor(const std::shared_ptr(); } - LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "." << ONEAPI_VERSION_MINOR(desc.compiler_ver)); + LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "." + << ONEAPI_VERSION_MINOR(desc.compiler_ver)); - constexpr std::string_view compiler_gate_support_msg - = "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : "; + constexpr std::string_view compiler_gate_support_msg = + "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : "; if (desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 28)) { // accuracy for gated matmul fixed at 7.28 @@ -1255,7 +1256,7 @@ ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { config.erase("NPUW_DCOFF_SCALE"); } - //default version is ON - while for older compiler it might be turned off + // default version is ON - while for older compiler it might be turned off if (npudesc.has_value()) { config.emplace("NPUW_MM_GATED", (npudesc->compiler_matmul_gate ? "YES" : "NO")); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index afd6c69de967c4..d558932e187155 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1495,7 +1495,7 @@ void Partitioner::saveTailDictConstants(const std::string& func_name) { std::vector to_keep; ov::npuw::patterns::opt::Context ctx; - ctx.mm_gate = cfg.get<::intel_npu::NPUW_MM_GATED>(); + ctx.mm_gate = cfg.get<::intel_npu::NPUW_MM_GATED>(); ov::pass::GraphRewrite rewr; rewr.add_matcher(std::ref(ctx), std::ref(to_keep)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp index 51389725663892..c8dff61741a6dd 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp @@ -76,7 +76,7 @@ struct Subgraph { using Ref = std::reference_wrapper; - void settag(const std::string & t) { + void settag(const std::string& t) { LOG_DEBUG("Subgraph set-tag=" << t); _tag = t; } @@ -111,15 +111,16 @@ struct Function { // FIXME: shouldn't be here. Needed to not unpack some lazy closures in DCOFF std::set _idx_lazy_unpack; - void settag(const std::string & t) { + void settag(const std::string& t) { LOG_DEBUG("Function set-tag=" << t); _tag = t; } std::string gettag() const { return _tag; } + private: - std::string _tag; // derived from the partitioning + std::string _tag; // derived from the partitioning }; struct Group { @@ -139,7 +140,7 @@ struct Group { ov::npuw::Subgraph sg; - void settag(const std::string & t) { + void settag(const std::string& t) { LOG_DEBUG("group set-tag=" << t); _tag = t; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 29bb144a5314ae..ef510b32e9c4e3 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -1919,7 +1919,8 @@ CompressDictMatMulf32::CompressDictMatMulf32(Context::Ref ctx) { // Const(S) ---------------------> Multiply -> to(f32) -> MatMul -> Result // ???(Act) --------------------------------------------> -PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, PreserveConstDictMatMulAsymm::Results to_keep) { +PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, + PreserveConstDictMatMulAsymm::Results to_keep) { auto qweight = opp::wrap_type(); auto qcoeff = opp::wrap_type(); auto qzerop = opp::wrap_type(); @@ -1932,7 +1933,6 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, Pre auto qmm = opp::wrap_type({qmmi, qcvtm}); std::shared_ptr qres; - // MatMul -> Divide -> Tanh -> Multiply -> Result if (ctx.get().mm_gate) { auto div = opp::wrap_type({qmm, opp::any_input()}); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp index eff81a3bc4ddeb..f1fc3e31449c51 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp @@ -109,7 +109,7 @@ SDPADecomposed::SDPADecomposed(const std::shared_ptr auto convert1 = opp::wrap_type({opp::any_input()}); auto concat1 = opp::wrap_type({convert1, opp::any_input()}); - //GQA optional nodes + // GQA optional nodes auto unsqueeze1 = opp::optional({concat1, opp::any_input()}); auto broadcast1 = opp::optional({unsqueeze1, opp::any_input()}); auto reshape1 = opp::optional({broadcast1, opp::any_input()}); @@ -117,7 +117,7 @@ SDPADecomposed::SDPADecomposed(const std::shared_ptr auto convert2 = opp::wrap_type({opp::any_input()}); auto concat2 = opp::wrap_type({convert2, opp::any_input()}); - //GQA optional nodes + // GQA optional nodes auto unsqueeze2 = opp::optional({concat2, opp::any_input()}); auto broadcast2 = opp::optional({unsqueeze2, opp::any_input()}); auto reshape2 = opp::optional({broadcast2, opp::any_input()}); From 0a811c9fd2006c6bc29ed179ab1d71592f4fb51c Mon Sep 17 00:00:00 2001 From: esmirno1 Date: Mon, 9 Mar 2026 22:12:28 +0000 Subject: [PATCH 07/14] Revert "clang-format fixed" This reverts commit 268048a84bb77f62b26b52b431af0c76fbec5a90. --- src/plugins/intel_cpu/thirdparty/shl | 1 - src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp | 2 +- .../intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 9 ++++----- .../src/plugin/npuw/partitioning/partitioning.cpp | 2 +- .../src/plugin/npuw/partitioning/partitioning.hpp | 9 ++++----- .../src/plugin/npuw/partitioning/patterns/opt.cpp | 4 ++-- .../src/plugin/npuw/partitioning/patterns/sdpa.cpp | 4 ++-- 7 files changed, 14 insertions(+), 17 deletions(-) delete mode 160000 src/plugins/intel_cpu/thirdparty/shl diff --git a/src/plugins/intel_cpu/thirdparty/shl b/src/plugins/intel_cpu/thirdparty/shl deleted file mode 160000 index 27992eaf41ef96..00000000000000 --- a/src/plugins/intel_cpu/thirdparty/shl +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 27992eaf41ef967ed228ea8d801b1aa489ea8997 diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 72e68655d1658b..8556124c02a317 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -635,7 +635,7 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared std::vector to_keep; ov::pass::GraphRewrite rewr2; - ctx.mm_gate = m_cfg.get<::intel_npu::NPUW_MM_GATED>(); + ctx.mm_gate = m_cfg.get<::intel_npu::NPUW_MM_GATED>(); rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 9c0e3adf9fc826..ae62c28cd0bb14 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -1199,11 +1199,10 @@ std::optional extract_npu_descriptor(const std::shared_ptr(); } - LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "." - << ONEAPI_VERSION_MINOR(desc.compiler_ver)); + LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "." << ONEAPI_VERSION_MINOR(desc.compiler_ver)); - constexpr std::string_view compiler_gate_support_msg = - "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : "; + constexpr std::string_view compiler_gate_support_msg + = "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : "; if (desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 28)) { // accuracy for gated matmul fixed at 7.28 @@ -1256,7 +1255,7 @@ ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { config.erase("NPUW_DCOFF_SCALE"); } - // default version is ON - while for older compiler it might be turned off + //default version is ON - while for older compiler it might be turned off if (npudesc.has_value()) { config.emplace("NPUW_MM_GATED", (npudesc->compiler_matmul_gate ? "YES" : "NO")); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index d558932e187155..afd6c69de967c4 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1495,7 +1495,7 @@ void Partitioner::saveTailDictConstants(const std::string& func_name) { std::vector to_keep; ov::npuw::patterns::opt::Context ctx; - ctx.mm_gate = cfg.get<::intel_npu::NPUW_MM_GATED>(); + ctx.mm_gate = cfg.get<::intel_npu::NPUW_MM_GATED>(); ov::pass::GraphRewrite rewr; rewr.add_matcher(std::ref(ctx), std::ref(to_keep)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp index c8dff61741a6dd..51389725663892 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp @@ -76,7 +76,7 @@ struct Subgraph { using Ref = std::reference_wrapper; - void settag(const std::string& t) { + void settag(const std::string & t) { LOG_DEBUG("Subgraph set-tag=" << t); _tag = t; } @@ -111,16 +111,15 @@ struct Function { // FIXME: shouldn't be here. Needed to not unpack some lazy closures in DCOFF std::set _idx_lazy_unpack; - void settag(const std::string& t) { + void settag(const std::string & t) { LOG_DEBUG("Function set-tag=" << t); _tag = t; } std::string gettag() const { return _tag; } - private: - std::string _tag; // derived from the partitioning + std::string _tag; // derived from the partitioning }; struct Group { @@ -140,7 +139,7 @@ struct Group { ov::npuw::Subgraph sg; - void settag(const std::string& t) { + void settag(const std::string & t) { LOG_DEBUG("group set-tag=" << t); _tag = t; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index ef510b32e9c4e3..29bb144a5314ae 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -1919,8 +1919,7 @@ CompressDictMatMulf32::CompressDictMatMulf32(Context::Ref ctx) { // Const(S) ---------------------> Multiply -> to(f32) -> MatMul -> Result // ???(Act) --------------------------------------------> -PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, - PreserveConstDictMatMulAsymm::Results to_keep) { +PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, PreserveConstDictMatMulAsymm::Results to_keep) { auto qweight = opp::wrap_type(); auto qcoeff = opp::wrap_type(); auto qzerop = opp::wrap_type(); @@ -1933,6 +1932,7 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, auto qmm = opp::wrap_type({qmmi, qcvtm}); std::shared_ptr qres; + // MatMul -> Divide -> Tanh -> Multiply -> Result if (ctx.get().mm_gate) { auto div = opp::wrap_type({qmm, opp::any_input()}); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp index f1fc3e31449c51..eff81a3bc4ddeb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp @@ -109,7 +109,7 @@ SDPADecomposed::SDPADecomposed(const std::shared_ptr auto convert1 = opp::wrap_type({opp::any_input()}); auto concat1 = opp::wrap_type({convert1, opp::any_input()}); - // GQA optional nodes + //GQA optional nodes auto unsqueeze1 = opp::optional({concat1, opp::any_input()}); auto broadcast1 = opp::optional({unsqueeze1, opp::any_input()}); auto reshape1 = opp::optional({broadcast1, opp::any_input()}); @@ -117,7 +117,7 @@ SDPADecomposed::SDPADecomposed(const std::shared_ptr auto convert2 = opp::wrap_type({opp::any_input()}); auto concat2 = opp::wrap_type({convert2, opp::any_input()}); - // GQA optional nodes + //GQA optional nodes auto unsqueeze2 = opp::optional({concat2, opp::any_input()}); auto broadcast2 = opp::optional({unsqueeze2, opp::any_input()}); auto reshape2 = opp::optional({broadcast2, opp::any_input()}); From b872130e2a0da1864f31fe11b5b5c616347c64cc Mon Sep 17 00:00:00 2001 From: esmirno1 Date: Mon, 9 Mar 2026 22:15:16 +0000 Subject: [PATCH 08/14] clang-format fixed --- src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp | 2 +- .../intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 9 +++++---- .../src/plugin/npuw/partitioning/partitioning.cpp | 2 +- .../src/plugin/npuw/partitioning/partitioning.hpp | 9 +++++---- .../src/plugin/npuw/partitioning/patterns/opt.cpp | 4 ++-- .../src/plugin/npuw/partitioning/patterns/sdpa.cpp | 4 ++-- 6 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 8556124c02a317..72e68655d1658b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -635,7 +635,7 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared std::vector to_keep; ov::pass::GraphRewrite rewr2; - ctx.mm_gate = m_cfg.get<::intel_npu::NPUW_MM_GATED>(); + ctx.mm_gate = m_cfg.get<::intel_npu::NPUW_MM_GATED>(); rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index ae62c28cd0bb14..9c0e3adf9fc826 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -1199,10 +1199,11 @@ std::optional extract_npu_descriptor(const std::shared_ptr(); } - LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "." << ONEAPI_VERSION_MINOR(desc.compiler_ver)); + LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "." + << ONEAPI_VERSION_MINOR(desc.compiler_ver)); - constexpr std::string_view compiler_gate_support_msg - = "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : "; + constexpr std::string_view compiler_gate_support_msg = + "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : "; if (desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 28)) { // accuracy for gated matmul fixed at 7.28 @@ -1255,7 +1256,7 @@ ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { config.erase("NPUW_DCOFF_SCALE"); } - //default version is ON - while for older compiler it might be turned off + // default version is ON - while for older compiler it might be turned off if (npudesc.has_value()) { config.emplace("NPUW_MM_GATED", (npudesc->compiler_matmul_gate ? "YES" : "NO")); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index afd6c69de967c4..d558932e187155 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1495,7 +1495,7 @@ void Partitioner::saveTailDictConstants(const std::string& func_name) { std::vector to_keep; ov::npuw::patterns::opt::Context ctx; - ctx.mm_gate = cfg.get<::intel_npu::NPUW_MM_GATED>(); + ctx.mm_gate = cfg.get<::intel_npu::NPUW_MM_GATED>(); ov::pass::GraphRewrite rewr; rewr.add_matcher(std::ref(ctx), std::ref(to_keep)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp index 51389725663892..c8dff61741a6dd 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp @@ -76,7 +76,7 @@ struct Subgraph { using Ref = std::reference_wrapper; - void settag(const std::string & t) { + void settag(const std::string& t) { LOG_DEBUG("Subgraph set-tag=" << t); _tag = t; } @@ -111,15 +111,16 @@ struct Function { // FIXME: shouldn't be here. Needed to not unpack some lazy closures in DCOFF std::set _idx_lazy_unpack; - void settag(const std::string & t) { + void settag(const std::string& t) { LOG_DEBUG("Function set-tag=" << t); _tag = t; } std::string gettag() const { return _tag; } + private: - std::string _tag; // derived from the partitioning + std::string _tag; // derived from the partitioning }; struct Group { @@ -139,7 +140,7 @@ struct Group { ov::npuw::Subgraph sg; - void settag(const std::string & t) { + void settag(const std::string& t) { LOG_DEBUG("group set-tag=" << t); _tag = t; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 29bb144a5314ae..ef510b32e9c4e3 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -1919,7 +1919,8 @@ CompressDictMatMulf32::CompressDictMatMulf32(Context::Ref ctx) { // Const(S) ---------------------> Multiply -> to(f32) -> MatMul -> Result // ???(Act) --------------------------------------------> -PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, PreserveConstDictMatMulAsymm::Results to_keep) { +PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, + PreserveConstDictMatMulAsymm::Results to_keep) { auto qweight = opp::wrap_type(); auto qcoeff = opp::wrap_type(); auto qzerop = opp::wrap_type(); @@ -1932,7 +1933,6 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, Pre auto qmm = opp::wrap_type({qmmi, qcvtm}); std::shared_ptr qres; - // MatMul -> Divide -> Tanh -> Multiply -> Result if (ctx.get().mm_gate) { auto div = opp::wrap_type({qmm, opp::any_input()}); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp index eff81a3bc4ddeb..f1fc3e31449c51 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/sdpa.cpp @@ -109,7 +109,7 @@ SDPADecomposed::SDPADecomposed(const std::shared_ptr auto convert1 = opp::wrap_type({opp::any_input()}); auto concat1 = opp::wrap_type({convert1, opp::any_input()}); - //GQA optional nodes + // GQA optional nodes auto unsqueeze1 = opp::optional({concat1, opp::any_input()}); auto broadcast1 = opp::optional({unsqueeze1, opp::any_input()}); auto reshape1 = opp::optional({broadcast1, opp::any_input()}); @@ -117,7 +117,7 @@ SDPADecomposed::SDPADecomposed(const std::shared_ptr auto convert2 = opp::wrap_type({opp::any_input()}); auto concat2 = opp::wrap_type({convert2, opp::any_input()}); - //GQA optional nodes + // GQA optional nodes auto unsqueeze2 = opp::optional({concat2, opp::any_input()}); auto broadcast2 = opp::optional({unsqueeze2, opp::any_input()}); auto reshape2 = opp::optional({broadcast2, opp::any_input()}); From fb12f9ac50e3c5875d498a80c8ccc81c841a29a3 Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Wed, 11 Mar 2026 23:53:16 +0100 Subject: [PATCH 09/14] code-review-fixes --- .../src/plugin/include/properties.hpp | 1 + .../src/plugin/npuw/llm_compiled_model.cpp | 80 ++++++++++++------- .../plugin/npuw/partitioning/patterns/opt.cpp | 4 +- 3 files changed, 52 insertions(+), 33 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/include/properties.hpp b/src/plugins/intel_npu/src/plugin/include/properties.hpp index c95e07fc17fa6b..fbab3a9451b1fa 100644 --- a/src/plugins/intel_npu/src/plugin/include/properties.hpp +++ b/src/plugins/intel_npu/src/plugin/include/properties.hpp @@ -156,6 +156,7 @@ class Properties final { ov::intel_npu::npuw::partitioning::dyn_quant.name(), ov::intel_npu::npuw::partitioning::dyn_quant_full.name(), ov::intel_npu::npuw::partitioning::par_matmul_merge_dims.name(), + ov::intel_npu::npuw::partitioning::matmul_gate_preserve_constants.name(), ov::intel_npu::npuw::partitioning::slice_out.name(), ov::intel_npu::npuw::partitioning::spatial.name(), ov::intel_npu::npuw::partitioning::spatial_nway.name(), diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index e869a9c486d21e..b2ed9557bf5960 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -43,6 +43,23 @@ namespace opp = ov::pass::pattern; +// specific function that match subgraph appeared as result of lpt transformations +auto match_down_up_convert_subgraph_after_lpt = [](const ov::Output& input) { + auto upconvert = opp::wrap_type({input}, opp::type_matches(ov::element::f32)); + + auto upscale = opp::wrap_type(opp::rank_equals(0)); + auto upmul = opp::wrap_type({upconvert, upscale}); + + auto downscale = opp::wrap_type(opp::rank_equals(0)); + auto downmul = opp::wrap_type({upmul, downscale}); + + auto downconvert = + opp::wrap_type({downmul}, + opp::type_matches_any({ov::element::f8e4m3, ov::element::f8e5m2})); + + return downconvert; +}; + class RemoveEmptyKVTensors : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::RemoveEmptyKVTensors"); @@ -54,7 +71,10 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass { RemoveEmptyKVTensors(Context::Ref ctx) { auto param = opp::wrap_type(); - auto concat = opp::wrap_type({param, opp::any_input()}); + auto param_or = + std::make_shared(ov::OutputVector{param, match_down_up_convert_subgraph_after_lpt(param)}); + + auto concat = opp::wrap_type({param_or, opp::any_input()}); auto callback = [=](opp::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); @@ -63,15 +83,28 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass { ctx.get().old_params.push_back(matched_param); - auto users = matched_param->get_users(); - if (users.size() == 2u) { - auto shapeof_node = ov::is_type(users[0]) ? users[0] : users[1]; - NPUW_ASSERT(ov::is_type(shapeof_node)); - auto cst_node = - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, matched_param->get_shape()); - ov::replace_node(shapeof_node, cst_node); - } else { - NPUW_ASSERT(users.size() == 1u); + // Use concat's first input source node to find ShapeOf users. + // This works universally for both plain parameter and down_up_convert subgraph cases, + // because in the subgraph case matched_param->get_users() would return the Convert node + // (first node of the subgraph), not the ShapeOf. + auto concat_input0_node = matched_node_concat->input(0).get_source_output().get_node_shared_ptr(); + auto users = concat_input0_node->get_users(); + + // In subgraph case the parameter itself may also have a ShapeOf user, + // so check both the concat input node and the parameter. + if (concat_input0_node != matched_param) { + auto param_users = matched_param->get_users(); + users.insert(users.end(), param_users.begin(), param_users.end()); + } + + // Remove duplicates (concat itself will appear in users) + // Find and replace ShapeOf nodes with constants + for (auto& user : users) { + if (ov::is_type(user)) { + auto cst_node = + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, matched_param->get_shape()); + ov::replace_node(user, cst_node); + } } // Redirect second concat input to every node which reads from concat @@ -323,22 +356,6 @@ class GroupQueryAttentionDecomposition : public ov::pass::MatcherPass { class RedirectNewKvToOutput : public ov::pass::MatcherPass { public: RedirectNewKvToOutput() { - auto match_down_up_convert_subgraph = [](const ov::Output& input) { - auto upconvert = opp::wrap_type({input}, opp::type_matches(ov::element::f32)); - - auto upscale = opp::wrap_type(opp::rank_equals(0)); - auto upmul = opp::wrap_type({upconvert, upscale}); - - auto downscale = opp::wrap_type(opp::rank_equals(0)); - auto downmul = opp::wrap_type({upmul, downscale}); - - auto downconvert = - opp::wrap_type({downmul}, - opp::type_matches_any({ov::element::f8e4m3, ov::element::f8e5m2})); - - return downconvert; - }; - // example of fp8 inputs to concat // input0 : float8e4m3[1,32,1151,96] // input1 : float8e4m3[1,32,1,96] @@ -348,13 +365,13 @@ class RedirectNewKvToOutput : public ov::pass::MatcherPass { // TODO: this matcher logic better to cover with unit-tests auto input0 = opp::wrap_type(); auto input0_or = - std::make_shared(ov::OutputVector{input0, match_down_up_convert_subgraph(input0)}); + std::make_shared(ov::OutputVector{input0, match_down_up_convert_subgraph_after_lpt(input0)}); auto input1 = opp::any_input(); auto kv_concat = opp::wrap_type({input0_or, input1}); auto result1 = opp::wrap_type(kv_concat); - auto result2 = opp::wrap_type(match_down_up_convert_subgraph(kv_concat)); + auto result2 = opp::wrap_type(match_down_up_convert_subgraph_after_lpt(kv_concat)); auto result_or = std::make_shared(ov::OutputVector{result1, result2}); @@ -1262,7 +1279,9 @@ ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { config.erase("NPUW_DCOFF_SCALE"); } - // default version is ON - while for older compiler it might be turned off + // default value is ON + // for compiler versions >= 7.28 value is ON + // for other compiler versions value is OFF if (npudesc.has_value()) { config.emplace("NPUW_MM_GATED", (npudesc->compiler_matmul_gate ? "YES" : "NO")); } @@ -1896,8 +1915,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m if (!m_is_embedding) { if (!m_use_chunk_prefill) { - // TODO: sometimes it is ok if we cannot find any empty inputs or not? - remove_empty_kv_inputs(prefill_model); + NPUW_ASSERT(remove_empty_kv_inputs(prefill_model)); } else { LOG_DEBUG("Don't remove input key/values from prefill model."); LOG_DEBUG("Ask prefill model to output key/values for prefill chunk size tokens."); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index ef510b32e9c4e3..a0a941f55e2209 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -1984,9 +1984,9 @@ PreserveConstDictMatMulFP8::PreserveConstDictMatMulFP8(Context::Ref ctx, Preserv auto qcoeff = opp::wrap_type(); auto qcvtw = opp::wrap_type({qweight}); auto qmuls = opp::wrap_type({qcvtw, qcoeff}); - auto optional_kvt = opp::optional({qmuls}); + auto optional_cvt = opp::optional({qmuls}); auto qmmi = opp::any_input(); - auto qmm = opp::wrap_type({qmmi, optional_kvt}); + auto qmm = opp::wrap_type({qmmi, optional_cvt}); std::shared_ptr qres; // // MatMul -> Divide -> Tanh -> Multiply -> Result if (ctx.get().mm_gate) { From a0186bd99dca06cda0a54a4b375eae2dcd135523 Mon Sep 17 00:00:00 2001 From: esmirno1 Date: Wed, 11 Mar 2026 23:23:59 +0000 Subject: [PATCH 10/14] clang-format-fixes --- src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 3d4a6f97d068b4..e8ceead8870324 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -72,7 +72,7 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass { RemoveEmptyKVTensors(Context::Ref ctx) { auto param = opp::wrap_type(); auto param_or = - std::make_shared(ov::OutputVector{param, match_down_up_convert_subgraph_after_lpt(param)}); + std::make_shared(ov::OutputVector{param, match_down_up_convert_subgraph_after_lpt(param)}); auto concat = opp::wrap_type({param_or, opp::any_input()}); From d673f36f07ea6f6bc6d63478652d6daac85b21e0 Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Thu, 12 Mar 2026 00:51:23 +0100 Subject: [PATCH 11/14] clang-format-fixes --- .../src/plugin/npuw/llm_compiled_model.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index e8ceead8870324..966c957f756632 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -72,22 +72,25 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass { RemoveEmptyKVTensors(Context::Ref ctx) { auto param = opp::wrap_type(); auto param_or = - std::make_shared(ov::OutputVector{param, match_down_up_convert_subgraph_after_lpt(param)}); + std::make_shared(ov::OutputVector{param, + match_down_up_convert_subgraph_after_lpt(param)}); auto concat = opp::wrap_type({param_or, opp::any_input()}); auto callback = [=](opp::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); - auto matched_param = ov::as_type_ptr(node_to_output.at(param).get_node_shared_ptr()); + auto matched_param = + ov::as_type_ptr(node_to_output.at(param).get_node_shared_ptr()); auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr(); ctx.get().old_params.push_back(matched_param); // Use concat's first input source node to find ShapeOf users. // This works universally for both plain parameter and down_up_convert subgraph cases, - // because in the subgraph case matched_param->get_users() would return the Convert node - // (first node of the subgraph), not the ShapeOf. - auto concat_input0_node = matched_node_concat->input(0).get_source_output().get_node_shared_ptr(); + // because in the subgraph case matched_param->get_users() would return the Convert + // node (first node of the subgraph), not the ShapeOf. + auto concat_input0_node = + matched_node_concat->input(0).get_source_output().get_node_shared_ptr(); auto users = concat_input0_node->get_users(); // In subgraph case the parameter itself may also have a ShapeOf user, @@ -542,7 +545,7 @@ class OldPhi3SlidingMaskMatcher : public ov::pass::MatcherPass { "Sliding window size constant must be of size 1, but got " + std::to_string(matched_neg_window_size->get_output_size())); - // 1.(K range <= (Q_pos range - sliding window).T) | (K range > Q range.T) + // 1.(K range > (Q_pos range - sliding window).T) & (K range <= Q range.T) auto query_range_as_pos_ids = std::make_shared(matched_pos_ids_input, ov::element::f32); std::vector vector_shape{-1, 1}; @@ -556,7 +559,7 @@ class OldPhi3SlidingMaskMatcher : public ov::pass::MatcherPass { std::make_shared(matched_key_range_f32, query_range_as_pos_left_bound); matched_bitwise_or->input(1).replace_source_output(forget_left_mask_for_right_padding); - // 2. (K range <= (Q range - sliding window).T) & (K range >= shape(past_key_values, 2)) + // 2. (K range > (Q range - sliding window).T) | (K range < shape(past_key_values, 2)) auto past_kv_len_f32 = std::make_shared(matched_past_kv_len, ov::element::f32); auto only_present_tokens_mask = std::make_shared(matched_key_range_f32, past_kv_len_f32); From d52dc2e3d767310b7bff0a861d584cf8ab34fae6 Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Thu, 12 Mar 2026 02:07:09 +0100 Subject: [PATCH 12/14] clang-format fixes --- .../intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 966c957f756632..e43781e03b080f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -72,15 +72,13 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass { RemoveEmptyKVTensors(Context::Ref ctx) { auto param = opp::wrap_type(); auto param_or = - std::make_shared(ov::OutputVector{param, - match_down_up_convert_subgraph_after_lpt(param)}); + std::make_shared(ov::OutputVector{param, match_down_up_convert_subgraph_after_lpt(param)}); auto concat = opp::wrap_type({param_or, opp::any_input()}); auto callback = [=](opp::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); - auto matched_param = - ov::as_type_ptr(node_to_output.at(param).get_node_shared_ptr()); + auto matched_param = ov::as_type_ptr(node_to_output.at(param).get_node_shared_ptr()); auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr(); ctx.get().old_params.push_back(matched_param); @@ -89,8 +87,7 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass { // This works universally for both plain parameter and down_up_convert subgraph cases, // because in the subgraph case matched_param->get_users() would return the Convert // node (first node of the subgraph), not the ShapeOf. - auto concat_input0_node = - matched_node_concat->input(0).get_source_output().get_node_shared_ptr(); + auto concat_input0_node = matched_node_concat->input(0).get_source_output().get_node_shared_ptr(); auto users = concat_input0_node->get_users(); // In subgraph case the parameter itself may also have a ShapeOf user, From 8e04ee147c02c52b25af50168ff849ae2a1f901b Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Thu, 12 Mar 2026 13:13:44 +0100 Subject: [PATCH 13/14] comments restored --- src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index e43781e03b080f..603e55bd38ecc1 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -542,7 +542,7 @@ class OldPhi3SlidingMaskMatcher : public ov::pass::MatcherPass { "Sliding window size constant must be of size 1, but got " + std::to_string(matched_neg_window_size->get_output_size())); - // 1.(K range > (Q_pos range - sliding window).T) & (K range <= Q range.T) + // 1.(K range <= (Q_pos range - sliding window).T) | (K range > Q range.T) auto query_range_as_pos_ids = std::make_shared(matched_pos_ids_input, ov::element::f32); std::vector vector_shape{-1, 1}; @@ -556,7 +556,7 @@ class OldPhi3SlidingMaskMatcher : public ov::pass::MatcherPass { std::make_shared(matched_key_range_f32, query_range_as_pos_left_bound); matched_bitwise_or->input(1).replace_source_output(forget_left_mask_for_right_padding); - // 2. (K range > (Q range - sliding window).T) | (K range < shape(past_key_values, 2)) + // 2. (K range <= (Q range - sliding window).T) & (K range >= shape(past_key_values, 2)) auto past_kv_len_f32 = std::make_shared(matched_past_kv_len, ov::element::f32); auto only_present_tokens_mask = std::make_shared(matched_key_range_f32, past_kv_len_f32); From 87029d78794a045304ea0083f25233ead5c83ea9 Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Thu, 12 Mar 2026 13:36:13 +0100 Subject: [PATCH 14/14] comment corrected --- src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 603e55bd38ecc1..fd1db77da35611 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -97,7 +97,6 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass { users.insert(users.end(), param_users.begin(), param_users.end()); } - // Remove duplicates (concat itself will appear in users) // Find and replace ShapeOf nodes with constants for (auto& user : users) { if (ov::is_type(user)) {