diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp index ebae76152394f6..8204217ed27e79 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp @@ -57,6 +57,13 @@ TYPE_PRINTER(std::size_t) #ifndef ONEAPI_MAKE_VERSION /// @brief Generates generic 'oneAPI' API versions # define ONEAPI_MAKE_VERSION(_major, _minor) ((_major << 16) | (_minor & 0x0000ffff)) + +/// @brief extract 'oneAPI' API major version +# define ONEAPI_VERSION_MAJOR(_version) ((_version) >> 16) + +/// @brief extract 'oneAPI' API minor version +# define ONEAPI_VERSION_MINOR(_version) ((_version) & 0x0000ffff) + #endif // ONEAPI_MAKE_VERSION // diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 1427063d8ca532..26766745bb176d 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -112,6 +112,7 @@ DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, RunTime); DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, RunTime); DEFINE_OPT(NPUW_DQ_FULL, bool, true, npuw::partitioning::dyn_quant_full, RunTime); DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, RunTime); +DEFINE_OPT(NPUW_MM_GATED, bool, true, npuw::partitioning::matmul_gate_preserve_constants, RunTime); DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, RunTime); DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, RunTime); DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, RunTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index b4520482f0d0e3..6ef738ff16b044 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -216,6 +216,15 @@ static constexpr ov::Property dyn_quant_full{"NPUW_DQ_FULL"}; */ static constexpr ov::Property par_matmul_merge_dims{"NPUW_PMM"}; +/** + * @brief + * Type: bool. + * whether to preserve constants for gated version of matmul + * on some version of compiler - might produce incorrect results when enabled + * Default value: YES + */ +static constexpr ov::Property matmul_gate_preserve_constants{"NPUW_MM_GATED"}; + /** * @brief * Type: bool. diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 825103217532a0..79e72f405d5154 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -29,6 +29,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/include/properties.hpp b/src/plugins/intel_npu/src/plugin/include/properties.hpp index fdb8d1b52a1329..9320b2f471b954 100644 --- a/src/plugins/intel_npu/src/plugin/include/properties.hpp +++ b/src/plugins/intel_npu/src/plugin/include/properties.hpp @@ -172,6 +172,7 @@ class Properties final { ov::intel_npu::npuw::partitioning::dyn_quant.name(), ov::intel_npu::npuw::partitioning::dyn_quant_full.name(), ov::intel_npu::npuw::partitioning::par_matmul_merge_dims.name(), + ov::intel_npu::npuw::partitioning::matmul_gate_preserve_constants.name(), ov::intel_npu::npuw::partitioning::slice_out.name(), ov::intel_npu::npuw::partitioning::spatial.name(), ov::intel_npu::npuw::partitioning::spatial_nway.name(), diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index dc98a1456ac360..6d11667d6bcbdf 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -635,8 +635,10 @@ bool ov::npuw::CompiledModel::should_use_quantized_host_gather(const std::shared std::vector to_keep; ov::pass::GraphRewrite rewr2; - rewr2.add_matcher(std::ref(to_keep)); - rewr2.add_matcher(std::ref(to_keep)); + ctx.mm_gate = m_cfg.get<::intel_npu::NPUW_MM_GATED>(); + + rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); + rewr2.add_matcher(std::ref(ctx), std::ref(to_keep)); rewr2.run_on_model(model); // FIXME: since 3-model pipeline is the default option, the tail will be separate, // so we need to match either head or tail pattern here for host gather quantized feature to work. @@ -2506,6 +2508,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::dyn_quant, NPUW_DQ), BIND(npuw::partitioning::dyn_quant_full, NPUW_DQ_FULL), BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM), + BIND(npuw::partitioning::matmul_gate_preserve_constants, NPUW_MM_GATED), BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT), BIND(npuw::partitioning::spatial, NPUW_SPATIAL), BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY), diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 1fa6036eaf1bbf..fd1db77da35611 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -43,6 +43,23 @@ namespace opp = ov::pass::pattern; +// specific function that match subgraph appeared as result of lpt transformations +auto match_down_up_convert_subgraph_after_lpt = [](const ov::Output& input) { + auto upconvert = opp::wrap_type({input}, opp::type_matches(ov::element::f32)); + + auto upscale = opp::wrap_type(opp::rank_equals(0)); + auto upmul = opp::wrap_type({upconvert, upscale}); + + auto downscale = opp::wrap_type(opp::rank_equals(0)); + auto downmul = opp::wrap_type({upmul, downscale}); + + auto downconvert = + opp::wrap_type({downmul}, + opp::type_matches_any({ov::element::f8e4m3, ov::element::f8e5m2})); + + return downconvert; +}; + class RemoveEmptyKVTensors : public ov::pass::MatcherPass { public: OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::RemoveEmptyKVTensors"); @@ -54,7 +71,10 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass { RemoveEmptyKVTensors(Context::Ref ctx) { auto param = opp::wrap_type(); - auto concat = opp::wrap_type({param, opp::any_input()}); + auto param_or = + std::make_shared(ov::OutputVector{param, match_down_up_convert_subgraph_after_lpt(param)}); + + auto concat = opp::wrap_type({param_or, opp::any_input()}); auto callback = [=](opp::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); @@ -63,15 +83,27 @@ class RemoveEmptyKVTensors : public ov::pass::MatcherPass { ctx.get().old_params.push_back(matched_param); - auto users = matched_param->get_users(); - if (users.size() == 2u) { - auto shapeof_node = ov::is_type(users[0]) ? users[0] : users[1]; - NPUW_ASSERT(ov::is_type(shapeof_node)); - auto cst_node = - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, matched_param->get_shape()); - ov::replace_node(shapeof_node, cst_node); - } else { - NPUW_ASSERT(users.size() == 1u); + // Use concat's first input source node to find ShapeOf users. + // This works universally for both plain parameter and down_up_convert subgraph cases, + // because in the subgraph case matched_param->get_users() would return the Convert + // node (first node of the subgraph), not the ShapeOf. + auto concat_input0_node = matched_node_concat->input(0).get_source_output().get_node_shared_ptr(); + auto users = concat_input0_node->get_users(); + + // In subgraph case the parameter itself may also have a ShapeOf user, + // so check both the concat input node and the parameter. + if (concat_input0_node != matched_param) { + auto param_users = matched_param->get_users(); + users.insert(users.end(), param_users.begin(), param_users.end()); + } + + // Find and replace ShapeOf nodes with constants + for (auto& user : users) { + if (ov::is_type(user)) { + auto cst_node = + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, matched_param->get_shape()); + ov::replace_node(user, cst_node); + } } // Redirect second concat input to every node which reads from concat @@ -323,22 +355,6 @@ class GroupQueryAttentionDecomposition : public ov::pass::MatcherPass { class RedirectNewKvToOutput : public ov::pass::MatcherPass { public: RedirectNewKvToOutput() { - auto match_down_up_convert_subgraph = [](const ov::Output& input) { - auto upconvert = opp::wrap_type({input}, opp::type_matches(ov::element::f32)); - - auto upscale = opp::wrap_type(opp::rank_equals(0)); - auto upmul = opp::wrap_type({upconvert, upscale}); - - auto downscale = opp::wrap_type(opp::rank_equals(0)); - auto downmul = opp::wrap_type({upmul, downscale}); - - auto downconvert = - opp::wrap_type({downmul}, - opp::type_matches_any({ov::element::f8e4m3, ov::element::f8e5m2})); - - return downconvert; - }; - // example of fp8 inputs to concat // input0 : float8e4m3[1,32,1151,96] // input1 : float8e4m3[1,32,1,96] @@ -348,13 +364,13 @@ class RedirectNewKvToOutput : public ov::pass::MatcherPass { // TODO: this matcher logic better to cover with unit-tests auto input0 = opp::wrap_type(); auto input0_or = - std::make_shared(ov::OutputVector{input0, match_down_up_convert_subgraph(input0)}); + std::make_shared(ov::OutputVector{input0, match_down_up_convert_subgraph_after_lpt(input0)}); auto input1 = opp::any_input(); auto kv_concat = opp::wrap_type({input0_or, input1}); auto result1 = opp::wrap_type(kv_concat); - auto result2 = opp::wrap_type(match_down_up_convert_subgraph(kv_concat)); + auto result2 = opp::wrap_type(match_down_up_convert_subgraph_after_lpt(kv_concat)); auto result_or = std::make_shared(ov::OutputVector{result1, result2}); @@ -1162,6 +1178,7 @@ struct NPUDesc { std::string arch; int64_t max_tiles = 0; bool compiler_dq = false; + bool compiler_matmul_gate = false; int64_t compiler_ver = 0; bool support_flash_attention_tile = false; }; @@ -1199,6 +1216,19 @@ std::optional extract_npu_descriptor(const std::shared_ptr(); } + LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "." + << ONEAPI_VERSION_MINOR(desc.compiler_ver)); + + constexpr std::string_view compiler_gate_support_msg = + "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : "; + + if (desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 28)) { + // accuracy for gated matmul fixed at 7.28 + desc.compiler_matmul_gate = true; + LOG_INFO(compiler_gate_support_msg << "supported"); + } else { + LOG_WARN(compiler_gate_support_msg << "unsupported"); + } if (desc.arch == "5010" && desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 29)) { // Flash attention tile is supported starting from compiler version 7.29 on NPU5010 @@ -1247,6 +1277,13 @@ ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { config.erase("NPUW_DCOFF_TYPE"); config.erase("NPUW_DCOFF_SCALE"); } + + // default value is ON + // for compiler versions >= 7.28 value is ON + // for other compiler versions value is OFF + if (npudesc.has_value()) { + config.emplace("NPUW_MM_GATED", (npudesc->compiler_matmul_gate ? "YES" : "NO")); + } return config; } @@ -1877,7 +1914,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m if (!m_is_embedding) { if (!m_use_chunk_prefill) { - // TODO: sometimes it is ok if we cannot find any empty inputs or not? NPUW_ASSERT(remove_empty_kv_inputs(prefill_model)); } else { LOG_DEBUG("Don't remove input key/values from prefill model."); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 785232baeaedc0..8ce8e3211d2fcb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1494,9 +1494,12 @@ void Partitioner::saveTailDictConstants(const std::string& func_name) { using CPtr = std::shared_ptr; std::vector to_keep; + ov::npuw::patterns::opt::Context ctx; + ctx.mm_gate = cfg.get<::intel_npu::NPUW_MM_GATED>(); + ov::pass::GraphRewrite rewr; - rewr.add_matcher(std::ref(to_keep)); - rewr.add_matcher(std::ref(to_keep)); + rewr.add_matcher(std::ref(ctx), std::ref(to_keep)); + rewr.add_matcher(std::ref(ctx), std::ref(to_keep)); rewr.run_on_model(model_group.front()); for (auto&& const_to_keep : to_keep) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 709e81084690d7..a0a941f55e2209 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -1919,7 +1919,8 @@ CompressDictMatMulf32::CompressDictMatMulf32(Context::Ref ctx) { // Const(S) ---------------------> Multiply -> to(f32) -> MatMul -> Result // ???(Act) --------------------------------------------> -PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatMulAsymm::Results to_keep) { +PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(Context::Ref ctx, + PreserveConstDictMatMulAsymm::Results to_keep) { auto qweight = opp::wrap_type(); auto qcoeff = opp::wrap_type(); auto qzerop = opp::wrap_type(); @@ -1930,7 +1931,21 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatM auto qcvtm = opp::wrap_type({qmuls}); auto qmmi = opp::any_input(); auto qmm = opp::wrap_type({qmmi, qcvtm}); - auto qres = opp::wrap_type({qmm}); + std::shared_ptr qres; + + // MatMul -> Divide -> Tanh -> Multiply -> Result + if (ctx.get().mm_gate) { + auto div = opp::wrap_type({qmm, opp::any_input()}); + auto tanh = opp::wrap_type({div}); + auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); + + auto matmul_or = + std::make_shared(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)}); + + qres = opp::wrap_type({matmul_or}); + } else { + qres = opp::wrap_type({qmm}); + } // Note: Use [=] to make sure the above objects stay alive in the callback auto callback = [=](ov::pass::pattern::Matcher& m) { @@ -1964,14 +1979,28 @@ PreserveConstDictMatMulAsymm::PreserveConstDictMatMulAsymm(PreserveConstDictMatM // Const(S) ----------------> Multiply -> MatMul -> Result // ???(Act) ----------------------------> -PreserveConstDictMatMulSymm::PreserveConstDictMatMulSymm(PreserveConstDictMatMulSymm::Results to_keep) { +PreserveConstDictMatMulFP8::PreserveConstDictMatMulFP8(Context::Ref ctx, PreserveConstDictMatMulFP8::Results to_keep) { auto qweight = opp::wrap_type(); auto qcoeff = opp::wrap_type(); auto qcvtw = opp::wrap_type({qweight}); auto qmuls = opp::wrap_type({qcvtw, qcoeff}); + auto optional_cvt = opp::optional({qmuls}); auto qmmi = opp::any_input(); - auto qmm = opp::wrap_type({qmmi, qmuls}); - auto qres = opp::wrap_type({qmm}); + auto qmm = opp::wrap_type({qmmi, optional_cvt}); + std::shared_ptr qres; + // // MatMul -> Divide -> Tanh -> Multiply -> Result + if (ctx.get().mm_gate) { + auto div = opp::wrap_type({qmm, opp::any_input()}); + auto tanh = opp::wrap_type({div}); + auto matmul_multiply = opp::wrap_type({tanh, opp::any_input()}); + + auto matmul_or = + std::make_shared(ov::OutputVector{qmm->output(0), matmul_multiply->output(0)}); + + qres = opp::wrap_type({matmul_or}); + } else { + qres = opp::wrap_type({qmm}); + } // Note: Use [=] to make sure the above objects stay alive in the callback auto callback = [=](ov::pass::pattern::Matcher& m) { @@ -1997,7 +2026,7 @@ PreserveConstDictMatMulSymm::PreserveConstDictMatMulSymm(PreserveConstDictMatMul } return false; // root hasn't changed }; - register_matcher(std::make_shared(qres, "OptPreserveConstDictMatMulSymm"), std::move(callback)); + register_matcher(std::make_shared(qres, "OptPreserveConstDictMatMulFP8"), std::move(callback)); } SliceLastMatmul::SliceLastMatmul() { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index 83ae516dedccc9..c9463f249cb8cb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -23,6 +23,7 @@ struct Context { std::string pmm_dims; bool is_spatial = false; bool mm_dq_full = true; + bool mm_gate = false; using PPtr = std::shared_ptr; using NPtr = std::shared_ptr; @@ -229,17 +230,17 @@ class PreserveConstDictMatMulAsymm : public ov::pass::MatcherPass { using CPtr = std::shared_ptr; using Results = std::reference_wrapper>; - PreserveConstDictMatMulAsymm(Results to_keep); + PreserveConstDictMatMulAsymm(Context::Ref ctx, Results to_keep); }; -class PreserveConstDictMatMulSymm : public ov::pass::MatcherPass { +class PreserveConstDictMatMulFP8 : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("npuw::patterns::opt::PreserveConstDictMatMulSymm"); + OPENVINO_MATCHER_PASS_RTTI("npuw::patterns::opt::PreserveConstDictMatMulFP8"); using CPtr = std::shared_ptr; using Results = std::reference_wrapper>; - PreserveConstDictMatMulSymm(Results to_keep); + PreserveConstDictMatMulFP8(Context::Ref ctx, Results to_keep); }; // Slice last Matmul diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 96847cfbffef5e..ed3257c524abf8 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -305,6 +305,7 @@ void init_config(const IEngineBackend* backend, OptionsDesc& options, FilteredCo REGISTER_OPTION(NPUW_DQ); REGISTER_OPTION(NPUW_DQ_FULL); REGISTER_OPTION(NPUW_PMM); + REGISTER_OPTION(NPUW_MM_GATED); REGISTER_OPTION(NPUW_SLICE_OUT); REGISTER_OPTION(NPUW_SPATIAL); REGISTER_OPTION(NPUW_SPATIAL_NWAY); diff --git a/src/plugins/intel_npu/src/plugin/src/properties.cpp b/src/plugins/intel_npu/src/plugin/src/properties.cpp index 16c9ae64190d4d..63f7f005bb4ad6 100644 --- a/src/plugins/intel_npu/src/plugin/src/properties.cpp +++ b/src/plugins/intel_npu/src/plugin/src/properties.cpp @@ -607,6 +607,7 @@ void Properties::registerPluginProperties() { TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::dyn_quant, NPUW_DQ); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::dyn_quant_full, NPUW_DQ_FULL); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::par_matmul_merge_dims, NPUW_PMM); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::matmul_gate_preserve_constants, NPUW_MM_GATED); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::slice_out, NPUW_SLICE_OUT); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::spatial, NPUW_SPATIAL); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY); diff --git a/src/plugins/intel_npu/tests/functional/internal/plugin/test_properties.hpp b/src/plugins/intel_npu/tests/functional/internal/plugin/test_properties.hpp index 52142f6df2b2b1..e3522c19fa2874 100644 --- a/src/plugins/intel_npu/tests/functional/internal/plugin/test_properties.hpp +++ b/src/plugins/intel_npu/tests/functional/internal/plugin/test_properties.hpp @@ -200,6 +200,7 @@ class PropertiesManagerTests : public ov::test::behavior::OVPluginTestBase, REGISTER_OPTION(NPUW_DQ); REGISTER_OPTION(NPUW_DQ_FULL); REGISTER_OPTION(NPUW_PMM); + REGISTER_OPTION(NPUW_MM_GATED); REGISTER_OPTION(NPUW_SLICE_OUT); REGISTER_OPTION(NPUW_SPATIAL); REGISTER_OPTION(NPUW_SPATIAL_NWAY);