From 2864388055c9a24965c8fa1b91883db5745b7b71 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Tue, 10 Feb 2026 11:19:11 +0100 Subject: [PATCH 01/11] [CPU] Add ACL concat executor path - add generic concat executor/factory and ACL implementation for f16/f32 ncsp/nspc tensors up to 4D - register ACL concat descriptors, prefer ACL during PD selection, and run executor when available - keep impl_desc_type flexible for ref path to allow factory-provided implementations --- src/plugins/intel_cpu/src/nodes/concat.cpp | 109 +++++++++++- src/plugins/intel_cpu/src/nodes/concat.h | 4 + .../src/nodes/executors/acl/acl_concat.cpp | 157 ++++++++++++++++++ .../src/nodes/executors/acl/acl_concat.hpp | 44 +++++ .../intel_cpu/src/nodes/executors/concat.cpp | 15 ++ .../intel_cpu/src/nodes/executors/concat.hpp | 59 +++++++ .../src/nodes/executors/concat_list.cpp | 73 ++++++++ .../src/nodes/executors/concat_list.hpp | 42 +++++ 8 files changed, 502 insertions(+), 1 deletion(-) create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/concat.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/concat.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/concat_list.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/concat_list.hpp diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 07970e6192bf3a..aa5431e681b3ba 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -34,6 +34,10 @@ #include "graph_context.h" #include "memory_desc/cpu_memory_desc.h" #include "node.h" +#if defined(OV_CPU_WITH_ACL) +# include "nodes/executors/concat_list.hpp" +# include "nodes/executors/executor.hpp" +#endif #include "nodes/node_config.h" #include "onednn/dnnl.h" #include "openvino/core/except.hpp" @@ -211,7 +215,7 @@ void Concat::initSupportedPrimitiveDescriptors() { auto desc = itr->second->createSharedDesc(inputPrecision, getInputShapeAtPort(i)); config.inConfs[i].setMemDesc(desc); } - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref); + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef); if (itr->first != LayoutType::nspc) { pdIndexesToReuse.push_back(supportedPrimitiveDescriptors.size() - 1); } else if (canBeInPlace) { @@ -248,6 +252,47 @@ void Concat::initSupportedPrimitiveDescriptors() { } supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); } + +#if defined(OV_CPU_WITH_ACL) + // ACL executor (ncsp/nspc, f16/f32, rank<=4) + { + const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); + auto pushAclDesc = [&](LayoutType lt) { + NodeConfig config; + config.outConfs.resize(1); + config.inConfs.resize(getParentEdges().size()); + + std::vector srcMemoryDescs; + for (size_t i = 0; i < getParentEdges().size(); ++i) { + auto desc = creatorsMap.at(lt)->createSharedDesc(inputPrecision, getInputShapeAtPort(i)); + config.inConfs[i].setMemDesc(desc); + config.inConfs[i].inPlace(-1); + config.inConfs[i].constant(false); + srcMemoryDescs.push_back(desc); + } + auto outDesc = creatorsMap.at(lt)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0)); + config.outConfs[0].setMemDesc(outDesc); + config.outConfs[0].inPlace(-1); + config.outConfs[0].constant(false); + + concatAttrs = {axis}; + auto factory = + std::make_shared(concatAttrs, + srcMemoryDescs, + std::vector{outDesc}, + std::make_shared(context, getImplPriority())); + if (factory->hasSupportedDescs()) { + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::acl, factory); + } + }; + + const auto rank = getInputShapeAtPort(0).getRank(); + if (rank <= 4 && (inputPrecision == ov::element::f16 || inputPrecision == ov::element::f32)) { + pushAclDesc(LayoutType::ncsp); + pushAclDesc(LayoutType::nspc); + } + } +#endif } void Concat::selectOptimalPrimitiveDescriptor() { @@ -354,6 +399,20 @@ void Concat::selectOptimalPrimitiveDescriptor() { return; } + for (auto indx : canSelectPrimitive) { + if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::acl) { + selectPrimitiveDescriptorByIndex(static_cast(indx)); + return; + } + } + + for (auto indx : canSelectPrimitive) { + if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::ref) { + selectPrimitiveDescriptorByIndex(static_cast(indx)); + return; + } + } + // if there are more than one PD with similar data layouts - select the optimized one for (auto indx : canSelectPrimitive) { if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::unknown) { @@ -389,6 +448,35 @@ void Concat::prepareParams() { return; } +#if defined(OV_CPU_WITH_ACL) + if (useACL) { + auto* selected_pd = getSelectedPrimitiveDescriptor(); + CPU_NODE_ASSERT(selected_pd, "Preferable primitive descriptor is not set."); + + std::vector srcMemoryDescs; + for (size_t i = 0; i < getParentEdges().size(); ++i) { + srcMemoryDescs.push_back(getParentEdgeAt(i)->getMemory().getDescPtr()); + } + std::vector dstMemoryDescs; + dstMemoryDescs.push_back(getChildEdgeAt(0)->getMemory().getDescPtr()); + + try { + execPtrACL = + selected_pd->getExecutorFactoryAs()->makeExecutor(concatAttrs, + srcMemoryDescs, + dstMemoryDescs, + dnnl::primitive_attr()); + } catch (...) { + execPtrACL.reset(); + } + if (execPtrACL) { + selected_pd->setImplementationType(execPtrACL->getImplType()); + return; + } + useACL = false; // fallback to default path + } +#endif + const auto& dstMemPtr = getDstMemoryAtPort(0); CPU_NODE_ASSERT(dstMemPtr && dstMemPtr->isDefined(), "Destination memory is undefined."); auto dstMemDesc = dstMemPtr->getDescWithType(); @@ -525,6 +613,8 @@ void Concat::initOptimalPrimitiveDescriptor() { } } + useACL = selected_pd->getImplementationType() == impl_desc_type::acl; + // block layout may have axis greater than rank, disable ref_concat auto* primDesc = getSelectedPrimitiveDescriptor(); auto* memDesc = primDesc->getConfig().outConfs[0].getMemDesc()->as(); @@ -558,6 +648,23 @@ void Concat::execute(const dnnl::stream& strm) { return; } +#if defined(OV_CPU_WITH_ACL) + if (useACL && execPtrACL) { + std::vector srcMemory; + srcMemory.reserve(getOriginalInputsNumber()); + for (size_t i = 0; i < getOriginalInputsNumber(); ++i) { + srcMemory.push_back(getSrcMemoryAtPort(i)); + } + std::vector dstMemory; + dstMemory.reserve(getOriginalOutputsNumber()); + for (size_t i = 0; i < getOriginalOutputsNumber(); ++i) { + dstMemory.push_back(getDstMemoryAtPort(i)); + } + execPtrACL->exec(srcMemory, dstMemory); + return; + } +#endif + if (canOptimize1DCase) { exec1DCase(); return; diff --git a/src/plugins/intel_cpu/src/nodes/concat.h b/src/plugins/intel_cpu/src/nodes/concat.h index cfbd7aa2830466..83dad4494a8f9d 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.h +++ b/src/plugins/intel_cpu/src/nodes/concat.h @@ -14,6 +14,7 @@ #include "edge.h" #include "graph_context.h" #include "node.h" +#include "nodes/executors/concat.hpp" #include "openvino/core/node.hpp" #include "openvino/core/type/element_type.hpp" @@ -70,6 +71,9 @@ class Concat : public Node { bool doFuseConvert = false; // whether to perform FP16 to FP32 conversion static constexpr size_t MAX_RANK_REF = 6; dnnl::primitive prim; + bool useACL = false; + ConcatAttrs concatAttrs; + ConcatExecutorPtr execPtrACL; }; } // namespace ov::intel_cpu::node diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp new file mode 100644 index 00000000000000..3d1da57715dec7 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "acl_concat.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "acl_utils.hpp" +#include "cpu_memory.h" +#include "memory_desc/cpu_memory_desc.h" +#include "nodes/executors/concat.hpp" +#include "openvino/core/type/element_type.hpp" +#include "utils/debug_capabilities.h" + +namespace ov::intel_cpu { + +bool AclConcatExecutor::init(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + [[maybe_unused]] const dnnl::primitive_attr& attr) { + this->concatAttrs = concatAttrs; + const auto& firstDesc = srcDescs[0]; + const auto precision = firstDesc->getPrecision(); + const bool isNspc = firstDesc->hasLayoutType(LayoutType::nspc); + const bool isNcsp = firstDesc->hasLayoutType(LayoutType::ncsp); + const auto rank = firstDesc->getShape().getRank(); + + if ((!isNspc && !isNcsp) || rank > 4) { + return false; + } + // all inputs must share layout & precision + for (const auto& d : srcDescs) { + if (d->getPrecision() != precision || d->hasLayoutType(LayoutType::nspc) != isNspc || + d->hasLayoutType(LayoutType::ncsp) != isNcsp || d->getShape().getRank() != rank) { + return false; + } + } + if (dstDescs[0]->getPrecision() != precision || dstDescs[0]->hasLayoutType(LayoutType::nspc) != isNspc || + dstDescs[0]->hasLayoutType(LayoutType::ncsp) != isNcsp) { + return false; + } + if (precision != ov::element::f16 && precision != ov::element::f32) { + return false; + } + + auto aclLayout = getAclDataLayoutByMemoryDesc(firstDesc); + if (aclLayout == arm_compute::DataLayout::UNKNOWN) { + return false; + } + + const auto& dstDims = dstDescs[0]->getShape().getStaticDims(); + int aclAxis = axisCast(concatAttrs.axis, + rank, + isNspc ? ACLAxisCastMode::NHWC_TO_NCHW : ACLAxisCastMode::NO_LAYOUT_CONVERSION); + if (aclAxis < 0 || static_cast(aclAxis) >= rank) { + return false; + } + + auto dstShape = shapeCast(dstDims); + if (isNspc) { + changeLayoutToNH_C({&dstShape}); + } + arm_compute::TensorInfo dstInfo(dstShape, 1, precisionToAclDataType(precision), aclLayout); + + std::vector inputInfos; + srcTensors.resize(srcDescs.size()); + for (size_t i = 0; i < srcDescs.size(); ++i) { + const auto& dims = srcDescs[i]->getShape().getStaticDims(); + auto srcShape = shapeCast(dims); + if (isNspc) { + changeLayoutToNH_C({&srcShape}); + } + arm_compute::TensorInfo srcInfo(srcShape, 1, precisionToAclDataType(precision), aclLayout); + inputInfos.push_back(&srcInfo); + srcTensors[i].allocator()->init(srcInfo); + } + + dstTensor.allocator()->init(dstInfo); + + auto status = arm_compute::NEConcatenateLayer::validate(inputInfos, &dstInfo, static_cast(aclAxis)); + if (!status) { + DEBUG_LOG("NEConcatenateLayer validation failed: ", status.error_description()); + return false; + } + + configureThreadSafe([&] { + std::vector tensors; + tensors.reserve(srcTensors.size()); + for (auto& t : srcTensors) { + tensors.push_back(&t); + } + concatLayer.configure(tensors, &dstTensor, static_cast(aclAxis)); + }); + + return true; +} + +void AclConcatExecutor::exec(const std::vector& src, const std::vector& dst) { + for (size_t i = 0; i < src.size(); ++i) { + srcTensors[i].allocator()->import_memory(src[i]->getData()); + } + dstTensor.allocator()->import_memory(dst[0]->getData()); + + concatLayer.run(); + + for (auto& t : srcTensors) { + t.allocator()->free(); + } + dstTensor.allocator()->free(); +} + +bool AclConcatExecutorBuilder::isSupported(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const { + if (srcDescs.empty() || dstDescs.empty()) { + return false; + } + const auto rank = srcDescs[0]->getShape().getRank(); + if (rank == Shape::UNDEFINED_DIM || rank == 0) { + return false; + } + if (concatAttrs.axis >= rank) { + return false; + } + const bool isNspc = srcDescs[0]->hasLayoutType(LayoutType::nspc); + const bool isNcsp = srcDescs[0]->hasLayoutType(LayoutType::ncsp); + if (!isNspc && !isNcsp) { + return false; + } + // ACL concat supports up to 4D tensors (NHWC/NCHW) + if (rank > 4) { + return false; + } + const auto prec = srcDescs[0]->getPrecision(); + if (prec != ov::element::f16 && prec != ov::element::f32) { + return false; + } + for (const auto& d : srcDescs) { + if (d->getPrecision() != prec || d->getShape().getRank() != rank || + d->hasLayoutType(LayoutType::nspc) != isNspc || d->hasLayoutType(LayoutType::ncsp) != isNcsp) { + return false; + } + } + return dstDescs[0]->getPrecision() == prec && dstDescs[0]->hasLayoutType(LayoutType::nspc) == isNspc && + dstDescs[0]->hasLayoutType(LayoutType::ncsp) == isNcsp; +} + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.hpp new file mode 100644 index 00000000000000..a24c0552c94a4a --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "nodes/executors/concat.hpp" +#include "nodes/executors/executor.hpp" +#include "utils/debug_capabilities.h" + +namespace ov::intel_cpu { + +class AclConcatExecutor : public ConcatExecutor { +public: + using ConcatExecutor::ConcatExecutor; + bool init(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) override; + void exec(const std::vector& src, const std::vector& dst) override; + [[nodiscard]] impl_desc_type getImplType() const override { + return impl_desc_type::acl; + } + +private: + std::vector srcTensors; + arm_compute::Tensor dstTensor; + arm_compute::NEConcatenateLayer concatLayer; +}; + +class AclConcatExecutorBuilder : public ConcatExecutorBuilder { +public: + [[nodiscard]] bool isSupported(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const override; + [[nodiscard]] ConcatExecutorPtr makeExecutor(ExecutorContext::CPtr context) const override { + return std::make_shared(context); + } +}; + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat.cpp new file mode 100644 index 00000000000000..3f12340480ab07 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/concat.cpp @@ -0,0 +1,15 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "concat.hpp" + +#include + +#include "nodes/executors/executor.hpp" + +namespace ov::intel_cpu { + +ConcatExecutor::ConcatExecutor(ExecutorContext::CPtr context) : context(std::move(context)) {} + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat.hpp b/src/plugins/intel_cpu/src/nodes/executors/concat.hpp new file mode 100644 index 00000000000000..4ac21baa112796 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/concat.hpp @@ -0,0 +1,59 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "executor.hpp" +#include "memory_desc/cpu_memory_desc.h" + +namespace ov::intel_cpu { + +struct ConcatAttrs { + size_t axis = 0; +}; + +class ConcatExecutor { +public: + explicit ConcatExecutor(ExecutorContext::CPtr context); + virtual ~ConcatExecutor() = default; + + virtual bool init(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) = 0; + + virtual void exec(const std::vector& src, const std::vector& dst) = 0; + + [[nodiscard]] virtual impl_desc_type getImplType() const = 0; + +protected: + ConcatAttrs concatAttrs; + ExecutorContext::CPtr context; +}; + +using ConcatExecutorPtr = std::shared_ptr; +using ConcatExecutorCPtr = std::shared_ptr; + +class ConcatExecutorBuilder { +public: + virtual ~ConcatExecutorBuilder() = default; + [[nodiscard]] virtual bool isSupported(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const = 0; + [[nodiscard]] virtual ConcatExecutorPtr makeExecutor(ExecutorContext::CPtr context) const = 0; +}; + +using ConcatExecutorBuilderPtr = std::shared_ptr; +using ConcatExecutorBuilderCPtr = std::shared_ptr; + +struct ConcatExecutorDesc { + ExecutorType executorType; + ConcatExecutorBuilderCPtr builder; +}; + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat_list.cpp new file mode 100644 index 00000000000000..fffb6d0d26baa4 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_list.cpp @@ -0,0 +1,73 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "concat_list.hpp" + +#if defined(OV_CPU_WITH_ACL) +# include +#endif +#include +#include + +#include "memory_desc/cpu_memory_desc.h" +#include "nodes/executors/concat.hpp" +#include "nodes/executors/executor.hpp" +#include "openvino/core/except.hpp" + +#if defined(OV_CPU_WITH_ACL) +# include "nodes/executors/acl/acl_concat.hpp" +#endif + +namespace ov::intel_cpu { + +const std::vector& getConcatExecutorsList() { + static const std::vector descs = { +#if defined(OV_CPU_WITH_ACL) + {ExecutorType::Acl, std::make_shared()}, +#endif + }; + return descs; +} + +ConcatExecutorFactory::ConcatExecutorFactory(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr& context) + : ExecutorFactoryLegacy(context) { + for (const auto& desc : getConcatExecutorsList()) { + if (desc.builder->isSupported(concatAttrs, srcDescs, dstDescs)) { + supportedDescs.push_back(desc); + } + } +} + +ConcatExecutorPtr ConcatExecutorFactory::makeExecutor(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { + auto build = [&](const ConcatExecutorDesc* desc) -> ConcatExecutorPtr { + auto executor = desc->builder->makeExecutor(context); + if (executor->init(concatAttrs, srcDescs, dstDescs, attr)) { + return executor; + } + return nullptr; + }; + + if (chosenDesc) { + if (auto executor = build(chosenDesc)) { + return executor; + } + } + + for (const auto& sd : supportedDescs) { + if (auto executor = build(&sd)) { + chosenDesc = &sd; + return executor; + } + } + + OPENVINO_THROW("Supported Concat executor is not found"); +} + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/concat_list.hpp new file mode 100644 index 00000000000000..048c21beb85c4e --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_list.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "concat.hpp" +#include "executor.hpp" + +namespace ov::intel_cpu { + +const std::vector& getConcatExecutorsList(); + +class ConcatExecutorFactory : public ExecutorFactoryLegacy { +public: + ConcatExecutorFactory(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr& context); + + ~ConcatExecutorFactory() override = default; + + ConcatExecutorPtr makeExecutor(const ConcatAttrs& concatAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr); + + [[nodiscard]] bool hasSupportedDescs() const { + return !supportedDescs.empty(); + } + +private: + std::vector supportedDescs; + const ConcatExecutorDesc* chosenDesc = nullptr; +}; + +using ConcatExecutorFactoryPtr = std::shared_ptr; +using ConcatExecutorFactoryCPtr = std::shared_ptr; + +} // namespace ov::intel_cpu From e8da4090683d596fb02b2f3c3404ba3d102c0695 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Wed, 11 Feb 2026 07:37:57 +0100 Subject: [PATCH 02/11] refactor --- src/plugins/intel_cpu/src/nodes/concat.cpp | 167 +++++++------ src/plugins/intel_cpu/src/nodes/concat.h | 10 +- .../src/nodes/executors/acl/acl_concat.cpp | 231 +++++++++++------- .../src/nodes/executors/acl/acl_concat.hpp | 39 ++- .../intel_cpu/src/nodes/executors/concat.cpp | 15 -- .../intel_cpu/src/nodes/executors/concat.hpp | 46 +--- .../src/nodes/executors/concat_config.hpp | 14 ++ .../executors/concat_implementations.cpp | 48 ++++ .../src/nodes/executors/concat_list.cpp | 73 ------ .../src/nodes/executors/concat_list.hpp | 42 ---- .../src/nodes/executors/executor.hpp | 2 +- .../src/nodes/executors/implementations.hpp | 5 + 12 files changed, 330 insertions(+), 362 deletions(-) delete mode 100644 src/plugins/intel_cpu/src/nodes/executors/concat.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/concat_config.hpp create mode 100644 src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp delete mode 100644 src/plugins/intel_cpu/src/nodes/executors/concat_list.cpp delete mode 100644 src/plugins/intel_cpu/src/nodes/executors/concat_list.hpp diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index aa5431e681b3ba..ec8a5c35461cf9 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -34,10 +34,11 @@ #include "graph_context.h" #include "memory_desc/cpu_memory_desc.h" #include "node.h" -#if defined(OV_CPU_WITH_ACL) -# include "nodes/executors/concat_list.hpp" -# include "nodes/executors/executor.hpp" -#endif +#include "nodes/executors/executor.hpp" +#include "nodes/executors/executor_config.hpp" +#include "nodes/executors/executor_factory.hpp" +#include "nodes/executors/implementations.hpp" +#include "nodes/executors/memory_arguments.hpp" #include "nodes/node_config.h" #include "onednn/dnnl.h" #include "openvino/core/except.hpp" @@ -96,6 +97,7 @@ Concat::Concat(const std::shared_ptr& op, const GraphContext::CPtr& co } CPU_NODE_ASSERT(axis < static_cast(inRank) && axis >= 0, "has invalid value of axis parameter: ", axis); this->axis = axis; + m_attrs.axis = axis; } void Concat::getSupportedDescriptors() { @@ -215,7 +217,7 @@ void Concat::initSupportedPrimitiveDescriptors() { auto desc = itr->second->createSharedDesc(inputPrecision, getInputShapeAtPort(i)); config.inConfs[i].setMemDesc(desc); } - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef); + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref); if (itr->first != LayoutType::nspc) { pdIndexesToReuse.push_back(supportedPrimitiveDescriptors.size() - 1); } else if (canBeInPlace) { @@ -246,53 +248,52 @@ void Concat::initSupportedPrimitiveDescriptors() { // Optimized inplace case for (auto refPdIndex : pdIndexesToReuse) { auto config = supportedPrimitiveDescriptors[refPdIndex].getConfig(); - ; for (auto& inConf : config.inConfs) { inConf.inPlace(0); } supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); } -#if defined(OV_CPU_WITH_ACL) - // ACL executor (ncsp/nspc, f16/f32, rank<=4) - { + const auto& concatImplementations = getImplementations(); + if (!concatImplementations.empty()) { const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); - auto pushAclDesc = [&](LayoutType lt) { - NodeConfig config; - config.outConfs.resize(1); - config.inConfs.resize(getParentEdges().size()); + auto pushExecutorDesc = [&](LayoutType layoutType) { + NodeConfig nodeConfig; + nodeConfig.outConfs.resize(1); + nodeConfig.inConfs.resize(getParentEdges().size()); - std::vector srcMemoryDescs; + MemoryDescArgs descs; + descs.reserve(getParentEdges().size() + 1); for (size_t i = 0; i < getParentEdges().size(); ++i) { - auto desc = creatorsMap.at(lt)->createSharedDesc(inputPrecision, getInputShapeAtPort(i)); - config.inConfs[i].setMemDesc(desc); - config.inConfs[i].inPlace(-1); - config.inConfs[i].constant(false); - srcMemoryDescs.push_back(desc); + auto srcDesc = creatorsMap.at(layoutType)->createSharedDesc(inputPrecision, getInputShapeAtPort(i)); + nodeConfig.inConfs[i].setMemDesc(srcDesc); + nodeConfig.inConfs[i].inPlace(-1); + nodeConfig.inConfs[i].constant(false); + descs[ARG_SRC + i] = srcDesc; } - auto outDesc = creatorsMap.at(lt)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0)); - config.outConfs[0].setMemDesc(outDesc); - config.outConfs[0].inPlace(-1); - config.outConfs[0].constant(false); - - concatAttrs = {axis}; - auto factory = - std::make_shared(concatAttrs, - srcMemoryDescs, - std::vector{outDesc}, - std::make_shared(context, getImplPriority())); - if (factory->hasSupportedDescs()) { - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::acl, factory); + + auto dstDesc = creatorsMap.at(layoutType)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0)); + nodeConfig.outConfs[0].setMemDesc(dstDesc); + nodeConfig.outConfs[0].inPlace(-1); + nodeConfig.outConfs[0].constant(false); + descs[ARG_DST] = dstDesc; + + const executor::Config config{descs, m_attrs}; + const bool supported = + std::any_of(concatImplementations.begin(), concatImplementations.end(), [&](const auto& impl) { + return impl.supports(config, memoryFormatFilter); + }); + if (supported) { + supportedPrimitiveDescriptors.emplace_back(nodeConfig, impl_desc_type::acl); } }; const auto rank = getInputShapeAtPort(0).getRank(); if (rank <= 4 && (inputPrecision == ov::element::f16 || inputPrecision == ov::element::f32)) { - pushAclDesc(LayoutType::ncsp); - pushAclDesc(LayoutType::nspc); + pushExecutorDesc(LayoutType::ncsp); + pushExecutorDesc(LayoutType::nspc); } } -#endif } void Concat::selectOptimalPrimitiveDescriptor() { @@ -448,34 +449,24 @@ void Concat::prepareParams() { return; } -#if defined(OV_CPU_WITH_ACL) - if (useACL) { - auto* selected_pd = getSelectedPrimitiveDescriptor(); - CPU_NODE_ASSERT(selected_pd, "Preferable primitive descriptor is not set."); - - std::vector srcMemoryDescs; + if (useExecutor && m_executor) { for (size_t i = 0; i < getParentEdges().size(); ++i) { - srcMemoryDescs.push_back(getParentEdgeAt(i)->getMemory().getDescPtr()); - } - std::vector dstMemoryDescs; - dstMemoryDescs.push_back(getChildEdgeAt(0)->getMemory().getDescPtr()); - - try { - execPtrACL = - selected_pd->getExecutorFactoryAs()->makeExecutor(concatAttrs, - srcMemoryDescs, - dstMemoryDescs, - dnnl::primitive_attr()); - } catch (...) { - execPtrACL.reset(); - } - if (execPtrACL) { - selected_pd->setImplementationType(execPtrACL->getImplType()); + m_memory[ARG_SRC + i] = getSrcMemoryAtPort(i); + } + m_memory[ARG_DST] = getDstMemoryAtPort(0); + + auto* selectedPd = getSelectedPrimitiveDescriptor(); + CPU_NODE_ASSERT(selectedPd, "Preferable primitive descriptor is not set."); + + if (m_executor->update(m_memory)) { + selectedPd->setImplementationType(m_executor->implType()); return; } - useACL = false; // fallback to default path + + // Fallback to oneDNN/ref concat when executor update is not applicable for runtime shapes. + useExecutor = false; + m_executor.reset(); } -#endif const auto& dstMemPtr = getDstMemoryAtPort(0); CPU_NODE_ASSERT(dstMemPtr && dstMemPtr->isDefined(), "Destination memory is undefined."); @@ -590,6 +581,46 @@ size_t Concat::inverseOrder(const VectorDims& order, size_t axis) { return -1; } +void Concat::createPrimitive() { + auto* selectedPd = getSelectedPrimitiveDescriptor(); + CPU_NODE_ASSERT(selectedPd, "Preferable primitive descriptor is not set."); + + if (!isInPlace()) { + m_memory.clear(); + m_memory.reserve(getParentEdges().size() + 1); + for (size_t i = 0; i < getParentEdges().size(); ++i) { + m_memory[ARG_SRC + i] = getSrcMemoryAtPort(i); + } + m_memory[ARG_DST] = getDstMemoryAtPort(0); + + useExecutor = selectedPd->getImplementationType() == impl_desc_type::acl && !canOptimizeNspc; + m_executor.reset(); + + if (useExecutor) { + MemoryDescArgs descs; + descs.reserve(m_memory.size()); + for (const auto& [arg, mem] : m_memory) { + descs[arg] = mem->getDescPtr(); + } + + try { + auto executionContext = std::make_shared(context, getImplPriority()); + auto factory = std::make_shared>(m_attrs, + executionContext, + descs, + memoryFormatFilter); + m_executor = factory->make(m_memory); + selectedPd->setImplementationType(m_executor->implType()); + } catch (...) { + useExecutor = false; + m_executor.reset(); + } + } + } + + Node::createPrimitive(); +} + void Concat::initOptimalPrimitiveDescriptor() { auto* selected_pd = getSelectedPrimitiveDescriptor(); CPU_NODE_ASSERT(selected_pd, "Preferable primitive descriptor is not set."); @@ -613,7 +644,7 @@ void Concat::initOptimalPrimitiveDescriptor() { } } - useACL = selected_pd->getImplementationType() == impl_desc_type::acl; + useExecutor = selected_pd->getImplementationType() == impl_desc_type::acl; // block layout may have axis greater than rank, disable ref_concat auto* primDesc = getSelectedPrimitiveDescriptor(); @@ -648,22 +679,10 @@ void Concat::execute(const dnnl::stream& strm) { return; } -#if defined(OV_CPU_WITH_ACL) - if (useACL && execPtrACL) { - std::vector srcMemory; - srcMemory.reserve(getOriginalInputsNumber()); - for (size_t i = 0; i < getOriginalInputsNumber(); ++i) { - srcMemory.push_back(getSrcMemoryAtPort(i)); - } - std::vector dstMemory; - dstMemory.reserve(getOriginalOutputsNumber()); - for (size_t i = 0; i < getOriginalOutputsNumber(); ++i) { - dstMemory.push_back(getDstMemoryAtPort(i)); - } - execPtrACL->exec(srcMemory, dstMemory); + if (useExecutor && m_executor) { + m_executor->execute(m_memory); return; } -#endif if (canOptimize1DCase) { exec1DCase(); diff --git a/src/plugins/intel_cpu/src/nodes/concat.h b/src/plugins/intel_cpu/src/nodes/concat.h index 83dad4494a8f9d..bd67622de7c123 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.h +++ b/src/plugins/intel_cpu/src/nodes/concat.h @@ -15,6 +15,8 @@ #include "graph_context.h" #include "node.h" #include "nodes/executors/concat.hpp" +#include "nodes/executors/executor.hpp" +#include "nodes/executors/memory_arguments.hpp" #include "openvino/core/node.hpp" #include "openvino/core/type/element_type.hpp" @@ -42,6 +44,7 @@ class Concat : public Node { [[nodiscard]] bool isExecutable() const override; [[nodiscard]] bool needPrepareParams() const override; void prepareParams() override; + void createPrimitive() override; // TODO: Move to base Node class when more nodes support fuse convert bool supportConvertFusion() const { return supportFuseConvert; @@ -71,9 +74,10 @@ class Concat : public Node { bool doFuseConvert = false; // whether to perform FP16 to FP32 conversion static constexpr size_t MAX_RANK_REF = 6; dnnl::primitive prim; - bool useACL = false; - ConcatAttrs concatAttrs; - ConcatExecutorPtr execPtrACL; + bool useExecutor = false; + ConcatAttrs m_attrs; + MemoryArgs m_memory; + ExecutorPtr m_executor = nullptr; }; } // namespace ov::intel_cpu::node diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp index 3d1da57715dec7..5148c57ede122c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp @@ -10,148 +10,207 @@ #include #include +#include #include -#include #include #include "acl_utils.hpp" #include "cpu_memory.h" #include "memory_desc/cpu_memory_desc.h" -#include "nodes/executors/concat.hpp" +#include "nodes/executors/concat_config.hpp" +#include "nodes/executors/memory_arguments.hpp" +#include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" #include "utils/debug_capabilities.h" namespace ov::intel_cpu { -bool AclConcatExecutor::init(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - [[maybe_unused]] const dnnl::primitive_attr& attr) { - this->concatAttrs = concatAttrs; - const auto& firstDesc = srcDescs[0]; - const auto precision = firstDesc->getPrecision(); - const bool isNspc = firstDesc->hasLayoutType(LayoutType::nspc); - const bool isNcsp = firstDesc->hasLayoutType(LayoutType::ncsp); +namespace { + +template +std::vector getSourceArgIds(const T& args) { + std::vector sourceIds; + for (int argId = ARG_SRC;; ++argId) { + if (args.find(argId) == args.end()) { + break; + } + sourceIds.push_back(argId); + } + return sourceIds; +} + +bool isSupportedCommon(const ConcatAttrs& attrs, + const std::vector& srcDescs, + const MemoryDescPtr& dstDesc, + LayoutType expectedLayout) { + if (srcDescs.empty() || !dstDesc) { + return false; + } + + const auto& firstDesc = srcDescs.front(); const auto rank = firstDesc->getShape().getRank(); + if (rank == Shape::UNDEFINED_DIM || rank == 0 || rank > 4 || attrs.axis >= rank) { + return false; + } - if ((!isNspc && !isNcsp) || rank > 4) { + const auto precision = firstDesc->getPrecision(); + if (precision != ov::element::f16 && precision != ov::element::f32) { return false; } - // all inputs must share layout & precision - for (const auto& d : srcDescs) { - if (d->getPrecision() != precision || d->hasLayoutType(LayoutType::nspc) != isNspc || - d->hasLayoutType(LayoutType::ncsp) != isNcsp || d->getShape().getRank() != rank) { + + if (expectedLayout != LayoutType::ncsp && expectedLayout != LayoutType::nspc) { + return false; + } + + if (!firstDesc->hasLayoutType(expectedLayout)) { + return false; + } + + for (const auto& srcDesc : srcDescs) { + if (srcDesc->getPrecision() != precision || srcDesc->getShape().getRank() != rank || + !srcDesc->hasLayoutType(expectedLayout)) { return false; } } - if (dstDescs[0]->getPrecision() != precision || dstDescs[0]->hasLayoutType(LayoutType::nspc) != isNspc || - dstDescs[0]->hasLayoutType(LayoutType::ncsp) != isNcsp) { + + if (dstDesc->getPrecision() != precision || dstDesc->getShape().getRank() != rank || + !dstDesc->hasLayoutType(expectedLayout)) { return false; } - if (precision != ov::element::f16 && precision != ov::element::f32) { + + return true; +} + +} // namespace + +AclConcatExecutor::AclConcatExecutor(const ConcatAttrs& attrs, + const MemoryArgs& memory, + [[maybe_unused]] const ExecutorContext::CPtr& context) + : m_attrs(attrs) { + const auto srcArgIds = getSourceArgIds(memory); + OPENVINO_ASSERT(!srcArgIds.empty(), "AclConcatExecutor requires at least one source tensor"); + + const auto& firstSrcDesc = memory.at(srcArgIds.front())->getDescPtr(); + m_expectedLayout = firstSrcDesc->hasLayoutType(LayoutType::nspc) ? LayoutType::nspc : LayoutType::ncsp; +} + +bool AclConcatExecutor::supports(const ConcatConfig& config, LayoutType expectedLayout) { + const auto srcArgIds = getSourceArgIds(config.descs); + if (srcArgIds.empty()) { + return false; + } + auto dstIt = config.descs.find(ARG_DST); + if (dstIt == config.descs.end()) { + return false; + } + + std::vector srcDescs; + srcDescs.reserve(srcArgIds.size()); + for (const auto& srcArgId : srcArgIds) { + srcDescs.push_back(config.descs.at(srcArgId)); + } + + return isSupportedCommon(config.attrs, srcDescs, dstIt->second, expectedLayout); +} + +bool AclConcatExecutor::update(const MemoryArgs& memory) { + const auto srcArgIds = getSourceArgIds(memory); + if (srcArgIds.empty()) { + return false; + } + auto dstIt = memory.find(ARG_DST); + if (dstIt == memory.end()) { + return false; + } + + std::vector srcDescs; + srcDescs.reserve(srcArgIds.size()); + for (const auto& srcArgId : srcArgIds) { + srcDescs.push_back(memory.at(srcArgId)->getDescPtr()); + } + + const auto& dstDesc = dstIt->second->getDescPtr(); + if (!isSupportedCommon(m_attrs, srcDescs, dstDesc, m_expectedLayout)) { return false; } - auto aclLayout = getAclDataLayoutByMemoryDesc(firstDesc); + const bool isNspc = m_expectedLayout == LayoutType::nspc; + const auto rank = srcDescs.front()->getShape().getRank(); + const auto precision = srcDescs.front()->getPrecision(); + const auto aclDataType = precisionToAclDataType(precision); + + if (aclDataType == arm_compute::DataType::UNKNOWN) { + return false; + } + + const auto aclLayout = getAclDataLayoutByMemoryDesc(srcDescs.front()); if (aclLayout == arm_compute::DataLayout::UNKNOWN) { return false; } - const auto& dstDims = dstDescs[0]->getShape().getStaticDims(); - int aclAxis = axisCast(concatAttrs.axis, - rank, - isNspc ? ACLAxisCastMode::NHWC_TO_NCHW : ACLAxisCastMode::NO_LAYOUT_CONVERSION); + const int aclAxis = + axisCast(m_attrs.axis, rank, isNspc ? ACLAxisCastMode::NHWC_TO_NCHW : ACLAxisCastMode::NO_LAYOUT_CONVERSION); if (aclAxis < 0 || static_cast(aclAxis) >= rank) { return false; } - auto dstShape = shapeCast(dstDims); + auto dstShape = shapeCast(memory.at(ARG_DST)->getStaticDims()); if (isNspc) { changeLayoutToNH_C({&dstShape}); } - arm_compute::TensorInfo dstInfo(dstShape, 1, precisionToAclDataType(precision), aclLayout); + arm_compute::TensorInfo dstInfo(dstShape, 1, aclDataType, aclLayout); - std::vector inputInfos; - srcTensors.resize(srcDescs.size()); - for (size_t i = 0; i < srcDescs.size(); ++i) { - const auto& dims = srcDescs[i]->getShape().getStaticDims(); - auto srcShape = shapeCast(dims); + std::vector srcInfos; + srcInfos.reserve(srcArgIds.size()); + std::vector srcInfosPtrs; + srcInfosPtrs.reserve(srcArgIds.size()); + for (const auto& srcArgId : srcArgIds) { + auto srcShape = shapeCast(memory.at(srcArgId)->getStaticDims()); if (isNspc) { changeLayoutToNH_C({&srcShape}); } - arm_compute::TensorInfo srcInfo(srcShape, 1, precisionToAclDataType(precision), aclLayout); - inputInfos.push_back(&srcInfo); - srcTensors[i].allocator()->init(srcInfo); + srcInfos.emplace_back(srcShape, 1, aclDataType, aclLayout); + srcInfosPtrs.push_back(&srcInfos.back()); } - dstTensor.allocator()->init(dstInfo); - - auto status = arm_compute::NEConcatenateLayer::validate(inputInfos, &dstInfo, static_cast(aclAxis)); + auto status = arm_compute::NEConcatenateLayer::validate(srcInfosPtrs, &dstInfo, static_cast(aclAxis)); if (!status) { DEBUG_LOG("NEConcatenateLayer validation failed: ", status.error_description()); return false; } + m_srcArgIds = srcArgIds; + m_srcTensors = std::vector(srcArgIds.size()); + m_dstTensor = arm_compute::Tensor(); + for (size_t i = 0; i < srcInfos.size(); ++i) { + m_srcTensors[i].allocator()->init(srcInfos[i]); + } + m_dstTensor.allocator()->init(dstInfo); + configureThreadSafe([&] { - std::vector tensors; - tensors.reserve(srcTensors.size()); - for (auto& t : srcTensors) { - tensors.push_back(&t); + std::vector srcTensors; + srcTensors.reserve(m_srcTensors.size()); + for (const auto& srcTensor : m_srcTensors) { + srcTensors.push_back(&srcTensor); } - concatLayer.configure(tensors, &dstTensor, static_cast(aclAxis)); + m_concatLayer.configure(srcTensors, &m_dstTensor, static_cast(aclAxis)); }); return true; } -void AclConcatExecutor::exec(const std::vector& src, const std::vector& dst) { - for (size_t i = 0; i < src.size(); ++i) { - srcTensors[i].allocator()->import_memory(src[i]->getData()); +void AclConcatExecutor::execute(const MemoryArgs& memory) { + for (size_t i = 0; i < m_srcArgIds.size(); ++i) { + m_srcTensors[i].allocator()->import_memory(memory.at(m_srcArgIds[i])->getData()); } - dstTensor.allocator()->import_memory(dst[0]->getData()); + m_dstTensor.allocator()->import_memory(memory.at(ARG_DST)->getData()); - concatLayer.run(); + m_concatLayer.run(); - for (auto& t : srcTensors) { - t.allocator()->free(); - } - dstTensor.allocator()->free(); -} - -bool AclConcatExecutorBuilder::isSupported(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs) const { - if (srcDescs.empty() || dstDescs.empty()) { - return false; - } - const auto rank = srcDescs[0]->getShape().getRank(); - if (rank == Shape::UNDEFINED_DIM || rank == 0) { - return false; - } - if (concatAttrs.axis >= rank) { - return false; - } - const bool isNspc = srcDescs[0]->hasLayoutType(LayoutType::nspc); - const bool isNcsp = srcDescs[0]->hasLayoutType(LayoutType::ncsp); - if (!isNspc && !isNcsp) { - return false; - } - // ACL concat supports up to 4D tensors (NHWC/NCHW) - if (rank > 4) { - return false; - } - const auto prec = srcDescs[0]->getPrecision(); - if (prec != ov::element::f16 && prec != ov::element::f32) { - return false; - } - for (const auto& d : srcDescs) { - if (d->getPrecision() != prec || d->getShape().getRank() != rank || - d->hasLayoutType(LayoutType::nspc) != isNspc || d->hasLayoutType(LayoutType::ncsp) != isNcsp) { - return false; - } + for (auto& srcTensor : m_srcTensors) { + srcTensor.allocator()->free(); } - return dstDescs[0]->getPrecision() == prec && dstDescs[0]->hasLayoutType(LayoutType::nspc) == isNspc && - dstDescs[0]->hasLayoutType(LayoutType::ncsp) == isNcsp; + m_dstTensor.allocator()->free(); } } // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.hpp index a24c0552c94a4a..f033cfe32b2690 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.hpp @@ -7,38 +7,31 @@ #include #include -#include "nodes/executors/concat.hpp" +#include "nodes/executors/concat_config.hpp" #include "nodes/executors/executor.hpp" -#include "utils/debug_capabilities.h" +#include "nodes/executors/memory_arguments.hpp" namespace ov::intel_cpu { -class AclConcatExecutor : public ConcatExecutor { +class AclConcatExecutor : public Executor { public: - using ConcatExecutor::ConcatExecutor; - bool init(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr& attr) override; - void exec(const std::vector& src, const std::vector& dst) override; - [[nodiscard]] impl_desc_type getImplType() const override { + AclConcatExecutor(const ConcatAttrs& attrs, const MemoryArgs& memory, const ExecutorContext::CPtr& context); + + static bool supports(const ConcatConfig& config, LayoutType expectedLayout); + + bool update(const MemoryArgs& memory) override; + void execute(const MemoryArgs& memory) override; + [[nodiscard]] impl_desc_type implType() const override { return impl_desc_type::acl; } private: - std::vector srcTensors; - arm_compute::Tensor dstTensor; - arm_compute::NEConcatenateLayer concatLayer; -}; - -class AclConcatExecutorBuilder : public ConcatExecutorBuilder { -public: - [[nodiscard]] bool isSupported(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs) const override; - [[nodiscard]] ConcatExecutorPtr makeExecutor(ExecutorContext::CPtr context) const override { - return std::make_shared(context); - } + ConcatAttrs m_attrs; + LayoutType m_expectedLayout; + std::vector m_srcArgIds; + std::vector m_srcTensors; + arm_compute::Tensor m_dstTensor; + arm_compute::NEConcatenateLayer m_concatLayer; }; } // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat.cpp deleted file mode 100644 index 3f12340480ab07..00000000000000 --- a/src/plugins/intel_cpu/src/nodes/executors/concat.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (C) 2018-2026 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "concat.hpp" - -#include - -#include "nodes/executors/executor.hpp" - -namespace ov::intel_cpu { - -ConcatExecutor::ConcatExecutor(ExecutorContext::CPtr context) : context(std::move(context)) {} - -} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat.hpp b/src/plugins/intel_cpu/src/nodes/executors/concat.hpp index 4ac21baa112796..efca7a198d1e5d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/concat.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/concat.hpp @@ -4,12 +4,7 @@ #pragma once -#include -#include -#include - -#include "executor.hpp" -#include "memory_desc/cpu_memory_desc.h" +#include namespace ov::intel_cpu { @@ -17,43 +12,4 @@ struct ConcatAttrs { size_t axis = 0; }; -class ConcatExecutor { -public: - explicit ConcatExecutor(ExecutorContext::CPtr context); - virtual ~ConcatExecutor() = default; - - virtual bool init(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr& attr) = 0; - - virtual void exec(const std::vector& src, const std::vector& dst) = 0; - - [[nodiscard]] virtual impl_desc_type getImplType() const = 0; - -protected: - ConcatAttrs concatAttrs; - ExecutorContext::CPtr context; -}; - -using ConcatExecutorPtr = std::shared_ptr; -using ConcatExecutorCPtr = std::shared_ptr; - -class ConcatExecutorBuilder { -public: - virtual ~ConcatExecutorBuilder() = default; - [[nodiscard]] virtual bool isSupported(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs) const = 0; - [[nodiscard]] virtual ConcatExecutorPtr makeExecutor(ExecutorContext::CPtr context) const = 0; -}; - -using ConcatExecutorBuilderPtr = std::shared_ptr; -using ConcatExecutorBuilderCPtr = std::shared_ptr; - -struct ConcatExecutorDesc { - ExecutorType executorType; - ConcatExecutorBuilderCPtr builder; -}; - } // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/concat_config.hpp new file mode 100644 index 00000000000000..d4b04c4e39a9fb --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_config.hpp @@ -0,0 +1,14 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "nodes/executors/concat.hpp" +#include "nodes/executors/executor_config.hpp" + +namespace ov::intel_cpu { + +using ConcatConfig = executor::Config; + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp new file mode 100644 index 00000000000000..d81df1664ea3e0 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "nodes/executors/concat.hpp" +#include "nodes/executors/executor_implementation.hpp" +#include "nodes/executors/implementations.hpp" + +#if defined(OV_CPU_WITH_ACL) +# include "nodes/executors/acl/acl_concat.hpp" +# include "nodes/executors/implementation_utils.hpp" +#endif + +namespace ov::intel_cpu { + +template <> +const std::vector>& getImplementations() { +#if defined(OV_CPU_WITH_ACL) + static const std::vector> concatImplementations{ + {"concat_acl_ncsp", + ExecutorType::Acl, + OperationType::Concat, + [](const executor::Config& config, + [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + return AclConcatExecutor::supports(config, LayoutType::ncsp); + }, + HasNoOptimalConfig{}, + AcceptsAnyShape, + CreateDefault{}}, + {"concat_acl_nspc", + ExecutorType::Acl, + OperationType::Concat, + [](const executor::Config& config, + [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + return AclConcatExecutor::supports(config, LayoutType::nspc); + }, + HasNoOptimalConfig{}, + AcceptsAnyShape, + CreateDefault{}}}; +#else + static const std::vector> concatImplementations{}; +#endif + return concatImplementations; +} + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat_list.cpp deleted file mode 100644 index fffb6d0d26baa4..00000000000000 --- a/src/plugins/intel_cpu/src/nodes/executors/concat_list.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (C) 2018-2026 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "concat_list.hpp" - -#if defined(OV_CPU_WITH_ACL) -# include -#endif -#include -#include - -#include "memory_desc/cpu_memory_desc.h" -#include "nodes/executors/concat.hpp" -#include "nodes/executors/executor.hpp" -#include "openvino/core/except.hpp" - -#if defined(OV_CPU_WITH_ACL) -# include "nodes/executors/acl/acl_concat.hpp" -#endif - -namespace ov::intel_cpu { - -const std::vector& getConcatExecutorsList() { - static const std::vector descs = { -#if defined(OV_CPU_WITH_ACL) - {ExecutorType::Acl, std::make_shared()}, -#endif - }; - return descs; -} - -ConcatExecutorFactory::ConcatExecutorFactory(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr& context) - : ExecutorFactoryLegacy(context) { - for (const auto& desc : getConcatExecutorsList()) { - if (desc.builder->isSupported(concatAttrs, srcDescs, dstDescs)) { - supportedDescs.push_back(desc); - } - } -} - -ConcatExecutorPtr ConcatExecutorFactory::makeExecutor(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr& attr) { - auto build = [&](const ConcatExecutorDesc* desc) -> ConcatExecutorPtr { - auto executor = desc->builder->makeExecutor(context); - if (executor->init(concatAttrs, srcDescs, dstDescs, attr)) { - return executor; - } - return nullptr; - }; - - if (chosenDesc) { - if (auto executor = build(chosenDesc)) { - return executor; - } - } - - for (const auto& sd : supportedDescs) { - if (auto executor = build(&sd)) { - chosenDesc = &sd; - return executor; - } - } - - OPENVINO_THROW("Supported Concat executor is not found"); -} - -} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/concat_list.hpp deleted file mode 100644 index 048c21beb85c4e..00000000000000 --- a/src/plugins/intel_cpu/src/nodes/executors/concat_list.hpp +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (C) 2018-2026 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#include "concat.hpp" -#include "executor.hpp" - -namespace ov::intel_cpu { - -const std::vector& getConcatExecutorsList(); - -class ConcatExecutorFactory : public ExecutorFactoryLegacy { -public: - ConcatExecutorFactory(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr& context); - - ~ConcatExecutorFactory() override = default; - - ConcatExecutorPtr makeExecutor(const ConcatAttrs& concatAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr& attr); - - [[nodiscard]] bool hasSupportedDescs() const { - return !supportedDescs.empty(); - } - -private: - std::vector supportedDescs; - const ConcatExecutorDesc* chosenDesc = nullptr; -}; - -using ConcatExecutorFactoryPtr = std::shared_ptr; -using ConcatExecutorFactoryCPtr = std::shared_ptr; - -} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp index 5346ffcf83e757..ac88e50ba3bb82 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp @@ -41,7 +41,7 @@ enum class ExecutorType : uint8_t { Kleidiai, }; -enum class OperationType : uint8_t { FullyConnected, MatMul, Convolution, Eltwise }; +enum class OperationType : uint8_t { FullyConnected, MatMul, Convolution, Eltwise, Concat }; std::string ExecutorTypeToString(ExecutorType type); ExecutorType ExecutorTypeFromString(const std::string& typeStr); diff --git a/src/plugins/intel_cpu/src/nodes/executors/implementations.hpp b/src/plugins/intel_cpu/src/nodes/executors/implementations.hpp index c0ac6e9ac9ccc5..c6a9d8913b9b36 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/implementations.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/implementations.hpp @@ -7,6 +7,7 @@ #include #include +#include "nodes/executors/concat_config.hpp" #include "nodes/executors/convolution_config.hpp" #include "nodes/executors/eltwise_config.hpp" #include "nodes/executors/executor_implementation.hpp" @@ -37,6 +38,10 @@ const std::vector>& getImplementations(); template <> const std::vector>& getImplementations(); +// Concat +template <> +const std::vector>& getImplementations(); + // MatMul template <> const std::vector>& getImplementations(); From b437e8175b8b12a4c94d755d92cf8069ceb2ae82 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Wed, 11 Feb 2026 11:00:21 +0100 Subject: [PATCH 03/11] fix tests --- src/plugins/intel_cpu/src/nodes/concat.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index ec8a5c35461cf9..6553d7e86da267 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -408,15 +408,14 @@ void Concat::selectOptimalPrimitiveDescriptor() { } for (auto indx : canSelectPrimitive) { - if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::ref) { + if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::unknown) { selectPrimitiveDescriptorByIndex(static_cast(indx)); return; } } - // if there are more than one PD with similar data layouts - select the optimized one for (auto indx : canSelectPrimitive) { - if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::unknown) { + if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::ref) { selectPrimitiveDescriptorByIndex(static_cast(indx)); return; } @@ -449,15 +448,15 @@ void Concat::prepareParams() { return; } + auto* selectedPd = getSelectedPrimitiveDescriptor(); + CPU_NODE_ASSERT(selectedPd, "Preferable primitive descriptor is not set."); + if (useExecutor && m_executor) { for (size_t i = 0; i < getParentEdges().size(); ++i) { m_memory[ARG_SRC + i] = getSrcMemoryAtPort(i); } m_memory[ARG_DST] = getDstMemoryAtPort(0); - auto* selectedPd = getSelectedPrimitiveDescriptor(); - CPU_NODE_ASSERT(selectedPd, "Preferable primitive descriptor is not set."); - if (m_executor->update(m_memory)) { selectedPd->setImplementationType(m_executor->implType()); return; @@ -466,12 +465,12 @@ void Concat::prepareParams() { // Fallback to oneDNN/ref concat when executor update is not applicable for runtime shapes. useExecutor = false; m_executor.reset(); + selectedPd->setImplementationType(impl_desc_type::ref); } const auto& dstMemPtr = getDstMemoryAtPort(0); CPU_NODE_ASSERT(dstMemPtr && dstMemPtr->isDefined(), "Destination memory is undefined."); auto dstMemDesc = dstMemPtr->getDescWithType(); - CPU_NODE_ASSERT(getSelectedPrimitiveDescriptor(), "Preferable primitive descriptor is not set."); const auto& outputStrides = dstMemDesc->getStrides(); size_t curConcatOffset = 0; @@ -585,6 +584,10 @@ void Concat::createPrimitive() { auto* selectedPd = getSelectedPrimitiveDescriptor(); CPU_NODE_ASSERT(selectedPd, "Preferable primitive descriptor is not set."); + auto fallbackToRefImplType = [&]() { + selectedPd->setImplementationType(impl_desc_type::ref); + }; + if (!isInPlace()) { m_memory.clear(); m_memory.reserve(getParentEdges().size() + 1); @@ -614,7 +617,11 @@ void Concat::createPrimitive() { } catch (...) { useExecutor = false; m_executor.reset(); + fallbackToRefImplType(); } + } else if (selectedPd->getImplementationType() == impl_desc_type::undef || + selectedPd->getImplementationType() == impl_desc_type::acl) { + fallbackToRefImplType(); } } From dcabc53fc055cae914fd01a4b7a24fe2e7baca3c Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Wed, 11 Feb 2026 12:32:37 +0100 Subject: [PATCH 04/11] tidy --- .../src/nodes/executors/acl/acl_concat.cpp | 13 +++++-------- .../src/nodes/executors/concat_implementations.cpp | 4 ++++ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp index 5148c57ede122c..ad076d5cd393e4 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp @@ -9,15 +9,16 @@ #include #include #include +#include -#include #include #include #include "acl_utils.hpp" -#include "cpu_memory.h" #include "memory_desc/cpu_memory_desc.h" +#include "nodes/executors/concat.hpp" #include "nodes/executors/concat_config.hpp" +#include "nodes/executors/executor.hpp" #include "nodes/executors/memory_arguments.hpp" #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" @@ -73,12 +74,8 @@ bool isSupportedCommon(const ConcatAttrs& attrs, } } - if (dstDesc->getPrecision() != precision || dstDesc->getShape().getRank() != rank || - !dstDesc->hasLayoutType(expectedLayout)) { - return false; - } - - return true; + return dstDesc->getPrecision() == precision && dstDesc->getShape().getRank() == rank && + dstDesc->hasLayoutType(expectedLayout); } } // namespace diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp index d81df1664ea3e0..f3014f35b0dcf1 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp @@ -9,7 +9,11 @@ #include "nodes/executors/implementations.hpp" #if defined(OV_CPU_WITH_ACL) +# include "memory_desc/cpu_memory_desc.h" +# include "memory_format_filter.hpp" # include "nodes/executors/acl/acl_concat.hpp" +# include "nodes/executors/executor.hpp" +# include "nodes/executors/executor_config.hpp" # include "nodes/executors/implementation_utils.hpp" #endif From 7a980770da7b36c87078feb9b8013e3b1dc9a891 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Wed, 18 Feb 2026 07:11:42 +0100 Subject: [PATCH 05/11] Address review comments --- src/plugins/intel_cpu/src/nodes/concat.cpp | 28 ++---- .../executors/concat_implementations.cpp | 87 ++++++++++++------- 2 files changed, 64 insertions(+), 51 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 6553d7e86da267..53f5f24ad16c6f 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -284,15 +284,12 @@ void Concat::initSupportedPrimitiveDescriptors() { return impl.supports(config, memoryFormatFilter); }); if (supported) { - supportedPrimitiveDescriptors.emplace_back(nodeConfig, impl_desc_type::acl); + supportedPrimitiveDescriptors.emplace_back(nodeConfig, impl_desc_type::undef); } }; - const auto rank = getInputShapeAtPort(0).getRank(); - if (rank <= 4 && (inputPrecision == ov::element::f16 || inputPrecision == ov::element::f32)) { - pushExecutorDesc(LayoutType::ncsp); - pushExecutorDesc(LayoutType::nspc); - } + pushExecutorDesc(LayoutType::ncsp); + pushExecutorDesc(LayoutType::nspc); } } @@ -401,7 +398,7 @@ void Concat::selectOptimalPrimitiveDescriptor() { } for (auto indx : canSelectPrimitive) { - if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::acl) { + if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::undef) { selectPrimitiveDescriptorByIndex(static_cast(indx)); return; } @@ -461,11 +458,6 @@ void Concat::prepareParams() { selectedPd->setImplementationType(m_executor->implType()); return; } - - // Fallback to oneDNN/ref concat when executor update is not applicable for runtime shapes. - useExecutor = false; - m_executor.reset(); - selectedPd->setImplementationType(impl_desc_type::ref); } const auto& dstMemPtr = getDstMemoryAtPort(0); @@ -584,10 +576,6 @@ void Concat::createPrimitive() { auto* selectedPd = getSelectedPrimitiveDescriptor(); CPU_NODE_ASSERT(selectedPd, "Preferable primitive descriptor is not set."); - auto fallbackToRefImplType = [&]() { - selectedPd->setImplementationType(impl_desc_type::ref); - }; - if (!isInPlace()) { m_memory.clear(); m_memory.reserve(getParentEdges().size() + 1); @@ -596,7 +584,7 @@ void Concat::createPrimitive() { } m_memory[ARG_DST] = getDstMemoryAtPort(0); - useExecutor = selectedPd->getImplementationType() == impl_desc_type::acl && !canOptimizeNspc; + useExecutor = selectedPd->getImplementationType() == impl_desc_type::undef && !canOptimizeNspc; m_executor.reset(); if (useExecutor) { @@ -617,11 +605,7 @@ void Concat::createPrimitive() { } catch (...) { useExecutor = false; m_executor.reset(); - fallbackToRefImplType(); } - } else if (selectedPd->getImplementationType() == impl_desc_type::undef || - selectedPd->getImplementationType() == impl_desc_type::acl) { - fallbackToRefImplType(); } } @@ -651,7 +635,7 @@ void Concat::initOptimalPrimitiveDescriptor() { } } - useExecutor = selected_pd->getImplementationType() == impl_desc_type::acl; + useExecutor = selected_pd->getImplementationType() == impl_desc_type::undef; // block layout may have axis greater than rank, disable ref_concat auto* primDesc = getSelectedPrimitiveDescriptor(); diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp index f3014f35b0dcf1..332cdc87a5fdbb 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp @@ -4,48 +4,77 @@ #include +#include "memory_format_filter.hpp" #include "nodes/executors/concat.hpp" #include "nodes/executors/executor_implementation.hpp" +#include "nodes/executors/implementation_utils.hpp" #include "nodes/executors/implementations.hpp" +#include "utils/arch_macros.h" -#if defined(OV_CPU_WITH_ACL) +#if defined(OPENVINO_ARCH_RISCV64) # include "memory_desc/cpu_memory_desc.h" -# include "memory_format_filter.hpp" +#endif +#if defined(OV_CPU_WITH_ACL) # include "nodes/executors/acl/acl_concat.hpp" -# include "nodes/executors/executor.hpp" -# include "nodes/executors/executor_config.hpp" -# include "nodes/executors/implementation_utils.hpp" #endif namespace ov::intel_cpu { template <> const std::vector>& getImplementations() { -#if defined(OV_CPU_WITH_ACL) static const std::vector> concatImplementations{ - {"concat_acl_ncsp", - ExecutorType::Acl, - OperationType::Concat, - [](const executor::Config& config, - [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { - return AclConcatExecutor::supports(config, LayoutType::ncsp); - }, - HasNoOptimalConfig{}, - AcceptsAnyShape, - CreateDefault{}}, - {"concat_acl_nspc", - ExecutorType::Acl, - OperationType::Concat, - [](const executor::Config& config, - [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { - return AclConcatExecutor::supports(config, LayoutType::nspc); - }, - HasNoOptimalConfig{}, - AcceptsAnyShape, - CreateDefault{}}}; -#else - static const std::vector> concatImplementations{}; -#endif + OV_CPU_INSTANCE_ACL( + "concat_acl_ncsp", + ExecutorType::Acl, + OperationType::Concat, + [](const executor::Config& config, + [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + return AclConcatExecutor::supports(config, LayoutType::ncsp); + }, + HasNoOptimalConfig{}, + AcceptsAnyShape, + CreateDefault{}) + OV_CPU_INSTANCE_ACL( + "concat_acl_nspc", + ExecutorType::Acl, + OperationType::Concat, + [](const executor::Config& config, + [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + return AclConcatExecutor::supports(config, LayoutType::nspc); + }, + HasNoOptimalConfig{}, + AcceptsAnyShape, + CreateDefault{}) + OV_CPU_INSTANCE_COMMON( + "concat_ref_ncsp", + ExecutorType::Reference, + OperationType::Concat, + []([[maybe_unused]] const executor::Config& config, + [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + return false; + }, + HasNoOptimalConfig{}, + AcceptsAnyShape, + []([[maybe_unused]] const ConcatAttrs& attrs, + [[maybe_unused]] const MemoryArgs& memory, + [[maybe_unused]] const ExecutorContext::CPtr& context) -> ExecutorPtr { + return nullptr; + }) + OV_CPU_INSTANCE_COMMON( + "concat_ref_nspc", + ExecutorType::Reference, + OperationType::Concat, + []([[maybe_unused]] const executor::Config& config, + [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + return false; + }, + HasNoOptimalConfig{}, + AcceptsAnyShape, + []([[maybe_unused]] const ConcatAttrs& attrs, + [[maybe_unused]] const MemoryArgs& memory, + [[maybe_unused]] const ExecutorContext::CPtr& context) -> ExecutorPtr { + return nullptr; + })}; return concatImplementations; } From 7fa7b36be87dbcd5846a9f0bc2566a50c5a65c21 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Wed, 18 Feb 2026 12:01:13 +0100 Subject: [PATCH 06/11] fix arm concat issue --- src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp index ad076d5cd393e4..4cf31fb514e957 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_concat.cpp @@ -44,7 +44,7 @@ bool isSupportedCommon(const ConcatAttrs& attrs, const std::vector& srcDescs, const MemoryDescPtr& dstDesc, LayoutType expectedLayout) { - if (srcDescs.empty() || !dstDesc) { + if (srcDescs.size() < 2 || !dstDesc) { return false; } From 1854ceae992814f1b5f7a59c3eac4401c3db7aea Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Wed, 18 Feb 2026 12:55:51 +0100 Subject: [PATCH 07/11] ref --- src/plugins/intel_cpu/src/nodes/concat.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 53f5f24ad16c6f..e10cb37dcecc4d 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -458,6 +458,11 @@ void Concat::prepareParams() { selectedPd->setImplementationType(m_executor->implType()); return; } + + // Fallback to oneDNN/ref concat when executor update is not applicable for runtime shapes. + useExecutor = false; + m_executor.reset(); + selectedPd->setImplementationType(impl_desc_type::ref); } const auto& dstMemPtr = getDstMemoryAtPort(0); From 07fad5678be75853d84bc392b0145e58eabd07e2 Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Wed, 18 Feb 2026 16:00:05 +0100 Subject: [PATCH 08/11] tidy --- .../src/nodes/executors/concat_implementations.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp index 332cdc87a5fdbb..30e94ca5f33926 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp @@ -6,14 +6,14 @@ #include "memory_format_filter.hpp" #include "nodes/executors/concat.hpp" +#include "nodes/executors/executor.hpp" +#include "nodes/executors/executor_config.hpp" #include "nodes/executors/executor_implementation.hpp" #include "nodes/executors/implementation_utils.hpp" #include "nodes/executors/implementations.hpp" +#include "nodes/executors/memory_arguments.hpp" #include "utils/arch_macros.h" -#if defined(OPENVINO_ARCH_RISCV64) -# include "memory_desc/cpu_memory_desc.h" -#endif #if defined(OV_CPU_WITH_ACL) # include "nodes/executors/acl/acl_concat.hpp" #endif From fb9dd83e41c24709eda49da45f414ee185a41a6a Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Thu, 19 Feb 2026 11:07:57 +0100 Subject: [PATCH 09/11] tidy arm --- .../intel_cpu/src/nodes/executors/concat_implementations.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp index 30e94ca5f33926..dd0a9e67b0a997 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp @@ -4,6 +4,7 @@ #include +#include "memory_desc/cpu_memory_desc.h" #include "memory_format_filter.hpp" #include "nodes/executors/concat.hpp" #include "nodes/executors/executor.hpp" From d02ecb55aea58691ed785da749c5e4f939528c1e Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Thu, 19 Feb 2026 13:45:00 +0100 Subject: [PATCH 10/11] tidy fin --- .../intel_cpu/src/nodes/executors/concat_implementations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp index dd0a9e67b0a997..81e5d20a22e08a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp @@ -4,7 +4,6 @@ #include -#include "memory_desc/cpu_memory_desc.h" #include "memory_format_filter.hpp" #include "nodes/executors/concat.hpp" #include "nodes/executors/executor.hpp" @@ -16,6 +15,7 @@ #include "utils/arch_macros.h" #if defined(OV_CPU_WITH_ACL) +# include "memory_desc/cpu_memory_desc.h" # include "nodes/executors/acl/acl_concat.hpp" #endif From b05dbc03c9154d5a9d081bb07a9547c040c6a65b Mon Sep 17 00:00:00 2001 From: Arseniy Obolenskiy Date: Wed, 25 Feb 2026 07:01:40 +0100 Subject: [PATCH 11/11] Address review comments --- src/plugins/intel_cpu/src/nodes/concat.cpp | 61 ++++++------- .../executors/concat_implementations.cpp | 85 ++++++++++--------- .../src/nodes/executors/executor_factory.hpp | 7 ++ 3 files changed, 76 insertions(+), 77 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index e10cb37dcecc4d..a0a4591376e2ec 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -35,9 +35,7 @@ #include "memory_desc/cpu_memory_desc.h" #include "node.h" #include "nodes/executors/executor.hpp" -#include "nodes/executors/executor_config.hpp" #include "nodes/executors/executor_factory.hpp" -#include "nodes/executors/implementations.hpp" #include "nodes/executors/memory_arguments.hpp" #include "nodes/node_config.h" #include "onednn/dnnl.h" @@ -254,43 +252,34 @@ void Concat::initSupportedPrimitiveDescriptors() { supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); } - const auto& concatImplementations = getImplementations(); - if (!concatImplementations.empty()) { - const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); - auto pushExecutorDesc = [&](LayoutType layoutType) { - NodeConfig nodeConfig; - nodeConfig.outConfs.resize(1); - nodeConfig.inConfs.resize(getParentEdges().size()); + auto pushExecutorDesc = [&](LayoutType layoutType) { + NodeConfig nodeConfig; + nodeConfig.outConfs.resize(1); + nodeConfig.inConfs.resize(getParentEdges().size()); - MemoryDescArgs descs; - descs.reserve(getParentEdges().size() + 1); - for (size_t i = 0; i < getParentEdges().size(); ++i) { - auto srcDesc = creatorsMap.at(layoutType)->createSharedDesc(inputPrecision, getInputShapeAtPort(i)); - nodeConfig.inConfs[i].setMemDesc(srcDesc); - nodeConfig.inConfs[i].inPlace(-1); - nodeConfig.inConfs[i].constant(false); - descs[ARG_SRC + i] = srcDesc; - } + MemoryDescArgs descs; + descs.reserve(getParentEdges().size() + 1); + for (size_t i = 0; i < getParentEdges().size(); ++i) { + auto srcDesc = creatorsMap.at(layoutType)->createSharedDesc(inputPrecision, getInputShapeAtPort(i)); + nodeConfig.inConfs[i].setMemDesc(srcDesc); + nodeConfig.inConfs[i].inPlace(-1); + nodeConfig.inConfs[i].constant(false); + descs[ARG_SRC + i] = srcDesc; + } - auto dstDesc = creatorsMap.at(layoutType)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0)); - nodeConfig.outConfs[0].setMemDesc(dstDesc); - nodeConfig.outConfs[0].inPlace(-1); - nodeConfig.outConfs[0].constant(false); - descs[ARG_DST] = dstDesc; - - const executor::Config config{descs, m_attrs}; - const bool supported = - std::any_of(concatImplementations.begin(), concatImplementations.end(), [&](const auto& impl) { - return impl.supports(config, memoryFormatFilter); - }); - if (supported) { - supportedPrimitiveDescriptors.emplace_back(nodeConfig, impl_desc_type::undef); - } - }; + auto dstDesc = creatorsMap.at(layoutType)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0)); + nodeConfig.outConfs[0].setMemDesc(dstDesc); + nodeConfig.outConfs[0].inPlace(-1); + nodeConfig.outConfs[0].constant(false); + descs[ARG_DST] = dstDesc; - pushExecutorDesc(LayoutType::ncsp); - pushExecutorDesc(LayoutType::nspc); - } + if (ExecutorFactory::hasSuitableImplementation(m_attrs, descs, memoryFormatFilter)) { + supportedPrimitiveDescriptors.emplace_back(nodeConfig, impl_desc_type::undef); + } + }; + + pushExecutorDesc(LayoutType::ncsp); + pushExecutorDesc(LayoutType::nspc); } void Concat::selectOptimalPrimitiveDescriptor() { diff --git a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp index 81e5d20a22e08a..fba1c75968a8fa 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/concat_implementations.cpp @@ -24,6 +24,7 @@ namespace ov::intel_cpu { template <> const std::vector>& getImplementations() { static const std::vector> concatImplementations{ + // clang-format off OV_CPU_INSTANCE_ACL( "concat_acl_ncsp", ExecutorType::Acl, @@ -35,47 +36,49 @@ const std::vector>& getImplementations() { HasNoOptimalConfig{}, AcceptsAnyShape, CreateDefault{}) - OV_CPU_INSTANCE_ACL( - "concat_acl_nspc", - ExecutorType::Acl, - OperationType::Concat, - [](const executor::Config& config, - [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { - return AclConcatExecutor::supports(config, LayoutType::nspc); - }, - HasNoOptimalConfig{}, - AcceptsAnyShape, - CreateDefault{}) - OV_CPU_INSTANCE_COMMON( - "concat_ref_ncsp", - ExecutorType::Reference, - OperationType::Concat, - []([[maybe_unused]] const executor::Config& config, - [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { - return false; - }, - HasNoOptimalConfig{}, - AcceptsAnyShape, - []([[maybe_unused]] const ConcatAttrs& attrs, - [[maybe_unused]] const MemoryArgs& memory, - [[maybe_unused]] const ExecutorContext::CPtr& context) -> ExecutorPtr { - return nullptr; - }) - OV_CPU_INSTANCE_COMMON( - "concat_ref_nspc", - ExecutorType::Reference, - OperationType::Concat, - []([[maybe_unused]] const executor::Config& config, - [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { - return false; - }, - HasNoOptimalConfig{}, - AcceptsAnyShape, - []([[maybe_unused]] const ConcatAttrs& attrs, - [[maybe_unused]] const MemoryArgs& memory, - [[maybe_unused]] const ExecutorContext::CPtr& context) -> ExecutorPtr { - return nullptr; - })}; + OV_CPU_INSTANCE_ACL( + "concat_acl_nspc", + ExecutorType::Acl, + OperationType::Concat, + [](const executor::Config& config, + [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + return AclConcatExecutor::supports(config, LayoutType::nspc); + }, + HasNoOptimalConfig{}, + AcceptsAnyShape, + CreateDefault{}) + OV_CPU_INSTANCE_COMMON( + "concat_ref_ncsp", + ExecutorType::Reference, + OperationType::Concat, + []([[maybe_unused]] const executor::Config& config, + [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + return false; + }, + HasNoOptimalConfig{}, + AcceptsAnyShape, + []([[maybe_unused]] const ConcatAttrs& attrs, + [[maybe_unused]] const MemoryArgs& memory, + [[maybe_unused]] const ExecutorContext::CPtr& context) -> ExecutorPtr { + return nullptr; + }) + OV_CPU_INSTANCE_COMMON( + "concat_ref_nspc", + ExecutorType::Reference, + OperationType::Concat, + []([[maybe_unused]] const executor::Config& config, + [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + return false; + }, + HasNoOptimalConfig{}, + AcceptsAnyShape, + []([[maybe_unused]] const ConcatAttrs& attrs, + [[maybe_unused]] const MemoryArgs& memory, + [[maybe_unused]] const ExecutorContext::CPtr& context) -> ExecutorPtr { + return nullptr; + }) + // clang-format on + }; return concatImplementations; } diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp index b12b752e5a2c52..547fb627b1ff94 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp @@ -30,6 +30,13 @@ class ExecutorFactory { public: using ExecutorImplementationRef = std::reference_wrapper>; + [[nodiscard]] static bool hasSuitableImplementation(const Attrs& attrs, + const MemoryDescArgs& descriptors, + const MemoryFormatFilter& memoryFormatFilter = {}, + const std::string& implementationPriority = {}) { + return !filter(attrs, descriptors, memoryFormatFilter, implementationPriority).empty(); + } + ExecutorFactory(Attrs attrs, ExecutorContext::CPtr context, const MemoryDescArgs& descriptors,