openvinotoolkit · abhijain1204fujitsu · Jan 12, 2026 · Feb 19, 2026 · Feb 24, 2026 · maxnick
@@ -100,7 +100,12 @@ MatMulKleidiAIExecutor::MatMulKleidiAIExecutor(const FCAttrs& attrs,
     if (!useDynamicQuant) {
         auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(memory.at(ARG_SRC)->getDescPtr()->getPrecision());
         auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc);
-        packedWeights = acl_fc_executor::reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
+        if (!attrs.weightsNonTransposed) {
+            dnnlDstDesc = acl_fc_executor::makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc);
+            aclfcAttrs.isWeightsRepacked = true;
+        }
+        MemoryCPtr packedWeights =
+            acl_fc_executor::reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
 
         const size_t rhsPackedSize = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(N, K);
         auto rhsPackedDesc = std::make_shared<CpuBlockedMemoryDesc>(u8, Shape({rhsPackedSize}));

@@ -114,7 +114,6 @@ class MatMulKleidiAIExecutor : public Executor {
     MemoryPtr biasMem;
     MemoryPtr rhsPackedMem;
     MemoryPtr lhsPackedMem;
-    MemoryCPtr packedWeights;
     size_t M = 0UL, N = 0UL, K = 0UL;
     size_t mr, nr, kr, sr;
     // F32 Kernel block size

@@ -1,7 +1,6 @@
 // Copyright (C) 2018-2026 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #include "gathermatmul.h"
 
 #include <oneapi/dnnl/dnnl_common_types.h>
@@ -46,7 +45,7 @@
 #include "transformations/cpu_opset/common/op/batch_gather_matmul_compressed.hpp"
 #include "transformations/utils/utils.hpp"
 #include "utils/general_utils.h"
-
+#ifdef OPENVINO_ARCH_X86_64
 namespace ov::intel_cpu::node {
 
 struct onednn_matmul_key {
@@ -299,7 +298,7 @@ bool GatherMatmul::isSupportedCompressedOperation([[maybe_unused]] const std::sh
                                                   [[maybe_unused]] size_t OC,
                                                   [[maybe_unused]] size_t G,
                                                   [[maybe_unused]] const Config& config) noexcept {
-#ifdef OPENVINO_ARCH_X86_64
+#    ifdef OPENVINO_ARCH_X86_64
     // copy paste from FullyConnected
     try {
         std::string errorMessage;
@@ -341,19 +340,19 @@ bool GatherMatmul::isSupportedCompressedOperation([[maybe_unused]] const std::sh
         return false;
     }
     return true;
-#else
+#    else
     return false;
-#endif
+#    endif
 }
 
 ov::element::TypeVector GatherMatmul::getSupportedCompressedWeightsTypes([[maybe_unused]] bool apply_fp8) {
     using ov::element::Type_t;
 
-#ifdef OPENVINO_ARCH_X86_64
+#    ifdef OPENVINO_ARCH_X86_64
     return {Type_t::u8, Type_t::i8, Type_t::u4, Type_t::i4};
-#else
+#    else
     return {};
-#endif
+#    endif
 }
 
 ov::element::TypeVector GatherMatmul::getSupportedCompressedActivationsTypes() {
@@ -860,3 +859,4 @@ bool GatherMatmul::created() const {
 }
 
 }  // namespace ov::intel_cpu::node
+#endif
@@ -7,10 +7,13 @@
 #include <memory>
 #include <oneapi/dnnl/dnnl.hpp>
 #include <string>
+#include <vector>
 
 #include "cpu_memory.h"
 #include "graph_context.h"
 #include "node.h"
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/executor_factory.hpp"
 #include "nodes/executors/memory_arguments.hpp"
 #include "openvino/core/node.hpp"
 
@@ -54,6 +57,8 @@ class GatherMatmul : public Node {
         WEIGHT_ZERO_POINTS,
     };
 
+#ifdef OPENVINO_ARCH_X86_64
+
     class onednn_matmul;
 
     using GemvImplPtr = std::shared_ptr<onednn_matmul>;
@@ -72,6 +77,21 @@ class GatherMatmul : public Node {
     MemoryDescPtr m_tmpOutputDesc = nullptr;
 
     bool bf16_amx_mode = false;
+#else
+
+    ov::element::Type getRuntimePrecision() const override;
+    Algorithm algorithm = Algorithm::GatherMatmulDefault;
+    size_t numExperts = 0;
+
+    std::vector<ExecutorPtr> executor;
+    std::vector<MemoryArgs> memArgsFC;
+
+    MemoryPtr m_weightsMemory = nullptr;
+    MemoryPtr m_tmpInpBuffer = nullptr;
+    MemoryDescPtr m_tmpInputDesc = nullptr;
+    MemoryDescPtr m_tmpOutputDesc = nullptr;
+
+#endif
 };
 
 }  // namespace ov::intel_cpu::node