Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmake/developer_package/compile_flags/functions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ macro(ov_arm_sve_optimization_flags flags)
endif()

# Check for compiler SVE support
ov_check_compiler_supports_sve("-march=armv8-a+sve")
ov_check_compiler_supports_sve("-march=armv8-a+sve+fp16")
if(OV_COMPILER_IS_INTEL_LLVM)
message(WARNING "Unsupported CXX compiler ${CMAKE_CXX_COMPILER_ID}")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
Expand All @@ -305,7 +305,7 @@ macro(ov_arm_sve_optimization_flags flags)

# Add flag for SVE if supported
if(CXX_SVE_FOUND)
list(APPEND ${flags} -march=armv8-a+sve)
list(APPEND ${flags} -march=armv8-a+sve+fp16)
endif()
if(NOT CMAKE_CL_64)
list(APPEND ${flags} -ftree-vectorize)
Expand Down
4 changes: 2 additions & 2 deletions cmake/developer_package/features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ ov_dependent_option (ENABLE_AVX512F "Enable AVX512 optimizations" ON "X86_64 OR

ov_dependent_option (ENABLE_NEON_FP16 "Enable ARM FP16 optimizations" ON "AARCH64" OFF)

ov_dependent_option (ENABLE_SVE "Enable SVE optimizations" ON "AARCH64" OFF)
ov_dependent_option (ENABLE_SVE "Enable SVE optimizations" ON "AARCH64 AND NOT APPLE" OFF)

# Type of build, we add this as an explicit option to default it to ON
get_property(BUILD_SHARED_LIBS_DEFAULT GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS)
Expand Down Expand Up @@ -115,7 +115,7 @@ if(ENABLE_AVX512F)
endif()

if(ENABLE_SVE)
ov_check_compiler_supports_sve("-march=armv8-a+sve")
ov_check_compiler_supports_sve("-march=armv8-a+sve+fp16")

if(NOT CXX_HAS_SVE)
set(ENABLE_SVE OFF CACHE BOOL "Enables ARM64 SVE support" FORCE)
Expand Down
85 changes: 85 additions & 0 deletions src/plugins/intel_cpu/src/nodes/kernels/aarch64/sve_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// Copyright (C) 2024 FUJITSU LIMITED
// SPDX-License-Identifier: Apache-2.0
//
#include <arm_sve.h>

#include "openvino/core/type/float16.hpp"

namespace ov::intel_cpu::sve_utils {

template <typename T, typename... Args>
constexpr bool one_of(T val, Args... args) {
return ((val == args) || ...);
}

template <size_t T_SIZE>
svbool_t sve_predicate() {
static_assert(one_of(T_SIZE, 8, 16, 32, 64), "Unexpected parameter size");
if constexpr (8 == T_SIZE) {
return svptrue_b8();
} else if (16 == T_SIZE) {
return svptrue_b16();
} else if (32 == T_SIZE) {
return svptrue_b32();
} else if (64 == T_SIZE) {
return svptrue_b64();
}
}

template <typename T_TYPE, size_t T_SIZE>
svbool_t sve_predicate(T_TYPE lower, T_TYPE higher) {
static_assert(one_of(T_SIZE, 8, 16, 32, 64), "Unexpected parameter size");
if constexpr (8 == T_SIZE) {
return svwhilelt_b8(lower, higher);
} else if (16 == T_SIZE) {
return svwhilelt_b16(lower, higher);
} else if (32 == T_SIZE) {
return svwhilelt_b32(lower, higher);
} else if (64 == T_SIZE) {
return svwhilelt_b64(lower, higher);
}
}

template <size_t T_SIZE>
size_t sve_vlen() {
static_assert(one_of(T_SIZE, 8, 16, 32, 64), "Unexpected parameter size");
if constexpr (8 == T_SIZE) {
return svcntb();
} else if (16 == T_SIZE) {
return svcnth();
} else if (32 == T_SIZE) {
return svcntw();
} else if (64 == T_SIZE) {
return svcntd();
}
}

template <typename TA, typename TB>
static void cvt_copy(TA* dst, TB* src, size_t n) {
size_t i = 0;
if constexpr (std::is_same<TA, TB>::value) {
auto pg_dst = sve_predicate<sizeof(TA)>();
auto vlen = sve_vlen<sizeof(TA)>();
for (; i + vlen <= n; i += vlen) {
auto vb = svld1(pg_dst, src + i);
svst1(pg_dst, dst + i, vb);
}
auto pgt = sve_predicate<TA, sizeof(TA)>(i, n);
auto vb = svld1(pg_dst, src + i);
svst1(pg_dst, dst + i, vb);
return;
} else if constexpr (std::is_same<TA, float>::value && std::is_same<TB, ov::float16>::value) {
auto src_ptr = reinterpret_cast<float16_t*>(src);
auto pg_vl2 = svwhilelt_b16(svcnth() / 2, svcnth());
auto vlen = svcnth() / 2;
auto pg_dst = svptrue_b32();
for (; i + vlen <= n; i += vlen) {
auto load_src = svld1_f16(pg_vl2, src_ptr + i);
auto src_interleave = svzip1_f16(load_src, load_src);
auto cvt_dst = svcvt_f32_f16_z(pg_dst, src_interleave);
svst1(pg_dst, dst + i, cvt_dst);
}
}
}

} // namespace ov::intel_cpu::sve_utils
104 changes: 104 additions & 0 deletions src/plugins/intel_cpu/src/nodes/kernels/kai/kleidi_kernel.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// Copyright (C) 2025 FUJITSU LIMITED
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <arm_neon.h>
#include <kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.h>
#include <kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/kai_matmul_clamp_f16_f16_f16p_interface.h>
#include <kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h>

#include <limits>
#include <openvino/core/type/element_type.hpp>

namespace ov::intel_cpu {

class KleidiGemm {
public:
KleidiGemm(size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc);
void executeGemm(const void* a, const void* b, void* c);
void packB(const float16_t* inp, const float16_t* bias, float16_t* packed_out);
const size_t get_packed_rhs_size() const;

private:
static constexpr kai_matmul_clamp_f16_f16_f16p_ukernel ukernel{
kai_get_m_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
kai_get_n_step_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
kai_get_nr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
kai_get_kr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
kai_get_sr_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
kai_get_lhs_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
kai_get_rhs_packed_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
kai_get_dst_offset_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
kai_get_dst_size_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla,
kai_run_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla};
size_t M, N, K;
size_t lda, ldb, ldc;
size_t nr, kr, sr;
size_t packedRHSsize;
};

KleidiGemm::KleidiGemm(size_t _M, size_t _N, size_t _K, size_t _lda, size_t _ldb, size_t _ldc)
: M(_M),
N(_N),
K(_K),
lda(_lda),
ldb(_ldb),
ldc(_ldc),
nr(ukernel.get_nr()),
kr(ukernel.get_kr()),
sr(ukernel.get_sr()),
packedRHSsize(kai_get_rhs_packed_size_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(_N, _K)){};

const size_t KleidiGemm::get_packed_rhs_size() const {
return packedRHSsize;
}

void KleidiGemm::packB(const float16_t* inp, const float16_t* bias, float16_t* packed_out) {
// Packing only needs to be performed once if the contents of the bias and RHS matrices are expected to be constant.
kai_run_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(1,
N,
K,
nr,
kr,
sr, // Packing arguments
ldb * sizeof(float16_t), // RHS stride
inp, // RHS
bias, // Bias
NULL, // Scale
packed_out, // RHS packed
0,
NULL);
}

void KleidiGemm::executeGemm(const void* a, const void* b, void* c) {
const size_t m_step = ukernel.get_m_step();
const size_t n_step = ukernel.get_n_step();
for (size_t i_m_step = 0; i_m_step < M; i_m_step += m_step) {
for (size_t i_n_step = 0; i_n_step < N; i_n_step += n_step) {
const uint8_t* lhs_ptr =
static_cast<const uint8_t*>(a) + (ukernel.get_lhs_packed_offset(i_m_step, lda * sizeof(float16_t)));
const uint8_t* rhs_ptr = static_cast<const uint8_t*>(b) + (ukernel.get_rhs_packed_offset(i_n_step, K));
uint8_t* dst_ptr =
static_cast<uint8_t*>(c) + (ukernel.get_dst_offset(i_m_step, i_n_step, ldc * sizeof(float16_t)));
const size_t actual_m = std::min(M - i_m_step, m_step);
const size_t actual_n = std::min(N - i_n_step, n_step);

ukernel.run_matmul(actual_m,
actual_n,
K, // Dimensions
lhs_ptr, // LHS
lda * sizeof(float16_t), // LHS stride
rhs_ptr, // RHS packed
dst_ptr, // DST
ldc * sizeof(float16_t), // DST stride (row)
sizeof(float16_t), // DST stride (col)
-std::numeric_limits<float>::max(),
std::numeric_limits<float>::max() // Min and max for the clamp operation
);
}
}
}

} // namespace ov::intel_cpu
Loading
Loading