From 5ae646992158026c66b36a2d6c54b14814bd0904 Mon Sep 17 00:00:00 2001 From: mangguo Date: Mon, 26 Jan 2026 16:01:37 +0800 Subject: [PATCH 1/4] Fix sparse mask handling in softmax kernel --- .../kernels/scaled_attn/softmax_kernel.hpp | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp index 8f39e9b9f5c59b..15ad8350c01b89 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp @@ -318,8 +318,9 @@ inline void scale_add2_reduce_max(float* a, if (has_sparse_mask) { \ size_t mask_idx = (i + n * vec_len_f32_avx512) / sparse_block_size; \ uint8_t mask_val = sparse_mask[mask_idx]; \ - __m512 v_mask_block = _mm512_set1_ps(mask_val ? 0.f : -FLT_MAX); \ - v_a = _mm512_add_ps(v_a, v_mask_block); \ + if (!mask_val) { \ + v_a = v_nfltmax; \ + } \ } \ if (has_causal_mask) { \ auto v_maski8 = \ @@ -355,8 +356,9 @@ inline void scale_add2_reduce_max(float* a, if (has_sparse_mask) { size_t mask_idx = i / sparse_block_size; uint8_t mask_val = sparse_mask[mask_idx]; - __m512 v_mask_block = _mm512_set1_ps(mask_val ? 0.f : -FLT_MAX); - v_a = _mm512_add_ps(v_a, v_mask_block); + if (!mask_val) { + v_a = v_nfltmax; + } } if (has_causal_mask) { @@ -390,8 +392,9 @@ inline void scale_add2_reduce_max(float* a, if (has_sparse_mask) { size_t mask_idx = i / sparse_block_size; uint8_t mask_val = sparse_mask[mask_idx]; - __m512 v_mask_block = _mm512_set1_ps(mask_val ? 0.f : -FLT_MAX); - v_a = _mm512_add_ps(v_a, v_mask_block); + if (!mask_val) { + v_a = v_nfltmax; + } } if (has_causal_mask) { @@ -439,8 +442,9 @@ inline void scale_add2_reduce_max(float* a, if (has_sparse_mask) { \ size_t mask_idx = (i + n * vec_len_f32_avx2) / sparse_block_size; \ uint8_t mask_val = sparse_mask[mask_idx]; \ - __m256 v_mask_block = _mm256_set1_ps(mask_val ? 0.f : -FLT_MAX); \ - v_a = _mm256_add_ps(v_a, v_mask_block); \ + if (!mask_val) { \ + v_a = v_nfltmax; \ + } \ } \ if (has_causal_mask) { \ auto v_maski8 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx2)); \ @@ -476,8 +480,9 @@ inline void scale_add2_reduce_max(float* a, if (has_sparse_mask) { size_t mask_idx = i / sparse_block_size; uint8_t mask_val = sparse_mask[mask_idx]; - __m256 v_mask_block = _mm256_set1_ps(mask_val ? 0.f : -FLT_MAX); - v_a = _mm256_add_ps(v_a, v_mask_block); + if (!mask_val) { + v_a = v_nfltmax; + } } if (has_causal_mask) { @@ -512,8 +517,9 @@ inline void scale_add2_reduce_max(float* a, if (has_sparse_mask) { size_t mask_idx = i / sparse_block_size; uint8_t mask_val = sparse_mask[mask_idx]; - __m256 v_mask_block = _mm256_set1_ps(mask_val ? 0.f : -FLT_MAX); - v_a = _mm256_add_ps(v_a, v_mask_block); + if (!mask_val) { + v_a = v_nfltmax; + } } if (has_causal_mask) { @@ -560,8 +566,9 @@ inline void scale_add2_reduce_max(float* a, if (has_sparse_mask) { size_t mask_idx = i / sparse_block_size; uint8_t mask_val = sparse_mask[mask_idx]; - float32x4_t v_mask_block = vdupq_n_f32(mask_val ? 0.0F : -FLT_MAX); - v_a = vaddq_f32(v_a, v_mask_block); + if (!mask_val) { + v_a = v_nfltmax; + } } if (has_causal_mask) { @@ -596,7 +603,9 @@ inline void scale_add2_reduce_max(float* a, if (has_sparse_mask) { size_t mask_idx = i / sparse_block_size; uint8_t mask_val = sparse_mask[mask_idx]; - a[i] += (mask_val ? 0.0F : -FLT_MAX); + if (!mask_val) { + a[i] = -FLT_MAX; + } } if (has_causal_mask) { From cdd68e6509b2eef51a2684d3b1825b89a0442442 Mon Sep 17 00:00:00 2001 From: mangguo Date: Fri, 30 Jan 2026 15:44:49 +0800 Subject: [PATCH 2/4] Add softmax kernel unit test with sparse mask --- .../tests/unit/softmax_kernel_test.cpp | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp diff --git a/src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp b/src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp new file mode 100644 index 00000000000000..4c0ae86b755453 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "nodes/kernels/scaled_attn/softmax_kernel.hpp" + +#include +#include + +#include "gtest/gtest.h" + +namespace { +TEST(SoftmaxKernelTest, AttnSoftmaxKernelWithSparseMask) { + std::vector input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + std::vector output(input.size(), 0.0f); + std::vector sparse_mask = {1, 0, 1, 0}; // Masking some elements, block size 2 + float scale = 1.0f; + float* alibi = nullptr; + void* attn_mask = nullptr; + uint8_t* causal_mask = nullptr; + bool select_nfltmax_at_0 = false; + size_t len = input.size(); + size_t total_size = input.size(); + ov::element::Type attn_mask_prec = ov::element::f32; + ov::element::Type dst_precision = ov::element::f32; + const float* sink = nullptr; + float alibi_slope = 0.0f; + size_t sparse_block_size = 2; + ov::Extensions::Cpu::XARCH::attn_softmax_kernel(input.data(), + output.data(), + scale, + alibi, + attn_mask, + causal_mask, + select_nfltmax_at_0, + len, + total_size, + attn_mask_prec, + dst_precision, + sink, + alibi_slope, + sparse_mask.data(), + sparse_block_size); + std::vector expect_output = {0.00483724f, 0.013149f, 0.0f, 0.0f, 0.264104, 0.71791f, 0.0f, 0.0f}; + for (size_t i = 0; i < output.size(); ++i) { + EXPECT_NEAR(output[i], expect_output[i], 1e-4f); + } +} + +TEST(SoftmaxKernelTest, AttnSoftmaxKernelWithNaNInputAndSparseMask) { + std::vector input = {1.0f, 2.0f, std::nanf(""), 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + std::vector output(input.size(), 0.0f); + std::vector sparse_mask = {1, 0, 1, 0}; // Masking some elements, block size 2 + float scale = 1.0f; + float* alibi = nullptr; + void* attn_mask = nullptr; + uint8_t* causal_mask = nullptr; + bool select_nfltmax_at_0 = false; + size_t len = input.size(); + size_t total_size = input.size(); + ov::element::Type attn_mask_prec = ov::element::f32; + ov::element::Type dst_precision = ov::element::f32; + const float* sink = nullptr; + float alibi_slope = 0.0f; + size_t sparse_block_size = 2; + ov::Extensions::Cpu::XARCH::attn_softmax_kernel(input.data(), + output.data(), + scale, + alibi, + attn_mask, + causal_mask, + select_nfltmax_at_0, + len, + total_size, + attn_mask_prec, + dst_precision, + sink, + alibi_slope, + sparse_mask.data(), + sparse_block_size); + std::vector expect_output = {0.00483724f, 0.013149f, 0.0f, 0.0f, 0.264104, 0.71791f, 0.0f, 0.0f}; + for (size_t i = 0; i < output.size(); ++i) { + EXPECT_NEAR(output[i], expect_output[i], 1e-5f); + } +} + +} // namespace From 2bcf7f8792b9b923e5e06b6c87d54daa72ead8a7 Mon Sep 17 00:00:00 2001 From: mangguo Date: Fri, 30 Jan 2026 16:00:43 +0800 Subject: [PATCH 3/4] Fix typo --- src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp b/src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp index 4c0ae86b755453..fa26d480c5ecd2 100644 --- a/src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp @@ -41,9 +41,9 @@ TEST(SoftmaxKernelTest, AttnSoftmaxKernelWithSparseMask) { alibi_slope, sparse_mask.data(), sparse_block_size); - std::vector expect_output = {0.00483724f, 0.013149f, 0.0f, 0.0f, 0.264104, 0.71791f, 0.0f, 0.0f}; + std::vector expect_output = {0.00483724f, 0.013149f, 0.0f, 0.0f, 0.264104f, 0.71791f, 0.0f, 0.0f}; for (size_t i = 0; i < output.size(); ++i) { - EXPECT_NEAR(output[i], expect_output[i], 1e-4f); + EXPECT_NEAR(output[i], expect_output[i], 1e-5f); } } @@ -78,7 +78,7 @@ TEST(SoftmaxKernelTest, AttnSoftmaxKernelWithNaNInputAndSparseMask) { alibi_slope, sparse_mask.data(), sparse_block_size); - std::vector expect_output = {0.00483724f, 0.013149f, 0.0f, 0.0f, 0.264104, 0.71791f, 0.0f, 0.0f}; + std::vector expect_output = {0.00483724f, 0.013149f, 0.0f, 0.0f, 0.264104f, 0.71791f, 0.0f, 0.0f}; for (size_t i = 0; i < output.size(); ++i) { EXPECT_NEAR(output[i], expect_output[i], 1e-5f); } From 6301843efc9a94c177be4120ff0c3afa9b743f33 Mon Sep 17 00:00:00 2001 From: mangguo Date: Sun, 1 Feb 2026 20:51:35 +0800 Subject: [PATCH 4/4] Skip the test on non-x86_64 platforms as xattention is supported only on Intel CPUs. --- src/plugins/intel_cpu/tests/unit/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/tests/unit/CMakeLists.txt b/src/plugins/intel_cpu/tests/unit/CMakeLists.txt index aea4278036a4ed..ac7a88858c6766 100644 --- a/src/plugins/intel_cpu/tests/unit/CMakeLists.txt +++ b/src/plugins/intel_cpu/tests/unit/CMakeLists.txt @@ -32,7 +32,8 @@ if(NOT X86_64) ${CMAKE_CURRENT_SOURCE_DIR}/snippets_transformations/x64 ${CMAKE_CURRENT_SOURCE_DIR}/nodes/eltwise_node_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/brgemm_executor_test.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/xattention_test.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/xattention_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/softmax_kernel_test.cpp) endif() if (NOT ENABLE_MLAS_FOR_CPU)