Fix sparse mask handling in softmax kernel (#33814)

mangguo321 · web-flow · commit 65b105ae5da2 · 2026-02-04T10:32:22.000Z
### Details: - *Fix sparse mask handling in softmax kernel. In the sparse attention path, the sparse mask caused some blocks to be skipped, so those blocks are not written by the GEMM kernel, as a result, the corresponding regions in the output buffer remain uninitialized and their contents may decode to NAN/Inf values.* - *In this PR, we overwrite the skipped regions with -FLT_MAX to prevent NaN propagation and avoid incorrect computations in downstream kernels* ### Tickets: - *[CVS-179625](https://jira.devtools.intel.com/browse/CVS-179625)*
diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp
@@ -318,8 +318,9 @@ inline void scale_add2_reduce_max(float* a,
         if (has_sparse_mask) {                                                                               \
             size_t mask_idx = (i + n * vec_len_f32_avx512) / sparse_block_size;                              \
             uint8_t mask_val = sparse_mask[mask_idx];                                                        \
-            __m512 v_mask_block = _mm512_set1_ps(mask_val ? 0.f : -FLT_MAX);                                 \
-            v_a = _mm512_add_ps(v_a, v_mask_block);                                                          \
+            if (!mask_val) {                                                                                 \
+                v_a = v_nfltmax;                                                                             \
+            }                                                                                                \
         }                                                                                                    \
         if (has_causal_mask) {                                                                               \
             auto v_maski8 =                                                                                  \
@@ -355,8 +356,9 @@ inline void scale_add2_reduce_max(float* a,
         if (has_sparse_mask) {
             size_t mask_idx = i / sparse_block_size;
             uint8_t mask_val = sparse_mask[mask_idx];
-            __m512 v_mask_block = _mm512_set1_ps(mask_val ? 0.f : -FLT_MAX);
-            v_a = _mm512_add_ps(v_a, v_mask_block);
+            if (!mask_val) {
+                v_a = v_nfltmax;
+            }
         }
 
         if (has_causal_mask) {
@@ -390,8 +392,9 @@ inline void scale_add2_reduce_max(float* a,
         if (has_sparse_mask) {
             size_t mask_idx = i / sparse_block_size;
             uint8_t mask_val = sparse_mask[mask_idx];
-            __m512 v_mask_block = _mm512_set1_ps(mask_val ? 0.f : -FLT_MAX);
-            v_a = _mm512_add_ps(v_a, v_mask_block);
+            if (!mask_val) {
+                v_a = v_nfltmax;
+            }
         }
 
         if (has_causal_mask) {
@@ -439,8 +442,9 @@ inline void scale_add2_reduce_max(float* a,
         if (has_sparse_mask) {                                                                                         \
             size_t mask_idx = (i + n * vec_len_f32_avx2) / sparse_block_size;                                          \
             uint8_t mask_val = sparse_mask[mask_idx];                                                                  \
-            __m256 v_mask_block = _mm256_set1_ps(mask_val ? 0.f : -FLT_MAX);                                           \
-            v_a = _mm256_add_ps(v_a, v_mask_block);                                                                    \
+            if (!mask_val) {                                                                                           \
+                v_a = v_nfltmax;                                                                                       \
+            }                                                                                                          \
         }                                                                                                              \
         if (has_causal_mask) {                                                                                         \
             auto v_maski8 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx2)); \
@@ -476,8 +480,9 @@ inline void scale_add2_reduce_max(float* a,
         if (has_sparse_mask) {
             size_t mask_idx = i / sparse_block_size;
             uint8_t mask_val = sparse_mask[mask_idx];
-            __m256 v_mask_block = _mm256_set1_ps(mask_val ? 0.f : -FLT_MAX);
-            v_a = _mm256_add_ps(v_a, v_mask_block);
+            if (!mask_val) {
+                v_a = v_nfltmax;
+            }
         }
 
         if (has_causal_mask) {
@@ -512,8 +517,9 @@ inline void scale_add2_reduce_max(float* a,
         if (has_sparse_mask) {
             size_t mask_idx = i / sparse_block_size;
             uint8_t mask_val = sparse_mask[mask_idx];
-            __m256 v_mask_block = _mm256_set1_ps(mask_val ? 0.f : -FLT_MAX);
-            v_a = _mm256_add_ps(v_a, v_mask_block);
+            if (!mask_val) {
+                v_a = v_nfltmax;
+            }
         }
 
         if (has_causal_mask) {
@@ -560,8 +566,9 @@ inline void scale_add2_reduce_max(float* a,
         if (has_sparse_mask) {
             size_t mask_idx = i / sparse_block_size;
             uint8_t mask_val = sparse_mask[mask_idx];
-            float32x4_t v_mask_block = vdupq_n_f32(mask_val ? 0.0F : -FLT_MAX);
-            v_a = vaddq_f32(v_a, v_mask_block);
+            if (!mask_val) {
+                v_a = v_nfltmax;
+            }
         }
 
         if (has_causal_mask) {
@@ -596,7 +603,9 @@ inline void scale_add2_reduce_max(float* a,
         if (has_sparse_mask) {
             size_t mask_idx = i / sparse_block_size;
             uint8_t mask_val = sparse_mask[mask_idx];
-            a[i] += (mask_val ? 0.0F : -FLT_MAX);
+            if (!mask_val) {
+                a[i] = -FLT_MAX;
+            }
         }
 
         if (has_causal_mask) {
diff --git a/src/plugins/intel_cpu/tests/unit/CMakeLists.txt b/src/plugins/intel_cpu/tests/unit/CMakeLists.txt
@@ -32,7 +32,8 @@ if(NOT X86_64)
       ${CMAKE_CURRENT_SOURCE_DIR}/snippets_transformations/x64
       ${CMAKE_CURRENT_SOURCE_DIR}/nodes/eltwise_node_test.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/brgemm_executor_test.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/xattention_test.cpp)
+      ${CMAKE_CURRENT_SOURCE_DIR}/xattention_test.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/softmax_kernel_test.cpp)
 endif()
 
 if (NOT ENABLE_MLAS_FOR_CPU)
diff --git a/src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp b/src/plugins/intel_cpu/tests/unit/softmax_kernel_test.cpp
@@ -0,0 +1,87 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "nodes/kernels/scaled_attn/softmax_kernel.hpp"
+
+#include <cmath>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+namespace {
+TEST(SoftmaxKernelTest, AttnSoftmaxKernelWithSparseMask) {
+    std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+    std::vector<float> output(input.size(), 0.0f);
+    std::vector<uint8_t> sparse_mask = {1, 0, 1, 0};  // Masking some elements, block size 2
+    float scale = 1.0f;
+    float* alibi = nullptr;
+    void* attn_mask = nullptr;
+    uint8_t* causal_mask = nullptr;
+    bool select_nfltmax_at_0 = false;
+    size_t len = input.size();
+    size_t total_size = input.size();
+    ov::element::Type attn_mask_prec = ov::element::f32;
+    ov::element::Type dst_precision = ov::element::f32;
+    const float* sink = nullptr;
+    float alibi_slope = 0.0f;
+    size_t sparse_block_size = 2;
+    ov::Extensions::Cpu::XARCH::attn_softmax_kernel<float>(input.data(),
+                                                           output.data(),
+                                                           scale,
+                                                           alibi,
+                                                           attn_mask,
+                                                           causal_mask,
+                                                           select_nfltmax_at_0,
+                                                           len,
+                                                           total_size,
+                                                           attn_mask_prec,
+                                                           dst_precision,
+                                                           sink,
+                                                           alibi_slope,
+                                                           sparse_mask.data(),
+                                                           sparse_block_size);
+    std::vector<float> expect_output = {0.00483724f, 0.013149f, 0.0f, 0.0f, 0.264104f, 0.71791f, 0.0f, 0.0f};
+    for (size_t i = 0; i < output.size(); ++i) {
+        EXPECT_NEAR(output[i], expect_output[i], 1e-5f);
+    }
+}
+
+TEST(SoftmaxKernelTest, AttnSoftmaxKernelWithNaNInputAndSparseMask) {
+    std::vector<float> input = {1.0f, 2.0f, std::nanf(""), 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+    std::vector<float> output(input.size(), 0.0f);
+    std::vector<uint8_t> sparse_mask = {1, 0, 1, 0};  // Masking some elements, block size 2
+    float scale = 1.0f;
+    float* alibi = nullptr;
+    void* attn_mask = nullptr;
+    uint8_t* causal_mask = nullptr;
+    bool select_nfltmax_at_0 = false;
+    size_t len = input.size();
+    size_t total_size = input.size();
+    ov::element::Type attn_mask_prec = ov::element::f32;
+    ov::element::Type dst_precision = ov::element::f32;
+    const float* sink = nullptr;
+    float alibi_slope = 0.0f;
+    size_t sparse_block_size = 2;
+    ov::Extensions::Cpu::XARCH::attn_softmax_kernel<float>(input.data(),
+                                                           output.data(),
+                                                           scale,
+                                                           alibi,
+                                                           attn_mask,
+                                                           causal_mask,
+                                                           select_nfltmax_at_0,
+                                                           len,
+                                                           total_size,
+                                                           attn_mask_prec,
+                                                           dst_precision,
+                                                           sink,
+                                                           alibi_slope,
+                                                           sparse_mask.data(),
+                                                           sparse_block_size);
+    std::vector<float> expect_output = {0.00483724f, 0.013149f, 0.0f, 0.0f, 0.264104f, 0.71791f, 0.0f, 0.0f};
+    for (size_t i = 0; i < output.size(); ++i) {
+        EXPECT_NEAR(output[i], expect_output[i], 1e-5f);
+    }
+}
+
+}  // namespace