Fix SDPA decode watcher errors with half-tile (16x32) CBs

Pavle Josipovic · claude · Pavle Josipovic · commit ff8edefa4125 · 2026-02-13T09:21:21.000Z
`generate_reduce_scaler` hardcoded 2048 bytes and 4 faces, assuming full 32x32 bf16 tiles. When circular buffers use half tiles (1024B, 2 faces), this overwrites adjacent L1 memory causing watcher-detected corruption. Restore the `half_tile` template parameter (previously removed in cleanup) so the zero-fill size and face iteration adapt to the actual tile dimensions. Also fix idle core runtime args count mismatch in sdpa_decode_program_factory. Fixes: #37631 Fixes: #29225 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/models/demos/deepseek_v3_b1/tests/unit_tests/test_flash_mla.py b/models/demos/deepseek_v3_b1/tests/unit_tests/test_flash_mla.py
@@ -12,7 +12,7 @@
 from loguru import logger
 
 import ttnn
-from models.common.utility_functions import comp_pcc, is_blackhole, is_watcher_enabled
+from models.common.utility_functions import comp_pcc
 from models.demos.deepseek_v3_b1.micro_ops.flash_mla.op import FlashMLADecode
 
 
@@ -22,8 +22,6 @@
 @pytest.mark.parametrize("max_seq_len", [32 * 1024])  # 32k max sequence length per chip
 def test_flash_mla_decode(device, batch_size, num_chunks, k_chunk_size, max_seq_len):
     """Test FlashMLADecode op."""
-    if is_blackhole() and is_watcher_enabled():
-        pytest.skip("Skipping test on Blackhole with watcher enabled, see issue #37631")
 
     # Calculate decode_position from num_chunks and k_chunk_size
     decode_position = num_chunks * k_chunk_size - 1
diff --git a/ttnn/cpp/ttnn/kernel/dataflow/generate_reduce_scaler.hpp b/ttnn/cpp/ttnn/kernel/dataflow/generate_reduce_scaler.hpp
@@ -8,10 +8,11 @@
 
 // Tile is assumed to have 16-bit elements
 // Scaler is assumed to be a 16-bit value double packed into a u32
+template <bool half_tile = false>
 FORCE_INLINE void generate_reduce_scaler(const uint32_t cb_id, const uint32_t scaler) {
     cb_reserve_back(cb_id, 1);
 
-    constexpr uint32_t num_zeros_reads = 2048 / MEM_ZEROS_SIZE;
+    constexpr uint32_t num_zeros_reads = (half_tile ? 1024 : 2048) / MEM_ZEROS_SIZE;
     static_assert(num_zeros_reads > 0, "num_zeros_reads must be greater than 0");
     uint64_t zeros_noc_addr = get_noc_addr(MEM_ZEROS_BASE);
     uint32_t write_addr = get_write_ptr(cb_id);
@@ -27,7 +28,7 @@ FORCE_INLINE void generate_reduce_scaler(const uint32_t cb_id, const uint32_t sc
     noc_async_read_barrier();
 
     if (scaler != 0) {
-        for (int k = 0; k < 4; ++k) {
+        for (int k = 0; k < (half_tile ? 2 : 4); ++k) {
             uint32_t idx = k << 7;
             for (int j = 0; j < 8; ++j) {
                 ptr[idx + j] = scaler;
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp
@@ -134,9 +134,9 @@ void kernel_main() {
     constexpr uint32_t cb_out_l = tt::CBIndex::c_18;
 
     // generate and send scaler to compute
-    // These helper functions respect tile size of CBs (ie. no need for special handling of tiny tiles)
-    generate_reduce_scaler(cb_identity_scale_in, identity_scalar_packed);
-    generate_reduce_scaler(cb_zero_in, zero_scalar_packed);
+    constexpr bool is_half_tile = (get_tile_size(cb_identity_scale_in) < 2 * tt::constants::TILE_HW);
+    generate_reduce_scaler<is_half_tile>(cb_identity_scale_in, identity_scalar_packed);
+    generate_reduce_scaler<is_half_tile>(cb_zero_in, zero_scalar_packed);
     generate_bcast_col_scalar(cb_col_identity, identity_scalar_packed);
 
     if (k_chunk_start == window_start_chunk && window_start_unaligned > 0) {
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp
@@ -1011,7 +1011,7 @@ SdpaDecodeProgramFactory::cached_program_t SdpaDecodeProgramFactory::create(
         for (auto core : core_group_idle) {
             log_debug(tt::LogOp, "Setting core {} to idle", core);
             // reader runtime args
-            std::vector<uint32_t> reader_rt_args = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+            std::vector<uint32_t> reader_rt_args = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
             // writer runtime args
             std::vector<uint32_t> writer_rt_args = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};