tenstorrent
diff --git a/‎tests/sweep_framework/sweep_utils/max_pool2d_with_indices_common.py‎
Lines changed: 12 additions & 3 deletions b/‎tests/sweep_framework/sweep_utils/max_pool2d_with_indices_common.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎tests/ttnn/unit_tests/operations/pool/test_mpwi.py‎
Lines changed: 76 additions & 21 deletions b/‎tests/ttnn/unit_tests/operations/pool/test_mpwi.py‎
Lines changed: 76 additions & 21 deletions
diff --git a/‎tt_metal/hw/inc/api/compute/compute_kernel_api.h‎
Lines changed: 1 addition & 3 deletions b/‎tt_metal/hw/inc/api/compute/compute_kernel_api.h‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎ttnn/cpp/ttnn/operations/pool/generic/device/kernels/compute/compute_mpwi.cpp‎
Lines changed: 16 additions & 10 deletions b/‎ttnn/cpp/ttnn/operations/pool/generic/device/kernels/compute/compute_mpwi.cpp‎
Lines changed: 16 additions & 10 deletions
@@ -15,6 +15,7 @@ def validate_indices(input_tensor, torch_indices, ttnn_indices, kernel_size, str
     """
     Validate indices using logic from test_mpwi.py
     Note input tensors should be in [N, H, W, C] format
+    Supports both uint16 and uint32 index tensors (indices should be converted to int64 before calling)
     Returns (indices_valid, tie_breaking_differences, actual_errors, value_differences, window_violations)
     """
     batch_size, input_h, input_w, channels = input_tensor.shape
@@ -215,10 +216,18 @@ def run_max_pool2d_with_indices(
         )
 
     ttnn_output_torch = ttnn.to_torch(ttnn_output)
-    # convert indexes to int64 for compatability with torch
+
+    # convert indexes to int64 for compatibility with torch
     ttnn_indices_torch = ttnn.to_torch(ttnn_indices, dtype=torch.int64)
-    # manually fix the wrapping since TTNN uint16 tensors get converted to int16 torch tensors, even when data type is specified as int64
-    ttnn_indices_torch = torch.where(ttnn_indices_torch < 0, ttnn_indices_torch + 65536, ttnn_indices_torch)
+
+    # manually fix the wrapping since TTNN uint16/uint32 tensors get converted to int16/int32 torch tensors
+    # even when data type is specified as int64
+    if ttnn_indices.dtype == ttnn.uint16:
+        # uint16: wraps at 65536 (2^16)
+        ttnn_indices_torch = torch.where(ttnn_indices_torch < 0, ttnn_indices_torch + 65536, ttnn_indices_torch)
+    elif ttnn_indices.dtype == ttnn.uint32:
+        # uint32: wraps at 4294967296 (2^32)
+        ttnn_indices_torch = torch.where(ttnn_indices_torch < 0, ttnn_indices_torch + 4294967296, ttnn_indices_torch)
 
     torch_output, torch_indices = torch.nn.functional.max_pool2d(
         torch_input,
 
@@ -82,9 +82,9 @@ def test_mpwi_20_core_C_dims(device, in_c):
         [4, 64, 30, 40, 4, 8, 1, 1, 2, 4, 1, 1, False],
     ],
 )
-@pytest.mark.parametrize("ttnn_dtype", [ttnn.bfloat16])
+@pytest.mark.parametrize("ttnn_dtype", [ttnn.bfloat16, ttnn.bfloat8_b])
 @skip_with_watcher("Test is not passing with watcher enabled github issue #37195")
-def test_mpwi_kernel_sizes(device, ttnn_dtype, input_spec):
+def test_mpwi_small_kernel_sizes(device, ttnn_dtype, input_spec):
     (
         in_n,
         in_c,
@@ -129,11 +129,6 @@ def test_mpwi_kernel_sizes(device, ttnn_dtype, input_spec):
     [
         # Contains following parameters
         # [batch_size, input_channels, input_height, input_width, kernel_height, kernel_width, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, ceil_mode]
-        # DILATION / MULTI-BATCH CASES
-        [2, 40, 100, 100, 3, 3, 2, 2, 0, 1, 2, 2, True],
-        [3, 56, 85, 85, 3, 3, 3, 3, 1, 0, 2, 2, False],
-        [4, 24, 56, 64, 3, 3, 2, 1, 1, 1, 3, 2, True],
-        # LARGE KERNEL CASES
         [2, 64, 159, 159, 13, 13, 2, 2, 6, 6, 2, 2, True],
         [2, 40, 100, 100, 9, 9, 2, 2, 0, 1, 2, 2, True],
         [3, 56, 85, 85, 8, 8, 3, 3, 1, 0, 2, 2, False],
@@ -146,7 +141,7 @@ def test_mpwi_kernel_sizes(device, ttnn_dtype, input_spec):
 )
 @pytest.mark.parametrize("ttnn_dtype", [ttnn.bfloat16, ttnn.bfloat8_b])
 @skip_with_watcher("Test is not passing with watcher enabled github issue #37195")
-def test_mpwi_general(device, ttnn_dtype, input_spec):
+def test_mpwi_large_kernel_sizes(device, ttnn_dtype, input_spec):
     (
         in_n,
         in_c,
@@ -198,21 +193,28 @@ def test_mpwi_general(device, ttnn_dtype, input_spec):
     )
 
 
-@pytest.mark.skip(reason="DRAM slicing with return_indices is not yet supported")
 @pytest.mark.parametrize(
     "input_spec",
     [
         # Contains following parameters
-        # [batch_size, input_channels, input_height, input_width, kernel_height, kernel_width, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, ceil_mode, num_slices]
-        # DILATION / MULTI-BATCH CASES
-        [2, 40, 1024, 1024, 3, 3, 2, 2, 0, 1, 2, 2, True, 8],
-        [3, 56, 512, 512, 3, 3, 3, 3, 1, 0, 2, 2, False, 8],
-        [4, 24, 768, 768, 3, 3, 2, 1, 1, 1, 3, 2, True, 8],
+        # [batch_size, input_channels, input_height, input_width, kernel_height, kernel_width, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, ceil_mode]
+        [3, 16, 80, 80, 9, 9, 3, 3, 3, 1, 2, 2, True],
+        [2, 48, 60, 60, 6, 6, 2, 2, 2, 0, 2, 2, False],
+        [4, 56, 65, 55, 5, 5, 1, 2, 1, 1, 1, 2, False],
+        [4, 24, 56, 64, 3, 3, 2, 1, 0, 1, 3, 2, True],
     ],
 )
 @pytest.mark.parametrize("ttnn_dtype", [ttnn.bfloat16, ttnn.bfloat8_b])
+@pytest.mark.parametrize(
+    "sharding_scheme",
+    [
+        ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+        ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+        ttnn.TensorMemoryLayout.BLOCK_SHARDED,
+    ],
+)
 @skip_with_watcher("Test is not passing with watcher enabled github issue #37195")
-def test_mpwi_dram_slice(device, ttnn_dtype, input_spec):
+def test_mpwi_general(device, ttnn_dtype, sharding_scheme, input_spec):
     (
         in_n,
         in_c,
@@ -227,9 +229,11 @@ def test_mpwi_dram_slice(device, ttnn_dtype, input_spec):
         dilation_h,
         dilation_w,
         ceil_mode,
-        num_slices,
     ) = input_spec
-    dram_slice_config = ttnn.Conv2dSliceConfig(num_slices=num_slices, slice_type=ttnn.Conv2dDRAMSliceWidth)
+
+    if sharding_scheme == ttnn.TensorMemoryLayout.WIDTH_SHARDED and ttnn_dtype == ttnn.bfloat8_b:
+        pytest.skip("this case runs OOM")
+
     run_max_pool2d_with_indices(
         in_n,
         in_c,
@@ -245,10 +249,61 @@ def test_mpwi_dram_slice(device, ttnn_dtype, input_spec):
         dilation_w,
         ttnn_dtype,
         device,
-        ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+        sharding=sharding_scheme,
+        ceil_mode=ceil_mode,
+        memory_config=None,
+        run_twice=True,
+        config_tensor_in_dram=True,
+    )
+
+
+@pytest.mark.parametrize(
+    "input_spec",
+    [
+        [1, 32, 384, 384, 3, 3, 1, 1, 1, 1, 1, 1, False],
+        [1, 48, 350, 350, 5, 5, 1, 1, 2, 2, 1, 1, False],
+        [1, 64, 350, 350, 6, 6, 1, 1, 3, 3, 1, 1, False],
+        [3, 32, 300, 300, 7, 7, 1, 1, 3, 3, 1, 1, False],
+        [2, 48, 300, 300, 9, 9, 2, 2, 4, 4, 1, 1, False],
+    ],
+)
+@pytest.mark.parametrize("ttnn_dtype", [ttnn.bfloat16])
+@skip_with_watcher("Test is not passing with watcher enabled github issue #37195")
+def test_mpwi_32_bit_index(device, ttnn_dtype, input_spec):
+    (
+        in_n,
+        in_c,
+        in_h,
+        in_w,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
         ceil_mode,
-        None,  # no memory_config
-        False,  # not in place
-        dram_slice_config=dram_slice_config,
+    ) = input_spec
+
+    run_max_pool2d_with_indices(
+        in_n,
+        in_c,
+        in_h,
+        in_w,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        ttnn_dtype,
+        device,
+        sharding=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
+        ceil_mode=ceil_mode,
+        memory_config=None,
+        run_twice=True,
         config_tensor_in_dram=True,
     )
@@ -595,14 +595,12 @@ ALWI void topk_tile_init() { MATH((llk_math_eltwise_unary_sfpu_topk_init<true>()
  * acquired state via *acquire_dst* call. This call is blocking and is only
  * available on the compute engine.
  *
- * Only a reduction of 9 rows is supported at this time.
- *
  * | Argument        | Description                                                                 | Type       | Valid Range                                           | Required |
  * |-----------------|-----------------------------------------------------------------------------|------------|-------------------------------------------------------|----------|
  * | idst            | The index of the tile in DST register containing the data to be reduced     | uint32_t   | Must be less than the size of the DST register buffer | True     |
  * | idst_idx        | The index of the tile in DST register containing the indices of the data    | uint32_t   | Must be less than the size of the DST register buffer | True     |
  * | chunk           | The index of the intra-kernel "chunk" of data for large kernel accumulation | uint32_t   | 0 to UINT_MAX                                         | False    |
- * | num_rows        | The number of rows to use for the MaxPool operation                         | uint32_t   | {9}                                                   | False    |
+ * | num_rows        | The number of rows to use for the MaxPool operation                         | uint32_t   | <= 32, but note either 9 or 32 rows will be reduced   | False    |
  * | layout          | The data layout of the data in DST                                          | DataLayout | TILE or ROW_MAJOR                                     | False    |
  * | accumulate      | Whether to accumulate results for large kernels                             | bool       | true, false                                           | False    |
  * | ITERATIONS      | The number of iterations to perform (unused)                                | int        | 1 to 8                                                | False    |
 
@@ -73,6 +73,9 @@ void kernel_main() {
     constexpr uint32_t kernel_h = get_compile_time_arg_val(34);
     constexpr uint32_t kernel_w = get_compile_time_arg_val(35);
     constexpr uint32_t clear_value_cb_id = get_compile_time_arg_val(36);
+    constexpr uint32_t indexes_32_bit = get_compile_time_arg_val(37);
+
+    constexpr DataFormat copy_format = indexes_32_bit ? DataFormat::UInt32 : DataFormat::UInt16;
 
     constexpr uint32_t mpwi_cb_tile_idx = 0;
     constexpr uint32_t data_dst_idx = 0;
@@ -111,8 +114,8 @@ void kernel_main() {
 
     uint32_t current_idx_col;
     uint32_t current_idx_row;
-    const uint16_t start_row = (uint16_t)get_arg_val<uint32_t>(2);
-    const uint16_t start_col = (uint16_t)get_arg_val<uint32_t>(3);
+    const uint32_t start_row = get_arg_val<uint32_t>(2);
+    const uint32_t start_col = get_arg_val<uint32_t>(3);
     current_idx_col = start_col;
     current_idx_row = start_row;
 
@@ -127,7 +130,6 @@ void kernel_main() {
     }
 
     unary_op_init_common(in_cb_id_0, in_cb_id_0);
-    copy_tile_to_dst_init_short(in_cb_id_0);
     max_reduce_with_indices_init<ckernel::DataLayout::ROW_MAJOR>();
 
     // if max out sticks is non-zero then this will be used as the number of out sticks for every core
@@ -144,6 +146,7 @@ void kernel_main() {
             tile_regs_acquire();
             uint32_t intra_kernel_h = 0;
             uint32_t intra_kernel_w = 0;
+            copy_tile_to_dst_init_short(compute_tmp_idx_cb_id);
             reconfig_data_format_srca(compute_tmp_idx_cb_id);
             if (first_iteration) {  // move the initial indexes from the reader to DST
                 cb_wait_front(in_idx_cb_id, 1);
@@ -159,24 +162,27 @@ void kernel_main() {
                 // clear the accumulation tiles since they will contain garbage data which is partially loaded
                 // since max SFPU offset if 62 DST rows, but 4 rows are loaded each time so we load 2 rows of
                 // DST tiles 1 and 3 during the reduction of tiles 0 and 2
+                copy_tile_to_dst_init_short(clear_value_cb_id);
                 reconfig_data_format_srca(clear_value_cb_id);
                 copy_tile(clear_value_cb_id, mpwi_cb_tile_idx, data_accum_dst_idx);
 
                 // make a copy of the initial indexes to be used for restoring between C blocks
-                copy_dest_values<DataFormat::UInt16>(index_dst_idx, index_temp_dst_idx);
+                copy_dest_values<copy_format>(index_dst_idx, index_temp_dst_idx);
             }
 
             for (uint32_t chunk = 0; chunk < interm_reduction_chunks; chunk++) {
                 bool last_chunk = chunk == interm_reduction_chunks - 1;
 
                 cb_wait_front(curr_in_cb_id, 1);
+                copy_tile_to_dst_init_short(curr_in_cb_id);
                 reconfig_data_format_srca(curr_in_cb_id);
                 copy_tile(curr_in_cb_id, mpwi_cb_tile_idx, data_dst_idx);
 
                 // increments happen between every chunk within a C block, and between C blocks
                 bool increment_needed = false;
                 if (last_c_block && last_chunk) {  // increment for the next kernel position
                     increment_needed = true;
+                    copy_tile_to_dst_init_short(compute_tmp_idx_cb_id);
                     reconfig_data_format_srca(compute_tmp_idx_cb_id);
                     // update the current index column
                     if (current_idx_col + stride_w + eff_kernel_w > in_w_padded) {
@@ -198,6 +204,7 @@ void kernel_main() {
                 } else if (is_large_kernel) {  // only need to increment within C block if multiple chunks
                     if (!last_chunk) {         // increment for the next chunk within the same C block
                         increment_needed = true;
+                        copy_tile_to_dst_init_short(compute_tmp_idx_cb_id);
                         reconfig_data_format_srca(compute_tmp_idx_cb_id);
                         if (intra_kernel_w + sticks_per_chunk < kernel_w) {  // move right in this row
                             intra_kernel_w += sticks_per_chunk;
@@ -210,23 +217,22 @@ void kernel_main() {
                     }
                 }
                 if (!increment_needed) {
-                    copy_dest_values<DataFormat::UInt16>(index_dst_idx, index_scratch_out_dst_idx);
+                    copy_dest_values<copy_format>(index_dst_idx, index_scratch_out_dst_idx);
                 } else {
                     // we allow overflow here for negative values as this only occurs in padding regions
                     add_int_tile_init();
-                    add_int_tile<DataFormat::UInt16>(index_dst_idx, inc_dst_idx, index_scratch_out_dst_idx);
+                    add_int_tile<copy_format>(index_dst_idx, inc_dst_idx, index_scratch_out_dst_idx);
                     max_reduce_with_indices_init<ckernel::DataLayout::ROW_MAJOR>();
                 }
 
-                // TODO # 27845: implement accumulation for <=9 MPWI SFPU so we can use this version for large kernels
-                // as well
+                // TODO implement accumulation for <=9 MPWI SFPU so we can use this version for large kernels as well
                 constexpr uint32_t max_mpwi_kernel_size = window_size_hw <= 9 ? 9 : 32;
                 max_reduce_with_indices<max_mpwi_kernel_size, ckernel::DataLayout::ROW_MAJOR, is_large_kernel>(
                     data_dst_idx, index_dst_idx, chunk);
 
                 if constexpr (is_large_kernel) {
                     if (!last_chunk) {
-                        copy_dest_values<DataFormat::UInt16>(index_scratch_out_dst_idx, index_dst_idx);
+                        copy_dest_values<copy_format>(index_scratch_out_dst_idx, index_dst_idx);
                     }
                 }
 
@@ -236,7 +242,7 @@ void kernel_main() {
             // After all chunks: if not last C block, restore base indices for next C block
             if constexpr (is_large_kernel) {
                 if (!last_c_block) {
-                    copy_dest_values<DataFormat::UInt16>(index_temp_dst_idx, index_scratch_out_dst_idx);
+                    copy_dest_values<copy_format>(index_temp_dst_idx, index_scratch_out_dst_idx);
                 }
             }