#22781: Fixes to support uneven ND sharding (#22782)

sminakov-tt · web-flow · commit 7c065e0fe3c4 · 2025-05-30T22:52:38.000Z
### Ticket #22781 ### Problem description Currently ND sharding has issues with uneven shards ### What's changed Added more tests to check uneven shards Fixed row major tensor alignment for nd sharding Used padded tensor shape for DistributionSpec calculation Avoid converting ND sharding to single bank, because it doesn't have much support in OPs ### Checklist - [x] [All post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/15343571295) - [x] New/Existing tests provide coverage for changes
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_tensor_nd_sharding.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_tensor_nd_sharding.cpp
@@ -200,6 +200,41 @@ INSTANTIATE_TEST_SUITE_P(
                 .shard_shape = Shape({32, 32, 32}),
                 .layout = Layout::TILE,
             },
+            NDShardingParams{
+                .shape = Shape({3 * 32 + 5, 4 * 32, 5 * 32}),
+                .shard_shape = Shape({32, 4 * 32, 5 * 32}),
+                .layout = Layout::TILE,
+            },
+            NDShardingParams{
+                .shape = Shape({3 * 32, 4 * 32 + 5, 5 * 32}),
+                .shard_shape = Shape({3 * 32, 32, 5 * 32}),
+                .layout = Layout::TILE,
+            },
+            NDShardingParams{
+                .shape = Shape({3 * 32, 4 * 32, 5 * 32 + 5}),
+                .shard_shape = Shape({3 * 32, 4 * 32, 32}),
+                .layout = Layout::TILE,
+            },
+            NDShardingParams{
+                .shape = Shape({3 * 32, 4 * 32 + 5, 5 * 32 + 5}),
+                .shard_shape = Shape({3 * 32, 32, 32}),
+                .layout = Layout::TILE,
+            },
+            NDShardingParams{
+                .shape = Shape({3 * 32 + 5, 4 * 32, 5 * 32 + 5}),
+                .shard_shape = Shape({32, 4 * 32, 32}),
+                .layout = Layout::TILE,
+            },
+            NDShardingParams{
+                .shape = Shape({3 * 32 + 5, 4 * 32 + 5, 5 * 32}),
+                .shard_shape = Shape({32, 32, 5 * 32}),
+                .layout = Layout::TILE,
+            },
+            NDShardingParams{
+                .shape = Shape({3 * 32 + 5, 4 * 32 + 5, 5 * 32 + 5}),
+                .shard_shape = Shape({32, 32, 32}),
+                .layout = Layout::TILE,
+            },
             NDShardingParams{
                 .shape = Shape({30, 40, 50}),
                 .shard_shape = Shape({30, 40, 50}),
@@ -239,6 +274,51 @@ INSTANTIATE_TEST_SUITE_P(
                 .shape = Shape({30, 40, 50}),
                 .shard_shape = Shape({10, 10, 10}),
                 .layout = Layout::ROW_MAJOR,
+            },
+            NDShardingParams{
+                .shape = Shape({3, 4, 5}),
+                .shard_shape = Shape({1, 1, 1}),
+                .layout = Layout::ROW_MAJOR,
+            },
+            NDShardingParams{
+                .shape = Shape({35, 40, 50}),
+                .shard_shape = Shape({10, 40, 50}),
+                .layout = Layout::ROW_MAJOR,
+            },
+            NDShardingParams{
+                .shape = Shape({30, 45, 50}),
+                .shard_shape = Shape({30, 10, 50}),
+                .layout = Layout::ROW_MAJOR,
+            },
+            NDShardingParams{
+                .shape = Shape({30, 40, 55}),
+                .shard_shape = Shape({30, 40, 10}),
+                .layout = Layout::ROW_MAJOR,
+            },
+            NDShardingParams{
+                .shape = Shape({35, 45, 50}),
+                .shard_shape = Shape({10, 10, 50}),
+                .layout = Layout::ROW_MAJOR,
+            },
+            NDShardingParams{
+                .shape = Shape({35, 40, 55}),
+                .shard_shape = Shape({10, 40, 10}),
+                .layout = Layout::ROW_MAJOR,
+            },
+            NDShardingParams{
+                .shape = Shape({30, 45, 55}),
+                .shard_shape = Shape({30, 10, 10}),
+                .layout = Layout::ROW_MAJOR,
+            },
+            NDShardingParams{
+                .shape = Shape({35, 45, 55}),
+                .shard_shape = Shape({10, 10, 10}),
+                .layout = Layout::ROW_MAJOR,
+            },
+            NDShardingParams{
+                .shape = Shape({3, 5, 7}),
+                .shard_shape = Shape({2, 2, 2}),
+                .layout = Layout::ROW_MAJOR,
             }),
         ::testing::Values(BufferType::L1, BufferType::DRAM),
         ::testing::Values(ShardOrientation::ROW_MAJOR, ShardOrientation::COL_MAJOR)));
diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_nd_sharding.py b/tests/ttnn/unit_tests/tensor/test_tensor_nd_sharding.py
@@ -15,36 +15,35 @@
         ([3, 4, 5], [3, 4, 5], ttnn.ROW_MAJOR_LAYOUT),  # All data on a single core
         ([3, 4, 5], [3, 4, 1], ttnn.ROW_MAJOR_LAYOUT),  # Each core gets full batch and height dimension
         ([3, 4, 5], [3, 1, 5], ttnn.ROW_MAJOR_LAYOUT),  # Each core gets full batch and width dimension
-        (
-            [3, 4, 5],
-            [1, 4, 5],
-            ttnn.ROW_MAJOR_LAYOUT,
-        ),  # Each core gets full height and width dimension, aka 1 batch per core
+        ([3, 4, 5], [1, 4, 5], ttnn.ROW_MAJOR_LAYOUT),  # Each core gets full height and width dimension
         ([3, 4, 5], [3, 1, 1], ttnn.ROW_MAJOR_LAYOUT),  # Each core gets full batch dimension
         ([3, 4, 5], [1, 4, 1], ttnn.ROW_MAJOR_LAYOUT),  # Each core gets full height dimension
         ([3, 4, 5], [1, 1, 5], ttnn.ROW_MAJOR_LAYOUT),  # Each core gets full width dimension
-        (
-            [3, 4, 5],
-            [1, 1, 1],
-            ttnn.ROW_MAJOR_LAYOUT,
-        ),  # Data is distributed equally across all cores, no dimenions preserved
+        ([3, 4, 5], [1, 1, 1], ttnn.ROW_MAJOR_LAYOUT),  # Data is distributed equally across all cores
         # Tile Layout
-        ([3, 4 * 32, 5 * 32], [3, 4 * 32, 5 * 32], ttnn.TILE_LAYOUT),  # All data on a single core
-        ([3, 4 * 32, 5 * 32], [3, 4 * 32, 32], ttnn.TILE_LAYOUT),  # Each core gets full batch and height dimension
-        ([3, 4 * 32, 5 * 32], [3, 32, 5 * 32], ttnn.TILE_LAYOUT),  # Each core gets full batch and width dimension
-        (
-            [3, 4 * 32, 5 * 32],
-            [1, 4 * 32, 5 * 32],
-            ttnn.TILE_LAYOUT,
-        ),  # Each core gets full height and width dimension, aka 1 batch per core
-        ([3, 4 * 32, 5 * 32], [3, 32, 32], ttnn.TILE_LAYOUT),  # Each core gets full batch dimension
-        ([3, 4 * 32, 5 * 32], [1, 4 * 32, 32], ttnn.TILE_LAYOUT),  # Each core gets full height dimension
-        ([3, 4 * 32, 5 * 32], [1, 32, 5 * 32], ttnn.TILE_LAYOUT),  # Each core gets full width dimension
-        (
-            [3, 4 * 32, 5 * 32],
-            [1, 32, 32],
-            ttnn.TILE_LAYOUT,
-        ),  # Data is distributed equally across all cores, no dimenions preserved
+        ([3, 128, 160], [3, 128, 160], ttnn.TILE_LAYOUT),  # All data on a single core
+        ([3, 128, 160], [3, 128, 32], ttnn.TILE_LAYOUT),  # Each core gets full batch and height dimension
+        ([3, 128, 160], [3, 32, 160], ttnn.TILE_LAYOUT),  # Each core gets full batch and width dimension
+        ([3, 128, 160], [1, 128, 160], ttnn.TILE_LAYOUT),  # Each core gets full height and width dimension
+        ([3, 128, 160], [3, 32, 32], ttnn.TILE_LAYOUT),  # Each core gets full batch dimension
+        ([3, 128, 160], [1, 128, 32], ttnn.TILE_LAYOUT),  # Each core gets full height dimension
+        ([3, 128, 160], [1, 32, 160], ttnn.TILE_LAYOUT),  # Each core gets full width dimension
+        ([3, 128, 160], [1, 32, 32], ttnn.TILE_LAYOUT),  # Data is distributed equally across all cores
+        # Uneven shards
+        ([30, 40, 55], [30, 40, 10], ttnn.ROW_MAJOR_LAYOUT),
+        ([30, 45, 50], [30, 10, 50], ttnn.ROW_MAJOR_LAYOUT),
+        ([35, 40, 50], [10, 40, 50], ttnn.ROW_MAJOR_LAYOUT),
+        ([30, 45, 50], [30, 10, 50], ttnn.ROW_MAJOR_LAYOUT),
+        ([35, 40, 50], [10, 40, 50], ttnn.ROW_MAJOR_LAYOUT),
+        ([35, 45, 50], [10, 10, 50], ttnn.ROW_MAJOR_LAYOUT),
+        ([35, 45, 55], [10, 10, 10], ttnn.ROW_MAJOR_LAYOUT),
+        ([3, 128, 165], [3, 128, 32], ttnn.TILE_LAYOUT),
+        ([3, 130, 160], [3, 32, 160], ttnn.TILE_LAYOUT),
+        ([5, 128, 160], [2, 128, 160], ttnn.TILE_LAYOUT),
+        ([3, 130, 165], [3, 32, 32], ttnn.TILE_LAYOUT),
+        ([5, 128, 165], [2, 128, 32], ttnn.TILE_LAYOUT),
+        ([5, 130, 160], [2, 32, 160], ttnn.TILE_LAYOUT),
+        ([5, 130, 165], [2, 32, 32], ttnn.TILE_LAYOUT),
     ],
 )
 @pytest.mark.parametrize("buffer_type", [ttnn.BufferType.L1, ttnn.BufferType.DRAM])
diff --git a/ttnn/core/tensor/layout/page_config.cpp b/ttnn/core/tensor/layout/page_config.cpp
@@ -134,21 +134,22 @@ const Tile& TilePageConfig::get_tile() const { return tile_; }
 RowMajorPageConfig::RowMajorPageConfig(const Tile& tile) : tile_(tile) {}
 
 Alignment RowMajorPageConfig::create_default_alignment(DataType dtype, const MemoryConfig& memory_config) const {
-    {
-        if (memory_config.shard_spec().has_value()) {
-            const auto& shard_spec = memory_config.shard_spec().value();
-            if (shard_spec.mode == ShardMode::LOGICAL) {
-                return shard_spec.physical_shard_shape.has_value() ? Alignment(shard_spec.physical_shard_shape.value())
-                                                                   : Alignment({shard_spec.shape[1]});
-            }
-            // TODO: Investigate why we need guard against HEIGHT_SHARDED and merge logic with LOGICAL sharding
-            if (shard_spec.mode == ShardMode::PHYSICAL &&
-                memory_config.memory_layout() != TensorMemoryLayout::HEIGHT_SHARDED) {
-                return Alignment({shard_spec.shape[1]});
-            }
+    if (memory_config.shard_spec().has_value()) {
+        const auto& shard_spec = memory_config.shard_spec().value();
+        if (shard_spec.mode == ShardMode::LOGICAL) {
+            return shard_spec.physical_shard_shape.has_value() ? Alignment(shard_spec.physical_shard_shape.value())
+                                                               : Alignment({shard_spec.shape[1]});
         }
-        return Alignment({1});
+        // TODO: Investigate why we need guard against HEIGHT_SHARDED and merge logic with LOGICAL sharding
+        if (shard_spec.mode == ShardMode::PHYSICAL &&
+            memory_config.memory_layout() != TensorMemoryLayout::HEIGHT_SHARDED) {
+            return Alignment({shard_spec.shape[1]});
+        }
+    } else if (memory_config.nd_shard_spec().has_value()) {
+        const auto& nd_shard_spec = *memory_config.nd_shard_spec();
+        return Alignment({nd_shard_spec.shard_shape[-1]});
     }
+    return Alignment({1});
 }
 
 void RowMajorPageConfig::validate_alignment(
diff --git a/ttnn/core/tensor/layout/tensor_layout.cpp b/ttnn/core/tensor/layout/tensor_layout.cpp
@@ -214,8 +214,9 @@ std::optional<std::variant<ShardSpecBuffer, BufferDistributionSpec>> TensorLayou
     }
 
     auto& nd_shard_spec = memory_config_.nd_shard_spec().value();
+    auto padded_shape = compute_padded_shape(shape);
     return BufferDistributionSpec::from_shard_spec(
-        shape, nd_shard_spec.shard_shape, page_shape, nd_shard_spec.grid, nd_shard_spec.orientation);
+        padded_shape, nd_shard_spec.shard_shape, page_shape, nd_shard_spec.grid, nd_shard_spec.orientation);
 }
 
 size_t TensorLayout::compute_packed_buffer_size_bytes(const ttnn::Shape& shape) const {
diff --git a/ttnn/core/tensor/tensor_impl.cpp b/ttnn/core/tensor/tensor_impl.cpp
@@ -940,8 +940,9 @@ std::array<Shape2D, 2> get_logical_and_physical_shard_shapes(const TensorSpec& t
 
     // TODO: get_logical_shard_shape always returns shard shape from shard spec, which is not correct in physical mode
     // if there is padding
-    if (tensor_spec.memory_config().is_sharded() and
-        (tensor_spec.memory_config().shard_spec().value().mode == ShardMode::LOGICAL or
+    if (tensor_spec.memory_config().is_sharded() &&
+        ((tensor_spec.memory_config().shard_spec().has_value() &&
+          tensor_spec.memory_config().shard_spec().value().mode == ShardMode::LOGICAL) ||
          logical_shape == padded_shape)) {
         return {
             tensor_spec.tensor_layout().get_logical_shard_shape(),
diff --git a/ttnn/core/tensor/tensor_spec.cpp b/ttnn/core/tensor/tensor_spec.cpp
@@ -246,16 +246,6 @@ std::optional<MemoryConfig> TensorSpec::populate_legacy_shard_spec_from_nd() con
         return std::nullopt;
     }
 
-    // Detect single bank case
-    if (nd_shard_shape == padded_shape()) {
-        return MemoryConfig::create_with_prepopulated_shard_specs(
-            TensorMemoryLayout::SINGLE_BANK,
-            mem_config.buffer_type(),
-            ShardSpec(nd_shard_spec.grid, physical_shape(), nd_shard_spec.orientation),
-            mem_config.nd_shard_spec(),
-            mem_config.created_with_nd_shard_spec());
-    }
-
     ShardSpec shard_spec(
         nd_shard_spec.grid,
         {nd_shard_shape.volume() / nd_shard_shape[-1], nd_shard_shape[-1]},

Original file line number	Diff line number	Diff line change
`@@ -214,8 +214,9 @@ std::optional<std::variant<ShardSpecBuffer, BufferDistributionSpec>> TensorLayou`
`214`	`214`	`}`
`215`	`215`
`216`	`216`	`auto& nd_shard_spec = memory_config_.nd_shard_spec().value();`
	`217`	`+ auto padded_shape = compute_padded_shape(shape);`
`217`	`218`	`return BufferDistributionSpec::from_shard_spec(`
`218`		`- shape, nd_shard_spec.shard_shape, page_shape, nd_shard_spec.grid, nd_shard_spec.orientation);`
	`219`	`+ padded_shape, nd_shard_spec.shard_shape, page_shape, nd_shard_spec.grid, nd_shard_spec.orientation);`
`219`	`220`	`}`
`220`	`221`
`221`	`222`	`size_t TensorLayout::compute_packed_buffer_size_bytes(const ttnn::Shape& shape) const {`