tenstorrent
diff --git a/‎tests/tt_metal/distributed/test_mesh_socket.cpp‎
Lines changed: 4 additions & 4 deletions b/‎tests/tt_metal/distributed/test_mesh_socket.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/tt_metal/tt_metal/api/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎tests/tt_metal/tt_metal/api/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/tt_metal/tt_metal/api/distribution_spec/test_buffer_distribution_spec.cpp‎
Lines changed: 81 additions & 125 deletions b/‎tests/tt_metal/tt_metal/api/distribution_spec/test_buffer_distribution_spec.cpp‎
Lines changed: 81 additions & 125 deletions
diff --git a/‎tests/tt_metal/tt_metal/api/distribution_spec/test_distribution_spec.cpp‎
Lines changed: 0 additions & 360 deletions b/‎tests/tt_metal/tt_metal/api/distribution_spec/test_distribution_spec.cpp‎
Lines changed: 0 additions & 360 deletions
diff --git a/‎tests/ttnn/unit_tests/gtests/tensor/test_tensor_nd_sharding.cpp‎
Lines changed: 48 additions & 141 deletions b/‎tests/ttnn/unit_tests/gtests/tensor/test_tensor_nd_sharding.cpp‎
Lines changed: 48 additions & 141 deletions
diff --git a/‎tt_metal/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎tt_metal/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tt_metal/api/tt-metalium/buffer.hpp‎
Lines changed: 2 additions & 33 deletions b/‎tt_metal/api/tt-metalium/buffer.hpp‎
Lines changed: 2 additions & 33 deletions
diff --git a/‎tt_metal/api/tt-metalium/buffer_distribution_spec.hpp‎
Lines changed: 24 additions & 17 deletions b/‎tt_metal/api/tt-metalium/buffer_distribution_spec.hpp‎
Lines changed: 24 additions & 17 deletions
@@ -1887,10 +1887,10 @@ TEST_F(MeshSocketTest, MultiConnectionSingleDeviceConfig) {
     EXPECT_EQ(recv_configs.size(), recv_logical_coords.size());
 
     const auto& sender_core_to_core_id =
-        send_socket.get_config_buffer()->get_backing_buffer()->get_buffer_page_mapping()->core_to_core_id_;
+        send_socket.get_config_buffer()->get_backing_buffer()->get_buffer_page_mapping()->core_to_core_id;
 
     const auto& recv_core_to_core_id =
-        recv_socket.get_config_buffer()->get_backing_buffer()->get_buffer_page_mapping()->core_to_core_id_;
+        recv_socket.get_config_buffer()->get_backing_buffer()->get_buffer_page_mapping()->core_to_core_id;
 
     for (const auto& connection : socket_connections) {
         const auto& sender = connection.sender_core;
@@ -1987,10 +1987,10 @@ TEST_F(MeshSocketTest2DFabric, MultiConnectionMultiDeviceTest) {
     auto [send_socket_dram, recv_socket_dram] = MeshSocket::create_sockets(md0, md1, socket_config_dram);
 
     const auto& sender_core_to_core_id =
-        send_socket_l1.get_config_buffer()->get_backing_buffer()->get_buffer_page_mapping()->core_to_core_id_;
+        send_socket_l1.get_config_buffer()->get_backing_buffer()->get_buffer_page_mapping()->core_to_core_id;
 
     const auto& recv_core_to_core_id =
-        recv_socket_l1.get_config_buffer()->get_backing_buffer()->get_buffer_page_mapping()->core_to_core_id_;
+        recv_socket_l1.get_config_buffer()->get_backing_buffer()->get_buffer_page_mapping()->core_to_core_id;
 
     std::unordered_map<MeshCoordinate, std::vector<sender_socket_md>> sender_configs_per_dev_coord;
     std::unordered_map<MeshCoordinate, std::vector<receiver_socket_md>> recv_configs_per_dev_coord;
 
@@ -14,7 +14,6 @@ set(UNIT_TESTS_API_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_contains.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_intersects.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_merge.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/distribution_spec/test_distribution_spec.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/distribution_spec/test_buffer_distribution_spec.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_banked.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_bit_utils.cpp
 
@@ -5,8 +5,6 @@
 
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
-#include "ttnn/core/tensor/nd_sharding_utils.hpp"
-
 #include "ttnn_test_fixtures.hpp"
 
 namespace {
@@ -38,13 +36,6 @@ struct NDShardingOpCompatParams {
     Shape shard_shape;
     CoreCoord grid_size;
 };
-struct PrepareShardedDataParams {
-    Shape shape;
-    Shape shard_shape;
-    uint32_t num_cores;
-
-    std::vector<uint8_t> expected_data;
-};
 }  // namespace
 
 class NDShardingTests
@@ -158,35 +149,59 @@ TEST_P(NdShardingOpCompatTests, TestAdd) {
     }
 }
 
-class PrepareNdShardedDataTests : public ::testing::TestWithParam<PrepareShardedDataParams> {};
-
-TEST_P(PrepareNdShardedDataTests, PrepareNdShardedData) {
-    const auto& params = GetParam();
+class NDShardingPerfTests : public ttnn::TTNNFixtureWithDevice {};
 
-    CoreRangeSet cores(CoreRange(CoreCoord{0, 0}, CoreCoord{0, params.num_cores - 1}));
-    NdShardSpec nd_shard_spec{params.shard_shape, cores, ShardOrientation::ROW_MAJOR};
-    MemoryConfig memory_config{BufferType::L1, nd_shard_spec};
-    TensorLayout tensor_layout(DataType::UINT8, PageConfig(Layout::ROW_MAJOR), memory_config);
-    TensorSpec tensor_spec(params.shape, tensor_layout);
+TEST_F(NDShardingPerfTests, TestBatchShardingPerf) {
+    CoreRangeSet cores(CoreRange(CoreCoord{0, 0}, CoreCoord{6, 6}));
 
-    std::vector<uint8_t> data(params.shape.volume());
-    for (size_t i = 0; i < data.size(); i++) {
-        data[i] = static_cast<uint8_t>(i);
-    }
-    auto tensor = Tensor::from_vector(data, tensor_spec);
-    auto tensor_data = std::get<HostStorage>(tensor.get_storage()).buffer.view_as<uint8_t>();
+    Shape tensor_shape{16, 1024, 1024};
+    Shape shard_shape_nd_batch{16, 160, 160};
+    Shape shard_shape_nd_small{1, 64, 64};
+    Shape2D shard_shape_2d{2368, 160};
 
-    auto sharded_data = pack_nd_sharded_data<uint8_t>(tensor_data, tensor_spec);
-    EXPECT_EQ(sharded_data.size(), params.expected_data.size());
-    for (size_t i = 0; i < sharded_data.size(); i++) {
-        EXPECT_EQ(sharded_data[i], static_cast<std::byte>(params.expected_data[i]));
+    size_t volume = tensor_shape.volume();
+    std::vector<uint16_t> data(volume);
+    for (size_t i = 0; i < volume; i++) {
+        data[i] = static_cast<uint16_t>(i);
     }
 
-    auto unpacked_data = unpack_nd_sharded_data<std::byte>(sharded_data, tensor_spec);
-    EXPECT_EQ(unpacked_data.size(), tensor_data.size());
-    for (size_t i = 0; i < unpacked_data.size(); i++) {
-        EXPECT_EQ(unpacked_data[i], static_cast<std::byte>(tensor_data[i]));
-    }
+    auto measure_to_device_time_ns = [&](const TensorSpec& tensor_spec) -> double {
+        auto tensor = Tensor::from_vector(data, tensor_spec);
+
+        auto start = std::chrono::high_resolution_clock::now();
+        auto device_tensor = tensor.to_device(device_, tensor_spec.memory_config());
+        auto end = std::chrono::high_resolution_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
+        return duration.count();
+    };
+
+    double batch_nd_sharding_time_ns = [&]() {
+        MemoryConfig memory_config{BufferType::L1, NdShardSpec{shard_shape_nd_batch, cores}};
+        TensorLayout tensor_layout(DataType::UINT16, PageConfig(Layout::TILE), memory_config);
+        TensorSpec tensor_spec(tensor_shape, tensor_layout);
+        return measure_to_device_time_ns(tensor_spec);
+    }();
+
+    double small_shards_nd_sharding_time_ns = [&]() {
+        MemoryConfig memory_config{BufferType::L1, NdShardSpec{shard_shape_nd_small, cores}};
+        TensorLayout tensor_layout(DataType::UINT16, PageConfig(Layout::TILE), memory_config);
+        TensorSpec tensor_spec(tensor_shape, tensor_layout);
+        return measure_to_device_time_ns(tensor_spec);
+    }();
+
+    double block_2d_sharding_time_ns = [&]() {
+        MemoryConfig memory_config{TensorMemoryLayout::BLOCK_SHARDED, BufferType::L1, ShardSpec{cores, shard_shape_2d}};
+        TensorLayout tensor_layout(DataType::UINT16, PageConfig(Layout::TILE), memory_config);
+        TensorSpec tensor_spec(tensor_shape, tensor_layout);
+        return measure_to_device_time_ns(tensor_spec);
+    }();
+
+    tt::log_info("Batch ND sharding time: {} ns", batch_nd_sharding_time_ns);
+    tt::log_info("Small shards ND sharding time: {} ns", small_shards_nd_sharding_time_ns);
+    tt::log_info("Block 2D sharding time: {} ns", block_2d_sharding_time_ns);
+
+    EXPECT_TRUE(batch_nd_sharding_time_ns < block_2d_sharding_time_ns * 4);
+    EXPECT_TRUE(small_shards_nd_sharding_time_ns < block_2d_sharding_time_ns * 4);
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -739,111 +754,3 @@ INSTANTIATE_TEST_SUITE_P(
             .shard_shape = Shape({1, 1, 32 * 2, 32 * 2}),
             .grid_size = CoreCoord{3, 4},
         }));
-
-INSTANTIATE_TEST_SUITE_P(
-    TensorShardingTests,
-    PrepareNdShardedDataTests,
-    ::testing::Values(
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({2, 2, 2}),
-            .num_cores = 1,
-            .expected_data = {0, 1, 2, 3, 4, 5, 6, 7},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({2, 2, 2}),
-            .num_cores = 2,
-            .expected_data = {0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({1, 2, 2}),
-            .num_cores = 2,
-            .expected_data = {0, 1, 2, 3, 4, 5, 6, 7},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({2, 1, 2}),
-            .num_cores = 2,
-            .expected_data = {0, 1, 4, 5, 2, 3, 6, 7},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({2, 2, 1}),
-            .num_cores = 2,
-            .expected_data = {0, 2, 4, 6, 1, 3, 5, 7},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({2, 1, 1}),
-            .num_cores = 2,
-            .expected_data = {0, 4, 2, 6, 1, 5, 3, 7},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({2, 1, 1}),
-            .num_cores = 3,
-            .expected_data = {0, 4, 3, 7, 1, 5, 0, 0, 2, 6, 0, 0},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({1, 2, 1}),
-            .num_cores = 2,
-            .expected_data = {0, 2, 4, 6, 1, 3, 5, 7},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({1, 2, 1}),
-            .num_cores = 3,
-            .expected_data = {0, 2, 5, 7, 1, 3, 0, 0, 4, 6, 0, 0},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({1, 1, 2}),
-            .num_cores = 2,
-            .expected_data = {0, 1, 4, 5, 2, 3, 6, 7},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({1, 1, 2}),
-            .num_cores = 3,
-            .expected_data = {0, 1, 6, 7, 2, 3, 0, 0, 4, 5, 0, 0},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({1, 1, 1}),
-            .num_cores = 2,
-            .expected_data = {0, 2, 4, 6, 1, 3, 5, 7},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({1, 1, 1}),
-            .num_cores = 3,
-            .expected_data = {0, 3, 6, 1, 4, 7, 2, 5, 0},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({1, 1, 1}),
-            .num_cores = 4,
-            .expected_data = {0, 4, 1, 5, 2, 6, 3, 7},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({2, 2, 2}),
-            .shard_shape = Shape({1, 1, 1}),
-            .num_cores = 5,
-            .expected_data = {0, 5, 1, 6, 2, 7, 3, 0, 4, 0},
-        },
-        PrepareShardedDataParams{
-            .shape = Shape({3, 3, 3}),
-            .shard_shape = Shape({2, 2, 2}),
-            .num_cores = 8,
-            .expected_data = {/* core 0 */ 0,  1,  3,  4,  9,  10, 12, 13,
-                              /* core 1 */ 2,  0,  5,  0,  11, 0,  14, 0,
-                              /* core 2 */ 6,  7,  0,  0,  15, 16, 0,  0,
-                              /* core 3 */ 8,  0,  0,  0,  17, 0,  0,  0,
-                              /* core 4 */ 18, 19, 21, 22, 0,  0,  0,  0,
-                              /* core 5 */ 20, 0,  23, 0,  0,  0,  0,  0,
-                              /* core 6 */ 24, 25, 0,  0,  0,  0,  0,  0,
-                              /* core 7 */ 26, 0,  0,  0,  0,  0,  0,  0},
-        }));
@@ -40,6 +40,7 @@ target_sources(
             api/tt-metalium/blockfloat_common.hpp
             api/tt-metalium/buffer.hpp
             api/tt-metalium/buffer_distribution_spec.hpp
+            api/tt-metalium/buffer_page_mapping.hpp
             api/tt-metalium/buffer_types.hpp
             api/tt-metalium/circular_buffer.hpp
             api/tt-metalium/circular_buffer_constants.h
@@ -54,7 +55,6 @@ target_sources(
             api/tt-metalium/device.hpp
             api/tt-metalium/device_pool.hpp
             api/tt-metalium/dispatch_core_common.hpp
-            api/tt-metalium/distribution_spec.hpp
             api/tt-metalium/event.hpp
             api/tt-metalium/fabric_host_interface.h
             api/tt-metalium/fabric_edm_packet_header.hpp
 
@@ -29,6 +29,7 @@
 #include <tt-metalium/core_coord.hpp>
 #include <tt-metalium/hal_types.hpp>
 #include <tt-metalium/sub_device_types.hpp>
+#include <tt-metalium/buffer_page_mapping.hpp>
 #include <umd/device/tt_core_coordinates.h>
 #include <umd/device/tt_soc_descriptor.h>
 #include <umd/device/types/xy_pair.h>
@@ -162,20 +163,6 @@ struct ShardedBufferConfig {
 
 bool is_sharded(const TensorMemoryLayout& layout);
 
-struct BufferPageMapping {
-    std::vector<CoreCoord> all_cores_;
-    std::vector<uint32_t> core_bank_indices_;
-    std::vector<std::vector<uint32_t>> core_host_page_indices_;
-    std::vector<uint32_t> dev_page_to_core_mapping_;
-
-    // some dev pages don't have mapping to host (in case of padding)
-    std::vector<std::optional<uint32_t>> dev_page_to_host_page_mapping_;
-    std::vector<uint32_t> host_page_to_dev_page_mapping_;
-    std::unordered_map<CoreCoord, uint32_t> core_to_core_id_;
-    std::vector<uint32_t> host_page_to_local_shard_page_mapping_;
-    std::vector<std::array<uint32_t, 2>> core_shard_shape_;
-};
-
 struct BufferRegion {
     DeviceAddr offset = 0;
     DeviceAddr size = 0;
@@ -267,24 +254,7 @@ class Buffer final {
     // SHARDED API STARTS HERE
     // If buffer contains BufferDistributionSpec, it is considered ND sharded
     bool is_nd_sharded() const;
-
-    /* BankDataMapping is a struct that provides an explicit mapping of data per bank:
-     * - banks: Logical coordinates of banks to use
-     * - bank_mapping_in_bytes: Mapping of data in bytes for each bank; it is a list of ChunkMapping which contains:
-     *   - src: host address offset in bytes
-     *   - dst: bank address offset in bytes
-     *   - size: size of data in bytes
-     * Some notes:
-     * - Size of banks and bank_mapping_in_bytes must be equal, with each bank having a corresponding mapping
-     * - Each TargetData is a list of ChunkMapping which fully describes all data relevant to that bank
-     * - In Buffer, all ChunkMapping are in bytes and takes into account page size and aligned page size
-     * - Also see DistributionSpec class for more details about TargetData and ChunkMapping
-     */
-    struct BankDataMapping {
-        std::vector<CoreCoord> banks;
-        std::vector<DistributionSpec::TargetData> bank_mapping_in_bytes;
-    };
-    BankDataMapping get_bank_data_mapping();
+    const std::optional<BufferDistributionSpec>& buffer_distribution_spec() const;
 
     // TODO: WILL SEPARATE INTO SHARDED BUFFER CLASS
 
@@ -354,7 +324,6 @@ class Buffer final {
     std::shared_ptr<const BufferPageMapping> buffer_page_mapping_;
 
     std::optional<BufferDistributionSpec> buffer_distribution_spec_;
-    std::optional<std::vector<DistributionSpec::TargetData>> bank_mapping_in_bytes_ = std::nullopt;
 
     size_t unique_id_ = 0;
     static std::atomic<size_t> next_unique_id;
 
@@ -6,36 +6,43 @@
 
 #include <tt-metalium/buffer_types.hpp>
 #include <tt-metalium/core_coord.hpp>
-#include <tt-metalium/distribution_spec.hpp>
+#include <tt-metalium/shape.hpp>
 #include <tt-metalium/shape2d.hpp>
+#include <tt-metalium/buffer_page_mapping.hpp>
 
 namespace tt::tt_metal {
 
 class BufferDistributionSpec {
 public:
     static BufferDistributionSpec from_shard_spec(
-        const tt::tt_metal::Shape& tensor_shape,
-        const tt::tt_metal::Shape& physical_shard_shape,
-        const Shape2D& page_shape,
-        const CoreRangeSet& corerangeset,
-        const ShardOrientation shard_orientation);
-
-    tt::tt_metal::Shape get_tensor_shape_in_pages() const { return page_distribution_spec_.get_tensor_shape(); }
-    tt::tt_metal::Shape get_shard_shape_in_pages() const { return page_distribution_spec_.get_shard_shape(); }
-
-    size_t num_dev_pages_per_core() const {
-        return page_distribution_spec_.get_shard_shape().volume() *
-               page_distribution_spec_.get_max_num_shards_per_target();
-    }
+        tt::tt_metal::Shape tensor_shape,
+        tt::tt_metal::Shape shard_shape,
+        tt::tt_metal::Shape2D page_shape,
+        CoreRangeSet core_range_set,
+        ShardOrientation shard_orientation);
+
+    BufferDistributionSpec(
+        tt::tt_metal::Shape tensor_shape_in_pages,
+        tt::tt_metal::Shape shard_shape_in_pages,
+        CoreRangeSet core_range_set,
+        ShardOrientation shard_orientation);
+
+    tt::tt_metal::Shape get_tensor_shape_in_pages() const { return tensor_shape_in_pages_; }
+    tt::tt_metal::Shape get_shard_shape_in_pages() const { return shard_shape_in_pages_; }
+
+    size_t num_shards() const;
+    size_t num_shards_per_core() const;
+    size_t num_dev_pages_per_core() const;
     size_t num_cores() const { return cores_.size(); }
     const std::vector<CoreCoord>& get_cores() const { return cores_; }
 
-    const std::vector<DistributionSpec::TargetData>& get_page_mapping(DistributionSpec::MappingMode mapping_mode);
+    BufferPageMapping compute_page_mapping() const;
 
 private:
-    BufferDistributionSpec(const DistributionSpec& page_distribution_spec, const std::vector<CoreCoord>& cores);
+    tt::tt_metal::Shape tensor_shape_in_pages_;
+    tt::tt_metal::Shape shard_shape_in_pages_;
+    ShardOrientation shard_orientation_ = ShardOrientation::ROW_MAJOR;
 
-    DistributionSpec page_distribution_spec_;
     std::vector<CoreCoord> cores_;
 };