tenstorrent
diff --git a/‎tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp‎
Lines changed: 1 addition & 14 deletions b/‎tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp‎
Lines changed: 1 addition & 14 deletions
diff --git a/‎tests/tt_metal/tt_metal/api/distribution_spec/test_buffer_distribution_spec.cpp‎
Lines changed: 29 additions & 26 deletions b/‎tests/tt_metal/tt_metal/api/distribution_spec/test_buffer_distribution_spec.cpp‎
Lines changed: 29 additions & 26 deletions
diff --git a/‎tests/ttnn/unit_tests/gtests/tensor/test_tensor_nd_sharding.cpp‎
Lines changed: 63 additions & 10 deletions b/‎tests/ttnn/unit_tests/gtests/tensor/test_tensor_nd_sharding.cpp‎
Lines changed: 63 additions & 10 deletions
diff --git a/‎tt_metal/api/tt-metalium/buffer.hpp‎
Lines changed: 18 additions & 13 deletions b/‎tt_metal/api/tt-metalium/buffer.hpp‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎tt_metal/api/tt-metalium/buffer_distribution_spec.hpp‎
Lines changed: 1 addition & 1 deletion b/‎tt_metal/api/tt-metalium/buffer_distribution_spec.hpp‎
Lines changed: 1 addition & 1 deletion
@@ -118,7 +118,7 @@ TEST_F(DeviceFixture, TensixTestCircularBufferNonBlockingAPIs) {
 
     std::vector<uint32_t> out_buf(data_buffer_size);
     for (size_t i = 0; i < n_cbs; i++) {
-        tt::tt_metal::detail::ReadFromBuffer(master_data_buffers[i], out_buf, false);
+        tt::tt_metal::detail::ReadFromBuffer(master_data_buffers[i], out_buf);
 
         uint8_t const* raw_data = reinterpret_cast<uint8_t*>(out_buf.data());
         for (size_t pages_pushed = 0; pages_pushed < cb_n_pages; pages_pushed++) {
@@ -129,17 +129,4 @@ TEST_F(DeviceFixture, TensixTestCircularBufferNonBlockingAPIs) {
             }
         }
     }
-
-    for (size_t i = 0; i < n_cbs; i++) {
-        tt::tt_metal::detail::ReadFromBuffer(subordinate_data_buffers[i], out_buf, true);
-
-        uint8_t const* raw_data = reinterpret_cast<uint8_t*>(out_buf.data());
-        for (size_t pages_pushed = 0; pages_pushed < cb_n_pages; pages_pushed++) {
-            for (size_t filled_pages_requested = 0; filled_pages_requested < cb_n_pages; filled_pages_requested++) {
-                ASSERT_EQ(
-                    static_cast<bool>(raw_data[pages_pushed * cb_n_pages + filled_pages_requested]),
-                    filled_pages_requested <= pages_pushed);
-            }
-        }
-    }
 }
@@ -14,6 +14,7 @@
 
 namespace distribution_spec_tests {
 using tt::tt_metal::BufferDistributionSpec;
+constexpr uint32_t PADDING = tt::tt_metal::UncompressedBufferPageMapping::PADDING;
 
 struct BufferDistributionSpecInputs {
     tt::tt_metal::Shape physical_tensor_shape;
@@ -299,7 +300,9 @@ TEST_P(MeshBufferReadWriteTests, WriteReadLoopback) {
         }
         for (size_t empty_core_idx = expected_page_mapping.size(); empty_core_idx < page_mapping.size();
              empty_core_idx++) {
-            EXPECT_EQ(page_mapping[empty_core_idx], std::vector<uint32_t>(page_mapping[empty_core_idx].size()));
+            EXPECT_EQ(
+                page_mapping[empty_core_idx],
+                std::vector<uint32_t>(page_mapping[empty_core_idx].size(), UncompressedBufferPageMapping::PADDING));
         }
 
         for (size_t i = 0; i < cores.size(); i++) {
@@ -313,7 +316,7 @@ TEST_P(MeshBufferReadWriteTests, WriteReadLoopback) {
 
             const auto* result_per_core_ptr = reinterpret_cast<const uint8_t*>(result_per_core.data());
             for (size_t core_page = 0; core_page < page_mapping[i].size(); core_page++) {
-                if (!page_mapping[i][core_page]) {
+                if (page_mapping[i][core_page] == UncompressedBufferPageMapping::PADDING) {
                     continue;
                 }
                 const auto host_page = page_mapping[i][core_page];
@@ -370,13 +373,13 @@ INSTANTIATE_TEST_SUITE_P(
                 MeshBufferReadWriteExpected{
                     .explicit_core_page_mapping = {
                         {0, 1},
-                        {2, 0},
+                        {2, PADDING},
                         {3, 4},
-                        {5, 0},
+                        {5, PADDING},
                         {6, 7},
-                        {8, 0},
+                        {8, PADDING},
                         {9, 10},
-                        {11, 0},
+                        {11, PADDING},
                     },
                 },
             },
@@ -395,10 +398,10 @@ INSTANTIATE_TEST_SUITE_P(
                 },
                 MeshBufferReadWriteExpected{
                     .explicit_core_page_mapping = {
-                        {0, 1, 0, 2, 3, 0},
-                        {4, 5, 0, 6, 7, 0},
-                        {8, 9, 0, 10, 11, 0},
-                        {12, 13, 0, 14, 15, 0},
+                        {0, 1, PADDING, 2, 3, PADDING},
+                        {4, 5, PADDING, 6, 7, PADDING},
+                        {8, 9, PADDING, 10, 11, PADDING},
+                        {12, 13, PADDING, 14, 15, PADDING},
                     },
                 },
             },
@@ -416,8 +419,8 @@ INSTANTIATE_TEST_SUITE_P(
                 },
                 MeshBufferReadWriteExpected{
                     .explicit_core_page_mapping = {
-                        {0, 2, 4, 0, 6, 8, 10, 0},
-                        {1, 3, 5, 0, 7, 9, 11, 0},
+                        {0, 2, 4, PADDING, 6, 8, 10, PADDING},
+                        {1, 3, 5, PADDING, 7, 9, 11, PADDING},
                     },
                 },
             },
@@ -437,15 +440,15 @@ INSTANTIATE_TEST_SUITE_P(
                 MeshBufferReadWriteExpected{
                     .explicit_core_page_mapping = {
                         {0, 1, 3, 4, 30, 31, 33, 34},
-                        {2, 0, 5, 0, 32, 0, 35, 0},
-                        {6, 7, 9, 10, 0, 0, 0, 0},
-                        {8, 0, 11, 0, 0, 0, 0, 0},
-                        {12, 13, 15, 16, 0, 0, 0, 0},
-                        {14, 0, 17, 0, 0, 0, 0, 0},
-                        {18, 19, 21, 22, 0, 0, 0, 0},
-                        {20, 0, 23, 0, 0, 0, 0, 0},
-                        {24, 25, 27, 28, 0, 0, 0, 0},
-                        {26, 0, 29, 0, 0, 0, 0, 0}
+                        {2, PADDING, 5, PADDING, 32, PADDING, 35, PADDING},
+                        {6, 7, 9, 10, PADDING, PADDING, PADDING, PADDING},
+                        {8, PADDING, 11, PADDING, PADDING, PADDING, PADDING, PADDING},
+                        {12, 13, 15, 16, PADDING, PADDING, PADDING, PADDING},
+                        {14, PADDING, 17, PADDING, PADDING, PADDING, PADDING, PADDING},
+                        {18, 19, 21, 22, PADDING, PADDING, PADDING, PADDING},
+                        {20, PADDING, 23, PADDING, PADDING, PADDING, PADDING, PADDING},
+                        {24, 25, 27, 28, PADDING, PADDING, PADDING, PADDING},
+                        {26, PADDING, 29, PADDING, PADDING, PADDING, PADDING, PADDING}
                     },
                 },
             },
@@ -464,11 +467,11 @@ INSTANTIATE_TEST_SUITE_P(
                 },
                 MeshBufferReadWriteExpected{
                     .explicit_core_page_mapping = {
-                        {0, 1, 3, 4, 12, 13, 15, 16, 26, 0, 29, 0, 38, 0, 41, 0, 54, 55, 57, 58, 0, 0, 0, 0},
-                        {2, 0, 5, 0, 14, 0, 17, 0, 30, 31, 33, 34, 42, 43, 45, 46, 56, 0, 59, 0, 0, 0, 0, 0},
-                        {6, 7, 9, 10, 18, 19, 21, 22, 32, 0, 35, 0, 44, 0, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-                        {8, 0, 11, 0, 20, 0, 23, 0, 48, 49, 51, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-                        {24, 25, 27, 28, 36, 37, 39, 40, 50, 0, 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+                        {0, 1, 3, 4, 12, 13, 15, 16, 26, PADDING, 29, PADDING, 38, PADDING, 41, PADDING, 54, 55, 57, 58, PADDING, PADDING, PADDING, PADDING},
+                        {2, PADDING, 5, PADDING, 14, PADDING, 17, PADDING, 30, 31, 33, 34, 42, 43, 45, 46, 56, PADDING, 59, PADDING, PADDING, PADDING, PADDING, PADDING},
+                        {6, 7, 9, 10, 18, 19, 21, 22, 32, PADDING, 35, PADDING, 44, PADDING, 47, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING},
+                        {8, PADDING, 11, PADDING, 20, PADDING, 23, PADDING, 48, 49, 51, 52, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING},
+                        {24, 25, 27, 28, 36, 37, 39, 40, 50, PADDING, 53, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING, PADDING}
                     },
                 },
             })  // Values
 
@@ -45,25 +45,29 @@ struct NDShardingBufferSizeParams {
     size_t expected_num_dev_pages = 0;
     size_t expected_aligned_size_per_bank = 0;
 };
-}  // namespace
-
-class NDShardingTests
-    : public ttnn::TTNNFixtureWithDevice,
-      public ::testing::WithParamInterface<std::tuple<NDShardingParams, BufferType, ShardOrientation>> {};
-
-TEST_P(NDShardingTests, LoopbackTest) {
-    const auto& [params, buffer_type, orientation] = GetParam();
 
+TensorSpec get_nd_sharding_tensor_spec(
+    const NDShardingParams& params, BufferType buffer_type, ShardOrientation orientation, IDevice* device) {
     CoreRangeSet cores;
     if (buffer_type == BufferType::L1) {
         cores = CoreRangeSet(CoreRange(CoreCoord{0, 0}, CoreCoord{6, 6}));
     } else {
-        auto dram_grid_size = device_->dram_grid_size();
+        auto dram_grid_size = device->dram_grid_size();
         cores = CoreRangeSet(CoreRange(CoreCoord{0, 0}, CoreCoord{dram_grid_size.x - 1, dram_grid_size.y - 1}));
     }
     MemoryConfig memory_config{buffer_type, NdShardSpec{params.shard_shape, cores, orientation}};
     TensorLayout tensor_layout(DataType::UINT16, PageConfig(params.layout), memory_config);
-    TensorSpec tensor_spec(params.shape, tensor_layout);
+    return TensorSpec(params.shape, tensor_layout);
+}
+}  // namespace
+
+class NDShardingTests
+    : public ttnn::TTNNFixtureWithDevice,
+      public ::testing::WithParamInterface<std::tuple<NDShardingParams, BufferType, ShardOrientation>> {};
+
+TEST_P(NDShardingTests, LoopbackTest) {
+    const auto& [params, buffer_type, orientation] = GetParam();
+    auto tensor_spec = get_nd_sharding_tensor_spec(params, buffer_type, orientation, device_);
 
     size_t volume = params.shape.volume();
     std::vector<uint16_t> data(volume);
@@ -79,6 +83,55 @@ TEST_P(NDShardingTests, LoopbackTest) {
     }
 }
 
+TEST_P(NDShardingTests, RegionWriteReadTest) {
+    const auto& [params, buffer_type, orientation] = GetParam();
+    auto tensor_spec = get_nd_sharding_tensor_spec(params, buffer_type, orientation, device_);
+
+    size_t volume = params.shape.volume();
+    std::vector<uint16_t> data(volume);
+    for (size_t i = 0; i < data.size(); i++) {
+        data[i] = static_cast<uint16_t>(i);
+    }
+    auto data_tensor = Tensor::from_vector(data, tensor_spec);
+    auto tensor_data_span = host_buffer::get_as<uint16_t>(data_tensor);
+    auto tensor_data = std::vector<uint16_t>(tensor_data_span.begin(), tensor_data_span.end());
+
+    std::vector<uint16_t> empty_data(volume);
+    auto tensor = Tensor::from_vector(empty_data, tensor_spec, device_);
+
+    auto& storage = std::get<DeviceStorage>(tensor.storage());
+    auto buffer = storage.get_buffer();
+    auto page_size = buffer->page_size();
+    auto device = buffer->device();
+
+    size_t region_size = buffer->page_size();
+    while (buffer->size() % (region_size * 2) == 0) {
+        region_size *= 2;
+    }
+
+    std::vector<uint16_t> partial_readback_data(tensor_data.size());
+    std::vector<uint16_t> full_readback_data(tensor_data.size());
+
+    for (size_t region = 0; region < buffer->size() / region_size; region++) {
+        size_t region_offset = region * region_size;
+        auto buffer_view = buffer->view(BufferRegion{region_offset, region_size});
+        EnqueueWriteBuffer(
+            device->command_queue(),
+            buffer_view,
+            reinterpret_cast<const std::byte*>(tensor_data.data()) + region_offset,
+            true);
+        EnqueueReadBuffer(
+            device->command_queue(),
+            buffer_view,
+            reinterpret_cast<std::byte*>(partial_readback_data.data()) + region_offset,
+            true);
+    }
+    EXPECT_EQ(tensor_data, partial_readback_data);
+
+    EnqueueReadBuffer(device->command_queue(), *buffer, full_readback_data.data(), true);
+    EXPECT_EQ(tensor_data, full_readback_data);
+}
+
 class LegacyToNdShardingTests : public ::testing::TestWithParam<LegacyToNdShardingParams> {};
 
 TEST_P(LegacyToNdShardingTests, LegacyToNdSharding) {
 
@@ -171,7 +171,7 @@ struct BufferRegion {
     BufferRegion(DeviceAddr offset, DeviceAddr size) : offset(offset), size(size) {}
 };
 
-class Buffer final {
+class Buffer final : public std::enable_shared_from_this<Buffer> {
     // Used in public Buffer constructors so they are only callable within Buffer
     // Buffer constructors are public so we can call std::make_shared on Buffer
     struct Private {
@@ -208,6 +208,11 @@ class Buffer final {
         std::optional<bool> bottom_up = std::nullopt,
         std::optional<SubDeviceId> sub_device_id = std::nullopt);
 
+    // Creates a view of the region of the buffer.
+    // The view is a new buffer (unless the region is the entire buffer) that shares the same underlying device memory.
+    // The view keeps the underlying buffer alive as long as the view is alive.
+    std::shared_ptr<Buffer> view(const BufferRegion& region);
+
     Buffer(const Buffer& other) = delete;
     Buffer& operator=(const Buffer& other) = delete;
     Buffer(Buffer&& other) = delete;
@@ -245,29 +250,23 @@ class Buffer final {
 
     DeviceAddr page_address(uint32_t bank_id, uint32_t page_index) const;
 
-    DeviceAddr bank_local_page_address(uint32_t bank_id, uint32_t page_index) const;
     uint32_t alignment() const;
     DeviceAddr aligned_page_size() const;
     DeviceAddr aligned_size() const;
     DeviceAddr aligned_size_per_bank() const;
 
     // SHARDED API STARTS HERE
-    // If buffer contains BufferDistributionSpec, it is considered ND sharded
-    bool is_nd_sharded() const;
     const std::optional<BufferDistributionSpec>& buffer_distribution_spec() const;
-
-    // TODO: WILL SEPARATE INTO SHARDED BUFFER CLASS
-
-    DeviceAddr sharded_page_address(uint32_t bank_id, uint32_t page_index) const;
-
     ShardSpecBuffer shard_spec() const;
     void set_shard_spec(const ShardSpecBuffer& shard_spec);
-
-    // TODO: Consolidate with interleaved and delete this (maybe get from BufferDistributionSpec)
     std::optional<uint32_t> num_cores() const;
-
     const std::shared_ptr<const BufferPageMapping>& get_buffer_page_mapping();
 
+    // Returns the buffer that owns the underlying device memory.
+    // Typically returns itself unless the buffer was created with a view method.
+    std::shared_ptr<Buffer> root_buffer();
+    BufferRegion root_buffer_region() const { return BufferRegion(root_buffer_offset_, size_); }
+
     std::optional<SubDeviceId> sub_device_id() const { return sub_device_id_; }
 
     size_t unique_id() const { return unique_id_; }
@@ -325,11 +324,17 @@ class Buffer final {
 
     std::optional<BufferDistributionSpec> buffer_distribution_spec_;
 
+    // The root buffer is the buffer that owns the underlying device memory.
+    // The root buffer is populated only when the buffer was created with a view method.
+    std::shared_ptr<Buffer> root_buffer_;
+    // Offset of the current view buffer in the root buffer
+    DeviceAddr root_buffer_offset_ = 0;
+
     size_t unique_id_ = 0;
     static std::atomic<size_t> next_unique_id;
 };
 
-BufferPageMapping generate_buffer_page_mapping(const Buffer& buffer);
+UncompressedBufferPageMapping generate_buffer_page_mapping(const Buffer& buffer);
 
 using HostDataType = std::variant<
     const std::shared_ptr<std::vector<uint8_t>>,
 
@@ -36,7 +36,7 @@ class BufferDistributionSpec {
     size_t num_cores() const { return cores_.size(); }
     const std::vector<CoreCoord>& get_cores() const { return cores_; }
 
-    BufferPageMapping compute_page_mapping() const;
+    UncompressedBufferPageMapping compute_page_mapping() const;
 
 private:
     tt::tt_metal::Shape tensor_shape_in_pages_;
Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ TEST_F(DeviceFixture, TensixTestCircularBufferNonBlockingAPIs) {`
`118`	`118`
`119`	`119`	`std::vector<uint32_t> out_buf(data_buffer_size);`
`120`	`120`	`for (size_t i = 0; i < n_cbs; i++) {`
`121`		`- tt::tt_metal::detail::ReadFromBuffer(master_data_buffers[i], out_buf, false);`
	`121`	`+ tt::tt_metal::detail::ReadFromBuffer(master_data_buffers[i], out_buf);`
`122`	`122`
`123`	`123`	`uint8_t const* raw_data = reinterpret_cast<uint8_t*>(out_buf.data());`
`124`	`124`	`for (size_t pages_pushed = 0; pages_pushed < cb_n_pages; pages_pushed++) {`
`@@ -129,17 +129,4 @@ TEST_F(DeviceFixture, TensixTestCircularBufferNonBlockingAPIs) {`
`129`	`129`	`}`
`130`	`130`	`}`
`131`	`131`	`}`
`132`		`-`
`133`		`- for (size_t i = 0; i < n_cbs; i++) {`
`134`		`- tt::tt_metal::detail::ReadFromBuffer(subordinate_data_buffers[i], out_buf, true);`
`135`		`-`
`136`		`- uint8_t const* raw_data = reinterpret_cast<uint8_t*>(out_buf.data());`
`137`		`- for (size_t pages_pushed = 0; pages_pushed < cb_n_pages; pages_pushed++) {`
`138`		`- for (size_t filled_pages_requested = 0; filled_pages_requested < cb_n_pages; filled_pages_requested++) {`
`139`		`- ASSERT_EQ(`
`140`		`- static_cast<bool>(raw_data[pages_pushed * cb_n_pages + filled_pages_requested]),`
`141`		`- filled_pages_requested <= pages_pushed);`
`142`		`- }`
`143`		`- }`
`144`		`- }`
`145`	`132`	`}`