CopyFromBuffer Device2Device BufferInstance transfer workaround (#2277)

jameszianxuTT · web-flow · commit 29282fa9b02f · 2025-11-25T12:23:43.000-05:00
### Ticket Revealed by #2258. ### Problem description After #1657, output BufferInstances no longer have a host runtime tensor. If we try to do a (PJRT) device to device transfer (even though that is not really meaningful in the current way we model MeshDevice/PJRT Device Instance), the copyToBuffer path assumes an existing host runtime tensor on the source buffer instance. ### What's changed In copyToBuffer, if a device runtime tensor exists on the copy source buffer instance, transfer it to host as the source of truth. ### Checklist - [x] New/Existing tests provide coverage for changes
diff --git a/pjrt_implementation/inc/api/buffer_instance.h b/pjrt_implementation/inc/api/buffer_instance.h
@@ -181,6 +181,8 @@ class BufferInstance {
       std::optional<uint32_t> device_id = std::nullopt);
 
   // Copies the tensor inside the src_buffer to the tensor of this buffer.
+  // Currently only used for device to device transfer in copy construction
+  // of new buffer instance.
   void copyFromBuffer(const BufferInstance *src_buffer);
 
   // Calculates required tensor shape.
diff --git a/pjrt_implementation/src/api/buffer_instance.cc b/pjrt_implementation/src/api/buffer_instance.cc
@@ -249,21 +249,49 @@ void BufferInstance::copyFromHost(
 }
 
 void BufferInstance::copyFromBuffer(const BufferInstance *src_buffer) {
+  DLOG_F(LOG_DEBUG, "BufferInstance::copyFromBuffer");
   ::tt::target::DataType runtime_data_type =
       tt::pjrt::data_type_utils::convertPJRTToRuntimeDataType(
           src_buffer->m_data_type);
+
   std::uint32_t element_size =
       tt::runtime::utils::dataTypeElementSize(runtime_data_type);
   std::vector<std::uint32_t> shape = calculateShape(
       src_buffer->getDimensionsRaw(), src_buffer->getNumberOfDimensions());
   std::vector<std::uint32_t> strides = calculateStrides(
       src_buffer->getNumberOfDimensions(), nullptr, 0, element_size);
 
+  // This function is expected to be used for device-to-device buffer
+  // initialization of a new buffer instance, so destination buffer must not
+  // have data yet, or it will be overwritten.
+  assert((!m_host_runtime_tensor.has_value() &&
+          !m_prepared_runtime_tensor.has_value()) &&
+         "Destination buffer already has data");
+
+  tt::runtime::Tensor source_host_runtime_tensor;
+
+  if (src_buffer->m_prepared_runtime_tensor != std::nullopt) {
+    DLOG_F(WARNING,
+           "BufferInstance::copyFromBuffer: Device-Device transfer is "
+           "inefficient due to PJRT device modeling limitations. This will "
+           "actually copy src to host, and fill dst host tensor, because at "
+           "this callsite we do not know what dst device is.");
+    std::vector<tt::runtime::Tensor> host_runtime_tensors = tt::runtime::toHost(
+        src_buffer->m_prepared_runtime_tensor.value(), /*untilize=*/true);
+
+    assert(host_runtime_tensors.size() == 1 &&
+           "Expected single host tensor when copying from device buffer");
+
+    source_host_runtime_tensor = host_runtime_tensors[0];
+  } else if (src_buffer->m_host_runtime_tensor != std::nullopt) {
+    source_host_runtime_tensor = *src_buffer->m_host_runtime_tensor;
+  } else {
+    assert(false && "Source buffer has no data to copy from");
+  }
+
   m_host_runtime_tensor = tt::runtime::createOwnedHostTensor(
       /* data= */ nullptr, shape, strides, element_size, runtime_data_type);
-
-  tt::runtime::memcpy(*m_host_runtime_tensor,
-                      *src_buffer->m_host_runtime_tensor);
+  tt::runtime::memcpy(*m_host_runtime_tensor, source_host_runtime_tensor);
   tt::runtime::setTensorRetain(*m_host_runtime_tensor, /*retain=*/true);
 
   markAsDataReady();
@@ -347,7 +375,8 @@ tt_pjrt_status BufferInstance::copyToHost(void *host_buffer,
       [](std::unique_lock<std::mutex> copy_lock, void *host_buffer,
          tt::runtime::Tensor runtime_tensor, EventInstance *event,
          PJRT_Buffer_Type data_type, size_t host_buffer_size,
-         std::optional<uint32_t> device_id, bool already_on_host) {
+         std::optional<uint32_t> device_id, bool already_on_host,
+         uint64_t buffer_uid) {
         // Acquire lock to serialize all copy-to-host operations across all
         // BufferInstances since any metal dispatch in this async thread will
         // cause ND segfaults as metal is not thread safe.
@@ -368,9 +397,9 @@ tt_pjrt_status BufferInstance::copyToHost(void *host_buffer,
           }
           DLOG_F(LOG_DEBUG,
                  "Returning tensor to host with host_runtime_tensors ct = %ld "
-                 "from device %d",
+                 "from device %d with buffer UID %zu",
                  host_runtime_tensors.size(),
-                 device_id.has_value() ? device_id.value() : 0);
+                 device_id.has_value() ? device_id.value() : 0, buffer_uid);
 
           // If device_id is not set, we are returning a replicated input
           // buffer instance to host (eg. cache position for update). This means
@@ -411,7 +440,7 @@ tt_pjrt_status BufferInstance::copyToHost(void *host_buffer,
       },
       std::move(copy_lock), host_buffer, runtime_tensor_to_retrieve,
       event.get(), m_data_type, host_buffer_size, m_device_id,
-      is_tensor_on_host);
+      is_tensor_on_host, m_uid);
 
   // responsible for calling `PJRT_Event_Destroy` on the event.
   *out_copy_done_event = event.release();
diff --git a/pjrt_implementation/src/api/flatbuffer_loaded_executable_instance.cc b/pjrt_implementation/src/api/flatbuffer_loaded_executable_instance.cc
@@ -199,8 +199,10 @@ void FlatbufferLoadedExecutableInstance::fillPJRTOutputLists(
               m_addressable_devices[device_index]->getDefaultMemory(),
               expected_output_data_types[output_index], device_index);
       DLOG_F(LOG_DEBUG,
-             "Filled output at output_index %zu device_index %d with shape %s",
-             output_index, device_index, output_buffer->toShapeStr().c_str());
+             "Filled output at output_index %zu device_index %d with shape %s "
+             "and UID %zu",
+             output_index, device_index, output_buffer->toShapeStr().c_str(),
+             output_buffer->getUID());
 
       output_buffer->markAsDataReady();
 
diff --git a/tests/jax/multi_chip/n300/graphs/tensor_parallel/test_basic_sharding.py b/tests/jax/multi_chip/n300/graphs/tensor_parallel/test_basic_sharding.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+from infra.connectors.device_connector import DeviceType
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+
+
+@pytest.mark.nightly
+@pytest.mark.push
+def test_sharded_copyFromBuffer():
+    """
+    Test basic tensor sharding with device_put - no operations.
+
+    This requires a revert of the of the jax_platforms test config set in autouse
+    initialize_device_connectors conftest fixture, which is monkeypatched around the test.
+    This results in the sharding happening on-device and induces a copyFromBuffer call by the framework.
+
+    This is not the expected usage pattern for tt-xla users, but is instead a backup check that the
+    copyFromBuffer path works correctly, as there is no legitimate usecase for it right now.
+    Users will encounter this path if they don't set jax platforms config to CPU **first** as is done in the conftest fixture.
+
+    Expected log when running locally:
+    > [...] buffer_instance.cc:295   WARN| BufferInstance::copyFromBuffer: Device-Device transfer
+    is inefficient due to PJRT device modeling limitations. This will actually copy src to host,
+    and fill dst host tensor, because at this callsite we do not know what dst device is.
+    """
+    original_platforms = jax.config.jax_platforms
+
+    try:
+        jax.config.update(
+            "jax_platforms",
+            ",".join([device.value for device in [DeviceType.TT, DeviceType.CPU]]),
+        )
+
+        devices = jax.devices("tt")
+        mesh = Mesh(np.array(devices), axis_names=("data",))
+
+        # Create tensor on CPU
+        with jax.default_device(jax.devices("cpu")[0]):
+            a = jax.random.normal(jax.random.PRNGKey(0), (4, 4))
+
+        # Shard tensor across data dimension
+        a_tt = jax.device_put(a, NamedSharding(mesh, PartitionSpec("data")))
+
+        # Verify sharding
+        assert a_tt.sharding is not None
+    finally:
+        # Restore original config
+        jax.config.update("jax_platforms", original_platforms)