[IntraNodeComm] fix an issue where input check fails when running all-reduce on sub groups

Yifu Wang · Yifu Wang · commit 95a353465a10 · 2024-07-10T14:31:53.000-07:00
ghstack-source-id: 218c718 Pull Request resolved: pytorch#130492
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/intra_node_comm.cpp
@@ -280,8 +280,8 @@ bool IntraNodeComm::rendezvous() {
     return false;
   }
 
-  auto deviceIdx = at::cuda::current_device();
-  c10::cuda::CUDAGuard guard(deviceIdx);
+  deviceIdx_ = at::cuda::current_device();
+  c10::cuda::CUDAGuard guard(deviceIdx_);
 
   // First hand shake: exchange hostname and device bus ID
   struct DevInfo {
@@ -292,7 +292,7 @@ bool IntraNodeComm::rendezvous() {
   DevInfo devInfo{};
   gethostname(devInfo.hostname, sizeof(devInfo.hostname));
   cudaDeviceProp prop{};
-  AT_CUDA_CHECK(cudaGetDeviceProperties(&prop, deviceIdx));
+  AT_CUDA_CHECK(cudaGetDeviceProperties(&prop, deviceIdx_));
   snprintf(
       devInfo.busId,
       sizeof(devInfo.busId),
@@ -334,7 +334,7 @@ bool IntraNodeComm::rendezvous() {
   auto groupName = "IntraNodeComm" + std::to_string(intraNodeCommIdx++);
   set_group_info(groupName, rank_, worldSize_, store_);
   auto allocator = get_allocator(c10::DeviceType::CUDA);
-  symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx, groupName);
+  symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx_, groupName);
   symmetricMemory_ = allocator->rendezvous(symmetricMemoryPtr_);
   TORCH_CHECK(symmetricMemory_->get_signal_pad_size() >= kP2pStateSize);
 
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cu b/torch/csrc/distributed/c10d/intra_node_comm.cu
@@ -441,13 +441,18 @@ static inline size_t alignUp(uint32_t a, uint32_t b) {
   return divUp(a, b) * b;
 }
 
-static void checkInput(const at::Tensor& input, size_t rank) {
+static void checkInput(const at::Tensor& input, int deviceIdx) {
   TORCH_CHECK(
       input.dtype() == at::kBFloat16,
       "oneShotAllReduce only supports bf16 for now");
   TORCH_CHECK(input.is_non_overlapping_and_dense());
   TORCH_CHECK(input.device().is_cuda());
-  TORCH_CHECK(static_cast<size_t>(input.get_device()) == rank);
+  TORCH_CHECK(
+      input.get_device() == deviceIdx,
+      "IntraNodeComm: expect input to be on device ",
+      deviceIdx,
+      ", got device ",
+      input.get_device());
 }
 
 static void getLaunchConfig(
@@ -507,7 +512,7 @@ void* initTopoInfo(Topology topology, NvlMesh nvlMesh, size_t rank) {
 at::Tensor IntraNodeComm::oneShotAllReduce(
     const at::Tensor& input,
     at::cuda::CUDAStream& stream) {
-  checkInput(input, rank_);
+  checkInput(input, deviceIdx_);
 
   const size_t numelPerWarp =
       kBytesPerThread / input.element_size() * kWarpSize;
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.hpp b/torch/csrc/distributed/c10d/intra_node_comm.hpp
@@ -101,6 +101,7 @@ class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
    * Members initialized after rendezvous
    */
   bool isInitialized_ = false;
+  int deviceIdx_;
   Topology topology_ = Topology::UNKNOWN;
   void* symmetricMemoryPtr_ = nullptr;
   c10::intrusive_ptr<SymmetricMemory> symmetricMemory_ = nullptr;