ROCm
diff --git a/‎c10/cuda/CUDAAllocatorConfig.h‎
Lines changed: 1 addition & 1 deletion b/‎c10/cuda/CUDAAllocatorConfig.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎c10/cuda/CUDACachingAllocator.cpp‎
Lines changed: 120 additions & 2 deletions b/‎c10/cuda/CUDACachingAllocator.cpp‎
Lines changed: 120 additions & 2 deletions
diff --git a/‎test/distributed/test_cupy_as_tensor.py‎
Lines changed: 5 additions & 1 deletion b/‎test/distributed/test_cupy_as_tensor.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎test/test_cuda.py‎
Lines changed: 8 additions & 1 deletion b/‎test/test_cuda.py‎
Lines changed: 8 additions & 1 deletion
@@ -34,7 +34,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
   static bool expandable_segments() {
     bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::
         use_expandable_segments();
-#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
+#if !defined(PYTORCH_C10_DRIVER_API_SUPPORTED) && !defined(USE_ROCM)
     if (enabled) {
       TORCH_WARN_ONCE("expandable_segments not supported on this platform")
     }
 
@@ -17,11 +17,17 @@
 #include <c10/util/llvmMathExtras.h>
 #include <c10/util/static_tracepoint.h>
 
-#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#if defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || defined(USE_ROCM)
+#if defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
+#endif
+#ifndef _WIN32
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <unistd.h>
+#else
+#include <process.h>
+#endif
 #endif
 
 #include <c10/util/Exception.h>
@@ -269,7 +275,8 @@ struct SegmentRange {
   SegmentRange(void* p, size_t s) : ptr(static_cast<char*>(p)), size(s) {}
 };
 
-#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || \
+    defined(USE_ROCM)
 
 /*
 Note [Expandable Segments]
@@ -383,8 +390,13 @@ struct ExpandableSegment {
     // This allows for some cases where we have to unmap pages earlier in the
     // segment to put them at the end.
     max_handles_ = numSegments(prop.totalGlobalMem + prop.totalGlobalMem / 8);
+#ifdef USE_ROCM
+    C10_CUDA_CHECK(hipMemAddressReserve(
+        &ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL));
+#else
     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressReserve_(
         &ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL));
+#endif
   }
   ExpandableSegment(const ExpandableSegment&) = delete;
   ExpandableSegment(ExpandableSegment&&) = delete;
@@ -408,12 +420,14 @@ struct ExpandableSegment {
     // if it fails, use posix file handle
     if (CUDAAllocatorConfig::expandable_segments_handle_type() ==
         Expandable_Segments_Handle_Type::UNSPECIFIED) {
+#ifndef USE_ROCM
       CUDAAllocatorConfig::set_expandable_segments_handle_type(
           Expandable_Segments_Handle_Type::FABRIC_HANDLE);
       auto output = map(range);
       if (output.ptr != nullptr) {
         return output;
       }
+#endif
       // if fabric handle is not supported, use posix file handle.
       CUDAAllocatorConfig::set_expandable_segments_handle_type(
           Expandable_Segments_Handle_Type::POSIX_FD);
@@ -445,33 +459,60 @@ struct ExpandableSegment {
       if (enable_ipc_handles) {
         if (CUDAAllocatorConfig::expandable_segments_handle_type() !=
             Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
+#ifdef USE_ROCM
+          prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor;
+#else
           prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#endif
         } else {
+#ifndef USE_ROCM
           prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+#endif
         }
       }
       int flag = 0;
+#ifndef USE_ROCM
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuDeviceGetAttribute_(
           &flag,
           CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
           device_));
+#endif
       if (flag)
         prop.allocFlags.gpuDirectRDMACapable = 1;
       prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
       // NOLINTNEXTLINE(bugprone-signed-char-misuse)
       prop.location.id = static_cast<int>(device_);
+#ifdef USE_ROCM
+      auto status = hipMemCreate(&handle, segment_size_, &prop, 0);
+#else
       auto status =
           DriverAPI::get()->cuMemCreate_(&handle, segment_size_, &prop, 0);
+#endif
       if (status != CUDA_SUCCESS) {
         if (status == CUDA_ERROR_OUT_OF_MEMORY) {
+#ifdef USE_ROCM
+          // hipMemCreate above returned hipErrorOutOfMemory and treated it
+          // like a sticky runtime error. Which means we need to clear it.
+          // Unlike the corresponding CUDA Driver API.
+          (void)hipGetLastError();
+#endif
           for (auto j : c10::irange(begin, i)) {
             // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
             auto h = handles_.at(j).value();
             handles_.at(j) = std::nullopt;
+#ifdef USE_ROCM
+            C10_CUDA_CHECK(hipMemRelease(h.handle));
+#else
             C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
+#endif
           }
           trimHandles();
           return rangeFromHandles(begin, begin);
+#ifdef USE_ROCM
+        } else {
+          C10_CUDA_CHECK(status);
+        }
+#else
         } else if (
             CUDAAllocatorConfig::expandable_segments_handle_type() ==
             Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
@@ -487,6 +528,7 @@ struct ExpandableSegment {
         } else {
           C10_CUDA_DRIVER_CHECK(status);
         }
+#endif
       }
       handles_.at(i) = Handle{handle, std::nullopt};
     }
@@ -522,7 +564,11 @@ struct ExpandableSegment {
     // thereby ensuring that the handle can be correctly matched in
     // ipcMemHandle_to_devptr.
     ShareHeader header{};
+#ifdef _WIN32
+    header.pid = _getpid();
+#else
     header.pid = getpid();
+#endif
     header.segment_size = segment_size_;
     header.num_handles = end - begin;
 
@@ -534,8 +580,13 @@ struct ExpandableSegment {
           Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
         if (!handle.shareable_handle) {
           int fd = 0;
+#ifdef USE_ROCM
+          C10_CUDA_CHECK(hipMemExportToShareableHandle(
+              &fd, handle.handle, hipMemHandleTypePosixFileDescriptor, 0));
+#else
           C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
               &fd, handle.handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
+#endif
           handle.shareable_handle = fd;
           LOG(INFO) << "use posix fd to share expandable segments.";
         }
@@ -546,6 +597,10 @@ struct ExpandableSegment {
             reinterpret_cast<const char*>(&*handle.shareable_handle),
             sizeof(int));
       } else {
+#ifdef USE_ROCM
+        TORCH_INTERNAL_ASSERT(
+            false, "expandable segment with fabric handle not supported");
+#else
         if (!handle.shareable_handle) {
           CUmemFabricHandle fabric_handle;
           C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
@@ -559,6 +614,7 @@ struct ExpandableSegment {
         buf.write(
             reinterpret_cast<const char*>(&*handle.shareable_handle),
             sizeof(CUmemFabricHandle));
+#endif
       }
     }
     return rangeFromHandles(begin, end);
@@ -574,14 +630,20 @@ struct ExpandableSegment {
         device, std::nullopt, header.segment_size, std::move(peers));
 // older build setups (e.g. multiwheels) do not have this syscall, added 2020
 // but the kernel on the system might still support it.
+#ifndef _WIN32
 #ifndef SYS_pidfd_open
 #define SYS_pidfd_open 434
 #endif
 #ifndef SYS_pidfd_getfd
 #define SYS_pidfd_getfd 438
 #endif
+#endif // !_WIN32
     if (CUDAAllocatorConfig::expandable_segments_handle_type() !=
         Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
+#ifdef _WIN32
+      TORCH_CHECK(
+          false, "IPC expandable segments are not supported on Windows");
+#else
       auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
       TORCH_CHECK(
           pidfd != -1 || errno != ENOSYS,
@@ -597,9 +659,13 @@ struct ExpandableSegment {
           auto err = errno;
           close(static_cast<int>(pidfd));
           for (auto& h : segment->handles_) {
+#ifdef USE_ROCM
+            C10_CUDA_CHECK(hipMemRelease(h.value().handle));
+#else
             C10_CUDA_DRIVER_CHECK(
                 // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
                 DriverAPI::get()->cuMemRelease_(h.value().handle));
+#endif
             h = std::nullopt;
           }
           TORCH_CHECK(
@@ -609,17 +675,33 @@ struct ExpandableSegment {
           TORCH_CHECK(false, "pidfd_getfd: ", c10::utils::str_error(err));
         }
         CUmemGenericAllocationHandle handle = 0;
+#ifdef USE_ROCM
+#if ROCM_VERSION >= 70100
+        void* myfd_handle =
+            reinterpret_cast<void*>(static_cast<uintptr_t>(myfd));
+#else
+        void* myfd_handle = (void*)(uintptr_t)&myfd;
+#endif
+        C10_CUDA_CHECK(hipMemImportFromShareableHandle(
+            &handle, myfd_handle, hipMemHandleTypePosixFileDescriptor));
+#else
         C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
             &handle,
             // NOLINTNEXTLINE(performance-no-int-to-ptr)
             (void*)(uintptr_t)myfd,
             CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+#endif
         LOG(INFO) << "use posix fd to import expandable segments.";
         close(static_cast<int>(myfd));
         segment->handles_.emplace_back(Handle{handle, std::nullopt});
       }
       close(static_cast<int>(pidfd));
+#endif // !_WIN32
     } else {
+#ifdef USE_ROCM
+      TORCH_INTERNAL_ASSERT(
+          false, "expandable segment with fabric handle not supported");
+#else
       for (auto i : c10::irange(header.num_handles)) {
         (void)i;
         CUmemFabricHandle fabric_handle;
@@ -634,6 +716,7 @@ struct ExpandableSegment {
         LOG(INFO) << "use fabric handle to import expandable segments.";
         segment->handles_.emplace_back(Handle{handle, std::nullopt});
       }
+#endif
     }
     segment->mapAndSetAccess(0, header.num_handles);
     return segment;
@@ -669,8 +752,12 @@ struct ExpandableSegment {
   ~ExpandableSegment() {
     forEachAllocatedRange(
         [&](size_t begin, size_t end) { unmapHandles(begin, end); });
+#ifdef USE_ROCM
+    C10_CUDA_CHECK(hipMemAddressFree(ptr_, segment_size_ * max_handles_));
+#else
     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressFree_(
         ptr_, segment_size_ * max_handles_));
+#endif
   }
 
  private:
@@ -680,19 +767,36 @@ struct ExpandableSegment {
     // NOLINTNEXTLINE(bugprone-signed-char-misuse)
     desc.location.id = static_cast<int>(device);
     desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+#ifdef USE_ROCM
+    C10_CUDA_CHECK(hipMemSetAccess(
+        ptr() + begin * segment_size_,
+        (end - begin) * segment_size_,
+        &desc,
+        1));
+#else
     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemSetAccess_(
         ptr_ + begin * segment_size_, (end - begin) * segment_size_, &desc, 1));
+#endif
   }
 
   void mapAndSetAccess(size_t begin, size_t end) {
     for (auto i : c10::irange(begin, end)) {
+#ifdef USE_ROCM
+      C10_CUDA_CHECK(hipMemMap(
+          ptr() + i * segment_size_,
+          segment_size_,
+          0,
+          handles_.at(i).value().handle,
+          0ULL));
+#else
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemMap_(
           ptr_ + i * segment_size_,
           segment_size_,
           0,
           // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
           handles_.at(i).value().handle,
           0ULL));
+#endif
     }
     mapped_size_ += (end - begin) * segment_size_;
     setAccess(device_, begin, end);
@@ -719,12 +823,22 @@ struct ExpandableSegment {
       // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
       Handle h = handles_.at(i).value();
       handles_.at(i) = std::nullopt;
+#ifdef USE_ROCM
+      C10_CUDA_CHECK(hipMemUnmap(ptr() + segment_size_ * i, segment_size_));
+#else
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemUnmap_(
           ptr_ + segment_size_ * i, segment_size_));
+#endif
       if (h.shareable_handle) {
+#ifndef _WIN32
         close(std::get<int>(*h.shareable_handle));
+#endif
       }
+#ifdef USE_ROCM
+      C10_CUDA_CHECK(hipMemRelease(h.handle));
+#else
       C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
+#endif
     }
     trimHandles();
   }
@@ -770,7 +884,11 @@ struct ExpandableSegment {
     std::optional<std::variant<int, CUmemFabricHandle>> shareable_handle;
   };
   struct ShareHeader {
+#ifdef _WIN32
+    int pid;
+#else
     pid_t pid;
+#endif
     size_t segment_size;
     size_t num_handles;
   };
 
@@ -8,7 +8,10 @@
 import torch
 from torch.multiprocessing.reductions import reduce_tensor
 from torch.testing._internal.common_cuda import SM100OrLater
-from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_distributed import (
+    MultiProcContinuousTest,
+    skip_if_rocm_multiprocess,
+)
 from torch.testing._internal.common_utils import (
     requires_cuda_p2p_access,
     run_tests,
@@ -64,6 +67,7 @@ def _init_device(self) -> None:
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
+    @skip_if_rocm_multiprocess  # RuntimeError: pidfd_getfd Operation not permitted"
     @skip_but_pass_in_sandcastle_if(
         SM100OrLater,
         "Fails if ran in docker environment without privileged access (https://github.com/pytorch/pytorch/issues/165170)",
 
@@ -4990,6 +4990,14 @@ def cb(device, alloc, device_alloc, device_free):
 
     def test_allocator_fuzz(self):
         # fuzz
+        if (
+            torch.version.hip
+            and "expandable_segments:True"
+            in torch._C._accelerator_getAllocatorSettings()
+        ):
+            raise unittest.SkipTest(
+                "ROCm needs https://github.com/ROCm/rocm-systems/pull/3023"
+            )
         state = random.getstate()
         random.seed(123)
         N = 10000
@@ -6448,7 +6456,6 @@ def test_graph_capture_reclaim_4_streams(self):
             "graph_capture_record_stream_reuse:False"
         )
 
-    @skipIfRocm(msg="expandable_segments mode is not supported on ROCm")
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode")
     def test_mempool_expandable(self):
         torch.cuda.empty_cache()
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ class C10_CUDA_API CUDAAllocatorConfig {`
`34`	`34`	`static bool expandable_segments() {`
`35`	`35`	`bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::`
`36`	`36`	`use_expandable_segments();`
`37`		`-#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED`
	`37`	`+#if !defined(PYTORCH_C10_DRIVER_API_SUPPORTED) && !defined(USE_ROCM)`
`38`	`38`	`if (enabled) {`
`39`	`39`	`TORCH_WARN_ONCE("expandable_segments not supported on this platform")`
`40`	`40`	`}`