Skip to content

Commit 2fea146

Browse files
praguptahaoyuz
andauthored
[ROCm] Reland: Enable expandable segments (pytorch#173330) (pytorch#177974) (#3106)
Summary: Original pull request: pytorch#173330 Fixes pytorch#168737. Fixes pytorch#168736. The original diff enabled expandable segments for ROCm by adding `#ifdef USE_ROCM` guards throughout CUDACachingAllocator.cpp to use HIP APIs (hipMemAddressReserve, hipMemCreate, hipMemMap, etc.) instead of CUDA driver APIs when building for ROCm. Root cause: In HIP/ROCm 6.2.1, the field name for memory allocation properties is `requestedHandleType` (singular), not `requestedHandleTypes` (plural) as in CUDA. Additionally, `hipMemHandleTypeFabric` does not exist in HIP, so the `CU_MEM_HANDLE_TYPE_FABRIC` assignment must be skipped on ROCm. Fix applied on top of the original diff (from D96652342): - Use `prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor` under `#ifdef USE_ROCM` (singular field name, HIP constant) - Use `prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR` for CUDA (plural field name, CUDA constant) - Skip the `CU_MEM_HANDLE_TYPE_FABRIC` assignment entirely on ROCm under `#ifndef USE_ROCM`, as `hipMemHandleTypeFabric` does not exist in HIP Co-authored-by: Prachi Gupta prachi.gupta@amd.com Co-authored-by: Jeff Daily jeff.daily@amd.com Co-authored-by: moonshadow-25 moonshadow-25@users.noreply.github.com Co-authored-by: Vighanesh Sharma vighaneshsharma@gmail.com Test Plan: ``` fbpkg build //aps_models/ads/ecosystem/eval/cogwheel_tests/amd:cogwheel_aps_ads_icvr_kd_eval_amd_test_harness --build-remote ``` https://www.internalfb.com/sandcastle/workflow/1049338713192153464 Differential Revision: D97211385 Pull Request resolved: pytorch#177974 Approved by: https://github.com/jeffdaily, https://github.com/echen4096 (cherry picked from commit 5792701) ## Motivation <!-- Explain the purpose of this PR and the goals it aims to achieve. --> ## Technical Details <!-- Explain the changes along with any relevant GitHub links. --> ## Test Plan <!-- Explain any relevant testing done to verify this PR. --> ## Test Result <!-- Briefly summarize test outcomes. --> ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. Co-authored-by: Haoyu Zhang <haoyuz@meta.com>
1 parent 752cc24 commit 2fea146

File tree

8 files changed

+142
-12
lines changed

8 files changed

+142
-12
lines changed

c10/cuda/CUDAAllocatorConfig.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
3434
static bool expandable_segments() {
3535
bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::
3636
use_expandable_segments();
37-
#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
37+
#if !defined(PYTORCH_C10_DRIVER_API_SUPPORTED) && !defined(USE_ROCM)
3838
if (enabled) {
3939
TORCH_WARN_ONCE("expandable_segments not supported on this platform")
4040
}

c10/cuda/CUDACachingAllocator.cpp

Lines changed: 120 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,17 @@
1717
#include <c10/util/llvmMathExtras.h>
1818
#include <c10/util/static_tracepoint.h>
1919

20-
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
20+
#if defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || defined(USE_ROCM)
21+
#if defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
2122
#include <c10/cuda/driver_api.h>
23+
#endif
24+
#ifndef _WIN32
2225
#include <sys/syscall.h>
2326
#include <sys/types.h>
2427
#include <unistd.h>
28+
#else
29+
#include <process.h>
30+
#endif
2531
#endif
2632

2733
#include <c10/util/Exception.h>
@@ -269,7 +275,8 @@ struct SegmentRange {
269275
SegmentRange(void* p, size_t s) : ptr(static_cast<char*>(p)), size(s) {}
270276
};
271277

272-
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
278+
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || \
279+
defined(USE_ROCM)
273280

274281
/*
275282
Note [Expandable Segments]
@@ -383,8 +390,13 @@ struct ExpandableSegment {
383390
// This allows for some cases where we have to unmap pages earlier in the
384391
// segment to put them at the end.
385392
max_handles_ = numSegments(prop.totalGlobalMem + prop.totalGlobalMem / 8);
393+
#ifdef USE_ROCM
394+
C10_CUDA_CHECK(hipMemAddressReserve(
395+
&ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL));
396+
#else
386397
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressReserve_(
387398
&ptr_, segment_size_ * max_handles_, 0ULL, 0, 0ULL));
399+
#endif
388400
}
389401
ExpandableSegment(const ExpandableSegment&) = delete;
390402
ExpandableSegment(ExpandableSegment&&) = delete;
@@ -408,12 +420,14 @@ struct ExpandableSegment {
408420
// if it fails, use posix file handle
409421
if (CUDAAllocatorConfig::expandable_segments_handle_type() ==
410422
Expandable_Segments_Handle_Type::UNSPECIFIED) {
423+
#ifndef USE_ROCM
411424
CUDAAllocatorConfig::set_expandable_segments_handle_type(
412425
Expandable_Segments_Handle_Type::FABRIC_HANDLE);
413426
auto output = map(range);
414427
if (output.ptr != nullptr) {
415428
return output;
416429
}
430+
#endif
417431
// if fabric handle is not supported, use posix file handle.
418432
CUDAAllocatorConfig::set_expandable_segments_handle_type(
419433
Expandable_Segments_Handle_Type::POSIX_FD);
@@ -445,33 +459,60 @@ struct ExpandableSegment {
445459
if (enable_ipc_handles) {
446460
if (CUDAAllocatorConfig::expandable_segments_handle_type() !=
447461
Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
462+
#ifdef USE_ROCM
463+
prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor;
464+
#else
448465
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
466+
#endif
449467
} else {
468+
#ifndef USE_ROCM
450469
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
470+
#endif
451471
}
452472
}
453473
int flag = 0;
474+
#ifndef USE_ROCM
454475
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuDeviceGetAttribute_(
455476
&flag,
456477
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
457478
device_));
479+
#endif
458480
if (flag)
459481
prop.allocFlags.gpuDirectRDMACapable = 1;
460482
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
461483
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
462484
prop.location.id = static_cast<int>(device_);
485+
#ifdef USE_ROCM
486+
auto status = hipMemCreate(&handle, segment_size_, &prop, 0);
487+
#else
463488
auto status =
464489
DriverAPI::get()->cuMemCreate_(&handle, segment_size_, &prop, 0);
490+
#endif
465491
if (status != CUDA_SUCCESS) {
466492
if (status == CUDA_ERROR_OUT_OF_MEMORY) {
493+
#ifdef USE_ROCM
494+
// hipMemCreate above returned hipErrorOutOfMemory and treated it
495+
// like a sticky runtime error. Which means we need to clear it.
496+
// Unlike the corresponding CUDA Driver API.
497+
(void)hipGetLastError();
498+
#endif
467499
for (auto j : c10::irange(begin, i)) {
468500
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
469501
auto h = handles_.at(j).value();
470502
handles_.at(j) = std::nullopt;
503+
#ifdef USE_ROCM
504+
C10_CUDA_CHECK(hipMemRelease(h.handle));
505+
#else
471506
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
507+
#endif
472508
}
473509
trimHandles();
474510
return rangeFromHandles(begin, begin);
511+
#ifdef USE_ROCM
512+
} else {
513+
C10_CUDA_CHECK(status);
514+
}
515+
#else
475516
} else if (
476517
CUDAAllocatorConfig::expandable_segments_handle_type() ==
477518
Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
@@ -487,6 +528,7 @@ struct ExpandableSegment {
487528
} else {
488529
C10_CUDA_DRIVER_CHECK(status);
489530
}
531+
#endif
490532
}
491533
handles_.at(i) = Handle{handle, std::nullopt};
492534
}
@@ -522,7 +564,11 @@ struct ExpandableSegment {
522564
// thereby ensuring that the handle can be correctly matched in
523565
// ipcMemHandle_to_devptr.
524566
ShareHeader header{};
567+
#ifdef _WIN32
568+
header.pid = _getpid();
569+
#else
525570
header.pid = getpid();
571+
#endif
526572
header.segment_size = segment_size_;
527573
header.num_handles = end - begin;
528574

@@ -534,8 +580,13 @@ struct ExpandableSegment {
534580
Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
535581
if (!handle.shareable_handle) {
536582
int fd = 0;
583+
#ifdef USE_ROCM
584+
C10_CUDA_CHECK(hipMemExportToShareableHandle(
585+
&fd, handle.handle, hipMemHandleTypePosixFileDescriptor, 0));
586+
#else
537587
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
538588
&fd, handle.handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
589+
#endif
539590
handle.shareable_handle = fd;
540591
LOG(INFO) << "use posix fd to share expandable segments.";
541592
}
@@ -546,6 +597,10 @@ struct ExpandableSegment {
546597
reinterpret_cast<const char*>(&*handle.shareable_handle),
547598
sizeof(int));
548599
} else {
600+
#ifdef USE_ROCM
601+
TORCH_INTERNAL_ASSERT(
602+
false, "expandable segment with fabric handle not supported");
603+
#else
549604
if (!handle.shareable_handle) {
550605
CUmemFabricHandle fabric_handle;
551606
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemExportToShareableHandle_(
@@ -559,6 +614,7 @@ struct ExpandableSegment {
559614
buf.write(
560615
reinterpret_cast<const char*>(&*handle.shareable_handle),
561616
sizeof(CUmemFabricHandle));
617+
#endif
562618
}
563619
}
564620
return rangeFromHandles(begin, end);
@@ -574,14 +630,20 @@ struct ExpandableSegment {
574630
device, std::nullopt, header.segment_size, std::move(peers));
575631
// older build setups (e.g. multiwheels) do not have this syscall, added 2020
576632
// but the kernel on the system might still support it.
633+
#ifndef _WIN32
577634
#ifndef SYS_pidfd_open
578635
#define SYS_pidfd_open 434
579636
#endif
580637
#ifndef SYS_pidfd_getfd
581638
#define SYS_pidfd_getfd 438
582639
#endif
640+
#endif // !_WIN32
583641
if (CUDAAllocatorConfig::expandable_segments_handle_type() !=
584642
Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
643+
#ifdef _WIN32
644+
TORCH_CHECK(
645+
false, "IPC expandable segments are not supported on Windows");
646+
#else
585647
auto pidfd = syscall(SYS_pidfd_open, header.pid, 0);
586648
TORCH_CHECK(
587649
pidfd != -1 || errno != ENOSYS,
@@ -597,9 +659,13 @@ struct ExpandableSegment {
597659
auto err = errno;
598660
close(static_cast<int>(pidfd));
599661
for (auto& h : segment->handles_) {
662+
#ifdef USE_ROCM
663+
C10_CUDA_CHECK(hipMemRelease(h.value().handle));
664+
#else
600665
C10_CUDA_DRIVER_CHECK(
601666
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
602667
DriverAPI::get()->cuMemRelease_(h.value().handle));
668+
#endif
603669
h = std::nullopt;
604670
}
605671
TORCH_CHECK(
@@ -609,17 +675,33 @@ struct ExpandableSegment {
609675
TORCH_CHECK(false, "pidfd_getfd: ", c10::utils::str_error(err));
610676
}
611677
CUmemGenericAllocationHandle handle = 0;
678+
#ifdef USE_ROCM
679+
#if ROCM_VERSION >= 70100
680+
void* myfd_handle =
681+
reinterpret_cast<void*>(static_cast<uintptr_t>(myfd));
682+
#else
683+
void* myfd_handle = (void*)(uintptr_t)&myfd;
684+
#endif
685+
C10_CUDA_CHECK(hipMemImportFromShareableHandle(
686+
&handle, myfd_handle, hipMemHandleTypePosixFileDescriptor));
687+
#else
612688
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
613689
&handle,
614690
// NOLINTNEXTLINE(performance-no-int-to-ptr)
615691
(void*)(uintptr_t)myfd,
616692
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
693+
#endif
617694
LOG(INFO) << "use posix fd to import expandable segments.";
618695
close(static_cast<int>(myfd));
619696
segment->handles_.emplace_back(Handle{handle, std::nullopt});
620697
}
621698
close(static_cast<int>(pidfd));
699+
#endif // !_WIN32
622700
} else {
701+
#ifdef USE_ROCM
702+
TORCH_INTERNAL_ASSERT(
703+
false, "expandable segment with fabric handle not supported");
704+
#else
623705
for (auto i : c10::irange(header.num_handles)) {
624706
(void)i;
625707
CUmemFabricHandle fabric_handle;
@@ -634,6 +716,7 @@ struct ExpandableSegment {
634716
LOG(INFO) << "use fabric handle to import expandable segments.";
635717
segment->handles_.emplace_back(Handle{handle, std::nullopt});
636718
}
719+
#endif
637720
}
638721
segment->mapAndSetAccess(0, header.num_handles);
639722
return segment;
@@ -669,8 +752,12 @@ struct ExpandableSegment {
669752
~ExpandableSegment() {
670753
forEachAllocatedRange(
671754
[&](size_t begin, size_t end) { unmapHandles(begin, end); });
755+
#ifdef USE_ROCM
756+
C10_CUDA_CHECK(hipMemAddressFree(ptr_, segment_size_ * max_handles_));
757+
#else
672758
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemAddressFree_(
673759
ptr_, segment_size_ * max_handles_));
760+
#endif
674761
}
675762

676763
private:
@@ -680,19 +767,36 @@ struct ExpandableSegment {
680767
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
681768
desc.location.id = static_cast<int>(device);
682769
desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
770+
#ifdef USE_ROCM
771+
C10_CUDA_CHECK(hipMemSetAccess(
772+
ptr() + begin * segment_size_,
773+
(end - begin) * segment_size_,
774+
&desc,
775+
1));
776+
#else
683777
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemSetAccess_(
684778
ptr_ + begin * segment_size_, (end - begin) * segment_size_, &desc, 1));
779+
#endif
685780
}
686781

687782
void mapAndSetAccess(size_t begin, size_t end) {
688783
for (auto i : c10::irange(begin, end)) {
784+
#ifdef USE_ROCM
785+
C10_CUDA_CHECK(hipMemMap(
786+
ptr() + i * segment_size_,
787+
segment_size_,
788+
0,
789+
handles_.at(i).value().handle,
790+
0ULL));
791+
#else
689792
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemMap_(
690793
ptr_ + i * segment_size_,
691794
segment_size_,
692795
0,
693796
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
694797
handles_.at(i).value().handle,
695798
0ULL));
799+
#endif
696800
}
697801
mapped_size_ += (end - begin) * segment_size_;
698802
setAccess(device_, begin, end);
@@ -719,12 +823,22 @@ struct ExpandableSegment {
719823
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
720824
Handle h = handles_.at(i).value();
721825
handles_.at(i) = std::nullopt;
826+
#ifdef USE_ROCM
827+
C10_CUDA_CHECK(hipMemUnmap(ptr() + segment_size_ * i, segment_size_));
828+
#else
722829
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemUnmap_(
723830
ptr_ + segment_size_ * i, segment_size_));
831+
#endif
724832
if (h.shareable_handle) {
833+
#ifndef _WIN32
725834
close(std::get<int>(*h.shareable_handle));
835+
#endif
726836
}
837+
#ifdef USE_ROCM
838+
C10_CUDA_CHECK(hipMemRelease(h.handle));
839+
#else
727840
C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemRelease_(h.handle));
841+
#endif
728842
}
729843
trimHandles();
730844
}
@@ -770,7 +884,11 @@ struct ExpandableSegment {
770884
std::optional<std::variant<int, CUmemFabricHandle>> shareable_handle;
771885
};
772886
struct ShareHeader {
887+
#ifdef _WIN32
888+
int pid;
889+
#else
773890
pid_t pid;
891+
#endif
774892
size_t segment_size;
775893
size_t num_handles;
776894
};

test/distributed/test_cupy_as_tensor.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
import torch
99
from torch.multiprocessing.reductions import reduce_tensor
1010
from torch.testing._internal.common_cuda import SM100OrLater
11-
from torch.testing._internal.common_distributed import MultiProcContinuousTest
11+
from torch.testing._internal.common_distributed import (
12+
MultiProcContinuousTest,
13+
skip_if_rocm_multiprocess,
14+
)
1215
from torch.testing._internal.common_utils import (
1316
requires_cuda_p2p_access,
1417
run_tests,
@@ -64,6 +67,7 @@ def _init_device(self) -> None:
6467
def device(self) -> torch.device:
6568
return torch.device(device_type, self.rank)
6669

70+
@skip_if_rocm_multiprocess # RuntimeError: pidfd_getfd Operation not permitted"
6771
@skip_but_pass_in_sandcastle_if(
6872
SM100OrLater,
6973
"Fails if ran in docker environment without privileged access (https://github.com/pytorch/pytorch/issues/165170)",

test/test_cuda.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4990,6 +4990,14 @@ def cb(device, alloc, device_alloc, device_free):
49904990

49914991
def test_allocator_fuzz(self):
49924992
# fuzz
4993+
if (
4994+
torch.version.hip
4995+
and "expandable_segments:True"
4996+
in torch._C._accelerator_getAllocatorSettings()
4997+
):
4998+
raise unittest.SkipTest(
4999+
"ROCm needs https://github.com/ROCm/rocm-systems/pull/3023"
5000+
)
49935001
state = random.getstate()
49945002
random.seed(123)
49955003
N = 10000
@@ -6448,7 +6456,6 @@ def test_graph_capture_reclaim_4_streams(self):
64486456
"graph_capture_record_stream_reuse:False"
64496457
)
64506458

6451-
@skipIfRocm(msg="expandable_segments mode is not supported on ROCm")
64526459
@unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode")
64536460
def test_mempool_expandable(self):
64546461
torch.cuda.empty_cache()

0 commit comments

Comments
 (0)