1717#include < c10/util/llvmMathExtras.h>
1818#include < c10/util/static_tracepoint.h>
1919
20- #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
20+ #if defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || defined(USE_ROCM)
21+ #if defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
2122#include < c10/cuda/driver_api.h>
23+ #endif
24+ #ifndef _WIN32
2225#include < sys/syscall.h>
2326#include < sys/types.h>
2427#include < unistd.h>
28+ #else
29+ #include < process.h>
30+ #endif
2531#endif
2632
2733#include < c10/util/Exception.h>
@@ -269,7 +275,8 @@ struct SegmentRange {
269275 SegmentRange (void * p, size_t s) : ptr(static_cast <char *>(p)), size(s) {}
270276};
271277
272- #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
278+ #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) || \
279+ defined (USE_ROCM)
273280
274281/*
275282Note [Expandable Segments]
@@ -383,8 +390,13 @@ struct ExpandableSegment {
383390 // This allows for some cases where we have to unmap pages earlier in the
384391 // segment to put them at the end.
385392 max_handles_ = numSegments (prop.totalGlobalMem + prop.totalGlobalMem / 8 );
393+ #ifdef USE_ROCM
394+ C10_CUDA_CHECK (hipMemAddressReserve (
395+ &ptr_, segment_size_ * max_handles_, 0ULL , 0 , 0ULL ));
396+ #else
386397 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemAddressReserve_ (
387398 &ptr_, segment_size_ * max_handles_, 0ULL , 0 , 0ULL ));
399+ #endif
388400 }
389401 ExpandableSegment (const ExpandableSegment&) = delete ;
390402 ExpandableSegment (ExpandableSegment&&) = delete ;
@@ -408,12 +420,14 @@ struct ExpandableSegment {
408420 // if it fails, use posix file handle
409421 if (CUDAAllocatorConfig::expandable_segments_handle_type () ==
410422 Expandable_Segments_Handle_Type::UNSPECIFIED) {
423+ #ifndef USE_ROCM
411424 CUDAAllocatorConfig::set_expandable_segments_handle_type (
412425 Expandable_Segments_Handle_Type::FABRIC_HANDLE);
413426 auto output = map (range);
414427 if (output.ptr != nullptr ) {
415428 return output;
416429 }
430+ #endif
417431 // if fabric handle is not supported, use posix file handle.
418432 CUDAAllocatorConfig::set_expandable_segments_handle_type (
419433 Expandable_Segments_Handle_Type::POSIX_FD);
@@ -445,33 +459,60 @@ struct ExpandableSegment {
445459 if (enable_ipc_handles) {
446460 if (CUDAAllocatorConfig::expandable_segments_handle_type () !=
447461 Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
462+ #ifdef USE_ROCM
463+ prop.requestedHandleType = hipMemHandleTypePosixFileDescriptor;
464+ #else
448465 prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
466+ #endif
449467 } else {
468+ #ifndef USE_ROCM
450469 prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
470+ #endif
451471 }
452472 }
453473 int flag = 0 ;
474+ #ifndef USE_ROCM
454475 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuDeviceGetAttribute_ (
455476 &flag,
456477 CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
457478 device_));
479+ #endif
458480 if (flag)
459481 prop.allocFlags .gpuDirectRDMACapable = 1 ;
460482 prop.location .type = CU_MEM_LOCATION_TYPE_DEVICE;
461483 // NOLINTNEXTLINE(bugprone-signed-char-misuse)
462484 prop.location .id = static_cast <int >(device_);
485+ #ifdef USE_ROCM
486+ auto status = hipMemCreate (&handle, segment_size_, &prop, 0 );
487+ #else
463488 auto status =
464489 DriverAPI::get ()->cuMemCreate_ (&handle, segment_size_, &prop, 0 );
490+ #endif
465491 if (status != CUDA_SUCCESS) {
466492 if (status == CUDA_ERROR_OUT_OF_MEMORY) {
493+ #ifdef USE_ROCM
494+ // hipMemCreate above returned hipErrorOutOfMemory and treated it
495+ // like a sticky runtime error. Which means we need to clear it.
496+ // Unlike the corresponding CUDA Driver API.
497+ (void )hipGetLastError ();
498+ #endif
467499 for (auto j : c10::irange (begin, i)) {
468500 // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
469501 auto h = handles_.at (j).value ();
470502 handles_.at (j) = std::nullopt ;
503+ #ifdef USE_ROCM
504+ C10_CUDA_CHECK (hipMemRelease (h.handle ));
505+ #else
471506 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemRelease_ (h.handle ));
507+ #endif
472508 }
473509 trimHandles ();
474510 return rangeFromHandles (begin, begin);
511+ #ifdef USE_ROCM
512+ } else {
513+ C10_CUDA_CHECK (status);
514+ }
515+ #else
475516 } else if (
476517 CUDAAllocatorConfig::expandable_segments_handle_type () ==
477518 Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
@@ -487,6 +528,7 @@ struct ExpandableSegment {
487528 } else {
488529 C10_CUDA_DRIVER_CHECK (status);
489530 }
531+ #endif
490532 }
491533 handles_.at (i) = Handle{handle, std::nullopt };
492534 }
@@ -522,7 +564,11 @@ struct ExpandableSegment {
522564 // thereby ensuring that the handle can be correctly matched in
523565 // ipcMemHandle_to_devptr.
524566 ShareHeader header{};
567+ #ifdef _WIN32
568+ header.pid = _getpid ();
569+ #else
525570 header.pid = getpid ();
571+ #endif
526572 header.segment_size = segment_size_;
527573 header.num_handles = end - begin;
528574
@@ -534,8 +580,13 @@ struct ExpandableSegment {
534580 Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
535581 if (!handle.shareable_handle ) {
536582 int fd = 0 ;
583+ #ifdef USE_ROCM
584+ C10_CUDA_CHECK (hipMemExportToShareableHandle (
585+ &fd, handle.handle , hipMemHandleTypePosixFileDescriptor, 0 ));
586+ #else
537587 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemExportToShareableHandle_ (
538588 &fd, handle.handle , CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 ));
589+ #endif
539590 handle.shareable_handle = fd;
540591 LOG (INFO) << " use posix fd to share expandable segments." ;
541592 }
@@ -546,6 +597,10 @@ struct ExpandableSegment {
546597 reinterpret_cast <const char *>(&*handle.shareable_handle ),
547598 sizeof (int ));
548599 } else {
600+ #ifdef USE_ROCM
601+ TORCH_INTERNAL_ASSERT (
602+ false , " expandable segment with fabric handle not supported" );
603+ #else
549604 if (!handle.shareable_handle ) {
550605 CUmemFabricHandle fabric_handle;
551606 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemExportToShareableHandle_ (
@@ -559,6 +614,7 @@ struct ExpandableSegment {
559614 buf.write (
560615 reinterpret_cast <const char *>(&*handle.shareable_handle ),
561616 sizeof (CUmemFabricHandle));
617+ #endif
562618 }
563619 }
564620 return rangeFromHandles (begin, end);
@@ -574,14 +630,20 @@ struct ExpandableSegment {
574630 device, std::nullopt , header.segment_size , std::move (peers));
575631// older build setups (e.g. multiwheels) do not have this syscall, added 2020
576632// but the kernel on the system might still support it.
633+ #ifndef _WIN32
577634#ifndef SYS_pidfd_open
578635#define SYS_pidfd_open 434
579636#endif
580637#ifndef SYS_pidfd_getfd
581638#define SYS_pidfd_getfd 438
582639#endif
640+ #endif // !_WIN32
583641 if (CUDAAllocatorConfig::expandable_segments_handle_type () !=
584642 Expandable_Segments_Handle_Type::FABRIC_HANDLE) {
643+ #ifdef _WIN32
644+ TORCH_CHECK (
645+ false , " IPC expandable segments are not supported on Windows" );
646+ #else
585647 auto pidfd = syscall (SYS_pidfd_open, header.pid , 0 );
586648 TORCH_CHECK (
587649 pidfd != -1 || errno != ENOSYS,
@@ -597,9 +659,13 @@ struct ExpandableSegment {
597659 auto err = errno;
598660 close (static_cast <int >(pidfd));
599661 for (auto & h : segment->handles_ ) {
662+ #ifdef USE_ROCM
663+ C10_CUDA_CHECK (hipMemRelease (h.value ().handle ));
664+ #else
600665 C10_CUDA_DRIVER_CHECK (
601666 // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
602667 DriverAPI::get ()->cuMemRelease_ (h.value ().handle ));
668+ #endif
603669 h = std::nullopt ;
604670 }
605671 TORCH_CHECK (
@@ -609,17 +675,33 @@ struct ExpandableSegment {
609675 TORCH_CHECK (false , " pidfd_getfd: " , c10::utils::str_error (err));
610676 }
611677 CUmemGenericAllocationHandle handle = 0 ;
678+ #ifdef USE_ROCM
679+ #if ROCM_VERSION >= 70100
680+ void * myfd_handle =
681+ reinterpret_cast <void *>(static_cast <uintptr_t >(myfd));
682+ #else
683+ void * myfd_handle = (void *)(uintptr_t )&myfd;
684+ #endif
685+ C10_CUDA_CHECK (hipMemImportFromShareableHandle (
686+ &handle, myfd_handle, hipMemHandleTypePosixFileDescriptor));
687+ #else
612688 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemImportFromShareableHandle_ (
613689 &handle,
614690 // NOLINTNEXTLINE(performance-no-int-to-ptr)
615691 (void *)(uintptr_t )myfd,
616692 CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
693+ #endif
617694 LOG (INFO) << " use posix fd to import expandable segments." ;
618695 close (static_cast <int >(myfd));
619696 segment->handles_ .emplace_back (Handle{handle, std::nullopt });
620697 }
621698 close (static_cast <int >(pidfd));
699+ #endif // !_WIN32
622700 } else {
701+ #ifdef USE_ROCM
702+ TORCH_INTERNAL_ASSERT (
703+ false , " expandable segment with fabric handle not supported" );
704+ #else
623705 for (auto i : c10::irange (header.num_handles )) {
624706 (void )i;
625707 CUmemFabricHandle fabric_handle;
@@ -634,6 +716,7 @@ struct ExpandableSegment {
634716 LOG (INFO) << " use fabric handle to import expandable segments." ;
635717 segment->handles_ .emplace_back (Handle{handle, std::nullopt });
636718 }
719+ #endif
637720 }
638721 segment->mapAndSetAccess (0 , header.num_handles );
639722 return segment;
@@ -669,8 +752,12 @@ struct ExpandableSegment {
669752 ~ExpandableSegment () {
670753 forEachAllocatedRange (
671754 [&](size_t begin, size_t end) { unmapHandles (begin, end); });
755+ #ifdef USE_ROCM
756+ C10_CUDA_CHECK (hipMemAddressFree (ptr_, segment_size_ * max_handles_));
757+ #else
672758 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemAddressFree_ (
673759 ptr_, segment_size_ * max_handles_));
760+ #endif
674761 }
675762
676763 private:
@@ -680,19 +767,36 @@ struct ExpandableSegment {
680767 // NOLINTNEXTLINE(bugprone-signed-char-misuse)
681768 desc.location .id = static_cast <int >(device);
682769 desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
770+ #ifdef USE_ROCM
771+ C10_CUDA_CHECK (hipMemSetAccess (
772+ ptr () + begin * segment_size_,
773+ (end - begin) * segment_size_,
774+ &desc,
775+ 1 ));
776+ #else
683777 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemSetAccess_ (
684778 ptr_ + begin * segment_size_, (end - begin) * segment_size_, &desc, 1 ));
779+ #endif
685780 }
686781
687782 void mapAndSetAccess (size_t begin, size_t end) {
688783 for (auto i : c10::irange (begin, end)) {
784+ #ifdef USE_ROCM
785+ C10_CUDA_CHECK (hipMemMap (
786+ ptr () + i * segment_size_,
787+ segment_size_,
788+ 0 ,
789+ handles_.at (i).value ().handle ,
790+ 0ULL ));
791+ #else
689792 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemMap_ (
690793 ptr_ + i * segment_size_,
691794 segment_size_,
692795 0 ,
693796 // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
694797 handles_.at (i).value ().handle ,
695798 0ULL ));
799+ #endif
696800 }
697801 mapped_size_ += (end - begin) * segment_size_;
698802 setAccess (device_, begin, end);
@@ -719,12 +823,22 @@ struct ExpandableSegment {
719823 // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
720824 Handle h = handles_.at (i).value ();
721825 handles_.at (i) = std::nullopt ;
826+ #ifdef USE_ROCM
827+ C10_CUDA_CHECK (hipMemUnmap (ptr () + segment_size_ * i, segment_size_));
828+ #else
722829 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemUnmap_ (
723830 ptr_ + segment_size_ * i, segment_size_));
831+ #endif
724832 if (h.shareable_handle ) {
833+ #ifndef _WIN32
725834 close (std::get<int >(*h.shareable_handle ));
835+ #endif
726836 }
837+ #ifdef USE_ROCM
838+ C10_CUDA_CHECK (hipMemRelease (h.handle ));
839+ #else
727840 C10_CUDA_DRIVER_CHECK (DriverAPI::get ()->cuMemRelease_ (h.handle ));
841+ #endif
728842 }
729843 trimHandles ();
730844 }
@@ -770,7 +884,11 @@ struct ExpandableSegment {
770884 std::optional<std::variant<int , CUmemFabricHandle>> shareable_handle;
771885 };
772886 struct ShareHeader {
887+ #ifdef _WIN32
888+ int pid;
889+ #else
773890 pid_t pid;
891+ #endif
774892 size_t segment_size;
775893 size_t num_handles;
776894 };
0 commit comments