PaddlePaddle · SigureMo · Apr 10, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
@@ -30,7 +30,9 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/memory/malloc.h"
 
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#elif defined(PADDLE_WITH_CUDA)
 #include <cuda_runtime_api.h>
 #endif
 
@@ -724,9 +726,13 @@ class Tensor : public TensorBase {
   void record_stream(at::Stream s) const;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void record_stream(at::cuda::CUDAStream s) const;
-  // TODO(youge325): Remove after DeepEP paddle branch is updated to use
-  // at::Stream
+// TODO(youge325): Remove after DeepEP paddle branch is updated to use
+// at::Stream
+#ifdef PADDLE_WITH_HIP
+  void record_stream(hipStream_t s) const;
+#else
   void record_stream(cudaStream_t s) const;
+#endif
 #endif
 
   Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); }

diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDAContext.h b/paddle/phi/api/include/compat/ATen/cuda/CUDAContext.h
@@ -18,8 +18,9 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <ATen/cuda/CUDAContextLight.h>
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAStream.h>
 #endif
diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
@@ -22,10 +22,14 @@
 // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
 // added bf16 support
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    defined(USE_CUDSS)
+#if defined(PADDLE_WITH_HIP)
+#include <hip/hip_runtime.h>
+#elif defined(PADDLE_WITH_CUDA)
+#if defined(USE_CUDSS)
 #include <cudss.h>
 #endif
+#include <driver_types.h>
+#endif
 
 #include <c10/core/Allocator.h>
 #include <c10/cuda/CUDAFunctions.h>
@@ -118,13 +122,13 @@ void* getCUDABlasLtWorkspace();
 
 CUDAContextSolverHandle getCurrentCUDASolverDnHandle();
 
+#if defined(USE_CUDSS)
+cudssHandle_t getCurrentCudssHandle();
+#endif
+
 // Get the CUDA device allocator for the current device.
 // Returns a pointer to a c10::Allocator that allocates GPU memory.
 c10::Allocator* getCUDADeviceAllocator();
 #endif
 
-#if defined(USE_CUDSS)
-cudssHandle_t getCurrentCudssHandle();
-#endif
-
 }  // namespace at::cuda
diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDADataType.h b/paddle/phi/api/include/compat/ATen/cuda/CUDADataType.h
@@ -20,11 +20,36 @@
 
 #include <c10/core/ScalarType.h>
 
+#if defined(PADDLE_WITH_HIP)
+#include <hip/hip_runtime.h>
+#include <hip/library_types.h>
+#elif defined(PADDLE_WITH_CUDA)
 #include <cuda.h>
 #include <library_types.h>
+#endif
 
 namespace at::cuda {
 
+#if defined(PADDLE_WITH_HIP)
+using cudaDataType = hipDataType;
+#define CUDA_R_16F HIP_R_16F
+#define CUDA_R_32F HIP_R_32F
+#define CUDA_R_64F HIP_R_64F
+#define CUDA_C_16F HIP_C_16F
+#define CUDA_C_32F HIP_C_32F
+#define CUDA_C_64F HIP_C_64F
+#define CUDA_R_8U HIP_R_8U
+#define CUDA_R_8I HIP_R_8I
+#define CUDA_R_32I HIP_R_32I
+#define CUDA_R_16I HIP_R_16I
+#define CUDA_R_64I HIP_R_64I
+#define CUDA_R_16BF HIP_R_16BF
+#define CUDA_R_8F_E4M3 HIP_R_8F_E4M3
+#define CUDA_R_8F_E5M2 HIP_R_8F_E5M2
+#elif defined(PADDLE_WITH_CUDA)
+using cudaDataType = cudaDataType;
-using cudaDataType = cudaDataType;
+using cudaDataType = ::cudaDataType;
-using cudaDataType = cudaDataType;
+using cudaDataType = ::cudaDataType;
+#endif
+
 template <typename scalar_t>
 cudaDataType getCudaDataType() {
   static_assert(false && sizeof(scalar_t),
@@ -110,17 +135,20 @@ inline cudaDataType ScalarTypeToCudaDataType(
       return CUDA_R_64I;
     case c10::ScalarType::BFloat16:
       return CUDA_R_16BF;
-#if !defined(USE_ROCM) || ROCM_VERSION >= 60300
+#if defined(PADDLE_WITH_HIP)
     case c10::ScalarType::Float8_e4m3fn:
       return CUDA_R_8F_E4M3;
     case c10::ScalarType::Float8_e5m2:
       return CUDA_R_8F_E5M2;
-#endif
-#if defined(USE_ROCM)
     case c10::ScalarType::Float8_e4m3fnuz:
       return HIP_R_8F_E4M3_FNUZ;
     case c10::ScalarType::Float8_e5m2fnuz:
       return HIP_R_8F_E5M2_FNUZ;
+#elif !defined(USE_ROCM) || ROCM_VERSION >= 60300
+    case c10::ScalarType::Float8_e4m3fn:
+      return CUDA_R_8F_E4M3;
+    case c10::ScalarType::Float8_e5m2:
+      return CUDA_R_8F_E5M2;
 #endif
       // #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080) ||
       // (defined(USE_ROCM) && ROCM_VERSION >= 70000)

diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDAEvent.h b/paddle/phi/api/include/compat/ATen/cuda/CUDAEvent.h
@@ -18,7 +18,11 @@
 
 #pragma once
 
+#if defined(PADDLE_WITH_HIP)
+#include <hip/hip_runtime.h>
+#elif defined(PADDLE_WITH_CUDA)
 #include <cuda_runtime_api.h>
+#endif
 
 #include <c10/core/Device.h>
 #include <c10/cuda/CUDAException.h>
@@ -41,7 +45,11 @@ struct CUDAEvent {
 
   ~CUDAEvent() {
     if (is_created_) {
+#ifdef PADDLE_WITH_HIP
+      hipEventDestroy(event_);
+#else
       cudaEventDestroy(event_);
+#endif
     }
   }
 
@@ -56,19 +64,31 @@ struct CUDAEvent {
     return *this;
   }
 
+#ifdef PADDLE_WITH_HIP
+  operator hipEvent_t() const { return event(); }
+
+  hipEvent_t event() const { return event_; }
+#else
   operator cudaEvent_t() const { return event(); }
 
   cudaEvent_t event() const { return event_; }
+#endif
 
   bool isCreated() const { return is_created_; }
 
   c10::DeviceIndex device_index() const { return device_index_; }
 
   bool query() const {
     if (!is_created_) return true;
+#ifdef PADDLE_WITH_HIP
+    hipError_t err = hipEventQuery(event_);
+    if (err == hipSuccess) return true;
+    if (err != hipErrorNotReady) C10_CUDA_CHECK(err);
+#else
     cudaError_t err = cudaEventQuery(event_);
     if (err == cudaSuccess) return true;
     if (err != cudaErrorNotReady) C10_CUDA_CHECK(err);
+#endif
     return false;
   }
 
@@ -85,7 +105,11 @@ struct CUDAEvent {
                 stream.unwrap().device_index(),
                 ".");
     c10::cuda::CUDAGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+    C10_CUDA_CHECK(hipEventRecord(event_, stream.stream()));
+#else
     C10_CUDA_CHECK(cudaEventRecord(event_, stream.stream()));
+#endif
   }
 
   void recordOnce(const CUDAStream& stream) {
@@ -98,13 +122,21 @@ struct CUDAEvent {
   void block(const CUDAStream& stream) {
     if (is_created_) {
       c10::cuda::CUDAGuard guard(stream.unwrap().device_index());
+#ifdef PADDLE_WITH_HIP
+      C10_CUDA_CHECK(hipStreamWaitEvent(stream.stream(), event_, 0));
+#else
       C10_CUDA_CHECK(cudaStreamWaitEvent(stream.stream(), event_, 0));
+#endif
     }
   }
 
   void synchronize() const {
     if (is_created_) {
+#ifdef PADDLE_WITH_HIP
+      C10_CUDA_CHECK(hipEventSynchronize(event_));
+#else
       C10_CUDA_CHECK(cudaEventSynchronize(event_));
+#endif
     }
   }
 
@@ -117,21 +149,37 @@ struct CUDAEvent {
         "Both events must be completed before calculating elapsed time.");
     float time_ms = 0;
     c10::cuda::CUDAGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+    C10_CUDA_CHECK(hipEventElapsedTime(&time_ms, event_, other.event_));
+#else
     C10_CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_, other.event_));
+#endif
     return time_ms;
   }
 
  private:
+#ifdef PADDLE_WITH_HIP
+  unsigned int flags_ = hipEventDisableTiming;
+#else
   unsigned int flags_ = cudaEventDisableTiming;
+#endif
   bool is_created_ = false;
   bool was_recorded_ = false;
   c10::DeviceIndex device_index_ = -1;
+#ifdef PADDLE_WITH_HIP
+  hipEvent_t event_{};
+#else
   cudaEvent_t event_{};
+#endif
 
   void createEvent(c10::DeviceIndex device_index) {
     device_index_ = device_index;
     c10::cuda::CUDAGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+    C10_CUDA_CHECK(hipEventCreateWithFlags(&event_, flags_));
+#else
     C10_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
+#endif
     is_created_ = true;
   }
 
@@ -140,7 +188,11 @@ struct CUDAEvent {
     is_created_ = std::exchange(other.is_created_, false);
     was_recorded_ = other.was_recorded_;
     device_index_ = other.device_index_;
+#ifdef PADDLE_WITH_HIP
+    event_ = std::exchange(other.event_, hipEvent_t{});
+#else
     event_ = std::exchange(other.event_, cudaEvent_t{});
+#endif
   }
 };
 

diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
@@ -56,6 +56,18 @@ inline void Tensor::record_stream(at::cuda::CUDAStream s) const {
 
 // TODO(youge325): Remove after DeepEP paddle branch is updated to use
 // at::Stream
+#ifdef PADDLE_WITH_HIP
+inline void Tensor::record_stream(hipStream_t s) const {
+  auto dense_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
+  PD_CHECK(dense_tensor != nullptr,
+           "record_stream only supports DenseTensor, but got a non-dense "
+           "tensor implementation.");
+  PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
+           "record_stream is not supported for CPU tensors.");
+  paddle::memory::RecordStream(dense_tensor->Holder(), s);
+}
+#else
 inline void Tensor::record_stream(cudaStream_t s) const {
   auto dense_tensor =
       std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
@@ -68,4 +80,5 @@ inline void Tensor::record_stream(cudaStream_t s) const {
                                reinterpret_cast<gpuStream_t>(s));
 }
 #endif
+#endif
 }  // namespace at
diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h
@@ -95,15 +95,23 @@ struct Event final {
   void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); }
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // TODO(youge325): Remove after DeepEP paddle branch is updated to use
   // c10::Stream
+#ifdef PADDLE_WITH_HIP
+  void record(const hipStream_t& stream) {
+    TORCH_CHECK(device_type_ == DeviceType::CUDA,
+                "Raw hipStream_t recording is only supported for CUDA events.");
+    RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
+  }
+#else
   void record(const cudaStream_t& stream) {
     TORCH_CHECK(
         device_type_ == DeviceType::CUDA,
         "Raw cudaStream_t recording is only supported for CUDA events.");
     RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
   }
+#endif
 #endif
 
   void block(const Stream& stream) const {
@@ -243,8 +251,12 @@ struct Event final {
     TORCH_CHECK(false, "Backend doesn't support events.");
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
+  hipEvent_t cuda_event() const { return backend_event_; }
+#else
   cudaEvent_t cuda_event() const { return backend_event_; }
+#endif
 #endif
 
  private: