Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions paddle/phi/api/include/compat/ATen/core/TensorBody.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/memory/malloc.h"

#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#elif defined(PADDLE_WITH_CUDA)
#include <cuda_runtime_api.h>
#endif

Expand Down Expand Up @@ -724,9 +726,13 @@ class Tensor : public TensorBase {
void record_stream(at::Stream s) const;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void record_stream(at::cuda::CUDAStream s) const;
// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// at::Stream
// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// at::Stream
#ifdef PADDLE_WITH_HIP
void record_stream(hipStream_t s) const;
#else
void record_stream(cudaStream_t s) const;
#endif
#endif

Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); }
Expand Down
3 changes: 2 additions & 1 deletion paddle/phi/api/include/compat/ATen/cuda/CUDAContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@

#pragma once

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <ATen/cuda/CUDAContextLight.h>

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAStream.h>
#endif
16 changes: 10 additions & 6 deletions paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@
// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
// added bf16 support

#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
defined(USE_CUDSS)
#if defined(PADDLE_WITH_HIP)
#include <hip/hip_runtime.h>
#elif defined(PADDLE_WITH_CUDA)
#if defined(USE_CUDSS)
#include <cudss.h>
#endif
#include <driver_types.h>
#endif

#include <c10/core/Allocator.h>
#include <c10/cuda/CUDAFunctions.h>
Expand Down Expand Up @@ -118,13 +122,13 @@ void* getCUDABlasLtWorkspace();

CUDAContextSolverHandle getCurrentCUDASolverDnHandle();

#if defined(USE_CUDSS)
cudssHandle_t getCurrentCudssHandle();
#endif

// Get the CUDA device allocator for the current device.
// Returns a pointer to a c10::Allocator that allocates GPU memory.
c10::Allocator* getCUDADeviceAllocator();
#endif

#if defined(USE_CUDSS)
cudssHandle_t getCurrentCudssHandle();
#endif

} // namespace at::cuda
34 changes: 31 additions & 3 deletions paddle/phi/api/include/compat/ATen/cuda/CUDADataType.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,36 @@

#include <c10/core/ScalarType.h>

#if defined(PADDLE_WITH_HIP)
#include <hip/hip_runtime.h>
#include <hip/library_types.h>
#elif defined(PADDLE_WITH_CUDA)
#include <cuda.h>
#include <library_types.h>
#endif

namespace at::cuda {

#if defined(PADDLE_WITH_HIP)
using cudaDataType = hipDataType;
#define CUDA_R_16F HIP_R_16F
#define CUDA_R_32F HIP_R_32F
#define CUDA_R_64F HIP_R_64F
#define CUDA_C_16F HIP_C_16F
#define CUDA_C_32F HIP_C_32F
#define CUDA_C_64F HIP_C_64F
#define CUDA_R_8U HIP_R_8U
#define CUDA_R_8I HIP_R_8I
#define CUDA_R_32I HIP_R_32I
#define CUDA_R_16I HIP_R_16I
#define CUDA_R_64I HIP_R_64I
#define CUDA_R_16BF HIP_R_16BF
#define CUDA_R_8F_E4M3 HIP_R_8F_E4M3
#define CUDA_R_8F_E5M2 HIP_R_8F_E5M2
#elif defined(PADDLE_WITH_CUDA)
using cudaDataType = cudaDataType;
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CUDA branch uses using cudaDataType = cudaDataType;, which relies on unqualified lookup to find the global ::cudaDataType and is easy to misread as a self-referential alias. Qualifying the RHS (e.g., ::cudaDataType) or removing the alias entirely on the CUDA path would make the intent clearer and avoid confusion for tools/readers.

Suggested change
using cudaDataType = cudaDataType;
using cudaDataType = ::cudaDataType;

Copilot uses AI. Check for mistakes.
#endif

template <typename scalar_t>
cudaDataType getCudaDataType() {
static_assert(false && sizeof(scalar_t),
Expand Down Expand Up @@ -110,17 +135,20 @@ inline cudaDataType ScalarTypeToCudaDataType(
return CUDA_R_64I;
case c10::ScalarType::BFloat16:
return CUDA_R_16BF;
#if !defined(USE_ROCM) || ROCM_VERSION >= 60300
#if defined(PADDLE_WITH_HIP)
case c10::ScalarType::Float8_e4m3fn:
return CUDA_R_8F_E4M3;
case c10::ScalarType::Float8_e5m2:
return CUDA_R_8F_E5M2;
#endif
#if defined(USE_ROCM)
case c10::ScalarType::Float8_e4m3fnuz:
return HIP_R_8F_E4M3_FNUZ;
case c10::ScalarType::Float8_e5m2fnuz:
return HIP_R_8F_E5M2_FNUZ;
#elif !defined(USE_ROCM) || ROCM_VERSION >= 60300
case c10::ScalarType::Float8_e4m3fn:
return CUDA_R_8F_E4M3;
case c10::ScalarType::Float8_e5m2:
return CUDA_R_8F_E5M2;
Comment on lines +138 to +151
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In ScalarTypeToCudaDataType(), the HIP branch now only handles Float8_e4m3fnuz/Float8_e5m2fnuz, but not Float8_e4m3fn/Float8_e5m2. These scalar types exist in c10::ScalarType, and on HIP builds this function will hit TORCH_INTERNAL_ASSERT(false, ...) for them. Consider restoring handling for Float8_e4m3fn/Float8_e5m2 in the HIP branch (mapping to HIP_R_8F_E4M3 / HIP_R_8F_E5M2) to avoid a runtime assert regression.

Copilot uses AI. Check for mistakes.
#endif
// #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080) ||
// (defined(USE_ROCM) && ROCM_VERSION >= 70000)
Expand Down
52 changes: 52 additions & 0 deletions paddle/phi/api/include/compat/ATen/cuda/CUDAEvent.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@

#pragma once

#if defined(PADDLE_WITH_HIP)
#include <hip/hip_runtime.h>
#elif defined(PADDLE_WITH_CUDA)
#include <cuda_runtime_api.h>
#endif

#include <c10/core/Device.h>
#include <c10/cuda/CUDAException.h>
Expand All @@ -41,7 +45,11 @@ struct CUDAEvent {

~CUDAEvent() {
if (is_created_) {
#ifdef PADDLE_WITH_HIP
hipEventDestroy(event_);
#else
cudaEventDestroy(event_);
#endif
}
}

Expand All @@ -56,19 +64,31 @@ struct CUDAEvent {
return *this;
}

#ifdef PADDLE_WITH_HIP
operator hipEvent_t() const { return event(); }

hipEvent_t event() const { return event_; }
#else
operator cudaEvent_t() const { return event(); }

cudaEvent_t event() const { return event_; }
#endif

bool isCreated() const { return is_created_; }

c10::DeviceIndex device_index() const { return device_index_; }

bool query() const {
if (!is_created_) return true;
#ifdef PADDLE_WITH_HIP
hipError_t err = hipEventQuery(event_);
if (err == hipSuccess) return true;
if (err != hipErrorNotReady) C10_CUDA_CHECK(err);
#else
cudaError_t err = cudaEventQuery(event_);
if (err == cudaSuccess) return true;
if (err != cudaErrorNotReady) C10_CUDA_CHECK(err);
#endif
return false;
}

Expand All @@ -85,7 +105,11 @@ struct CUDAEvent {
stream.unwrap().device_index(),
".");
c10::cuda::CUDAGuard guard(device_index_);
#ifdef PADDLE_WITH_HIP
C10_CUDA_CHECK(hipEventRecord(event_, stream.stream()));
#else
C10_CUDA_CHECK(cudaEventRecord(event_, stream.stream()));
#endif
}

void recordOnce(const CUDAStream& stream) {
Expand All @@ -98,13 +122,21 @@ struct CUDAEvent {
void block(const CUDAStream& stream) {
if (is_created_) {
c10::cuda::CUDAGuard guard(stream.unwrap().device_index());
#ifdef PADDLE_WITH_HIP
C10_CUDA_CHECK(hipStreamWaitEvent(stream.stream(), event_, 0));
#else
C10_CUDA_CHECK(cudaStreamWaitEvent(stream.stream(), event_, 0));
#endif
}
}

void synchronize() const {
if (is_created_) {
#ifdef PADDLE_WITH_HIP
C10_CUDA_CHECK(hipEventSynchronize(event_));
#else
C10_CUDA_CHECK(cudaEventSynchronize(event_));
#endif
}
}

Expand All @@ -117,21 +149,37 @@ struct CUDAEvent {
"Both events must be completed before calculating elapsed time.");
float time_ms = 0;
c10::cuda::CUDAGuard guard(device_index_);
#ifdef PADDLE_WITH_HIP
C10_CUDA_CHECK(hipEventElapsedTime(&time_ms, event_, other.event_));
#else
C10_CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_, other.event_));
#endif
return time_ms;
}

private:
#ifdef PADDLE_WITH_HIP
unsigned int flags_ = hipEventDisableTiming;
#else
unsigned int flags_ = cudaEventDisableTiming;
#endif
bool is_created_ = false;
bool was_recorded_ = false;
c10::DeviceIndex device_index_ = -1;
#ifdef PADDLE_WITH_HIP
hipEvent_t event_{};
#else
cudaEvent_t event_{};
#endif

void createEvent(c10::DeviceIndex device_index) {
device_index_ = device_index;
c10::cuda::CUDAGuard guard(device_index_);
#ifdef PADDLE_WITH_HIP
C10_CUDA_CHECK(hipEventCreateWithFlags(&event_, flags_));
#else
C10_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
#endif
is_created_ = true;
}

Expand All @@ -140,7 +188,11 @@ struct CUDAEvent {
is_created_ = std::exchange(other.is_created_, false);
was_recorded_ = other.was_recorded_;
device_index_ = other.device_index_;
#ifdef PADDLE_WITH_HIP
event_ = std::exchange(other.event_, hipEvent_t{});
#else
event_ = std::exchange(other.event_, cudaEvent_t{});
#endif
}
};

Expand Down
13 changes: 13 additions & 0 deletions paddle/phi/api/include/compat/ATen/ops/record_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,18 @@ inline void Tensor::record_stream(at::cuda::CUDAStream s) const {

// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// at::Stream
#ifdef PADDLE_WITH_HIP
inline void Tensor::record_stream(hipStream_t s) const {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
PD_CHECK(dense_tensor != nullptr,
"record_stream only supports DenseTensor, but got a non-dense "
"tensor implementation.");
PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
"record_stream is not supported for CPU tensors.");
paddle::memory::RecordStream(dense_tensor->Holder(), s);
}
#else
inline void Tensor::record_stream(cudaStream_t s) const {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
Expand All @@ -68,4 +80,5 @@ inline void Tensor::record_stream(cudaStream_t s) const {
reinterpret_cast<gpuStream_t>(s));
}
#endif
#endif
} // namespace at
16 changes: 14 additions & 2 deletions paddle/phi/api/include/compat/c10/core/Event.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,23 @@ struct Event final {
void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); }
#endif

#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// c10::Stream
#ifdef PADDLE_WITH_HIP
void record(const hipStream_t& stream) {
TORCH_CHECK(device_type_ == DeviceType::CUDA,
"Raw hipStream_t recording is only supported for CUDA events.");
RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
}
#else
void record(const cudaStream_t& stream) {
TORCH_CHECK(
device_type_ == DeviceType::CUDA,
"Raw cudaStream_t recording is only supported for CUDA events.");
RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
}
#endif
#endif

void block(const Stream& stream) const {
Expand Down Expand Up @@ -243,8 +251,12 @@ struct Event final {
TORCH_CHECK(false, "Backend doesn't support events.");
}

#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_HIP
hipEvent_t cuda_event() const { return backend_event_; }
#else
cudaEvent_t cuda_event() const { return backend_event_; }
#endif
#endif

private:
Expand Down
Loading
Loading