diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h index 549f18ea6ce906..ba6fe4bad9f017 100644 --- a/paddle/phi/api/include/compat/ATen/core/TensorBody.h +++ b/paddle/phi/api/include/compat/ATen/core/TensorBody.h @@ -726,13 +726,6 @@ class Tensor : public TensorBase { void record_stream(at::Stream s) const; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void record_stream(at::cuda::CUDAStream s) const; -// TODO(youge325): Remove after DeepEP paddle branch is updated to use -// at::Stream -#ifdef PADDLE_WITH_HIP - void record_stream(hipStream_t s) const; -#else - void record_stream(cudaStream_t s) const; -#endif #endif Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); } diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h index aa526e1b8a1b97..9977e662b76b25 100644 --- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h +++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h @@ -53,32 +53,6 @@ inline void Tensor::record_stream(at::Stream s) const { inline void Tensor::record_stream(at::cuda::CUDAStream s) const { record_stream(static_cast(s)); } - -// TODO(youge325): Remove after DeepEP paddle branch is updated to use -// at::Stream -#ifdef PADDLE_WITH_HIP -inline void Tensor::record_stream(hipStream_t s) const { - auto dense_tensor = - std::dynamic_pointer_cast(tensor_.impl()); - PD_CHECK(dense_tensor != nullptr, - "record_stream only supports DenseTensor, but got a non-dense " - "tensor implementation."); - PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU, - "record_stream is not supported for CPU tensors."); - paddle::memory::RecordStream(dense_tensor->Holder(), s); -} -#else -inline void Tensor::record_stream(cudaStream_t s) const { - auto dense_tensor = - std::dynamic_pointer_cast(tensor_.impl()); - PD_CHECK(dense_tensor != nullptr, - "record_stream only supports DenseTensor, but got a non-dense " - "tensor implementation."); - PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU, - "record_stream is not supported for CPU tensors."); - paddle::memory::RecordStream(dense_tensor->Holder(), - reinterpret_cast(s)); -} -#endif #endif + } // namespace at diff --git a/paddle/phi/api/include/compat/c10/core/Event.h b/paddle/phi/api/include/compat/c10/core/Event.h index 77f521c5a202bc..4e6f7919d32bc8 100644 --- a/paddle/phi/api/include/compat/c10/core/Event.h +++ b/paddle/phi/api/include/compat/c10/core/Event.h @@ -95,25 +95,6 @@ struct Event final { void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - // TODO(youge325): Remove after DeepEP paddle branch is updated to use - // c10::Stream -#ifdef PADDLE_WITH_HIP - void record(const hipStream_t& stream) { - TORCH_CHECK(device_type_ == DeviceType::CUDA, - "Raw hipStream_t recording is only supported for CUDA events."); - RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId()); - } -#else - void record(const cudaStream_t& stream) { - TORCH_CHECK( - device_type_ == DeviceType::CUDA, - "Raw cudaStream_t recording is only supported for CUDA events."); - RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId()); - } -#endif -#endif - void block(const Stream& stream) const { if (!was_marked_for_recording_) { return; diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h index e9bcbc939d9215..f68e863eb931dd 100644 --- a/paddle/phi/api/include/compat/c10/core/Stream.h +++ b/paddle/phi/api/include/compat/c10/core/Stream.h @@ -105,3 +105,7 @@ struct hash { } }; } // namespace std + +namespace at { +using c10::Stream; +} diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.cpp b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.cpp index c7bab19d5bddfa..4ede1f07d16d1f 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.cpp +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.cpp @@ -115,6 +115,16 @@ inline void initTLSCurrentStreams() { } // namespace +inline CUDAStream make_cuda_stream(cudaStream_t raw, + c10::DeviceIndex device_index) { + c10::StreamId sid = + static_cast(reinterpret_cast(raw)); + return CUDAStream( + c10::Stream(c10::Stream::UNSAFE, + c10::Device(c10::DeviceType::CUDA, device_index), + sid)); +} + CUDAStream getStreamFromPool(const int priority, c10::DeviceIndex device_index) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h index 085ff44d97e881..8d9a7312a5bc06 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h @@ -100,12 +100,6 @@ class CUDAStream { Device device() const { return Device(DeviceType::CUDA, device_index()); } -#ifdef PADDLE_WITH_HIP - hipStream_t raw_stream() const { return stream(); } -#else - cudaStream_t raw_stream() const { return stream(); } -#endif - struct c10::StreamData3 pack3() const { return stream_.pack3(); } @@ -139,28 +133,6 @@ class CUDAStream { Stream stream_; }; -#ifdef PADDLE_WITH_HIP -inline CUDAStream make_cuda_stream(hipStream_t raw, - c10::DeviceIndex device_index) { - c10::StreamId sid = - static_cast(reinterpret_cast(raw)); - return CUDAStream( - c10::Stream(c10::Stream::UNSAFE, - c10::Device(c10::DeviceType::CUDA, device_index), - sid)); -} -#else -inline CUDAStream make_cuda_stream(cudaStream_t raw, - c10::DeviceIndex device_index) { - c10::StreamId sid = - static_cast(reinterpret_cast(raw)); - return CUDAStream( - c10::Stream(c10::Stream::UNSAFE, - c10::Device(c10::DeviceType::CUDA, device_index), - sid)); -} -#endif - /** * Get the current CUDA stream for the passed CUDA device, or for the * current device if no device index is passed. diff --git a/test/cpp/compat/ATen_record_stream_test.cc b/test/cpp/compat/ATen_record_stream_test.cc index 71cdd6ac32bfa1..7971e8b3bbd17e 100644 --- a/test/cpp/compat/ATen_record_stream_test.cc +++ b/test/cpp/compat/ATen_record_stream_test.cc @@ -51,17 +51,6 @@ using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const; [[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method = &at::Tensor::record_stream; -// Raw stream type is platform-specific: -// - CUDA: cudaStream_t (CUstream_st*) -// - HIP: hipStream_t (ihipStream_t*) -// Only test the raw stream overload on CUDA builds where cudaStream_t is -// consistently defined. HIP builds use hipStream_t which is a different type. -#if defined(PADDLE_WITH_CUDA) -using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const; -[[maybe_unused]] static RecordRawCudaStreamMethod - g_record_raw_cuda_stream_method = &at::Tensor::record_stream; -#endif - TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) { if (!at::cuda::is_available()) { return; @@ -80,13 +69,6 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) { EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream)); } -TEST_F(RecordStreamTest, CudaTensorRawCudaStream) { - if (!at::cuda::is_available()) { - return; - } - auto stream = at::cuda::getCurrentCUDAStream(); - EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream())); -} #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP // --- Error path: CPU tensor + CPU stream (record_stream does not support CPU diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc index 9669894f38671f..fb3c594f66982a 100644 --- a/test/cpp/compat/c10_Event_test.cc +++ b/test/cpp/compat/c10_Event_test.cc @@ -40,24 +40,6 @@ TEST(EventTest, CpuEventRecordThrows) { EXPECT_THROW(event.recordOnce(stream), std::exception); } -// Test device_count() works in both CPU and CUDA builds -TEST(EventTest, DeviceCount) { - c10::DeviceIndex count = c10::cuda::device_count(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - // In CUDA builds, should return actual device count (>= 0) - EXPECT_GE(count, 0); -#else - // In CPU-only builds, should return 0 - EXPECT_EQ(count, 0); -#endif -} - -#ifdef PADDLE_WITH_CUDA -using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&); -[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method = - &c10::Event::record; -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(EventTest, CudaEventLazyCreateAndRecord) { if (!at::cuda::is_available()) { @@ -111,19 +93,6 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) { EXPECT_GE(elapsed_ms, 0.0); } -#ifdef PADDLE_WITH_CUDA -TEST(EventTest, CudaEventRawStreamRecordCompatibility) { - if (!at::cuda::is_available()) { - return; - } - auto stream = c10::cuda::getCurrentCUDAStream(); - c10::Event event(c10::DeviceType::CUDA); - EXPECT_NO_THROW(event.record(stream.raw_stream())); - EXPECT_EQ(event.device_index(), stream.device_index()); - EXPECT_TRUE(event.was_marked_for_recording()); -} -#endif - TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) { if (c10::cuda::device_count() < 2) { return;