diff --git a/.gitignore b/.gitignore index abd75c85cc55ab..7df4d0d77fc8f9 100644 --- a/.gitignore +++ b/.gitignore @@ -71,6 +71,7 @@ third_party/ bazel-* .humanize .codex +.paddle-agent build_* # clion workspace. diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h index a12dfba80b5bec..7593bd8840b0eb 100644 --- a/paddle/phi/api/include/compat/ATen/core/TensorBody.h +++ b/paddle/phi/api/include/compat/ATen/core/TensorBody.h @@ -678,12 +678,47 @@ class Tensor : public TensorBase { at::Tensor& absolute_() const { return abs_(); } Tensor operator[](int64_t index) const { - return paddle::experimental::slice(tensor_, - /*axes=*/{0}, - /*starts=*/{index}, - /*ends=*/{index + 1}, - /*infer_flags=*/{1}, - /*decrease_axis=*/{0}); + // Use as_strided to create a view (shares storage with original tensor) + // This allows fill_ to modify the original tensor + int64_t numel = tensor_.numel(); + if (numel == 0) { + PD_THROW("operator[]: cannot index empty tensor"); + } + + // Handle negative index + if (index < 0) { + index += tensor_.dims()[0]; + } + + // Check bounds + if (index < 0 || index >= tensor_.dims()[0]) { + PD_THROW("operator[]: index ", + index, + " out of range for tensor of size ", + tensor_.dims(), + " at dimension 0"); + } + + // For 1D tensor: create a scalar view (0-dim tensor) with proper offset + // For multi-D tensor: create a view of the row at index + std::vector new_sizes; + std::vector new_strides; + + auto dims = tensor_.dims(); + auto stride = tensor_.strides(); + + // Skip the first dimension (dim 0) + for (int i = 1; i < dims.size(); ++i) { + new_sizes.push_back(dims[i]); + new_strides.push_back(stride[i]); + } + + // Calculate storage offset + int64_t storage_offset = index * stride[0]; + + return as_strided(c10::IntArrayRef(new_sizes), + c10::IntArrayRef(new_strides), + storage_offset); } void record_stream(at::Stream s) const; diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h index fbd825981a38c3..5388dc00ffb0e9 100644 --- a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h +++ b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h @@ -49,7 +49,7 @@ using CUDAContextSparseHandle = phi::sparseHandle_t; using CUDAContextBlasHandle = phi::blasHandle_t; using CUDAContextBlasLtHandle = phi::blasLtHandle_t; using CUDAContextSolverHandle = phi::solverHandle_t; -#else +#elif defined(PADDLE_WITH_CUDA) using CUDAContextDeviceProp = cudaDeviceProp; using CUDAContextSparseHandle = cusparseHandle_t; using CUDAContextBlasHandle = cublasHandle_t; @@ -90,6 +90,7 @@ inline int64_t getNumGPUs() { return c10::cuda::device_count(); } */ inline bool is_available() { return c10::cuda::device_count() > 0; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) CUDAContextDeviceProp* getCurrentDeviceProperties(); int warp_size(); @@ -115,7 +116,6 @@ size_t getChosenWorkspaceSize(); size_t getCUDABlasLtWorkspaceSize(); void* getCUDABlasLtWorkspace(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) CUDAContextSolverHandle getCurrentCUDASolverDnHandle(); // Get the CUDA device allocator for the current device. diff --git a/paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh b/paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh index ae9ff1868579c6..3be67efa2ccc8d 100644 --- a/paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh +++ b/paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh @@ -26,8 +26,12 @@ namespace at::cuda::philox { // In-kernel call to retrieve philox seed and offset from a PhiloxCudaState // instance whether that instance was created with graph capture underway or // not. See Note [CUDA Graph-safe RNG states]. +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) __host__ __device__ __forceinline__ std::tuple unpack( at::PhiloxCudaState arg) { +#else +inline std::tuple unpack(at::PhiloxCudaState arg) { +#endif if (arg.captured_) { // static_cast avoids "warning: invalid narrowing conversion from "long" to // "unsigned long". diff --git a/paddle/phi/api/include/compat/ATen/ops/as_strided.h b/paddle/phi/api/include/compat/ATen/ops/as_strided.h index 185786b70c6dee..0823642849d31a 100644 --- a/paddle/phi/api/include/compat/ATen/ops/as_strided.h +++ b/paddle/phi/api/include/compat/ATen/ops/as_strided.h @@ -35,20 +35,28 @@ inline at::Tensor Tensor::as_strided( if (!src_tensor) { PD_THROW("as_strided: tensor must be a DenseTensor"); } - auto new_tensor = std::make_shared(); - new_tensor->ShareDataWith(*src_tensor); + // Create new meta with desired shape and strides first std::vector size_vec(size.begin(), size.end()); std::vector stride_vec(stride.begin(), stride.end()); - new_tensor->Resize(common::make_ddim(size_vec)); - new_tensor->set_strides(common::make_ddim(stride_vec)); + + // Create new DenseTensor with correct meta, then share data + // We need to create a temporary DenseTensor with the right meta + // because ShareDataWith copies the source meta which we don't want + auto new_tensor = std::make_shared(); + + // First, set up the holder by sharing data (this copies src meta, we'll + // override) + new_tensor->ShareDataWith(*src_tensor); + + // Now create the correct meta with new shape/strides + phi::DenseTensorMeta meta(src_tensor->dtype(), + common::make_ddim(size_vec), + common::make_ddim(stride_vec)); + // Calculate offset in bytes int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0; - if (offset != 0) { - auto meta = phi::DenseTensorMeta(new_tensor->meta()); - // meta.offset is in bytes; storage_offset is in elements - meta.offset = - static_cast(offset) * phi::SizeOf(new_tensor->dtype()); - new_tensor->set_meta(meta); - } + meta.offset = src_tensor->meta().offset + + static_cast(offset) * phi::SizeOf(src_tensor->dtype()); + new_tensor->set_meta(meta); PaddleTensor result; result.set_impl(new_tensor); return Tensor(result); @@ -67,16 +75,15 @@ inline const at::Tensor& Tensor::as_strided_( } std::vector size_vec(size.begin(), size.end()); std::vector stride_vec(stride.begin(), stride.end()); - src_tensor->Resize(common::make_ddim(size_vec)); - src_tensor->set_strides(common::make_ddim(stride_vec)); + // Use set_meta instead of Resize + set_strides to avoid contiguous check + phi::DenseTensorMeta meta(src_tensor->dtype(), + common::make_ddim(size_vec), + common::make_ddim(stride_vec)); + meta.layout = src_tensor->layout(); int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0; - if (offset != 0) { - auto meta = phi::DenseTensorMeta(src_tensor->meta()); - // meta.offset is in bytes; storage_offset is in elements - meta.offset = - static_cast(offset) * phi::SizeOf(src_tensor->dtype()); - src_tensor->set_meta(meta); - } + meta.offset = src_tensor->meta().offset + + static_cast(offset) * phi::SizeOf(src_tensor->dtype()); + src_tensor->set_meta(meta); return *this; } diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h index 73cb5dd4b2247c..1a2644d7e1a360 100644 --- a/paddle/phi/api/include/compat/ATen/ops/record_stream.h +++ b/paddle/phi/api/include/compat/ATen/ops/record_stream.h @@ -64,7 +64,8 @@ inline void Tensor::record_stream(cudaStream_t s) const { "tensor implementation."); PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU, "record_stream is not supported for CPU tensors."); - paddle::memory::RecordStream(dense_tensor->Holder(), s); + paddle::memory::RecordStream(dense_tensor->Holder(), + reinterpret_cast(s)); } #endif } // namespace at diff --git a/paddle/phi/api/include/compat/c10/core/TensorOptions.h b/paddle/phi/api/include/compat/c10/core/TensorOptions.h index 30c06c95005b8c..be94794d95206f 100644 --- a/paddle/phi/api/include/compat/c10/core/TensorOptions.h +++ b/paddle/phi/api/include/compat/c10/core/TensorOptions.h @@ -373,7 +373,3 @@ inline std::string toString(const TensorOptions& options) { namespace at { using namespace c10; // NOLINT } // namespace at - -namespace torch { -using namespace c10; // NOLINT -} // namespace torch diff --git a/test/cpp/compat/ATen_CUDABlas_test.cc b/test/cpp/compat/ATen_CUDABlas_test.cc index 03e49e1576cf45..8a8bf7ebfcebea 100644 --- a/test/cpp/compat/ATen_CUDABlas_test.cc +++ b/test/cpp/compat/ATen_CUDABlas_test.cc @@ -24,7 +24,6 @@ #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" -#include "test/cpp/compat/cuda_test_utils.h" // Helper: allocate three same-sized device buffers, copy host data in, // invoke a kernel via |fn|, copy results back, synchronize, then free. @@ -73,7 +72,6 @@ class GemmTester { static double toDouble(T val) { return static_cast(val); } void Run() { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); std::vector h_a = {T(1), T(3), T(2), T(4)}; std::vector h_b = {T(5), T(7), T(6), T(8)}; std::vector h_c(N * N, T(0)); @@ -95,7 +93,6 @@ class GemmTester { // transA='T': C = alpha * A^T * B + beta * C // A^T = [[1,3],[2,4]], A^T * B = [[26,30],[38,44]] void RunTransA() { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); std::vector h_a = {T(1), T(3), T(2), T(4)}; std::vector h_b = {T(5), T(7), T(6), T(8)}; std::vector h_c(N * N, T(0)); @@ -136,7 +133,6 @@ TEST(CUDABlasTest, GemmFloatTransA) { } TEST(CUDABlasTest, GemmFloatTransALowercase) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); constexpr int64_t N = 2; std::vector h_a = {1.F, 3.F, 2.F, 4.F}; @@ -181,7 +177,6 @@ TEST(CUDABlasTest, GemmBFloat16) { // A stored col-major: col0={1+i,2+2i}, col1={3+3i,4+4i} // A^H stored col-major: col0={1-i,3-3i}, col1={2-2i,4-4i} TEST(CUDABlasTest, GemmComplexFloatConjTrans) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); constexpr int64_t N = 2; using T = c10::complex; @@ -209,7 +204,6 @@ TEST(CUDABlasTest, GemmComplexFloatConjTrans) { // Same as above but uses lowercase 'c'/'n' to exercise that switch-case branch. TEST(CUDABlasTest, GemmComplexDoubleConjTransLower) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); constexpr int64_t N = 2; using T = c10::complex; diff --git a/test/cpp/compat/ATen_CUDAContext_test.cc b/test/cpp/compat/ATen_CUDAContext_test.cc index 3eb86c96301fa4..245548f0aaf729 100644 --- a/test/cpp/compat/ATen_CUDAContext_test.cc +++ b/test/cpp/compat/ATen_CUDAContext_test.cc @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - #include #include #include -#include #include "gtest/gtest.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include #include "paddle/phi/backends/gpu/gpu_info.h" -#include "test/cpp/compat/cuda_test_utils.h" +#endif // --------------------------------------------------------------------------- // CUDAFunctions.h — covers the 2 missing lines: @@ -29,34 +29,45 @@ // --------------------------------------------------------------------------- TEST(CUDAFunctionsTest, DeviceSynchronize) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Exercises the PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()) branch ASSERT_NO_THROW(c10::cuda::device_synchronize()); +#else + // In CPU-only builds, device_synchronize throws + ASSERT_THROW(c10::cuda::device_synchronize(), std::exception); +#endif } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(CUDAFunctionsTest, StreamSynchronize) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); // Exercises phi::backends::gpu::GpuStreamSync() auto stream = c10::cuda::getCurrentCUDAStream(); ASSERT_NO_THROW(c10::cuda::stream_synchronize(stream)); } +#endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(CUDAFunctionsTest, AtNamespaceAliases) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); // Exercises the using aliases in at::cuda namespace ASSERT_NO_THROW(at::cuda::device_synchronize()); auto stream = c10::cuda::getCurrentCUDAStream(); ASSERT_NO_THROW(at::cuda::stream_synchronize(stream)); } +#endif // --------------------------------------------------------------------------- // CUDAContextLight.h — covers the 1 missing line: is_available() // --------------------------------------------------------------------------- TEST(CUDAContextLightTest, IsAvailable) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // With GPU compilation and at least one device, this must be true. int gpu_count = phi::backends::gpu::GetGPUDeviceCount(); ASSERT_EQ(at::cuda::is_available(), gpu_count > 0); +#else + // In CPU-only builds, is_available() should return false + ASSERT_FALSE(at::cuda::is_available()); +#endif } // --------------------------------------------------------------------------- @@ -65,14 +76,21 @@ TEST(CUDAContextLightTest, IsAvailable) { // getNumGPUs() delegages to c10::cuda::device_count() TEST(CUDAContextLightTest, GetNumGPUs) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); int64_t n = at::cuda::getNumGPUs(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ASSERT_GE(n, 1); +#else + // In CPU-only builds, device_count() returns 0 + ASSERT_EQ(n, 0); +#endif } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +// The following tests require CUDA runtime and can only run in CUDA builds + // getCurrentDeviceProperties() / getDeviceProperties() TEST(CUDAContextLightTest, DeviceProperties) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); ASSERT_NE(prop, nullptr); // Sanity-check a few well-known fields @@ -87,7 +105,6 @@ TEST(CUDAContextLightTest, DeviceProperties) { // warp_size() TEST(CUDAContextLightTest, WarpSize) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); int ws = at::cuda::warp_size(); // All NVIDIA and AMD GPU architectures have warp size of 32 or 64 ASSERT_TRUE(ws == 32 || ws == 64); @@ -95,7 +112,6 @@ TEST(CUDAContextLightTest, WarpSize) { // canDeviceAccessPeer() — a device cannot peer-access itself TEST(CUDAContextLightTest, CanDeviceAccessPeer) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); int device_id = phi::backends::gpu::GetCurrentDeviceId(); // Self-to-self peer access is always false per CUDA spec bool self_peer = at::cuda::canDeviceAccessPeer(device_id, device_id); @@ -104,26 +120,22 @@ TEST(CUDAContextLightTest, CanDeviceAccessPeer) { // Handle accessors — all must return non-null handles TEST(CUDAContextLightTest, GetCurrentCUDABlasHandle) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); cublasHandle_t h = at::cuda::getCurrentCUDABlasHandle(); ASSERT_NE(h, nullptr); } TEST(CUDAContextLightTest, GetCurrentCUDABlasLtHandle) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); cublasLtHandle_t h = at::cuda::getCurrentCUDABlasLtHandle(); ASSERT_NE(h, nullptr); } TEST(CUDAContextLightTest, GetCurrentCUDASparseHandle) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); cusparseHandle_t h = at::cuda::getCurrentCUDASparseHandle(); ASSERT_NE(h, nullptr); } #if defined(CUDART_VERSION) || defined(USE_ROCM) TEST(CUDAContextLightTest, GetCurrentCUDASolverDnHandle) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); cusolverDnHandle_t h = at::cuda::getCurrentCUDASolverDnHandle(); ASSERT_NE(h, nullptr); } @@ -160,7 +172,6 @@ TEST(CUDAContextLightTest, GetChosenWorkspaceSize) { // getCUDABlasLtWorkspaceSize() / getCUDABlasLtWorkspace() TEST(CUDAContextLightTest, CUDABlasLtWorkspace) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); size_t sz = at::cuda::getCUDABlasLtWorkspaceSize(); ASSERT_GT(sz, 0UL); @@ -176,7 +187,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorSingleton) { } TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator(); ASSERT_NE(alloc, nullptr); @@ -207,7 +217,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) { } TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator(); ASSERT_NE(alloc, nullptr); @@ -220,7 +229,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) { } TEST(CUDAContextLightTest, AllocatorZeroSizeAndNoopCopyBranches) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator(); ASSERT_NE(alloc, nullptr); diff --git a/test/cpp/compat/ATen_Utils_test.cc b/test/cpp/compat/ATen_Utils_test.cc index 5666dc56b69310..a3815c5a85ebfa 100644 --- a/test/cpp/compat/ATen_Utils_test.cc +++ b/test/cpp/compat/ATen_Utils_test.cc @@ -28,7 +28,6 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" #include "paddle/phi/common/float16.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" // ============================================================ @@ -153,7 +152,6 @@ TEST(ATenUtilsTest, TensorBackend_CPUDevice_MatchesTensorCPU) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(ATenUtilsTest, TensorBackend_GPUDevice) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); std::vector data = {7.0f, 8.0f}; at::TensorOptions opts = at::TensorOptions().dtype(at::kFloat).device(c10::Device(c10::kCUDA, 0)); @@ -164,7 +162,6 @@ TEST(ATenUtilsTest, TensorBackend_GPUDevice) { } TEST(ATenUtilsTest, TensorComplexBackend_GPUDevice) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); std::vector> data = {{1.0f, 0.0f}}; at::TensorOptions opts = at::TensorOptions() .dtype(at::kComplexFloat) diff --git a/test/cpp/compat/ATen_as_strided_test.cc b/test/cpp/compat/ATen_as_strided_test.cc index 02f6a03524aba8..fe2003167f827e 100644 --- a/test/cpp/compat/ATen_as_strided_test.cc +++ b/test/cpp/compat/ATen_as_strided_test.cc @@ -20,8 +20,11 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" +#include "paddle/common/macros.h" #include "torch/all.h" +COMMON_DECLARE_bool(use_stride_kernel); + namespace { class TensorAsStridedTest : public ::testing::Test {}; @@ -84,6 +87,7 @@ TEST_F(TensorAsStridedTest, AsStridedInplaceWithOffset) { t.as_strided_({2, 3}, {3, 1}, 1); ASSERT_EQ(t.sizes(), c10::IntArrayRef({2, 3})); + ASSERT_NE(t.data_ptr(), original_data_ptr); float* data = t.data_ptr(); ASSERT_FLOAT_EQ(data[0], 1.0f); @@ -134,6 +138,9 @@ TEST_F(TensorAsStridedTest, AsStridedScatterWithOffset) { } TEST_F(TensorAsStridedTest, AsStridedTranspose) { + if (!FLAGS_use_stride_kernel) { + return; + } // Transpose: shape {2,3} -> {3,2}, stride {1,2} // [[0,1,2],[3,4,5]] -> [[0,3],[1,4],[2,5]] at::Tensor t = at::arange(6, at::kFloat).view({2, 3}); @@ -146,6 +153,9 @@ TEST_F(TensorAsStridedTest, AsStridedTranspose) { } TEST_F(TensorAsStridedTest, AsStridedContiguous) { + if (!FLAGS_use_stride_kernel) { + return; + } at::Tensor t = at::arange(12, at::kFloat); // Contiguous: {2,6}, stride {6,1} diff --git a/test/cpp/compat/compat_basic_test.cc b/test/cpp/compat/ATen_basic_test.cc similarity index 98% rename from test/cpp/compat/compat_basic_test.cc rename to test/cpp/compat/ATen_basic_test.cc index 232a9fd66e8f7c..20f22c96b591f4 100644 --- a/test/cpp/compat/compat_basic_test.cc +++ b/test/cpp/compat/ATen_basic_test.cc @@ -27,9 +27,12 @@ #endif #include "ATen/ATen.h" #include "gtest/gtest.h" +#include "paddle/common/macros.h" #include "paddle/phi/common/float16.h" #include "torch/all.h" +COMMON_DECLARE_bool(use_stride_kernel); + TEST(TensorBaseTest, DataPtrAPIs) { // Test data_ptr() and const_data_ptr() APIs at::TensorBase tensor = at::ones({2, 3}, at::kFloat); @@ -77,6 +80,9 @@ TEST(TensorBaseTest, TypeDeviceAPIs) { } TEST(TensorBaseTest, ModifyOperationAPIs) { + if (!FLAGS_use_stride_kernel) { + return; + } // Test modify operation related APIs at::TensorBase tensor = at::ones({2, 3}, at::kFloat); @@ -356,6 +362,9 @@ TEST(TensorBaseTest, ResetAPI) { } TEST(TensorBaseTest, IsNonOverlappingAndDenseAPI) { + if (!FLAGS_use_stride_kernel) { + return; + } // Test is_non_overlapping_and_dense() API // Case 1: Contiguous tensor - should be non-overlapping and dense @@ -403,6 +412,9 @@ TEST(TensorBaseTest, IsNonOverlappingAndDenseAPI) { } TEST(TensorBaseTest, UndefinedAndNonDenseBranchCoverage) { + if (!FLAGS_use_stride_kernel) { + return; + } at::TensorBase undefined; ASSERT_EQ(undefined.toString(), std::string("UndefinedType")); ASSERT_EQ(undefined.data_ptr(), nullptr); diff --git a/test/cpp/compat/ATen_clamp_test.cc b/test/cpp/compat/ATen_clamp_test.cc index 19aa353ba62a7a..fb0e32b3b77e61 100644 --- a/test/cpp/compat/ATen_clamp_test.cc +++ b/test/cpp/compat/ATen_clamp_test.cc @@ -202,6 +202,7 @@ TEST_F(TensorOperatorIndexTest, OperatorIndexOutOfBounds) { } // Note: Depending on implementation, this may or may not throw // We accept either behavior (return empty/invalid tensor or throw) + (void)threw_exception; // Silence unused variable warning } // ======================= Additional clamp edge case tests diff --git a/test/cpp/compat/ATen_cuda_test.cc b/test/cpp/compat/ATen_cuda_test.cc index 3613945638708f..c218a104c1e74d 100644 --- a/test/cpp/compat/ATen_cuda_test.cc +++ b/test/cpp/compat/ATen_cuda_test.cc @@ -12,19 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + #include #include #include #include #include #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include -#endif #include "ATen/ATen.h" #include "gtest/gtest.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" // ============================================================ @@ -33,7 +32,6 @@ // After cuda(), the tensor should reside on a GPU device. TEST(TensorCudaTest, CpuTensorMovesToCuda) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor cpu_t = at::tensor({1.0f, 2.0f, 3.0f}, at::kFloat); ASSERT_TRUE(cpu_t.is_cpu()); @@ -44,7 +42,6 @@ TEST(TensorCudaTest, CpuTensorMovesToCuda) { // dtype and numel must be preserved. TEST(TensorCudaTest, DtypeAndNumelPreserved) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor cpu_t = at::tensor({1, 2, 3, 4}, at::kInt); at::Tensor cuda_t = cpu_t.cuda(); @@ -54,7 +51,6 @@ TEST(TensorCudaTest, DtypeAndNumelPreserved) { // Values should round-trip back to CPU intact. TEST(TensorCudaTest, ValuesPreservedAfterRoundTrip) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); std::vector data = {1.0f, 2.5f, -3.0f, 4.75f}; at::Tensor cpu_t = at::tensor(data, at::kFloat); at::Tensor cuda_t = cpu_t.cuda(); @@ -68,7 +64,6 @@ TEST(TensorCudaTest, ValuesPreservedAfterRoundTrip) { // shape (sizes) should be preserved. TEST(TensorCudaTest, ShapePreserved) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor cpu_t = at::zeros({2, 3, 4}, at::kFloat); at::Tensor cuda_t = cpu_t.cuda(); @@ -80,7 +75,6 @@ TEST(TensorCudaTest, ShapePreserved) { // An already-CUDA tensor should still be CUDA after another cuda() call. TEST(TensorCudaTest, AlreadyCudaTensorStaysCuda) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor cpu_t = at::tensor({7.0f}, at::kFloat); at::Tensor cuda_t = cpu_t.cuda(); at::Tensor cuda_t2 = cuda_t.cuda(); @@ -91,7 +85,6 @@ TEST(TensorCudaTest, AlreadyCudaTensorStaysCuda) { // device() should report a CUDA device. TEST(TensorCudaTest, DeviceIsCuda) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor cpu_t = at::tensor({0.0f}, at::kFloat); at::Tensor cuda_t = cpu_t.cuda(); @@ -100,10 +93,11 @@ TEST(TensorCudaTest, DeviceIsCuda) { // is_cuda() / is_cpu() are mutually exclusive. TEST(TensorCudaTest, IsCudaAndIsCpuMutuallyExclusive) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor cpu_t = at::tensor({1.0f, 2.0f}, at::kFloat); at::Tensor cuda_t = cpu_t.cuda(); ASSERT_TRUE(cuda_t.is_cuda()); ASSERT_FALSE(cuda_t.is_cpu()); } + +#endif diff --git a/test/cpp/compat/compat_dense_sparse_conversion_test.cc b/test/cpp/compat/ATen_dense_sparse_conversion_test.cc similarity index 100% rename from test/cpp/compat/compat_dense_sparse_conversion_test.cc rename to test/cpp/compat/ATen_dense_sparse_conversion_test.cc diff --git a/test/cpp/compat/ATen_empty_test.cc b/test/cpp/compat/ATen_empty_test.cc index 6a1ea666bc2588..20538b2bcbaac7 100644 --- a/test/cpp/compat/ATen_empty_test.cc +++ b/test/cpp/compat/ATen_empty_test.cc @@ -20,7 +20,6 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" // ======================== at::empty basic tests ======================== @@ -57,7 +56,6 @@ TEST(ATenEmptyTest, ExplicitArgsCpu) { // TensorOptions overload: pin_memory via options TEST(ATenEmptyTest, PinMemoryViaTensorOptions) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::TensorOptions opts = at::TensorOptions().dtype(at::kFloat).pinned_memory(true); at::Tensor t = at::empty({4, 4}, opts); @@ -67,7 +65,6 @@ TEST(ATenEmptyTest, PinMemoryViaTensorOptions) { // 6-argument overload: pin_memory = true (must use CPU device) TEST(ATenEmptyTest, PinMemoryViaExplicitArgs) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor t = at::empty({8}, at::kFloat, at::kStrided, at::kCPU, true, std::nullopt); ASSERT_TRUE(t.is_pinned()) @@ -76,7 +73,6 @@ TEST(ATenEmptyTest, PinMemoryViaExplicitArgs) { // pin_memory = false must NOT produce a pinned tensor TEST(ATenEmptyTest, NoPinMemoryViaExplicitArgs) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor t = at::empty({8}, at::kFloat, at::kStrided, at::kCUDA, false, std::nullopt); ASSERT_FALSE(t.is_pinned()) @@ -85,7 +81,6 @@ TEST(ATenEmptyTest, NoPinMemoryViaExplicitArgs) { // Pinned tensor lives in pinned (host) memory, not on the GPU device itself TEST(ATenEmptyTest, PinnedTensorIsNotCuda) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::TensorOptions opts = at::TensorOptions().dtype(at::kFloat).pinned_memory(true); at::Tensor t = at::empty({16}, opts); @@ -96,7 +91,6 @@ TEST(ATenEmptyTest, PinnedTensorIsNotCuda) { // Data pointer of a pinned tensor must be non-null TEST(ATenEmptyTest, PinnedTensorDataPtrNonNull) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::TensorOptions opts = at::TensorOptions().dtype(at::kFloat).pinned_memory(true); at::Tensor t = at::empty({32}, opts); diff --git a/test/cpp/compat/ATen_equal_test.cc b/test/cpp/compat/ATen_equal_test.cc index 0b6e99591068b0..c91f2cd18885c4 100644 --- a/test/cpp/compat/ATen_equal_test.cc +++ b/test/cpp/compat/ATen_equal_test.cc @@ -21,7 +21,6 @@ #include #include "gtest/gtest.h" -#include "test/cpp/compat/cuda_test_utils.h" TEST(TensorEqualTest, DifferentShapeReturnsFalse) { at::Tensor a = at::ones({2, 2}, at::kFloat); @@ -41,8 +40,6 @@ TEST(TensorEqualTest, DtypeMismatchCastsOtherTensor) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(TensorEqualTest, DeviceMismatchThrows) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); - at::Tensor cpu = at::ones({2, 2}, at::kFloat); at::Tensor gpu = at::ones({2, 2}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); diff --git a/test/cpp/compat/ATen_eye_test.cc b/test/cpp/compat/ATen_eye_test.cc index 3cd147ddcaa5e8..d9d7eb49fe5f41 100644 --- a/test/cpp/compat/ATen_eye_test.cc +++ b/test/cpp/compat/ATen_eye_test.cc @@ -26,7 +26,6 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" #include "paddle/phi/common/float16.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" // ============================================================ @@ -159,7 +158,6 @@ TEST(ATenEyeTest, OneByOne) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(ATenEyeTest, SquareOnGPU) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor t = at::eye(4, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); at::Tensor t_cpu = t.to(at::kCPU); @@ -167,7 +165,6 @@ TEST(ATenEyeTest, SquareOnGPU) { } TEST(ATenEyeTest, RectangularOnGPU) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor t = at::eye(3, 5, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); at::Tensor t_cpu = t.to(at::kCPU); diff --git a/test/cpp/compat/ATen_from_blob_test.cc b/test/cpp/compat/ATen_from_blob_test.cc index 6505f8e70ffb5e..88e04a638b75e4 100644 --- a/test/cpp/compat/ATen_from_blob_test.cc +++ b/test/cpp/compat/ATen_from_blob_test.cc @@ -18,16 +18,17 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" +#include "paddle/common/macros.h" #include "torch/all.h" +COMMON_DECLARE_bool(use_stride_kernel); + #if defined(PADDLE_WITH_CUDA) #include #elif defined(PADDLE_WITH_HIP) #include #endif -#include "test/cpp/compat/cuda_test_utils.h" - // ======================== CPU place detection ======================== // No device specified: CPU pointer → tensor must be on CPU. @@ -67,6 +68,9 @@ TEST(ATenFromBlobTest, ShapeAndStrides) { // Explicit strides overload. TEST(ATenFromBlobTest, ExplicitStrides) { + if (!FLAGS_use_stride_kernel) { + return; + } // Row-major 2×3 laid out in memory, but we interpret as column-major strides float data[6] = {1, 2, 3, 4, 5, 6}; at::Tensor t = at::from_blob(data, {2, 3}, {1, 2}, at::kFloat); @@ -119,7 +123,6 @@ TEST(ATenFromBlobTest, DeleterWithStrides) { // No device specified: GPU pointer → tensor must be on CUDA automatically. TEST(ATenFromBlobTest, GpuPtrDefaultsToCuda) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); float* d_data = nullptr; #if defined(PADDLE_WITH_CUDA) cudaMalloc(&d_data, 4 * sizeof(float)); @@ -144,7 +147,6 @@ TEST(ATenFromBlobTest, GpuPtrDefaultsToCuda) { // Explicit CUDA device option + GPU pointer → still CUDA. TEST(ATenFromBlobTest, GpuPtrWithCudaOptions) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); float* d_data = nullptr; #if defined(PADDLE_WITH_CUDA) cudaMalloc(&d_data, 4 * sizeof(float)); @@ -165,7 +167,6 @@ TEST(ATenFromBlobTest, GpuPtrWithCudaOptions) { // target_device overrides auto-detection. TEST(ATenFromBlobTest, TargetDeviceOverride) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); float* d_data = nullptr; #if defined(PADDLE_WITH_CUDA) cudaMalloc(&d_data, 4 * sizeof(float)); diff --git a/test/cpp/compat/ATen_index_test.cc b/test/cpp/compat/ATen_index_test.cc index 83af96b80c5049..3e640a2bc25947 100644 --- a/test/cpp/compat/ATen_index_test.cc +++ b/test/cpp/compat/ATen_index_test.cc @@ -22,8 +22,11 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" +#include "paddle/common/macros.h" #include "torch/all.h" +COMMON_DECLARE_bool(use_stride_kernel); + // ======================== index tests ======================== TEST(TensorIndexTest, IndexWithSingleTensor) { @@ -53,6 +56,9 @@ TEST(TensorIndexTest, IndexWithSingleTensor) { } TEST(TensorIndexTest, SliceKeepsStrideWithoutContiguousCopy) { + if (!FLAGS_use_stride_kernel) { + return; + } at::Tensor base = at::arange(24, at::kFloat).reshape({4, 6}); at::Tensor transposed = base.t(); // shape: [6, 4], strides: [1, 6] ASSERT_FALSE(transposed.is_contiguous()); @@ -96,6 +102,9 @@ TEST(TensorIndexTest, IndexWithTensorInitializerList) { } TEST(TensorIndexTest, MemberIndexWithArrayRefTensorIndices) { + if (!FLAGS_use_stride_kernel) { + return; + } at::Tensor base = at::arange(24, at::kFloat).reshape({4, 6}); at::Tensor transposed = base.t(); std::vector indices = {at::indexing::Slice(1, 5), diff --git a/test/cpp/compat/ATen_local_scalar_dense_test.cc b/test/cpp/compat/ATen_local_scalar_dense_test.cc index 89c85fa9f47261..60e107fca94aa8 100644 --- a/test/cpp/compat/ATen_local_scalar_dense_test.cc +++ b/test/cpp/compat/ATen_local_scalar_dense_test.cc @@ -28,7 +28,6 @@ #include "gtest/gtest.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" // ============================================================ @@ -130,7 +129,6 @@ TEST(LocalScalarDenseTest, ZeroValue_Int32) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(LocalScalarDenseTest, GPU_Float32_ReturnsCorrectValue) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); // _local_scalar_dense must copy to CPU when the tensor is on GPU. at::Tensor t = at::tensor( {7.0f}, diff --git a/test/cpp/compat/ATen_memory_test.cc b/test/cpp/compat/ATen_memory_test.cc index 65a33c96fcccfc..9a7bc3a3f6883c 100644 --- a/test/cpp/compat/ATen_memory_test.cc +++ b/test/cpp/compat/ATen_memory_test.cc @@ -28,7 +28,6 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" #include "paddle/phi/common/float16.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" // ==================== is_pinned tests ==================== @@ -315,7 +314,6 @@ TEST(DetachInplaceTest, DetachInplaceChained) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Test reciprocal on CUDA TEST(ReciprocalTest, ReciprocalCUDA) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto tensor = at::empty({4}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); auto cpu_tensor = at::empty({4}, at::TensorOptions().dtype(at::kFloat)); @@ -338,7 +336,6 @@ TEST(ReciprocalTest, ReciprocalCUDA) { // Test detach on CUDA TEST(DetachTest, DetachCUDA) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto tensor = at::arange(5, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); diff --git a/test/cpp/compat/ATen_philox_test.cc b/test/cpp/compat/ATen_philox_test.cc index 77a1875c182b36..4c39fa6973d585 100644 --- a/test/cpp/compat/ATen_philox_test.cc +++ b/test/cpp/compat/ATen_philox_test.cc @@ -15,11 +15,16 @@ // Verify that including both headers in the same translation unit compiles // cleanly (no ODR violations) and that the canonical PhiloxCudaState // definition is consistent across both include paths. -#include -#include #include +#include + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include +#include +#endif + // offset_intragraph_ must be uint64_t to match PyTorch upstream. static_assert(std::is_same_v, @@ -31,6 +36,7 @@ TEST(ATenPhiloxTest, TypeConsistency) { EXPECT_EQ(sizeof(at::PhiloxCudaState{}.offset_intragraph_), sizeof(uint64_t)); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(ATenPhiloxTest, UnpackNonCaptured) { constexpr uint64_t kSeed = 42ULL; constexpr uint64_t kOffset = 100ULL; @@ -55,3 +61,4 @@ TEST(ATenPhiloxTest, NonCapturedOffsetIntragraphIgnored) { EXPECT_EQ(seed, 7ULL); EXPECT_EQ(offset, 13ULL); } +#endif diff --git a/test/cpp/compat/ATen_pin_memory_creation_test.cc b/test/cpp/compat/ATen_pin_memory_creation_test.cc index 8d89d00b07b083..c17bd7c4c121e1 100644 --- a/test/cpp/compat/ATen_pin_memory_creation_test.cc +++ b/test/cpp/compat/ATen_pin_memory_creation_test.cc @@ -25,7 +25,6 @@ #include #include "gtest/gtest.h" -#include "test/cpp/compat/cuda_test_utils.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -41,7 +40,6 @@ void AssertNotPinned(const at::Tensor& t) { ASSERT_FALSE(t.is_pinned()); } } // namespace TEST(ATenPinMemoryCreationTest, FullPinMemory) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); // Test using TensorOptions with pinned_memory auto by_options = at::full( {2, 3}, 1.5f, at::TensorOptions().dtype(at::kFloat).pinned_memory(true)); @@ -59,7 +57,6 @@ TEST(ATenPinMemoryCreationTest, FullPinMemory) { } TEST(ATenPinMemoryCreationTest, FullPinMemoryWithCUDADeviceErrors) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); // Test that pin_memory=true with explicit CUDA device throws error ASSERT_THROW( at::full({2, 3}, 1.5f, at::kFloat, std::nullopt, at::kCUDA, true), @@ -67,7 +64,6 @@ TEST(ATenPinMemoryCreationTest, FullPinMemoryWithCUDADeviceErrors) { } TEST(ATenPinMemoryCreationTest, OnesPinMemory) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto by_options = at::ones( {4, 2}, at::TensorOptions().dtype(at::kFloat).pinned_memory(true)); AssertPinned(by_options); @@ -80,13 +76,11 @@ TEST(ATenPinMemoryCreationTest, OnesPinMemory) { } TEST(ATenPinMemoryCreationTest, OnesPinMemoryWithCUDADeviceErrors) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); ASSERT_THROW(at::ones({4, 2}, at::kFloat, std::nullopt, at::kCUDA, true), std::exception); } TEST(ATenPinMemoryCreationTest, ZerosPinMemory) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto by_options = at::zeros( {3, 5}, at::TensorOptions().dtype(at::kFloat).pinned_memory(true)); AssertPinned(by_options); @@ -99,13 +93,11 @@ TEST(ATenPinMemoryCreationTest, ZerosPinMemory) { } TEST(ATenPinMemoryCreationTest, ZerosPinMemoryWithCUDADeviceErrors) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); ASSERT_THROW(at::zeros({3, 5}, at::kFloat, at::kStrided, at::kCUDA, true), std::exception); } TEST(ATenPinMemoryCreationTest, EyePinMemory) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto by_options = at::eye(6, at::TensorOptions().dtype(at::kFloat).pinned_memory(true)); AssertPinned(by_options); @@ -118,13 +110,11 @@ TEST(ATenPinMemoryCreationTest, EyePinMemory) { } TEST(ATenPinMemoryCreationTest, EyePinMemoryWithCUDADeviceErrors) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); ASSERT_THROW(at::eye(6, at::kFloat, std::nullopt, at::kCUDA, true), std::exception); } TEST(ATenPinMemoryCreationTest, ArangePinMemoryOverloads) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto end_only_by_options = at::arange(10, at::TensorOptions().dtype(at::kFloat).pinned_memory(true)); AssertPinned(end_only_by_options); @@ -157,7 +147,6 @@ TEST(ATenPinMemoryCreationTest, ArangePinMemoryOverloads) { } TEST(ATenPinMemoryCreationTest, ArangePinMemoryWithCUDADeviceErrors) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); ASSERT_THROW(at::arange(10, at::kFloat, std::nullopt, at::kCUDA, true), std::exception); ASSERT_THROW(at::arange(0, 10, at::kFloat, std::nullopt, at::kCUDA, true), @@ -167,7 +156,6 @@ TEST(ATenPinMemoryCreationTest, ArangePinMemoryWithCUDADeviceErrors) { } TEST(ATenPinMemoryCreationTest, EmptyLikePinMemory) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto base = at::ones({2, 4}, at::kFloat); auto by_options = @@ -186,7 +174,6 @@ TEST(ATenPinMemoryCreationTest, EmptyLikePinMemory) { } TEST(ATenPinMemoryCreationTest, EmptyLikePinMemoryWithCUDADeviceErrors) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto base = at::ones({2, 4}, at::kFloat); ASSERT_THROW(at::empty_like(base, at::TensorOptions() @@ -198,7 +185,6 @@ TEST(ATenPinMemoryCreationTest, EmptyLikePinMemoryWithCUDADeviceErrors) { } TEST(ATenPinMemoryCreationTest, ZerosLikePinMemory) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto base = at::ones({2, 4}, at::kFloat); auto by_options = @@ -217,7 +203,6 @@ TEST(ATenPinMemoryCreationTest, ZerosLikePinMemory) { } TEST(ATenPinMemoryCreationTest, ZerosLikePinMemoryWithCUDADeviceErrors) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto base = at::ones({2, 4}, at::kFloat); ASSERT_THROW(at::zeros_like(base, at::TensorOptions() diff --git a/test/cpp/compat/ATen_record_stream_test.cc b/test/cpp/compat/ATen_record_stream_test.cc index 8be51d243d1022..fc847864e14e79 100644 --- a/test/cpp/compat/ATen_record_stream_test.cc +++ b/test/cpp/compat/ATen_record_stream_test.cc @@ -23,7 +23,6 @@ #endif #include "ATen/ATen.h" #include "gtest/gtest.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" class RecordStreamTest : public ::testing::Test { @@ -32,10 +31,8 @@ class RecordStreamTest : public ::testing::Test { cpu_tensor = at::ones({4}, at::TensorOptions().dtype(at::kFloat).device(at::kCPU)); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (compat_test::CudaRuntimeAvailable()) { - cuda_tensor = at::ones( - {4}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); - } + cuda_tensor = + at::ones({4}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); #endif } @@ -56,7 +53,6 @@ using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const; g_record_raw_cuda_stream_method = &at::Tensor::record_stream; TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto stream = at::cuda::getCurrentCUDAStream(); // record_stream should not throw EXPECT_NO_THROW(cuda_tensor.record_stream(stream)); @@ -64,13 +60,11 @@ TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) { // --- Happy path: CUDA tensor + default CUDA stream should succeed --- TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); c10::Stream default_stream = c10::cuda::getDefaultCUDAStream().unwrap(); EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream)); } TEST_F(RecordStreamTest, CudaTensorRawCudaStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto stream = at::cuda::getCurrentCUDAStream(); EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream())); } @@ -88,7 +82,6 @@ TEST_F(RecordStreamTest, CpuTensorCpuStream) { // tensors) --- #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST_F(RecordStreamTest, CpuTensorCudaStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto cuda_stream = at::cuda::getCurrentCUDAStream(); EXPECT_THROW(cpu_tensor.record_stream(cuda_stream), std::exception); } diff --git a/test/cpp/compat/ATen_select_test.cc b/test/cpp/compat/ATen_select_test.cc index 4e8029714a57a4..4af3aa7d6931b5 100644 --- a/test/cpp/compat/ATen_select_test.cc +++ b/test/cpp/compat/ATen_select_test.cc @@ -28,7 +28,6 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" #include "paddle/phi/common/float16.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" // ==================== select tests ==================== @@ -380,7 +379,6 @@ TEST(MaskedSelectTest, MaskedSelectDifferentDtypes) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Test for select on CUDA TEST(SelectTest, SelectCUDA) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto tensor = at::arange(10, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); @@ -395,7 +393,6 @@ TEST(SelectTest, SelectCUDA) { // Test for index_select on CUDA TEST(IndexSelectTest, IndexSelectCUDA) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto tensor = at::arange(10, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); @@ -420,7 +417,6 @@ TEST(IndexSelectTest, IndexSelectCUDA) { // Test for masked_select on CUDA TEST(MaskedSelectTest, MaskedSelectCUDA) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto tensor = at::arange(10, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); diff --git a/test/cpp/compat/ATen_split_test.cc b/test/cpp/compat/ATen_split_test.cc index db0ff3cdd6a280..ce0bdc2a1c52ff 100644 --- a/test/cpp/compat/ATen_split_test.cc +++ b/test/cpp/compat/ATen_split_test.cc @@ -28,7 +28,6 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" #include "paddle/phi/common/float16.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" // Test for tensor_split with sections @@ -310,7 +309,6 @@ TEST(SplitTest, SplitSymInt) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Test for split on CUDA TEST(SplitTest, SplitCUDA) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto tensor = at::arange(10, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); @@ -329,7 +327,6 @@ TEST(SplitTest, SplitCUDA) { // Test for tensor_split on CUDA TEST(TensorSplitTest, TensorSplitCUDA) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto tensor = at::arange(12, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA)); diff --git a/test/cpp/compat/compat_toString_test.cc b/test/cpp/compat/ATen_toString_test.cc similarity index 100% rename from test/cpp/compat/compat_toString_test.cc rename to test/cpp/compat/ATen_toString_test.cc diff --git a/test/cpp/compat/ATen_to_test.cc b/test/cpp/compat/ATen_to_test.cc index cd9646f11321c2..3c1183dc7e807b 100644 --- a/test/cpp/compat/ATen_to_test.cc +++ b/test/cpp/compat/ATen_to_test.cc @@ -27,7 +27,6 @@ #include "ATen/ATen.h" #include "gtest/gtest.h" #include "paddle/phi/common/float16.h" -#include "test/cpp/compat/cuda_test_utils.h" #include "torch/all.h" // ============================================================ @@ -191,7 +190,6 @@ TEST(TensorToTest, ToOtherTensor_MatchesDevice) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(TensorToTest, ToDtype_GPU_FloatToDouble) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor t = at::tensor( {1.0f, 2.0f}, at::TensorOptions().dtype(at::kFloat).device(c10::Device(c10::kCUDA, 0))); @@ -202,7 +200,6 @@ TEST(TensorToTest, ToDtype_GPU_FloatToDouble) { } TEST(TensorToTest, ToDevice_CPUToGPU) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor t = at::tensor({5.0f}, at::kFloat); at::Tensor result = t.to(c10::Device(c10::kCUDA, 0), at::kFloat, @@ -213,7 +210,6 @@ TEST(TensorToTest, ToDevice_CPUToGPU) { } TEST(TensorToTest, ToDevice_GPUToCPU) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); at::Tensor t = at::tensor( {7.0f}, at::TensorOptions().dtype(at::kFloat).device(c10::Device(c10::kCUDA, 0))); diff --git a/test/cpp/compat/ATen_transpose_test.cc b/test/cpp/compat/ATen_transpose_test.cc index bc3d20f69e3a06..6d24ff2f628b5c 100644 --- a/test/cpp/compat/ATen_transpose_test.cc +++ b/test/cpp/compat/ATen_transpose_test.cc @@ -26,9 +26,12 @@ #endif #include "ATen/ATen.h" #include "gtest/gtest.h" +#include "paddle/common/macros.h" #include "paddle/phi/common/float16.h" #include "torch/all.h" +COMMON_DECLARE_bool(use_stride_kernel); + // ============================================================ // Tests for at::Tensor::transpose_(int64_t dim0, int64_t dim1) // (in-place variant; out-of-place transpose is a separate path) @@ -69,6 +72,9 @@ TEST(TensorTransposeInplaceTest, Transpose3D_SwapLastTwo) { } TEST(TensorTransposeInplaceTest, TransposeInplace_PreservesValues) { + if (!FLAGS_use_stride_kernel) { + return; + } // Verify values are correctly accessed after in-place transpose at::Tensor t = at::arange(6, at::kFloat).reshape({2, 3}); // t = [[0,1,2],[3,4,5]] diff --git a/test/cpp/compat/ATen_viewAs_test.cc b/test/cpp/compat/ATen_viewAs_test.cc index 38dd12b97bacdf..bdf863b2a873c0 100644 --- a/test/cpp/compat/ATen_viewAs_test.cc +++ b/test/cpp/compat/ATen_viewAs_test.cc @@ -26,9 +26,12 @@ #endif #include "ATen/ATen.h" #include "gtest/gtest.h" +#include "paddle/common/macros.h" #include "paddle/phi/common/float16.h" #include "torch/all.h" +COMMON_DECLARE_bool(use_stride_kernel); + // ============================================================ // Tests for at::Tensor::view_as(const at::Tensor& other) // ============================================================ @@ -44,6 +47,9 @@ TEST(TensorViewAsTest, ViewAsSameShape) { } TEST(TensorViewAsTest, ViewAsDifferentShape_CompatibleNumel) { + if (!FLAGS_use_stride_kernel) { + return; + } // view_as with a different but numel-compatible shape at::Tensor t = at::arange(12, at::kFloat); at::Tensor other = at::zeros({3, 4}, at::kFloat); @@ -54,6 +60,9 @@ TEST(TensorViewAsTest, ViewAsDifferentShape_CompatibleNumel) { } TEST(TensorViewAsTest, ViewAsPreservesData) { + if (!FLAGS_use_stride_kernel) { + return; + } // Elements are accessible with the new shape and preserve original values at::Tensor t = at::arange(6, at::kFloat); // t = [0, 1, 2, 3, 4, 5] @@ -68,6 +77,9 @@ TEST(TensorViewAsTest, ViewAsPreservesData) { } TEST(TensorViewAsTest, ViewAs1D_Flattens) { + if (!FLAGS_use_stride_kernel) { + return; + } // view_as a 1-D tensor to flatten a higher-rank tensor at::Tensor t = at::ones({2, 3, 4}, at::kFloat); at::Tensor flat_ref = at::zeros({24}, at::kFloat); diff --git a/test/cpp/compat/CMakeLists.txt b/test/cpp/compat/CMakeLists.txt index 38270a5c16eec0..efe00fec74112f 100644 --- a/test/cpp/compat/CMakeLists.txt +++ b/test/cpp/compat/CMakeLists.txt @@ -1,72 +1,81 @@ -if(NOT WIN32) +if(NOT WIN32 AND NOT WITH_ROCM) + # c10 core tests (CPU compatible) + cc_test(c10_Device_test SRCS c10_Device_test.cc) + cc_test(c10_DispatchKeySet_test SRCS c10_DispatchKeySet_test.cc) + cc_test(c10_DispatchKey_test SRCS c10_DispatchKey_test.cc) + cc_test(c10_MemoryFormat_test SRCS c10_MemoryFormat_test.cc) + cc_test(c10_ScalarType_test SRCS c10_ScalarType_test.cc) + cc_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc) + cc_test(c10_TensorOptions_test SRCS c10_TensorOptions_test.cc) + cc_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc) + cc_test(c10_intrusive_ptr_lifecycle_test + SRCS c10_intrusive_ptr_lifecycle_test.cc) + cc_test(c10_layout_test SRCS c10_layout_test.cc) + cc_test(c10_ptr_test SRCS c10_ptr_test.cc) + cc_test(c10_storage_test SRCS c10_storage_test.cc) + + # ATen core tests (CPU compatible) + cc_test(ATen_all_test SRCS ATen_all_test.cc) + cc_test(ATen_any_test SRCS ATen_any_test.cc) + cc_test(ATen_as_strided_test SRCS ATen_as_strided_test.cc) + cc_test(ATen_autograd_test SRCS ATen_autograd_test.cc) + cc_test(ATen_chunk_test SRCS ATen_chunk_test.cc) + cc_test(ATen_clamp_test SRCS ATen_clamp_test.cc) + cc_test(ATen_coalesce_test SRCS ATen_coalesce_test.cc) + cc_test(ATen_dense_sparse_conversion_test + SRCS ATen_dense_sparse_conversion_test.cc) + cc_test(ATen_empty_test SRCS ATen_empty_test.cc) + cc_test(ATen_equal_test SRCS ATen_equal_test.cc) + cc_test(ATen_expand_test SRCS ATen_expand_test.cc) + cc_test(ATen_eye_test SRCS ATen_eye_test.cc) + cc_test(ATen_factory_default_dtype_test + SRCS ATen_factory_default_dtype_test.cc) + cc_test(ATen_flatten_test SRCS ATen_flatten_test.cc) + cc_test(ATen_from_blob_test SRCS ATen_from_blob_test.cc) + cc_test(ATen_hook_test SRCS ATen_hook_test.cc) + cc_test(ATen_index_test SRCS ATen_index_test.cc) + cc_test(ATen_item_test SRCS ATen_item_test.cc) + cc_test(ATen_narrow_test SRCS ATen_narrow_test.cc) + cc_test(ATen_new_test SRCS ATen_new_test.cc) + cc_test(ATen_nnz_test SRCS ATen_nnz_test.cc) + cc_test(ATen_rename_test SRCS ATen_rename_test.cc) + cc_test(ATen_reshape_test SRCS ATen_reshape_test.cc) + cc_test(ATen_resize_test SRCS ATen_resize_test.cc) + cc_test(ATen_squeeze_test SRCS ATen_squeeze_test.cc) + cc_test(ATen_std_var_test SRCS ATen_std_var_test.cc) + cc_test(ATen_sum_test SRCS ATen_sum_test.cc) + cc_test(ATen_t_test SRCS ATen_t_test.cc) + cc_test(ATen_tensor_data_test SRCS ATen_tensor_data_test.cc) + cc_test(ATen_toString_test SRCS ATen_toString_test.cc) + cc_test(ATen_to_test SRCS ATen_to_test.cc) + cc_test(ATen_transpose_test SRCS ATen_transpose_test.cc) + cc_test(ATen_Utils_test SRCS ATen_Utils_test.cc) + cc_test(ATen_values_test SRCS ATen_values_test.cc) + cc_test(ATen_viewAs_test SRCS ATen_viewAs_test.cc) + + # torch library tests (CPU compatible) + cc_test(torch_library_test SRCS torch_library_test.cc) + cc_test(torch_library_dispatch_test SRCS torch_library_dispatch_test.cc) + + # ATen tests with GPU code guarded by PADDLE_WITH_CUDA macros + cc_test(ATen_TensorAccessor_test SRCS ATen_TensorAccessor_test.cc) + cc_test(ATen_basic_test SRCS ATen_basic_test.cc) + cc_test(ATen_local_scalar_dense_test SRCS ATen_local_scalar_dense_test.cc) + cc_test(ATen_memory_test SRCS ATen_memory_test.cc) + cc_test(ATen_pin_memory_creation_test SRCS ATen_pin_memory_creation_test.cc) + cc_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc) + cc_test(ATen_select_test SRCS ATen_select_test.cc) + cc_test(ATen_split_test SRCS ATen_split_test.cc) + + cc_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc) + cc_test(ATen_philox_test SRCS ATen_philox_test.cc) + cc_test(c10_Event_test SRCS c10_Event_test.cc) + cc_test(c10_Stream_test SRCS c10_Stream_test.cc) + if(WITH_GPU) nv_test(ATen_CUDABlas_test SRCS ATen_CUDABlas_test.cc) - nv_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc) - nv_test(ATen_TensorAccessor_test SRCS ATen_TensorAccessor_test.cc) - nv_test(compat_basic_test SRCS compat_basic_test.cc) - nv_test(ATen_autograd_test SRCS ATen_autograd_test.cc) - nv_test(ATen_memory_test SRCS ATen_memory_test.cc) - nv_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc) - nv_test(ATen_select_test SRCS ATen_select_test.cc) - nv_test(ATen_split_test SRCS ATen_split_test.cc) - nv_test(ATen_narrow_test SRCS ATen_narrow_test.cc) - nv_test(ATen_reshape_test SRCS ATen_reshape_test.cc) - nv_test(ATen_flatten_test SRCS ATen_flatten_test.cc) - nv_test(ATen_squeeze_test SRCS ATen_squeeze_test.cc) - nv_test(compat_toString_test SRCS compat_toString_test.cc) - nv_test(ATen_all_test SRCS ATen_all_test.cc) - nv_test(c10_ptr_test SRCS c10_ptr_test.cc) - nv_test(c10_MemoryFormat_test SRCS c10_MemoryFormat_test.cc) - nv_test(c10_Device_test SRCS c10_Device_test.cc) - nv_test(c10_ScalarType_test SRCS c10_ScalarType_test.cc) - nv_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc) - nv_test(c10_storage_test SRCS c10_storage_test.cc) - nv_test(c10_Stream_test SRCS c10_Stream_test.cc) - nv_test(c10_Event_test SRCS c10_Event_test.cc) - nv_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc) - nv_test(c10_layout_test SRCS c10_layout_test.cc) - nv_test(ATen_clamp_test SRCS ATen_clamp_test.cc) - nv_test(ATen_as_strided_test SRCS ATen_as_strided_test.cc) - nv_test(ATen_std_var_test SRCS ATen_std_var_test.cc) - nv_test(ATen_index_test SRCS ATen_index_test.cc) - nv_test(ATen_tensor_data_test SRCS ATen_tensor_data_test.cc) - nv_test(ATen_any_test SRCS ATen_any_test.cc) - nv_test(ATen_chunk_test SRCS ATen_chunk_test.cc) - nv_test(ATen_expand_test SRCS ATen_expand_test.cc) - nv_test(ATen_hook_test SRCS ATen_hook_test.cc) - nv_test(ATen_new_test SRCS ATen_new_test.cc) - nv_test(ATen_empty_test SRCS ATen_empty_test.cc) - nv_test(ATen_factory_default_dtype_test - SRCS ATen_factory_default_dtype_test.cc) - nv_test(ATen_pin_memory_creation_test SRCS ATen_pin_memory_creation_test.cc) - nv_test(ATen_resize_test SRCS ATen_resize_test.cc) - nv_test(ATen_rename_test SRCS ATen_rename_test.cc) - nv_test(cuda_generator_test SRCS cuda_generator_test.cc) - nv_test(c10_generator_impl_test SRCS c10_generator_impl_test.cc) - nv_test(c10_intrusive_ptr_lifecycle_test - SRCS c10_intrusive_ptr_lifecycle_test.cc) - nv_test(c10_DispatchKey_test SRCS c10_DispatchKey_test.cc) - nv_test(c10_DispatchKeySet_test SRCS c10_DispatchKeySet_test.cc) - cc_test(torch_library_test SRCS torch_library_test.cc) - nv_test(ATen_eye_test SRCS ATen_eye_test.cc) - nv_test(ATen_sum_test SRCS ATen_sum_test.cc) - nv_test(ATen_t_test SRCS ATen_t_test.cc) - nv_test(ATen_transpose_test SRCS ATen_transpose_test.cc) - nv_test(ATen_viewAs_test SRCS ATen_viewAs_test.cc) - nv_test(ATen_coalesce_test SRCS ATen_coalesce_test.cc) - nv_test(ATen_item_test SRCS ATen_item_test.cc) - nv_test(ATen_local_scalar_dense_test SRCS ATen_local_scalar_dense_test.cc) nv_test(ATen_cuda_test SRCS ATen_cuda_test.cc) - nv_test(ATen_to_test SRCS ATen_to_test.cc) - nv_test(ATen_equal_test SRCS ATen_equal_test.cc) - nv_test(ATen_Utils_test SRCS ATen_Utils_test.cc) - nv_test(ATen_from_blob_test SRCS ATen_from_blob_test.cc) - nv_test(compat_dense_sparse_conversion_test - SRCS compat_dense_sparse_conversion_test.cc) - nv_test(ATen_values_test SRCS ATen_values_test.cc) - nv_test(ATen_nnz_test SRCS ATen_nnz_test.cc) - nv_test(c10_TensorOptions_test SRCS c10_TensorOptions_test.cc) - nv_test(ATen_philox_test SRCS ATen_philox_test.cc) + nv_test(c10_cuda_generator_test SRCS c10_cuda_generator_test.cc) + nv_test(c10_generator_impl_test SRCS c10_generator_impl_test.cc) endif() - cc_test(torch_library_dispatch_test SRCS torch_library_dispatch_test.cc) endif() diff --git a/test/cpp/compat/c10_Event_test.cc b/test/cpp/compat/c10_Event_test.cc index a6933e06efea36..218fbe624d0ce5 100644 --- a/test/cpp/compat/c10_Event_test.cc +++ b/test/cpp/compat/c10_Event_test.cc @@ -13,15 +13,14 @@ // limitations under the License. #include +#include + +#include "gtest/gtest.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include #include #endif -#include "gtest/gtest.h" -#include "test/cpp/compat/cuda_test_utils.h" - TEST(EventTest, CpuEventDefaultProperties) { c10::Event event(c10::DeviceType::CPU); EXPECT_EQ(event.device_type(), c10::DeviceType::CPU); @@ -40,6 +39,18 @@ TEST(EventTest, CpuEventRecordThrows) { EXPECT_THROW(event.recordOnce(stream), std::exception); } +// Test device_count() works in both CPU and CUDA builds +TEST(EventTest, DeviceCount) { + c10::DeviceIndex count = c10::cuda::device_count(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // In CUDA builds, should return actual device count (>= 0) + EXPECT_GE(count, 0); +#else + // In CPU-only builds, should return 0 + EXPECT_EQ(count, 0); +#endif +} + #ifdef PADDLE_WITH_CUDA using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&); [[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method = @@ -48,7 +59,6 @@ using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(EventTest, CudaEventLazyCreateAndRecord) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); c10::Event event(c10::DeviceType::CUDA); auto stream = c10::cuda::getCurrentCUDAStream(); @@ -66,7 +76,6 @@ TEST(EventTest, CudaEventLazyCreateAndRecord) { } TEST(EventTest, CudaEventElapsedTimeRequiresTimingFlag) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto stream = c10::cuda::getCurrentCUDAStream(); c10::Event start(c10::DeviceType::CUDA); c10::Event end(c10::DeviceType::CUDA); @@ -79,7 +88,6 @@ TEST(EventTest, CudaEventElapsedTimeRequiresTimingFlag) { } TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto stream = c10::cuda::getCurrentCUDAStream(); c10::Event start(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); c10::Event end(c10::DeviceType::CUDA, c10::EventFlag::BACKEND_DEFAULT); @@ -95,7 +103,6 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) { #ifdef PADDLE_WITH_CUDA TEST(EventTest, CudaEventRawStreamRecordCompatibility) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto stream = c10::cuda::getCurrentCUDAStream(); c10::Event event(c10::DeviceType::CUDA); EXPECT_NO_THROW(event.record(stream.raw_stream())); @@ -105,7 +112,6 @@ TEST(EventTest, CudaEventRawStreamRecordCompatibility) { #endif TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); if (c10::cuda::device_count() < 2) { return; } diff --git a/test/cpp/compat/c10_Stream_test.cc b/test/cpp/compat/c10_Stream_test.cc index d41421c10abe53..45c73988b2ab21 100644 --- a/test/cpp/compat/c10_Stream_test.cc +++ b/test/cpp/compat/c10_Stream_test.cc @@ -13,14 +13,25 @@ // limitations under the License. #include +#include #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include #include #endif #include "gtest/gtest.h" -#include "test/cpp/compat/cuda_test_utils.h" + +// Test device_count() works in both CPU and CUDA builds +TEST(StreamTest, DeviceCount) { + c10::DeviceIndex count = c10::cuda::device_count(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // In CUDA builds, should return actual device count (>= 0) + EXPECT_GE(count, 0); +#else + // In CPU-only builds, should return 0 + EXPECT_EQ(count, 0); +#endif +} // ==================== native_handle ==================== @@ -29,14 +40,12 @@ // encoded as void*. For the default (null) stream the id is 0, so the // pointer is nullptr; for a real stream it must be non-null. TEST(StreamTest, NativeHandleCudaDefaultStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); c10::Stream s = c10::cuda::getDefaultCUDAStream().unwrap(); // Default stream encodes nullptr (id == 0), so native_handle() == nullptr. EXPECT_EQ(s.native_handle(), nullptr); } TEST(StreamTest, NativeHandleCudaCurrentStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto cuda_stream = c10::cuda::getCurrentCUDAStream(); c10::Stream s = cuda_stream.unwrap(); // getCurrentCUDAStream wraps the real phi stream handle; calling @@ -64,7 +73,6 @@ TEST(StreamTest, QueryCpuStreamReturnsTrue) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // A freshly-obtained CUDA stream with no pending work must report ready. TEST(StreamTest, QueryCudaStreamReady) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto cuda_stream = c10::cuda::getCurrentCUDAStream(); c10::Stream s = cuda_stream.unwrap(); // synchronize first to ensure no pending work, then query should be true. @@ -85,7 +93,6 @@ TEST(StreamTest, SynchronizeCpuStream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // CUDA stream: synchronize() must complete without error. TEST(StreamTest, SynchronizeCudaStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto cuda_stream = c10::cuda::getCurrentCUDAStream(); c10::Stream s = cuda_stream.unwrap(); EXPECT_NO_THROW(s.synchronize()); @@ -98,7 +105,6 @@ TEST(StreamTest, SynchronizeCudaStream) { // getDefaultCUDAStream must always return the null stream (id == 0), // which corresponds to cudaStreamDefault on the device. TEST(CUDAStreamTest, DefaultStreamIsNullStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto default_stream = c10::cuda::getDefaultCUDAStream(); // id == 0 encodes cudaStreamDefault (the null stream, handle nullptr). EXPECT_EQ(default_stream.id(), static_cast(0)); @@ -106,14 +112,12 @@ TEST(CUDAStreamTest, DefaultStreamIsNullStream) { // getDefaultCUDAStream must be stable: calling it twice returns equal streams. TEST(CUDAStreamTest, DefaultStreamIsStable) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto s1 = c10::cuda::getDefaultCUDAStream(); auto s2 = c10::cuda::getDefaultCUDAStream(); EXPECT_EQ(s1, s2); } TEST(CUDAStreamTest, GetStreamFromPoolBoolOverloadPreservesHighPriority) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); auto low_priority_stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/false); auto high_priority_stream = @@ -135,7 +139,6 @@ TEST(CUDAStreamTest, GetStreamFromPoolBoolOverloadPreservesHighPriority) { // After setCurrentCUDAStream redirects the per-thread current stream, // getDefaultCUDAStream must still return the null stream. TEST(CUDAStreamTest, DefaultStreamUnaffectedBySetCurrentCUDAStream) { - SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); // Snapshot the per-thread current stream before we touch it so we can // restore it afterward and avoid polluting subsequent tests. auto original_stream = c10::cuda::getCurrentCUDAStream(); diff --git a/test/cpp/compat/cuda_generator_test.cc b/test/cpp/compat/c10_cuda_generator_test.cc similarity index 98% rename from test/cpp/compat/cuda_generator_test.cc rename to test/cpp/compat/c10_cuda_generator_test.cc index cfed6cd6bbbd60..489560cdc55480 100644 --- a/test/cpp/compat/cuda_generator_test.cc +++ b/test/cpp/compat/c10_cuda_generator_test.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + #include #include @@ -132,6 +134,8 @@ TEST(CUDAGeneratorTest, PhiloxStateThroughGetGeneratorOrDefault) { // Further advance via philox_cuda_state. at::PhiloxCudaState state = impl->philox_cuda_state(8); + (void)state; // Silence unused variable warning - state is used for its side + // effect ASSERT_EQ(impl->philox_offset_per_thread(), 12u); } @@ -356,3 +360,5 @@ TEST(CUDAGeneratorTest, CheckGeneratorSucceedsWithMatchingDeviceType) { ASSERT_NE(impl, nullptr); ASSERT_EQ(impl->current_seed(), 555u); } + +#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/test/cpp/compat/cuda_test_utils.h b/test/cpp/compat/cuda_test_utils.h deleted file mode 100644 index c1085c250cd0b3..00000000000000 --- a/test/cpp/compat/cuda_test_utils.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "gtest/gtest.h" - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include - -#if defined(PADDLE_WITH_CUDA) -#include -#elif defined(PADDLE_WITH_HIP) -#include -#endif -#endif - -namespace compat_test { - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -inline bool CudaRuntimeAvailable() { - try { - if (c10::cuda::device_count() <= 0) { - return false; - } - } catch (...) { - return false; - } -#if defined(PADDLE_WITH_CUDA) - return cudaFree(nullptr) == cudaSuccess; -#else - return hipFree(nullptr) == hipSuccess; -#endif -} -#else -inline bool CudaRuntimeAvailable() { return false; } -#endif - -} // namespace compat_test - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#define SKIP_IF_CUDA_RUNTIME_UNAVAILABLE() \ - do { \ - if (!compat_test::CudaRuntimeAvailable()) { \ - return; \ - } \ - } while (false) -#else -#define SKIP_IF_CUDA_RUNTIME_UNAVAILABLE() \ - do { \ - } while (false) -#endif diff --git a/test/cpp/compat/torch_library_dispatch_test.cc b/test/cpp/compat/torch_library_dispatch_test.cc index e27b5b61d3568f..9af94953299945 100644 --- a/test/cpp/compat/torch_library_dispatch_test.cc +++ b/test/cpp/compat/torch_library_dispatch_test.cc @@ -84,7 +84,7 @@ static decltype(torch::OperatorRegistry::instance() .find_operator("") ->implementations.end()) pick_impl(torch::OperatorRegistration* op) { - using DK = torch::DispatchKey; + using DK = c10::DispatchKey; const std::vector preferred_keys = { DK::CPU, DK::BackendSelect, DK::CatchAll}; auto chosen = op->implementations.end(); @@ -110,10 +110,10 @@ TEST(CompatTorchDispatchTest, BackendSelectOnlyRegistration) { auto* op = torch::OperatorRegistry::instance().find_operator(qname); ASSERT_NE(op, nullptr); - EXPECT_EQ(op->implementations.find(torch::DispatchKey::CPU), + EXPECT_EQ(op->implementations.find(c10::DispatchKey::CPU), op->implementations.end()); - auto bs_it = op->implementations.find(torch::DispatchKey::BackendSelect); + auto bs_it = op->implementations.find(c10::DispatchKey::BackendSelect); ASSERT_NE(bs_it, op->implementations.end()); torch::FunctionArgs args; @@ -130,14 +130,14 @@ TEST(CompatTorchDispatchTest, CpuPreferredOverBackendSelect) { auto* op = torch::OperatorRegistry::instance().find_operator(qname); ASSERT_NE(op, nullptr); - ASSERT_NE(op->implementations.find(torch::DispatchKey::CPU), + ASSERT_NE(op->implementations.find(c10::DispatchKey::CPU), op->implementations.end()); - ASSERT_NE(op->implementations.find(torch::DispatchKey::BackendSelect), + ASSERT_NE(op->implementations.find(c10::DispatchKey::BackendSelect), op->implementations.end()); auto chosen = pick_impl(op); ASSERT_NE(chosen, op->implementations.end()); - EXPECT_EQ(chosen->first, torch::DispatchKey::CPU); + EXPECT_EQ(chosen->first, c10::DispatchKey::CPU); torch::FunctionArgs args; args.add_arg(torch::IValue(int64_t(41))); @@ -154,7 +154,7 @@ TEST(CompatTorchDispatchTest, BackendSelectPickedWhenCpuAbsent) { auto chosen = pick_impl(op); ASSERT_NE(chosen, op->implementations.end()); - EXPECT_EQ(chosen->first, torch::DispatchKey::BackendSelect); + EXPECT_EQ(chosen->first, c10::DispatchKey::BackendSelect); torch::FunctionArgs args; args.add_arg(torch::IValue(int64_t(32))); @@ -189,9 +189,9 @@ TEST(CompatTorchDispatchTest, AmbiguousMultiKeyProducesEnd) { ASSERT_NE(op, nullptr); // Registered under CUDA and XPU – neither is in the preferred list. ASSERT_GE(op->implementations.size(), 2UL); - EXPECT_EQ(op->implementations.find(torch::DispatchKey::CPU), + EXPECT_EQ(op->implementations.find(c10::DispatchKey::CPU), op->implementations.end()); - EXPECT_EQ(op->implementations.find(torch::DispatchKey::BackendSelect), + EXPECT_EQ(op->implementations.find(c10::DispatchKey::BackendSelect), op->implementations.end()); auto chosen = pick_impl(op);