PaddlePaddle · SigureMo · Apr 7, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 5, 2026
diff --git a/test/cpp/compat/ATen_CUDABlas_test.cc b/test/cpp/compat/ATen_CUDABlas_test.cc
@@ -24,7 +24,6 @@
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
-#include "test/cpp/compat/cuda_test_utils.h"
 
 // Helper: allocate three same-sized device buffers, copy host data in,
 // invoke a kernel via |fn|, copy results back, synchronize, then free.
@@ -73,7 +72,6 @@ class GemmTester {
   static double toDouble(T val) { return static_cast<double>(val); }
 
   void Run() {
-    SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
     std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
     std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
     std::vector<T> h_c(N * N, T(0));
@@ -95,7 +93,6 @@ class GemmTester {
   // transA='T': C = alpha * A^T * B + beta * C
   // A^T = [[1,3],[2,4]],  A^T * B = [[26,30],[38,44]]
   void RunTransA() {
-    SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
     std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
     std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
     std::vector<T> h_c(N * N, T(0));
@@ -136,7 +133,6 @@ TEST(CUDABlasTest, GemmFloatTransA) {
 }
 
 TEST(CUDABlasTest, GemmFloatTransALowercase) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   constexpr int64_t N = 2;
 
   std::vector<float> h_a = {1.F, 3.F, 2.F, 4.F};
@@ -181,7 +177,6 @@ TEST(CUDABlasTest, GemmBFloat16) {
 // A stored col-major: col0={1+i,2+2i}, col1={3+3i,4+4i}
 // A^H stored col-major: col0={1-i,3-3i}, col1={2-2i,4-4i}
 TEST(CUDABlasTest, GemmComplexFloatConjTrans) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   constexpr int64_t N = 2;
   using T = c10::complex<float>;
 
@@ -209,7 +204,6 @@ TEST(CUDABlasTest, GemmComplexFloatConjTrans) {
 
 // Same as above but uses lowercase 'c'/'n' to exercise that switch-case branch.
 TEST(CUDABlasTest, GemmComplexDoubleConjTransLower) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   constexpr int64_t N = 2;
   using T = c10::complex<double>;
 

diff --git a/test/cpp/compat/ATen_CUDAContext_test.cc b/test/cpp/compat/ATen_CUDAContext_test.cc
@@ -21,28 +21,24 @@
 
 #include "gtest/gtest.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
-#include "test/cpp/compat/cuda_test_utils.h"
 
 // ---------------------------------------------------------------------------
 // CUDAFunctions.h — covers the 2 missing lines:
 //   c10::cuda::device_synchronize() and c10::cuda::stream_synchronize()
 // ---------------------------------------------------------------------------
 
 TEST(CUDAFunctionsTest, DeviceSynchronize) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   // Exercises the PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()) branch
   ASSERT_NO_THROW(c10::cuda::device_synchronize());
 }
 
 TEST(CUDAFunctionsTest, StreamSynchronize) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   // Exercises phi::backends::gpu::GpuStreamSync()
   auto stream = c10::cuda::getCurrentCUDAStream();
   ASSERT_NO_THROW(c10::cuda::stream_synchronize(stream));
 }
 
 TEST(CUDAFunctionsTest, AtNamespaceAliases) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   // Exercises the using aliases in at::cuda namespace
   ASSERT_NO_THROW(at::cuda::device_synchronize());
   auto stream = c10::cuda::getCurrentCUDAStream();
@@ -65,14 +61,12 @@ TEST(CUDAContextLightTest, IsAvailable) {
 
 // getNumGPUs() delegages to c10::cuda::device_count()
 TEST(CUDAContextLightTest, GetNumGPUs) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   int64_t n = at::cuda::getNumGPUs();
   ASSERT_GE(n, 1);
 }
 
 // getCurrentDeviceProperties() / getDeviceProperties()
 TEST(CUDAContextLightTest, DeviceProperties) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   ASSERT_NE(prop, nullptr);
   // Sanity-check a few well-known fields
@@ -87,15 +81,13 @@ TEST(CUDAContextLightTest, DeviceProperties) {
 
 // warp_size()
 TEST(CUDAContextLightTest, WarpSize) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   int ws = at::cuda::warp_size();
   // All NVIDIA and AMD GPU architectures have warp size of 32 or 64
   ASSERT_TRUE(ws == 32 || ws == 64);
 }
 
 // canDeviceAccessPeer() — a device cannot peer-access itself
 TEST(CUDAContextLightTest, CanDeviceAccessPeer) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   int device_id = phi::backends::gpu::GetCurrentDeviceId();
   // Self-to-self peer access is always false per CUDA spec
   bool self_peer = at::cuda::canDeviceAccessPeer(device_id, device_id);
@@ -104,26 +96,22 @@ TEST(CUDAContextLightTest, CanDeviceAccessPeer) {
 
 // Handle accessors — all must return non-null handles
 TEST(CUDAContextLightTest, GetCurrentCUDABlasHandle) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cublasHandle_t h = at::cuda::getCurrentCUDABlasHandle();
   ASSERT_NE(h, nullptr);
 }
 
 TEST(CUDAContextLightTest, GetCurrentCUDABlasLtHandle) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cublasLtHandle_t h = at::cuda::getCurrentCUDABlasLtHandle();
   ASSERT_NE(h, nullptr);
 }
 
 TEST(CUDAContextLightTest, GetCurrentCUDASparseHandle) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cusparseHandle_t h = at::cuda::getCurrentCUDASparseHandle();
   ASSERT_NE(h, nullptr);
 }
 
 #if defined(CUDART_VERSION) || defined(USE_ROCM)
 TEST(CUDAContextLightTest, GetCurrentCUDASolverDnHandle) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cusolverDnHandle_t h = at::cuda::getCurrentCUDASolverDnHandle();
   ASSERT_NE(h, nullptr);
 }
@@ -160,7 +148,6 @@ TEST(CUDAContextLightTest, GetChosenWorkspaceSize) {
 
 // getCUDABlasLtWorkspaceSize() / getCUDABlasLtWorkspace()
 TEST(CUDAContextLightTest, CUDABlasLtWorkspace) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   size_t sz = at::cuda::getCUDABlasLtWorkspaceSize();
   ASSERT_GT(sz, 0UL);
 
@@ -176,7 +163,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorSingleton) {
 }
 
 TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator();
   ASSERT_NE(alloc, nullptr);
 
@@ -207,7 +193,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) {
 }
 
 TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator();
   ASSERT_NE(alloc, nullptr);
 
@@ -220,7 +205,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) {
 }
 
 TEST(CUDAContextLightTest, AllocatorZeroSizeAndNoopCopyBranches) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator();
   ASSERT_NE(alloc, nullptr);
 

diff --git a/test/cpp/compat/ATen_Utils_test.cc b/test/cpp/compat/ATen_Utils_test.cc
@@ -28,7 +28,6 @@
 #include "ATen/ATen.h"
 #include "gtest/gtest.h"
 #include "paddle/phi/common/float16.h"
-#include "test/cpp/compat/cuda_test_utils.h"
 #include "torch/all.h"
 
 // ============================================================
@@ -153,7 +152,6 @@ TEST(ATenUtilsTest, TensorBackend_CPUDevice_MatchesTensorCPU) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(ATenUtilsTest, TensorBackend_GPUDevice) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   std::vector<float> data = {7.0f, 8.0f};
   at::TensorOptions opts =
       at::TensorOptions().dtype(at::kFloat).device(c10::Device(c10::kCUDA, 0));
@@ -164,7 +162,6 @@ TEST(ATenUtilsTest, TensorBackend_GPUDevice) {
 }
 
 TEST(ATenUtilsTest, TensorComplexBackend_GPUDevice) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   std::vector<c10::complex<float>> data = {{1.0f, 0.0f}};
   at::TensorOptions opts = at::TensorOptions()
                                .dtype(at::kComplexFloat)

diff --git a/test/cpp/compat/ATen_as_strided_test.cc b/test/cpp/compat/ATen_as_strided_test.cc
@@ -84,6 +84,7 @@ TEST_F(TensorAsStridedTest, AsStridedInplaceWithOffset) {
   t.as_strided_({2, 3}, {3, 1}, 1);
 
   ASSERT_EQ(t.sizes(), c10::IntArrayRef({2, 3}));
+  ASSERT_NE(t.data_ptr<float>(), original_data_ptr);
 
   float* data = t.data_ptr<float>();
   ASSERT_FLOAT_EQ(data[0], 1.0f);

diff --git a/test/cpp/compat/compat_basic_test.cc → test/cpp/compat/ATen_basic_test.cc b/test/cpp/compat/compat_basic_test.cc → test/cpp/compat/ATen_basic_test.cc
diff --git a/test/cpp/compat/ATen_clamp_test.cc b/test/cpp/compat/ATen_clamp_test.cc
@@ -202,6 +202,7 @@ TEST_F(TensorOperatorIndexTest, OperatorIndexOutOfBounds) {
   }
   // Note: Depending on implementation, this may or may not throw
   // We accept either behavior (return empty/invalid tensor or throw)
+  (void)threw_exception;  // Silence unused variable warning
 }
 
 // ======================= Additional clamp edge case tests

diff --git a/test/cpp/compat/ATen_cuda_test.cc b/test/cpp/compat/ATen_cuda_test.cc
@@ -20,11 +20,9 @@
 #include <c10/core/ScalarType.h>
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <c10/cuda/CUDAFunctions.h>
-#endif
 
 #include "ATen/ATen.h"
 #include "gtest/gtest.h"
-#include "test/cpp/compat/cuda_test_utils.h"
 #include "torch/all.h"
 
 // ============================================================
@@ -33,7 +31,6 @@
 
 // After cuda(), the tensor should reside on a GPU device.
 TEST(TensorCudaTest, CpuTensorMovesToCuda) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor cpu_t = at::tensor({1.0f, 2.0f, 3.0f}, at::kFloat);
   ASSERT_TRUE(cpu_t.is_cpu());
 
@@ -44,7 +41,6 @@ TEST(TensorCudaTest, CpuTensorMovesToCuda) {
 
 // dtype and numel must be preserved.
 TEST(TensorCudaTest, DtypeAndNumelPreserved) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor cpu_t = at::tensor({1, 2, 3, 4}, at::kInt);
   at::Tensor cuda_t = cpu_t.cuda();
 
@@ -54,7 +50,6 @@ TEST(TensorCudaTest, DtypeAndNumelPreserved) {
 
 // Values should round-trip back to CPU intact.
 TEST(TensorCudaTest, ValuesPreservedAfterRoundTrip) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   std::vector<float> data = {1.0f, 2.5f, -3.0f, 4.75f};
   at::Tensor cpu_t = at::tensor(data, at::kFloat);
   at::Tensor cuda_t = cpu_t.cuda();
@@ -68,7 +63,6 @@ TEST(TensorCudaTest, ValuesPreservedAfterRoundTrip) {
 
 // shape (sizes) should be preserved.
 TEST(TensorCudaTest, ShapePreserved) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor cpu_t = at::zeros({2, 3, 4}, at::kFloat);
   at::Tensor cuda_t = cpu_t.cuda();
 
@@ -80,7 +74,6 @@ TEST(TensorCudaTest, ShapePreserved) {
 
 // An already-CUDA tensor should still be CUDA after another cuda() call.
 TEST(TensorCudaTest, AlreadyCudaTensorStaysCuda) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor cpu_t = at::tensor({7.0f}, at::kFloat);
   at::Tensor cuda_t = cpu_t.cuda();
   at::Tensor cuda_t2 = cuda_t.cuda();
@@ -91,7 +84,6 @@ TEST(TensorCudaTest, AlreadyCudaTensorStaysCuda) {
 
 // device() should report a CUDA device.
 TEST(TensorCudaTest, DeviceIsCuda) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor cpu_t = at::tensor({0.0f}, at::kFloat);
   at::Tensor cuda_t = cpu_t.cuda();
 
@@ -100,10 +92,11 @@ TEST(TensorCudaTest, DeviceIsCuda) {
 
 // is_cuda() / is_cpu() are mutually exclusive.
 TEST(TensorCudaTest, IsCudaAndIsCpuMutuallyExclusive) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor cpu_t = at::tensor({1.0f, 2.0f}, at::kFloat);
   at::Tensor cuda_t = cpu_t.cuda();
 
   ASSERT_TRUE(cuda_t.is_cuda());
   ASSERT_FALSE(cuda_t.is_cpu());
 }
+
+#endif
diff --git a/...at/compat_dense_sparse_conversion_test.cc → ...mpat/ATen_dense_sparse_conversion_test.cc b/...at/compat_dense_sparse_conversion_test.cc → ...mpat/ATen_dense_sparse_conversion_test.cc
diff --git a/test/cpp/compat/ATen_empty_test.cc b/test/cpp/compat/ATen_empty_test.cc
@@ -20,7 +20,6 @@
 
 #include "ATen/ATen.h"
 #include "gtest/gtest.h"
-#include "test/cpp/compat/cuda_test_utils.h"
 #include "torch/all.h"
 
 // ======================== at::empty basic tests ========================
@@ -57,7 +56,6 @@ TEST(ATenEmptyTest, ExplicitArgsCpu) {
 
 // TensorOptions overload: pin_memory via options
 TEST(ATenEmptyTest, PinMemoryViaTensorOptions) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::TensorOptions opts =
       at::TensorOptions().dtype(at::kFloat).pinned_memory(true);
   at::Tensor t = at::empty({4, 4}, opts);
@@ -67,7 +65,6 @@ TEST(ATenEmptyTest, PinMemoryViaTensorOptions) {
 
 // 6-argument overload: pin_memory = true (must use CPU device)
 TEST(ATenEmptyTest, PinMemoryViaExplicitArgs) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor t =
       at::empty({8}, at::kFloat, at::kStrided, at::kCPU, true, std::nullopt);
   ASSERT_TRUE(t.is_pinned())
@@ -76,7 +73,6 @@ TEST(ATenEmptyTest, PinMemoryViaExplicitArgs) {
 
 // pin_memory = false must NOT produce a pinned tensor
 TEST(ATenEmptyTest, NoPinMemoryViaExplicitArgs) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor t =
       at::empty({8}, at::kFloat, at::kStrided, at::kCUDA, false, std::nullopt);
   ASSERT_FALSE(t.is_pinned())
@@ -85,7 +81,6 @@ TEST(ATenEmptyTest, NoPinMemoryViaExplicitArgs) {
 
 // Pinned tensor lives in pinned (host) memory, not on the GPU device itself
 TEST(ATenEmptyTest, PinnedTensorIsNotCuda) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::TensorOptions opts =
       at::TensorOptions().dtype(at::kFloat).pinned_memory(true);
   at::Tensor t = at::empty({16}, opts);
@@ -96,7 +91,6 @@ TEST(ATenEmptyTest, PinnedTensorIsNotCuda) {
 
 // Data pointer of a pinned tensor must be non-null
 TEST(ATenEmptyTest, PinnedTensorDataPtrNonNull) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::TensorOptions opts =
       at::TensorOptions().dtype(at::kFloat).pinned_memory(true);
   at::Tensor t = at::empty({32}, opts);

diff --git a/test/cpp/compat/ATen_equal_test.cc b/test/cpp/compat/ATen_equal_test.cc
@@ -21,7 +21,6 @@
 #include <c10/core/TensorOptions.h>
 
 #include "gtest/gtest.h"
-#include "test/cpp/compat/cuda_test_utils.h"
 
 TEST(TensorEqualTest, DifferentShapeReturnsFalse) {
   at::Tensor a = at::ones({2, 2}, at::kFloat);
@@ -41,8 +40,6 @@ TEST(TensorEqualTest, DtypeMismatchCastsOtherTensor) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(TensorEqualTest, DeviceMismatchThrows) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
-
   at::Tensor cpu = at::ones({2, 2}, at::kFloat);
   at::Tensor gpu =
       at::ones({2, 2}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));

diff --git a/test/cpp/compat/ATen_eye_test.cc b/test/cpp/compat/ATen_eye_test.cc
@@ -26,7 +26,6 @@
 #include "ATen/ATen.h"
 #include "gtest/gtest.h"
 #include "paddle/phi/common/float16.h"
-#include "test/cpp/compat/cuda_test_utils.h"
 #include "torch/all.h"
 
 // ============================================================
@@ -159,15 +158,13 @@ TEST(ATenEyeTest, OneByOne) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(ATenEyeTest, SquareOnGPU) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor t =
       at::eye(4, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
   at::Tensor t_cpu = t.to(at::kCPU);
   CheckEye(t_cpu, 4, 4);
 }
 
 TEST(ATenEyeTest, RectangularOnGPU) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   at::Tensor t =
       at::eye(3, 5, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
   at::Tensor t_cpu = t.to(at::kCPU);