PaddlePaddle · SigureMo · Apr 7, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 5, 2026
diff --git a/.gitignore b/.gitignore
@@ -71,6 +71,7 @@ third_party/
 bazel-*
 .humanize
 .codex
+.paddle-agent
 
 build_*
 # clion workspace.

diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
@@ -678,12 +678,47 @@ class Tensor : public TensorBase {
   at::Tensor& absolute_() const { return abs_(); }
 
   Tensor operator[](int64_t index) const {
-    return paddle::experimental::slice(tensor_,
-                                       /*axes=*/{0},
-                                       /*starts=*/{index},
-                                       /*ends=*/{index + 1},
-                                       /*infer_flags=*/{1},
-                                       /*decrease_axis=*/{0});
+    // Use as_strided to create a view (shares storage with original tensor)
+    // This allows fill_ to modify the original tensor
+    int64_t numel = tensor_.numel();
+    if (numel == 0) {
+      PD_THROW("operator[]: cannot index empty tensor");
+    }
+
+    // Handle negative index
+    if (index < 0) {
+      index += tensor_.dims()[0];
+    }
+
+    // Check bounds
+    if (index < 0 || index >= tensor_.dims()[0]) {
+      PD_THROW("operator[]: index ",
+               index,
+               " out of range for tensor of size ",
+               tensor_.dims(),
+               " at dimension 0");
+    }
+
+    // For 1D tensor: create a scalar view (0-dim tensor) with proper offset
+    // For multi-D tensor: create a view of the row at index
+    std::vector<int64_t> new_sizes;
+    std::vector<int64_t> new_strides;
+
+    auto dims = tensor_.dims();
+    auto stride = tensor_.strides();
+
+    // Skip the first dimension (dim 0)
+    for (int i = 1; i < dims.size(); ++i) {
+      new_sizes.push_back(dims[i]);
+      new_strides.push_back(stride[i]);
+    }
+
+    // Calculate storage offset
+    int64_t storage_offset = index * stride[0];
+
+    return as_strided(c10::IntArrayRef(new_sizes),
+                      c10::IntArrayRef(new_strides),
+                      storage_offset);
   }
 
   void record_stream(at::Stream s) const;

diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
@@ -49,7 +49,7 @@ using CUDAContextSparseHandle = phi::sparseHandle_t;
 using CUDAContextBlasHandle = phi::blasHandle_t;
 using CUDAContextBlasLtHandle = phi::blasLtHandle_t;
 using CUDAContextSolverHandle = phi::solverHandle_t;
-#else
+#elif defined(PADDLE_WITH_CUDA)
 using CUDAContextDeviceProp = cudaDeviceProp;
 using CUDAContextSparseHandle = cusparseHandle_t;
 using CUDAContextBlasHandle = cublasHandle_t;
@@ -90,6 +90,7 @@ inline int64_t getNumGPUs() { return c10::cuda::device_count(); }
  */
 inline bool is_available() { return c10::cuda::device_count() > 0; }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 CUDAContextDeviceProp* getCurrentDeviceProperties();
 
 int warp_size();
@@ -115,7 +116,6 @@ size_t getChosenWorkspaceSize();
 size_t getCUDABlasLtWorkspaceSize();
 void* getCUDABlasLtWorkspace();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 CUDAContextSolverHandle getCurrentCUDASolverDnHandle();
 
 // Get the CUDA device allocator for the current device.

diff --git a/paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh b/paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh
@@ -26,8 +26,12 @@ namespace at::cuda::philox {
 // In-kernel call to retrieve philox seed and offset from a PhiloxCudaState
 // instance whether that instance was created with graph capture underway or
 // not. See Note [CUDA Graph-safe RNG states].
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 __host__ __device__ __forceinline__ std::tuple<uint64_t, uint64_t> unpack(
     at::PhiloxCudaState arg) {
+#else
+inline std::tuple<uint64_t, uint64_t> unpack(at::PhiloxCudaState arg) {
+#endif
   if (arg.captured_) {
     // static_cast avoids "warning: invalid narrowing conversion from "long" to
     // "unsigned long".

diff --git a/paddle/phi/api/include/compat/ATen/ops/as_strided.h b/paddle/phi/api/include/compat/ATen/ops/as_strided.h
@@ -35,20 +35,28 @@ inline at::Tensor Tensor::as_strided(
   if (!src_tensor) {
     PD_THROW("as_strided: tensor must be a DenseTensor");
   }
-  auto new_tensor = std::make_shared<phi::DenseTensor>();
-  new_tensor->ShareDataWith(*src_tensor);
+  // Create new meta with desired shape and strides first
   std::vector<int64_t> size_vec(size.begin(), size.end());
   std::vector<int64_t> stride_vec(stride.begin(), stride.end());
-  new_tensor->Resize(common::make_ddim(size_vec));
-  new_tensor->set_strides(common::make_ddim(stride_vec));
+
+  // Create new DenseTensor with correct meta, then share data
+  // We need to create a temporary DenseTensor with the right meta
+  // because ShareDataWith copies the source meta which we don't want
+  auto new_tensor = std::make_shared<phi::DenseTensor>();
+
+  // First, set up the holder by sharing data (this copies src meta, we'll
+  // override)
+  new_tensor->ShareDataWith(*src_tensor);
+
+  // Now create the correct meta with new shape/strides
+  phi::DenseTensorMeta meta(src_tensor->dtype(),
+                            common::make_ddim(size_vec),
+                            common::make_ddim(stride_vec));
+  // Calculate offset in bytes
   int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0;
-  if (offset != 0) {
-    auto meta = phi::DenseTensorMeta(new_tensor->meta());
-    // meta.offset is in bytes; storage_offset is in elements
-    meta.offset =
-        static_cast<size_t>(offset) * phi::SizeOf(new_tensor->dtype());
-    new_tensor->set_meta(meta);
-  }
+  meta.offset = src_tensor->meta().offset +
+                static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
+  new_tensor->set_meta(meta);
   PaddleTensor result;
   result.set_impl(new_tensor);
   return Tensor(result);
@@ -67,16 +75,15 @@ inline const at::Tensor& Tensor::as_strided_(
   }
   std::vector<int64_t> size_vec(size.begin(), size.end());
   std::vector<int64_t> stride_vec(stride.begin(), stride.end());
-  src_tensor->Resize(common::make_ddim(size_vec));
-  src_tensor->set_strides(common::make_ddim(stride_vec));
+  // Use set_meta instead of Resize + set_strides to avoid contiguous check
+  phi::DenseTensorMeta meta(src_tensor->dtype(),
+                            common::make_ddim(size_vec),
+                            common::make_ddim(stride_vec));
+  meta.layout = src_tensor->layout();
   int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0;
-  if (offset != 0) {
-    auto meta = phi::DenseTensorMeta(src_tensor->meta());
-    // meta.offset is in bytes; storage_offset is in elements
-    meta.offset =
-        static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
-    src_tensor->set_meta(meta);
-  }
+  meta.offset = src_tensor->meta().offset +
+                static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
+  src_tensor->set_meta(meta);
   return *this;
 }
 

diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
@@ -64,7 +64,8 @@ inline void Tensor::record_stream(cudaStream_t s) const {
            "tensor implementation.");
   PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
            "record_stream is not supported for CPU tensors.");
-  paddle::memory::RecordStream(dense_tensor->Holder(), s);
+  paddle::memory::RecordStream(dense_tensor->Holder(),
+                               reinterpret_cast<gpuStream_t>(s));
 }
 #endif
 }  // namespace at
diff --git a/paddle/phi/api/include/compat/c10/core/TensorOptions.h b/paddle/phi/api/include/compat/c10/core/TensorOptions.h
@@ -373,7 +373,3 @@ inline std::string toString(const TensorOptions& options) {
 namespace at {
 using namespace c10;  // NOLINT
 }  // namespace at
-
-namespace torch {
-using namespace c10;  // NOLINT
-}  // namespace torch
diff --git a/test/cpp/compat/ATen_CUDABlas_test.cc b/test/cpp/compat/ATen_CUDABlas_test.cc
@@ -24,7 +24,6 @@
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
-#include "test/cpp/compat/cuda_test_utils.h"
 
 // Helper: allocate three same-sized device buffers, copy host data in,
 // invoke a kernel via |fn|, copy results back, synchronize, then free.
@@ -73,7 +72,6 @@ class GemmTester {
   static double toDouble(T val) { return static_cast<double>(val); }
 
   void Run() {
-    SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
     std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
     std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
     std::vector<T> h_c(N * N, T(0));
@@ -95,7 +93,6 @@ class GemmTester {
   // transA='T': C = alpha * A^T * B + beta * C
   // A^T = [[1,3],[2,4]],  A^T * B = [[26,30],[38,44]]
   void RunTransA() {
-    SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
     std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
     std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
     std::vector<T> h_c(N * N, T(0));
@@ -136,7 +133,6 @@ TEST(CUDABlasTest, GemmFloatTransA) {
 }
 
 TEST(CUDABlasTest, GemmFloatTransALowercase) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   constexpr int64_t N = 2;
 
   std::vector<float> h_a = {1.F, 3.F, 2.F, 4.F};
@@ -181,7 +177,6 @@ TEST(CUDABlasTest, GemmBFloat16) {
 // A stored col-major: col0={1+i,2+2i}, col1={3+3i,4+4i}
 // A^H stored col-major: col0={1-i,3-3i}, col1={2-2i,4-4i}
 TEST(CUDABlasTest, GemmComplexFloatConjTrans) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   constexpr int64_t N = 2;
   using T = c10::complex<float>;
 
@@ -209,7 +204,6 @@ TEST(CUDABlasTest, GemmComplexFloatConjTrans) {
 
 // Same as above but uses lowercase 'c'/'n' to exercise that switch-case branch.
 TEST(CUDABlasTest, GemmComplexDoubleConjTransLower) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   constexpr int64_t N = 2;
   using T = c10::complex<double>;
 

diff --git a/test/cpp/compat/ATen_CUDAContext_test.cc b/test/cpp/compat/ATen_CUDAContext_test.cc
@@ -12,51 +12,62 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
 #include <ATen/cuda/CUDAContextLight.h>
 #include <c10/core/Allocator.h>
 #include <c10/cuda/CUDAFunctions.h>
-#include <c10/cuda/CUDAStream.h>
 
 #include "gtest/gtest.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <c10/cuda/CUDAStream.h>
 #include "paddle/phi/backends/gpu/gpu_info.h"
-#include "test/cpp/compat/cuda_test_utils.h"
+#endif
 
 // ---------------------------------------------------------------------------
 // CUDAFunctions.h — covers the 2 missing lines:
 //   c10::cuda::device_synchronize() and c10::cuda::stream_synchronize()
 // ---------------------------------------------------------------------------
 
 TEST(CUDAFunctionsTest, DeviceSynchronize) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // Exercises the PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()) branch
   ASSERT_NO_THROW(c10::cuda::device_synchronize());
+#else
+  // In CPU-only builds, device_synchronize throws
+  ASSERT_THROW(c10::cuda::device_synchronize(), std::exception);
+#endif
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(CUDAFunctionsTest, StreamSynchronize) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   // Exercises phi::backends::gpu::GpuStreamSync()
   auto stream = c10::cuda::getCurrentCUDAStream();
   ASSERT_NO_THROW(c10::cuda::stream_synchronize(stream));
 }
+#endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(CUDAFunctionsTest, AtNamespaceAliases) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   // Exercises the using aliases in at::cuda namespace
   ASSERT_NO_THROW(at::cuda::device_synchronize());
   auto stream = c10::cuda::getCurrentCUDAStream();
   ASSERT_NO_THROW(at::cuda::stream_synchronize(stream));
 }
+#endif
 
 // ---------------------------------------------------------------------------
 // CUDAContextLight.h — covers the 1 missing line: is_available()
 // ---------------------------------------------------------------------------
 
 TEST(CUDAContextLightTest, IsAvailable) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // With GPU compilation and at least one device, this must be true.
   int gpu_count = phi::backends::gpu::GetGPUDeviceCount();
   ASSERT_EQ(at::cuda::is_available(), gpu_count > 0);
+#else
+  // In CPU-only builds, is_available() should return false
+  ASSERT_FALSE(at::cuda::is_available());
+#endif
 }
 
 // ---------------------------------------------------------------------------
@@ -65,14 +76,21 @@ TEST(CUDAContextLightTest, IsAvailable) {
 
 // getNumGPUs() delegages to c10::cuda::device_count()
 TEST(CUDAContextLightTest, GetNumGPUs) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   int64_t n = at::cuda::getNumGPUs();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   ASSERT_GE(n, 1);
+#else
+  // In CPU-only builds, device_count() returns 0
+  ASSERT_EQ(n, 0);
+#endif
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+// The following tests require CUDA runtime and can only run in CUDA builds
+
 // getCurrentDeviceProperties() / getDeviceProperties()
 TEST(CUDAContextLightTest, DeviceProperties) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   ASSERT_NE(prop, nullptr);
   // Sanity-check a few well-known fields
@@ -87,15 +105,13 @@ TEST(CUDAContextLightTest, DeviceProperties) {
 
 // warp_size()
 TEST(CUDAContextLightTest, WarpSize) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   int ws = at::cuda::warp_size();
   // All NVIDIA and AMD GPU architectures have warp size of 32 or 64
   ASSERT_TRUE(ws == 32 || ws == 64);
 }
 
 // canDeviceAccessPeer() — a device cannot peer-access itself
 TEST(CUDAContextLightTest, CanDeviceAccessPeer) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   int device_id = phi::backends::gpu::GetCurrentDeviceId();
   // Self-to-self peer access is always false per CUDA spec
   bool self_peer = at::cuda::canDeviceAccessPeer(device_id, device_id);
@@ -104,26 +120,22 @@ TEST(CUDAContextLightTest, CanDeviceAccessPeer) {
 
 // Handle accessors — all must return non-null handles
 TEST(CUDAContextLightTest, GetCurrentCUDABlasHandle) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cublasHandle_t h = at::cuda::getCurrentCUDABlasHandle();
   ASSERT_NE(h, nullptr);
 }
 
 TEST(CUDAContextLightTest, GetCurrentCUDABlasLtHandle) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cublasLtHandle_t h = at::cuda::getCurrentCUDABlasLtHandle();
   ASSERT_NE(h, nullptr);
 }
 
 TEST(CUDAContextLightTest, GetCurrentCUDASparseHandle) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cusparseHandle_t h = at::cuda::getCurrentCUDASparseHandle();
   ASSERT_NE(h, nullptr);
 }
 
 #if defined(CUDART_VERSION) || defined(USE_ROCM)
 TEST(CUDAContextLightTest, GetCurrentCUDASolverDnHandle) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   cusolverDnHandle_t h = at::cuda::getCurrentCUDASolverDnHandle();
   ASSERT_NE(h, nullptr);
 }
@@ -160,7 +172,6 @@ TEST(CUDAContextLightTest, GetChosenWorkspaceSize) {
 
 // getCUDABlasLtWorkspaceSize() / getCUDABlasLtWorkspace()
 TEST(CUDAContextLightTest, CUDABlasLtWorkspace) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   size_t sz = at::cuda::getCUDABlasLtWorkspaceSize();
   ASSERT_GT(sz, 0UL);
 
@@ -176,7 +187,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorSingleton) {
 }
 
 TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator();
   ASSERT_NE(alloc, nullptr);
 
@@ -207,7 +217,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) {
 }
 
 TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator();
   ASSERT_NE(alloc, nullptr);
 
@@ -220,7 +229,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) {
 }
 
 TEST(CUDAContextLightTest, AllocatorZeroSizeAndNoopCopyBranches) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator();
   ASSERT_NE(alloc, nullptr);
-Original file line number
+Diff line change
@@ Expand Up / @@ -71,6 +71,7 @@ third_party/ @@
     bazel-*
     .humanize
     .codex
+    .paddle-agent
     build_*
     # clion workspace.
@@ Expand Down @@