PaddlePaddle · SigureMo · Apr 7, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 5, 2026
diff --git a/paddle/fluid/pybind/torch_compat.h b/paddle/fluid/pybind/torch_compat.h
@@ -82,7 +82,7 @@ OperationInvoker::get_op_with_args(const std::string& qualified_name,
         "Operator '%s' not found in the registry", qualified_name.c_str()));
   }
 
-  auto impl_it = op->implementations.find(DispatchKey::CPU);
+  auto impl_it = op->implementations.find(c10::DispatchKey::CPU);
   if (impl_it == op->implementations.end()) {
     PADDLE_THROW(common::errors::NotFound(
         "No CPU implementation found for operator '%s'",

diff --git a/paddle/phi/api/include/compat/ATen/core/TensorBody.h b/paddle/phi/api/include/compat/ATen/core/TensorBody.h
@@ -678,12 +678,47 @@ class Tensor : public TensorBase {
   at::Tensor& absolute_() const { return abs_(); }
 
   Tensor operator[](int64_t index) const {
-    return paddle::experimental::slice(tensor_,
-                                       /*axes=*/{0},
-                                       /*starts=*/{index},
-                                       /*ends=*/{index + 1},
-                                       /*infer_flags=*/{1},
-                                       /*decrease_axis=*/{0});
+    // Use as_strided to create a view (shares storage with original tensor)
+    // This allows fill_ to modify the original tensor
+    int64_t numel = tensor_.numel();
+    if (numel == 0) {
+      PD_THROW("operator[]: cannot index empty tensor");
+    }
+
+    // Handle negative index
+    if (index < 0) {
+      index += tensor_.dims()[0];
+    }
+
+    // Check bounds
+    if (index < 0 || index >= tensor_.dims()[0]) {
+      PD_THROW("operator[]: index ",
+               index,
+               " out of range for tensor of size ",
+               tensor_.dims(),
+               " at dimension 0");
+    }
+
+    // For 1D tensor: create a scalar view (0-dim tensor) with proper offset
+    // For multi-D tensor: create a view of the row at index
+    std::vector<int64_t> new_sizes;
+    std::vector<int64_t> new_strides;
+
+    auto dims = tensor_.dims();
+    auto stride = tensor_.strides();
+
+    // Skip the first dimension (dim 0)
+    for (int i = 1; i < dims.size(); ++i) {
+      new_sizes.push_back(dims[i]);
+      new_strides.push_back(stride[i]);
+    }
+
+    // Calculate storage offset
+    int64_t storage_offset = index * stride[0];
+
+    return as_strided(c10::IntArrayRef(new_sizes),
+                      c10::IntArrayRef(new_strides),
+                      storage_offset);
   }
 
   void record_stream(at::Stream s) const;

diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
@@ -49,7 +49,7 @@ using CUDAContextSparseHandle = phi::sparseHandle_t;
 using CUDAContextBlasHandle = phi::blasHandle_t;
 using CUDAContextBlasLtHandle = phi::blasLtHandle_t;
 using CUDAContextSolverHandle = phi::solverHandle_t;
-#else
+#elif defined(PADDLE_WITH_CUDA)
 using CUDAContextDeviceProp = cudaDeviceProp;
 using CUDAContextSparseHandle = cusparseHandle_t;
 using CUDAContextBlasHandle = cublasHandle_t;
@@ -90,6 +90,7 @@ inline int64_t getNumGPUs() { return c10::cuda::device_count(); }
  */
 inline bool is_available() { return c10::cuda::device_count() > 0; }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 CUDAContextDeviceProp* getCurrentDeviceProperties();
 
 int warp_size();
@@ -115,7 +116,6 @@ size_t getChosenWorkspaceSize();
 size_t getCUDABlasLtWorkspaceSize();
 void* getCUDABlasLtWorkspace();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 CUDAContextSolverHandle getCurrentCUDASolverDnHandle();
 
 // Get the CUDA device allocator for the current device.

diff --git a/paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh b/paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh
@@ -26,8 +26,12 @@ namespace at::cuda::philox {
 // In-kernel call to retrieve philox seed and offset from a PhiloxCudaState
 // instance whether that instance was created with graph capture underway or
 // not. See Note [CUDA Graph-safe RNG states].
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 __host__ __device__ __forceinline__ std::tuple<uint64_t, uint64_t> unpack(
     at::PhiloxCudaState arg) {
+#else
+inline std::tuple<uint64_t, uint64_t> unpack(at::PhiloxCudaState arg) {
+#endif
   if (arg.captured_) {
     // static_cast avoids "warning: invalid narrowing conversion from "long" to
     // "unsigned long".

diff --git a/paddle/phi/api/include/compat/ATen/ops/as_strided.h b/paddle/phi/api/include/compat/ATen/ops/as_strided.h
@@ -35,20 +35,28 @@ inline at::Tensor Tensor::as_strided(
   if (!src_tensor) {
     PD_THROW("as_strided: tensor must be a DenseTensor");
   }
-  auto new_tensor = std::make_shared<phi::DenseTensor>();
-  new_tensor->ShareDataWith(*src_tensor);
+  // Create new meta with desired shape and strides first
   std::vector<int64_t> size_vec(size.begin(), size.end());
   std::vector<int64_t> stride_vec(stride.begin(), stride.end());
-  new_tensor->Resize(common::make_ddim(size_vec));
-  new_tensor->set_strides(common::make_ddim(stride_vec));
+
+  // Create new DenseTensor with correct meta, then share data
+  // We need to create a temporary DenseTensor with the right meta
+  // because ShareDataWith copies the source meta which we don't want
+  auto new_tensor = std::make_shared<phi::DenseTensor>();
+
+  // First, set up the holder by sharing data (this copies src meta, we'll
+  // override)
+  new_tensor->ShareDataWith(*src_tensor);
+
+  // Now create the correct meta with new shape/strides
+  phi::DenseTensorMeta meta(src_tensor->dtype(),
+                            common::make_ddim(size_vec),
+                            common::make_ddim(stride_vec));
+  // Calculate offset in bytes
   int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0;
-  if (offset != 0) {
-    auto meta = phi::DenseTensorMeta(new_tensor->meta());
-    // meta.offset is in bytes; storage_offset is in elements
-    meta.offset =
-        static_cast<size_t>(offset) * phi::SizeOf(new_tensor->dtype());
-    new_tensor->set_meta(meta);
-  }
+  meta.offset = src_tensor->meta().offset +
+                static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
+  new_tensor->set_meta(meta);
   PaddleTensor result;
   result.set_impl(new_tensor);
   return Tensor(result);
@@ -67,16 +75,15 @@ inline const at::Tensor& Tensor::as_strided_(
   }
   std::vector<int64_t> size_vec(size.begin(), size.end());
   std::vector<int64_t> stride_vec(stride.begin(), stride.end());
-  src_tensor->Resize(common::make_ddim(size_vec));
-  src_tensor->set_strides(common::make_ddim(stride_vec));
+  // Use set_meta instead of Resize + set_strides to avoid contiguous check
+  phi::DenseTensorMeta meta(src_tensor->dtype(),
+                            common::make_ddim(size_vec),
+                            common::make_ddim(stride_vec));
+  meta.layout = src_tensor->layout();
   int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0;
-  if (offset != 0) {
-    auto meta = phi::DenseTensorMeta(src_tensor->meta());
-    // meta.offset is in bytes; storage_offset is in elements
-    meta.offset =
-        static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
-    src_tensor->set_meta(meta);
-  }
+  meta.offset = src_tensor->meta().offset +
+                static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
+  src_tensor->set_meta(meta);
   return *this;
 }
 

diff --git a/paddle/phi/api/include/compat/ATen/ops/record_stream.h b/paddle/phi/api/include/compat/ATen/ops/record_stream.h
@@ -64,7 +64,8 @@ inline void Tensor::record_stream(cudaStream_t s) const {
            "tensor implementation.");
   PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
            "record_stream is not supported for CPU tensors.");
-  paddle::memory::RecordStream(dense_tensor->Holder(), s);
+  paddle::memory::RecordStream(dense_tensor->Holder(),
+                               reinterpret_cast<gpuStream_t>(s));
 }
 #endif
 }  // namespace at
diff --git a/paddle/phi/api/include/compat/c10/core/TensorOptions.h b/paddle/phi/api/include/compat/c10/core/TensorOptions.h
@@ -373,7 +373,3 @@ inline std::string toString(const TensorOptions& options) {
 namespace at {
 using namespace c10;  // NOLINT
 }  // namespace at
-
-namespace torch {
-using namespace c10;  // NOLINT
-}  // namespace torch
diff --git a/test/cpp/compat/ATen_CUDABlas_test.cc b/test/cpp/compat/ATen_CUDABlas_test.cc
@@ -24,7 +24,6 @@
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
-#include "test/cpp/compat/cuda_test_utils.h"
 
 // Helper: allocate three same-sized device buffers, copy host data in,
 // invoke a kernel via |fn|, copy results back, synchronize, then free.
@@ -73,7 +72,6 @@ class GemmTester {
   static double toDouble(T val) { return static_cast<double>(val); }
 
   void Run() {
-    SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
     std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
     std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
     std::vector<T> h_c(N * N, T(0));
@@ -95,7 +93,6 @@ class GemmTester {
   // transA='T': C = alpha * A^T * B + beta * C
   // A^T = [[1,3],[2,4]],  A^T * B = [[26,30],[38,44]]
   void RunTransA() {
-    SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
     std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
     std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
     std::vector<T> h_c(N * N, T(0));
@@ -136,7 +133,6 @@ TEST(CUDABlasTest, GemmFloatTransA) {
 }
 
 TEST(CUDABlasTest, GemmFloatTransALowercase) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   constexpr int64_t N = 2;
 
   std::vector<float> h_a = {1.F, 3.F, 2.F, 4.F};
@@ -181,7 +177,6 @@ TEST(CUDABlasTest, GemmBFloat16) {
 // A stored col-major: col0={1+i,2+2i}, col1={3+3i,4+4i}
 // A^H stored col-major: col0={1-i,3-3i}, col1={2-2i,4-4i}
 TEST(CUDABlasTest, GemmComplexFloatConjTrans) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   constexpr int64_t N = 2;
   using T = c10::complex<float>;
 
@@ -209,7 +204,6 @@ TEST(CUDABlasTest, GemmComplexFloatConjTrans) {
 
 // Same as above but uses lowercase 'c'/'n' to exercise that switch-case branch.
 TEST(CUDABlasTest, GemmComplexDoubleConjTransLower) {
-  SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
   constexpr int64_t N = 2;
   using T = c10::complex<double>;