Skip to content
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a45e7e0
[Cpp API Compatibility] Delete useless code and rename test files
youge325 Apr 3, 2026
7cee6e8
fix
youge325 Apr 3, 2026
3dc2c05
Revert "fix"
youge325 Apr 5, 2026
53acaba
fix
youge325 Apr 5, 2026
82768bc
Merge branch 'develop' into cNorm
youge325 Apr 5, 2026
a0be3de
complement CPU compiling branch
youge325 Apr 5, 2026
88c0a55
try to compile test files with cpu
youge325 Apr 5, 2026
90a73da
fix
youge325 Apr 5, 2026
aaac6b3
fix
youge325 Apr 5, 2026
d825db2
fix
youge325 Apr 5, 2026
d6e9be4
fix xpu
youge325 Apr 5, 2026
ce84dbe
fix Mac-CPU TensorOptions compilation error
youge325 Apr 5, 2026
003897d
fix torch_compat.h: add missing DispatchKey.h include and fix namespace
youge325 Apr 5, 2026
f8e3a74
fix dcu build with symbol visibility hidden
youge325 Apr 5, 2026
85ae6b8
Revert "fix xpu"
youge325 Apr 5, 2026
7c0fb0f
Fix ATen compat operator[] to return view instead of copy
youge325 Apr 5, 2026
f9a4cb5
Fix as_strided to work with non-contiguous tensors
youge325 Apr 5, 2026
3d9f777
fix dcu again
youge325 Apr 5, 2026
c661ed6
Revert "fix dcu build with symbol visibility hidden"
youge325 Apr 5, 2026
4447f88
Revert "fix dcu again"
youge325 Apr 6, 2026
ad97976
move dcu related tests to nv_test
youge325 Apr 6, 2026
ac7bab4
skip xpu test when FLAGS_use_stride_kernel is disabled
youge325 Apr 6, 2026
283802c
fix xpu build
youge325 Apr 6, 2026
253d526
fix dcu build, which will be removed due to deepep deprecated API
youge325 Apr 6, 2026
89106a5
fix cpu build due to google test version too old
youge325 Apr 6, 2026
1fcc43d
skip dcu for all tests
youge325 Apr 6, 2026
ea0fa1f
try to compile all tests in cpu environment, except dcu environment, …
youge325 Apr 6, 2026
52662f3
fix all cpu build, and move cuda only test to nv_test
youge325 Apr 6, 2026
934b82a
Merge branch 'develop' into cNorm
SigureMo Apr 6, 2026
d283912
replace `torch::DispatchKey` with `c10::DispatchKey`
SigureMo Apr 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/fluid/pybind/torch_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ OperationInvoker::get_op_with_args(const std::string& qualified_name,
"Operator '%s' not found in the registry", qualified_name.c_str()));
}

auto impl_it = op->implementations.find(DispatchKey::CPU);
auto impl_it = op->implementations.find(c10::DispatchKey::CPU);
if (impl_it == op->implementations.end()) {
PADDLE_THROW(common::errors::NotFound(
"No CPU implementation found for operator '%s'",
Expand Down
47 changes: 41 additions & 6 deletions paddle/phi/api/include/compat/ATen/core/TensorBody.h
Original file line number Diff line number Diff line change
Expand Up @@ -678,12 +678,47 @@ class Tensor : public TensorBase {
at::Tensor& absolute_() const { return abs_(); }

Tensor operator[](int64_t index) const {
return paddle::experimental::slice(tensor_,
/*axes=*/{0},
/*starts=*/{index},
/*ends=*/{index + 1},
/*infer_flags=*/{1},
/*decrease_axis=*/{0});
// Use as_strided to create a view (shares storage with original tensor)
// This allows fill_ to modify the original tensor
int64_t numel = tensor_.numel();
if (numel == 0) {
PD_THROW("operator[]: cannot index empty tensor");
}

// Handle negative index
if (index < 0) {
index += tensor_.dims()[0];
}

// Check bounds
if (index < 0 || index >= tensor_.dims()[0]) {
PD_THROW("operator[]: index ",
index,
" out of range for tensor of size ",
tensor_.dims(),
" at dimension 0");
}

// For 1D tensor: create a scalar view (0-dim tensor) with proper offset
// For multi-D tensor: create a view of the row at index
std::vector<int64_t> new_sizes;
std::vector<int64_t> new_strides;

auto dims = tensor_.dims();
auto stride = tensor_.strides();

// Skip the first dimension (dim 0)
for (int i = 1; i < dims.size(); ++i) {
new_sizes.push_back(dims[i]);
new_strides.push_back(stride[i]);
}

// Calculate storage offset
int64_t storage_offset = index * stride[0];

return as_strided(c10::IntArrayRef(new_sizes),
c10::IntArrayRef(new_strides),
storage_offset);
}

void record_stream(at::Stream s) const;
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ using CUDAContextSparseHandle = phi::sparseHandle_t;
using CUDAContextBlasHandle = phi::blasHandle_t;
using CUDAContextBlasLtHandle = phi::blasLtHandle_t;
using CUDAContextSolverHandle = phi::solverHandle_t;
#else
#elif defined(PADDLE_WITH_CUDA)
using CUDAContextDeviceProp = cudaDeviceProp;
using CUDAContextSparseHandle = cusparseHandle_t;
using CUDAContextBlasHandle = cublasHandle_t;
Expand Down Expand Up @@ -90,6 +90,7 @@ inline int64_t getNumGPUs() { return c10::cuda::device_count(); }
*/
inline bool is_available() { return c10::cuda::device_count() > 0; }

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDAContextDeviceProp* getCurrentDeviceProperties();

int warp_size();
Expand All @@ -115,7 +116,6 @@ size_t getChosenWorkspaceSize();
size_t getCUDABlasLtWorkspaceSize();
void* getCUDABlasLtWorkspace();

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDAContextSolverHandle getCurrentCUDASolverDnHandle();

// Get the CUDA device allocator for the current device.
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@ namespace at::cuda::philox {
// In-kernel call to retrieve philox seed and offset from a PhiloxCudaState
// instance whether that instance was created with graph capture underway or
// not. See Note [CUDA Graph-safe RNG states].
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
__host__ __device__ __forceinline__ std::tuple<uint64_t, uint64_t> unpack(
at::PhiloxCudaState arg) {
#else
inline std::tuple<uint64_t, uint64_t> unpack(at::PhiloxCudaState arg) {
#endif
if (arg.captured_) {
// static_cast avoids "warning: invalid narrowing conversion from "long" to
// "unsigned long".
Expand Down
47 changes: 27 additions & 20 deletions paddle/phi/api/include/compat/ATen/ops/as_strided.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,28 @@ inline at::Tensor Tensor::as_strided(
if (!src_tensor) {
PD_THROW("as_strided: tensor must be a DenseTensor");
}
auto new_tensor = std::make_shared<phi::DenseTensor>();
new_tensor->ShareDataWith(*src_tensor);
// Create new meta with desired shape and strides first
std::vector<int64_t> size_vec(size.begin(), size.end());
std::vector<int64_t> stride_vec(stride.begin(), stride.end());
new_tensor->Resize(common::make_ddim(size_vec));
new_tensor->set_strides(common::make_ddim(stride_vec));

// Create new DenseTensor with correct meta, then share data
// We need to create a temporary DenseTensor with the right meta
// because ShareDataWith copies the source meta which we don't want
auto new_tensor = std::make_shared<phi::DenseTensor>();

// First, set up the holder by sharing data (this copies src meta, we'll
// override)
new_tensor->ShareDataWith(*src_tensor);

// Now create the correct meta with new shape/strides
phi::DenseTensorMeta meta(src_tensor->dtype(),
common::make_ddim(size_vec),
common::make_ddim(stride_vec));
// Calculate offset in bytes
int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0;
if (offset != 0) {
auto meta = phi::DenseTensorMeta(new_tensor->meta());
// meta.offset is in bytes; storage_offset is in elements
meta.offset =
static_cast<size_t>(offset) * phi::SizeOf(new_tensor->dtype());
new_tensor->set_meta(meta);
}
meta.offset = src_tensor->meta().offset +
static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
new_tensor->set_meta(meta);
PaddleTensor result;
result.set_impl(new_tensor);
return Tensor(result);
Expand All @@ -67,16 +75,15 @@ inline const at::Tensor& Tensor::as_strided_(
}
std::vector<int64_t> size_vec(size.begin(), size.end());
std::vector<int64_t> stride_vec(stride.begin(), stride.end());
src_tensor->Resize(common::make_ddim(size_vec));
src_tensor->set_strides(common::make_ddim(stride_vec));
// Use set_meta instead of Resize + set_strides to avoid contiguous check
phi::DenseTensorMeta meta(src_tensor->dtype(),
common::make_ddim(size_vec),
common::make_ddim(stride_vec));
meta.layout = src_tensor->layout();
int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0;
if (offset != 0) {
auto meta = phi::DenseTensorMeta(src_tensor->meta());
// meta.offset is in bytes; storage_offset is in elements
meta.offset =
static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
src_tensor->set_meta(meta);
}
meta.offset = src_tensor->meta().offset +
static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
src_tensor->set_meta(meta);
return *this;
}

Expand Down
3 changes: 2 additions & 1 deletion paddle/phi/api/include/compat/ATen/ops/record_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ inline void Tensor::record_stream(cudaStream_t s) const {
"tensor implementation.");
PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
"record_stream is not supported for CPU tensors.");
paddle::memory::RecordStream(dense_tensor->Holder(), s);
paddle::memory::RecordStream(dense_tensor->Holder(),
reinterpret_cast<gpuStream_t>(s));
}
#endif
} // namespace at
4 changes: 0 additions & 4 deletions paddle/phi/api/include/compat/c10/core/TensorOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,3 @@ inline std::string toString(const TensorOptions& options) {
namespace at {
using namespace c10; // NOLINT
} // namespace at

namespace torch {
using namespace c10; // NOLINT
} // namespace torch
6 changes: 0 additions & 6 deletions test/cpp/compat/ATen_CUDABlas_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
#include "test/cpp/compat/cuda_test_utils.h"

// Helper: allocate three same-sized device buffers, copy host data in,
// invoke a kernel via |fn|, copy results back, synchronize, then free.
Expand Down Expand Up @@ -73,7 +72,6 @@ class GemmTester {
static double toDouble(T val) { return static_cast<double>(val); }

void Run() {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
std::vector<T> h_c(N * N, T(0));
Expand All @@ -95,7 +93,6 @@ class GemmTester {
// transA='T': C = alpha * A^T * B + beta * C
// A^T = [[1,3],[2,4]], A^T * B = [[26,30],[38,44]]
void RunTransA() {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
std::vector<T> h_c(N * N, T(0));
Expand Down Expand Up @@ -136,7 +133,6 @@ TEST(CUDABlasTest, GemmFloatTransA) {
}

TEST(CUDABlasTest, GemmFloatTransALowercase) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
constexpr int64_t N = 2;

std::vector<float> h_a = {1.F, 3.F, 2.F, 4.F};
Expand Down Expand Up @@ -181,7 +177,6 @@ TEST(CUDABlasTest, GemmBFloat16) {
// A stored col-major: col0={1+i,2+2i}, col1={3+3i,4+4i}
// A^H stored col-major: col0={1-i,3-3i}, col1={2-2i,4-4i}
TEST(CUDABlasTest, GemmComplexFloatConjTrans) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
constexpr int64_t N = 2;
using T = c10::complex<float>;

Expand Down Expand Up @@ -209,7 +204,6 @@ TEST(CUDABlasTest, GemmComplexFloatConjTrans) {

// Same as above but uses lowercase 'c'/'n' to exercise that switch-case branch.
TEST(CUDABlasTest, GemmComplexDoubleConjTransLower) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
constexpr int64_t N = 2;
using T = c10::complex<double>;

Expand Down
Loading
Loading