Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a45e7e0
[Cpp API Compatibility] Delete useless code and rename test files
youge325 Apr 3, 2026
7cee6e8
fix
youge325 Apr 3, 2026
3dc2c05
Revert "fix"
youge325 Apr 5, 2026
53acaba
fix
youge325 Apr 5, 2026
82768bc
Merge branch 'develop' into cNorm
youge325 Apr 5, 2026
a0be3de
complement CPU compiling branch
youge325 Apr 5, 2026
88c0a55
try to compile test files with cpu
youge325 Apr 5, 2026
90a73da
fix
youge325 Apr 5, 2026
aaac6b3
fix
youge325 Apr 5, 2026
d825db2
fix
youge325 Apr 5, 2026
d6e9be4
fix xpu
youge325 Apr 5, 2026
ce84dbe
fix Mac-CPU TensorOptions compilation error
youge325 Apr 5, 2026
003897d
fix torch_compat.h: add missing DispatchKey.h include and fix namespace
youge325 Apr 5, 2026
f8e3a74
fix dcu build with symbol visibility hidden
youge325 Apr 5, 2026
85ae6b8
Revert "fix xpu"
youge325 Apr 5, 2026
7c0fb0f
Fix ATen compat operator[] to return view instead of copy
youge325 Apr 5, 2026
f9a4cb5
Fix as_strided to work with non-contiguous tensors
youge325 Apr 5, 2026
3d9f777
fix dcu again
youge325 Apr 5, 2026
c661ed6
Revert "fix dcu build with symbol visibility hidden"
youge325 Apr 5, 2026
4447f88
Revert "fix dcu again"
youge325 Apr 6, 2026
ad97976
move dcu related tests to nv_test
youge325 Apr 6, 2026
ac7bab4
skip xpu test when FLAGS_use_stride_kernel is disabled
youge325 Apr 6, 2026
283802c
fix xpu build
youge325 Apr 6, 2026
253d526
fix dcu build, which will be removed due to deepep deprecated API
youge325 Apr 6, 2026
89106a5
fix cpu build due to google test version too old
youge325 Apr 6, 2026
1fcc43d
skip dcu for all tests
youge325 Apr 6, 2026
ea0fa1f
try to compile all tests in cpu environment, except dcu environment, …
youge325 Apr 6, 2026
52662f3
fix all cpu build, and move cuda only test to nv_test
youge325 Apr 6, 2026
934b82a
Merge branch 'develop' into cNorm
SigureMo Apr 6, 2026
d283912
replace `torch::DispatchKey` with `c10::DispatchKey`
SigureMo Apr 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ third_party/
bazel-*
.humanize
.codex
.paddle-agent

build_*
# clion workspace.
Expand Down
47 changes: 41 additions & 6 deletions paddle/phi/api/include/compat/ATen/core/TensorBody.h
Original file line number Diff line number Diff line change
Expand Up @@ -678,12 +678,47 @@ class Tensor : public TensorBase {
at::Tensor& absolute_() const { return abs_(); }

Tensor operator[](int64_t index) const {
return paddle::experimental::slice(tensor_,
/*axes=*/{0},
/*starts=*/{index},
/*ends=*/{index + 1},
/*infer_flags=*/{1},
/*decrease_axis=*/{0});
// Use as_strided to create a view (shares storage with original tensor)
// This allows fill_ to modify the original tensor
int64_t numel = tensor_.numel();
if (numel == 0) {
PD_THROW("operator[]: cannot index empty tensor");
}

// Handle negative index
if (index < 0) {
index += tensor_.dims()[0];
}

// Check bounds
if (index < 0 || index >= tensor_.dims()[0]) {
PD_THROW("operator[]: index ",
index,
" out of range for tensor of size ",
tensor_.dims(),
" at dimension 0");
}

// For 1D tensor: create a scalar view (0-dim tensor) with proper offset
// For multi-D tensor: create a view of the row at index
std::vector<int64_t> new_sizes;
std::vector<int64_t> new_strides;

auto dims = tensor_.dims();
auto stride = tensor_.strides();

// Skip the first dimension (dim 0)
for (int i = 1; i < dims.size(); ++i) {
new_sizes.push_back(dims[i]);
new_strides.push_back(stride[i]);
}

// Calculate storage offset
int64_t storage_offset = index * stride[0];

return as_strided(c10::IntArrayRef(new_sizes),
c10::IntArrayRef(new_strides),
storage_offset);
}

void record_stream(at::Stream s) const;
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ using CUDAContextSparseHandle = phi::sparseHandle_t;
using CUDAContextBlasHandle = phi::blasHandle_t;
using CUDAContextBlasLtHandle = phi::blasLtHandle_t;
using CUDAContextSolverHandle = phi::solverHandle_t;
#else
#elif defined(PADDLE_WITH_CUDA)
using CUDAContextDeviceProp = cudaDeviceProp;
using CUDAContextSparseHandle = cusparseHandle_t;
using CUDAContextBlasHandle = cublasHandle_t;
Expand Down Expand Up @@ -90,6 +90,7 @@ inline int64_t getNumGPUs() { return c10::cuda::device_count(); }
*/
inline bool is_available() { return c10::cuda::device_count() > 0; }

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDAContextDeviceProp* getCurrentDeviceProperties();

int warp_size();
Expand All @@ -115,7 +116,6 @@ size_t getChosenWorkspaceSize();
size_t getCUDABlasLtWorkspaceSize();
void* getCUDABlasLtWorkspace();

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
CUDAContextSolverHandle getCurrentCUDASolverDnHandle();

// Get the CUDA device allocator for the current device.
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/api/include/compat/ATen/cuda/PhiloxUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@ namespace at::cuda::philox {
// In-kernel call to retrieve philox seed and offset from a PhiloxCudaState
// instance whether that instance was created with graph capture underway or
// not. See Note [CUDA Graph-safe RNG states].
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
__host__ __device__ __forceinline__ std::tuple<uint64_t, uint64_t> unpack(
at::PhiloxCudaState arg) {
#else
inline std::tuple<uint64_t, uint64_t> unpack(at::PhiloxCudaState arg) {
#endif
if (arg.captured_) {
// static_cast avoids "warning: invalid narrowing conversion from "long" to
// "unsigned long".
Expand Down
47 changes: 27 additions & 20 deletions paddle/phi/api/include/compat/ATen/ops/as_strided.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,28 @@ inline at::Tensor Tensor::as_strided(
if (!src_tensor) {
PD_THROW("as_strided: tensor must be a DenseTensor");
}
auto new_tensor = std::make_shared<phi::DenseTensor>();
new_tensor->ShareDataWith(*src_tensor);
// Create new meta with desired shape and strides first
std::vector<int64_t> size_vec(size.begin(), size.end());
std::vector<int64_t> stride_vec(stride.begin(), stride.end());
new_tensor->Resize(common::make_ddim(size_vec));
new_tensor->set_strides(common::make_ddim(stride_vec));

// Create new DenseTensor with correct meta, then share data
// We need to create a temporary DenseTensor with the right meta
// because ShareDataWith copies the source meta which we don't want
auto new_tensor = std::make_shared<phi::DenseTensor>();

// First, set up the holder by sharing data (this copies src meta, we'll
// override)
new_tensor->ShareDataWith(*src_tensor);

// Now create the correct meta with new shape/strides
phi::DenseTensorMeta meta(src_tensor->dtype(),
common::make_ddim(size_vec),
common::make_ddim(stride_vec));
// Calculate offset in bytes
int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0;
if (offset != 0) {
auto meta = phi::DenseTensorMeta(new_tensor->meta());
// meta.offset is in bytes; storage_offset is in elements
meta.offset =
static_cast<size_t>(offset) * phi::SizeOf(new_tensor->dtype());
new_tensor->set_meta(meta);
}
meta.offset = src_tensor->meta().offset +
static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
new_tensor->set_meta(meta);
PaddleTensor result;
result.set_impl(new_tensor);
return Tensor(result);
Expand All @@ -67,16 +75,15 @@ inline const at::Tensor& Tensor::as_strided_(
}
std::vector<int64_t> size_vec(size.begin(), size.end());
std::vector<int64_t> stride_vec(stride.begin(), stride.end());
src_tensor->Resize(common::make_ddim(size_vec));
src_tensor->set_strides(common::make_ddim(stride_vec));
// Use set_meta instead of Resize + set_strides to avoid contiguous check
phi::DenseTensorMeta meta(src_tensor->dtype(),
common::make_ddim(size_vec),
common::make_ddim(stride_vec));
meta.layout = src_tensor->layout();
int64_t offset = storage_offset.has_value() ? storage_offset.value() : 0;
if (offset != 0) {
auto meta = phi::DenseTensorMeta(src_tensor->meta());
// meta.offset is in bytes; storage_offset is in elements
meta.offset =
static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
src_tensor->set_meta(meta);
}
meta.offset = src_tensor->meta().offset +
static_cast<size_t>(offset) * phi::SizeOf(src_tensor->dtype());
src_tensor->set_meta(meta);
return *this;
}

Expand Down
3 changes: 2 additions & 1 deletion paddle/phi/api/include/compat/ATen/ops/record_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ inline void Tensor::record_stream(cudaStream_t s) const {
"tensor implementation.");
PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
"record_stream is not supported for CPU tensors.");
paddle::memory::RecordStream(dense_tensor->Holder(), s);
paddle::memory::RecordStream(dense_tensor->Holder(),
reinterpret_cast<gpuStream_t>(s));
}
#endif
} // namespace at
4 changes: 0 additions & 4 deletions paddle/phi/api/include/compat/c10/core/TensorOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,3 @@ inline std::string toString(const TensorOptions& options) {
namespace at {
using namespace c10; // NOLINT
} // namespace at

namespace torch {
using namespace c10; // NOLINT
} // namespace torch
6 changes: 0 additions & 6 deletions test/cpp/compat/ATen_CUDABlas_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
#include "test/cpp/compat/cuda_test_utils.h"

// Helper: allocate three same-sized device buffers, copy host data in,
// invoke a kernel via |fn|, copy results back, synchronize, then free.
Expand Down Expand Up @@ -73,7 +72,6 @@ class GemmTester {
static double toDouble(T val) { return static_cast<double>(val); }

void Run() {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
std::vector<T> h_c(N * N, T(0));
Expand All @@ -95,7 +93,6 @@ class GemmTester {
// transA='T': C = alpha * A^T * B + beta * C
// A^T = [[1,3],[2,4]], A^T * B = [[26,30],[38,44]]
void RunTransA() {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
std::vector<T> h_a = {T(1), T(3), T(2), T(4)};
std::vector<T> h_b = {T(5), T(7), T(6), T(8)};
std::vector<T> h_c(N * N, T(0));
Expand Down Expand Up @@ -136,7 +133,6 @@ TEST(CUDABlasTest, GemmFloatTransA) {
}

TEST(CUDABlasTest, GemmFloatTransALowercase) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
constexpr int64_t N = 2;

std::vector<float> h_a = {1.F, 3.F, 2.F, 4.F};
Expand Down Expand Up @@ -181,7 +177,6 @@ TEST(CUDABlasTest, GemmBFloat16) {
// A stored col-major: col0={1+i,2+2i}, col1={3+3i,4+4i}
// A^H stored col-major: col0={1-i,3-3i}, col1={2-2i,4-4i}
TEST(CUDABlasTest, GemmComplexFloatConjTrans) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
constexpr int64_t N = 2;
using T = c10::complex<float>;

Expand Down Expand Up @@ -209,7 +204,6 @@ TEST(CUDABlasTest, GemmComplexFloatConjTrans) {

// Same as above but uses lowercase 'c'/'n' to exercise that switch-case branch.
TEST(CUDABlasTest, GemmComplexDoubleConjTransLower) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
constexpr int64_t N = 2;
using T = c10::complex<double>;

Expand Down
46 changes: 27 additions & 19 deletions test/cpp/compat/ATen_CUDAContext_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,51 +12,62 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

#include <ATen/cuda/CUDAContextLight.h>
#include <c10/core/Allocator.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/cuda/CUDAStream.h>

#include "gtest/gtest.h"

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <c10/cuda/CUDAStream.h>
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "test/cpp/compat/cuda_test_utils.h"
#endif

// ---------------------------------------------------------------------------
// CUDAFunctions.h — covers the 2 missing lines:
// c10::cuda::device_synchronize() and c10::cuda::stream_synchronize()
// ---------------------------------------------------------------------------

TEST(CUDAFunctionsTest, DeviceSynchronize) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Exercises the PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()) branch
ASSERT_NO_THROW(c10::cuda::device_synchronize());
#else
// In CPU-only builds, device_synchronize throws
ASSERT_THROW(c10::cuda::device_synchronize(), std::exception);
#endif
}

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(CUDAFunctionsTest, StreamSynchronize) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
// Exercises phi::backends::gpu::GpuStreamSync()
auto stream = c10::cuda::getCurrentCUDAStream();
ASSERT_NO_THROW(c10::cuda::stream_synchronize(stream));
}
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(CUDAFunctionsTest, AtNamespaceAliases) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
// Exercises the using aliases in at::cuda namespace
ASSERT_NO_THROW(at::cuda::device_synchronize());
auto stream = c10::cuda::getCurrentCUDAStream();
Comment on lines 31 to 53
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里把运行时 guard 一起删掉后,#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) 只保证“编译时启用了 GPU”,并不保证“执行环境一定有可用设备 / runtime”。

像下面这些调用:getCurrentCUDAStream()getNumGPUs()getCurrentDeviceProperties(),在无卡或驱动不可用的环境里都可能直接抛错;而原来的 cuda_test_utils.h 实际上同时检查了 device_count()cudaFree/hipFree 是否可用。

建议保留这层运行时判断,或者统一改成 if (!at::cuda::is_available()) { GTEST_SKIP() << ...; }。否则这批 compat tests 会从“可跳过”变成“硬失败”。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@youge325 话说这个当初为啥要加,现在又为啥要删?是因为只在 CUDA 上跑吗?其实我觉得如果可以的话,我还是希望这些单测能测就尽可能在 CPU 上测一测的,当初 CMakeListsWITH_GPU 只是早期希望快速跑通而已

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

当初用 Agent 加测试提高 coverage rate 的时候自动加的,按我的理解,如果能编译成功的话,那么测试也不能随便就 skip,所以现在就删了。

有些单测也确实可以在CPU上跑,条件编译一下测试用例就行,比如 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP), 就 ASSERT_NO_THROW#elseASSERT_THROW,我看看有哪些测试用例可以这样改

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ShigureNyako 帮我看看 test/cpp/compat/CMakeLists.txt 是否可以把 if(WITH_GPU) 删掉

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

刚刚 auth 过期了,@ShigureNyako 看看呢!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ShigureNyako 请给我一个 GPU build 但运行时无可用 device/runtime 的例子,在 CI 当中并没有出现这种情况, GPU build 就是为了在有可用 device/runtime 的场景中运行,我原来的代码写错了所以现在删掉,你可能倒果为因了

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

请给我一个 GPU build 但运行时无可用 device/runtime 的例子,在 CI 当中并没有出现这种情况, GPU build 就是为了在有可用 device/runtime 的场景中运行,我原来的代码写错了所以现在删掉,你可能倒果为因了

你这个提醒是对的,我这里把表述收敛一下:就这条 PR 当前的 CI 证据来看,我手头没有一个已经实际打出来的 “GPU build 但运行时无 device/runtime” 失败样例。 现在能直接看到的红灯,还是 Coverage 那条删测校验,以及 Linux-IXUCA 的外部 patch apply 失败。

我这边现在更准确的关注点,其实是另一层:不要因为补了部分 CPU-only compiling branch,就直接推导出可以把整块 if(WITH_GPU) 去掉。

原因是当前这批 test 里仍然混着不少明确的 GPU-only case,例如:

  • ATen_CUDABlas_test.cc 直接依赖 cudaMalloc/cudaMemcpy/cudaDeviceSynchronize
  • ATen_cuda_test.cc 直接测试 Tensor::cuda()
  • ATen_record_stream_test.cc 现在在 fixture SetUp() 里也还是会创建 cuda_tensor

所以我认同你说的:“当前 CI 没有证明 runtime 不可用场景会打到这里”。但从代码拆分角度看,我还是倾向于把结论限定成:

  • 这次 a0be3deeATen_CUDAContext_test.cc / c10_Stream_test.cc / c10_Event_test.cc 这类补 CPU compiling branch 的方向是对的;
  • 但如果后面要把更多 compat tests 真正下沉到 CPU 路径,还是得先把 CPU-capable case 和 GPU-only case 拆开,而不是现在就整体删 if(WITH_GPU)

我会按这个收敛后的判断继续看,不再把“当前 CI 已经证明有 runtime availability 问题”当成论据本身。

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ShigureNyako 如果从严谨性上来看,确实会有编译期 WITH_GPU 但是实际 runtime 没有的场景,比如我们 CI 里所有机器都是 CPU 编包的,即便是 Coverage build 也是,但是虽然 Coverage build 本身在 CPU 上,但其产物一定会在后续 runtime 有 GPU 的 Coverage test 上运行,也就是说目前 CI 上基本上有一个假设就是,编译时带了 GPU,那运行时就一定带 GPU,这其实不应该是考虑的重点

当初早期 C++ 兼容性第一个 PR #74402 为赶 3.2 发版在拉分支前夕合入临时跳过了一些内容,应该是 defd6a3 跳过了一些新硬件(根据上一个 commit 99f7b88 的报错和该 commit message 看起来是 XPU 和 NPU,但日志已经过期看不到了)上的运行时问题,请着眼于该场景分析下具体可能存在的问题,以及是否有推荐的解决方案

注意,我们即便不能推广到所有硬件(比如 XPU 和 NPU),即便跳过部分硬件,也比直接放弃 CPU 测试更好,我们的目标不是一次修复所有硬件上的问题(当然可以更好),只是希望在确保 CPU 上能够有所监控

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

请着眼于该场景分析下具体可能存在的问题,以及是否有推荐的解决方案

我按你给的线索回看了一下 PR #74402 里的相关 commit,能从代码里直接确认到的点大概是这样:

  1. 99f7b88d321d68d7fe8597c4d5bd31f7b424272b 本身只是给 ivalue.h 里的 generic_to 补前置声明,主要是 warning / 编译告警层面的修正,和 XPU/NPU 场景本身关系不大。
  2. 真正直接指向“新硬件 build 问题”的是 defd6a3cc60ca6bc3a2df5f58f2043ee0616a48c (fix build on xpu&npu),它至少做了三件很关键的事:
    • test/cpp/compat/CMakeLists.txt 里把 compat_basic_testtorch_library_test 从外层挪进了 if(WITH_GPU)
    • paddle/utils/pybind.h 里把 <ATen/core/TensorBody.h> 包进了 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    • 同一次 commit 还删除了旧的 torch_like_api/ATen/core/TensorBase.h

所以如果按这个历史上下文收敛,我现在更倾向于认为:当时真正要绕开的,不是单纯“这些测试在 runtime 上能不能 skip”,而是“CPU-capable 的测试文件里混入了 CUDA-only 头文件 / 链接依赖,导致 XPU/NPU build 只能整体退回到 WITH_GPU 里面”。

也就是说,这个场景下更具体的问题其实是:

  • 一部分 case 本身是 CPU 可测的;
  • 但它们和真正依赖 CUDA runtime / CUDA headers / CUDA libs 的 case 混在一个 target / 一个文件里;
  • 结果为了规避少数异构硬件问题,只能把整文件都锁回 WITH_GPU,CPU 监控也一起没了。

如果按这个目标来推,我这边比较推荐的解法会是分层处理,而不是一步到位追求“所有后端都修完”:

  1. 先把 CPU-capable case 拆出来恢复 CPU 监控

    • 这类最明显的还是 torch_library_testcompat_basic_test 里的 CPU 段,以及 ATen_toString_test 的 CPU 段;
    • 这部分更适合独立成 cc_test / CPU target,而不是继续和 GPU-only case 绑在一起。
  2. 把“CPU 可编译,但 GPU 语义不同”的 case 继续保留成条件分支

    • 比如这次 ATen_CUDAContext_test.cc 新补的 ASSERT_THROW / ASSERT_FALSE 这种 CPU-only branch,我觉得方向是对的;
    • 它解决的是“同一个 API 在 CPU-only build 下的 contract”。
  3. 真正依赖 CUDA runtime 的 case 继续留在 if(WITH_GPU) / nv_test

    • 例如 ATen_CUDABlas_test.ccATen_cuda_test.ccATen_record_stream_test.cc、GPU pointer 的 ATen_from_blob_test.cc
    • 这些 case 不适合为了扩 CPU 覆盖而整体挪到 CPU 路径。
  4. XPU / NPU 上如果仍有个别 API 不通,优先做 backend-specific 规避,而不是整体放弃 CPU 测试

    • 也就是尽量把“跳过部分硬件”收敛到具体 case / 具体 backend,避免再次把整批 CPU-capable 监控一起关掉。

从这个角度看,我会把当前 review 立场进一步收敛成:支持把 CPU-capable case 往 CPU 流水线拆,但不支持在这一步直接整体移除 if(WITH_GPU),也不支持把仍然混有 GPU-only 依赖的 target 直接默认成 CPU 可跑。

这样比较贴近当年 defd6a3 想绕开的具体问题,也更符合“先把 CPU 上能监控的部分恢复起来,再逐步收敛 XPU/NPU 等异构硬件差异”的目标。

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我看了最新提交 d6e9be4 (fix xpu),能理解你这里是在为 XPU build 继续收敛 CMakeLists,但我这里还是想把结论再收窄一下:我不太建议把这批 case 继续整体收回 nv_test

这次从 CPU 路径挪回 GPU 路径的文件里,至少有几类情况其实不太一样:

  1. 明显还是 CPU-capable 的 case

    • ATen_all_test.cc
    • ATen_as_strided_test.cc
    • ATen_index_test.cc

    这几个文件我回看了下,正文基本都是纯 CPU tensor / indexing / view 语义,没有实际用到 CUDA runtime。这里更像是“文件级依赖或目标划分”问题,而不是 test case 本身只能放在 GPU 路径里。

  2. 正文基本是 CPU case,但文件头带了无关 CUDA include

    • ATen_transpose_test.cc
    • ATen_viewAs_test.cc

    这两个文件目前也没有真正用到 c10::cuda::* / at::cuda::* 逻辑,感觉更像是可以先把无关的 CUDA 头删掉,再继续留在 CPU 路径监控。

  3. CPU / GPU case 混在同一个文件里,适合继续拆分

    • ATen_basic_test.cc
    • ATen_from_blob_test.cc

    这类我能理解为什么会在 XPU 上出问题,但如果问题来源是同文件里混有 GPU 段,我还是更倾向于:把 CPU-only 部分和 GPU-only 部分拆开,而不是把整文件都回退到 nv_test

所以从 review 角度看,我现在的判断还是一致的:如果目标是“先把 CPU 上能监控的部分保住”,那优先级应该是清理无关 CUDA include / 拆分混合文件,而不是把一批本来可以在 CPU 上监控的 compat tests 再收回 GPU-only 路径。

也就是说,这个 fix xpu 方向我理解,但我这边还不能把它当成最终解法;更理想的收敛方式还是尽量把 CPU-capable case 留在 cc_test,只把确实有 CUDA 依赖的部分留在 nv_test

ASSERT_NO_THROW(at::cuda::stream_synchronize(stream));
}
#endif

// ---------------------------------------------------------------------------
// CUDAContextLight.h — covers the 1 missing line: is_available()
// ---------------------------------------------------------------------------

TEST(CUDAContextLightTest, IsAvailable) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// With GPU compilation and at least one device, this must be true.
int gpu_count = phi::backends::gpu::GetGPUDeviceCount();
ASSERT_EQ(at::cuda::is_available(), gpu_count > 0);
#else
// In CPU-only builds, is_available() should return false
ASSERT_FALSE(at::cuda::is_available());
#endif
}

// ---------------------------------------------------------------------------
Expand All @@ -65,14 +76,21 @@ TEST(CUDAContextLightTest, IsAvailable) {

// getNumGPUs() delegages to c10::cuda::device_count()
TEST(CUDAContextLightTest, GetNumGPUs) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
int64_t n = at::cuda::getNumGPUs();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ASSERT_GE(n, 1);
#else
// In CPU-only builds, device_count() returns 0
ASSERT_EQ(n, 0);
#endif
}

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

// The following tests require CUDA runtime and can only run in CUDA builds

// getCurrentDeviceProperties() / getDeviceProperties()
TEST(CUDAContextLightTest, DeviceProperties) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
ASSERT_NE(prop, nullptr);
// Sanity-check a few well-known fields
Expand All @@ -87,15 +105,13 @@ TEST(CUDAContextLightTest, DeviceProperties) {

// warp_size()
TEST(CUDAContextLightTest, WarpSize) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
int ws = at::cuda::warp_size();
// All NVIDIA and AMD GPU architectures have warp size of 32 or 64
ASSERT_TRUE(ws == 32 || ws == 64);
}

// canDeviceAccessPeer() — a device cannot peer-access itself
TEST(CUDAContextLightTest, CanDeviceAccessPeer) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
int device_id = phi::backends::gpu::GetCurrentDeviceId();
// Self-to-self peer access is always false per CUDA spec
bool self_peer = at::cuda::canDeviceAccessPeer(device_id, device_id);
Expand All @@ -104,26 +120,22 @@ TEST(CUDAContextLightTest, CanDeviceAccessPeer) {

// Handle accessors — all must return non-null handles
TEST(CUDAContextLightTest, GetCurrentCUDABlasHandle) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
cublasHandle_t h = at::cuda::getCurrentCUDABlasHandle();
ASSERT_NE(h, nullptr);
}

TEST(CUDAContextLightTest, GetCurrentCUDABlasLtHandle) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
cublasLtHandle_t h = at::cuda::getCurrentCUDABlasLtHandle();
ASSERT_NE(h, nullptr);
}

TEST(CUDAContextLightTest, GetCurrentCUDASparseHandle) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
cusparseHandle_t h = at::cuda::getCurrentCUDASparseHandle();
ASSERT_NE(h, nullptr);
}

#if defined(CUDART_VERSION) || defined(USE_ROCM)
TEST(CUDAContextLightTest, GetCurrentCUDASolverDnHandle) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
cusolverDnHandle_t h = at::cuda::getCurrentCUDASolverDnHandle();
ASSERT_NE(h, nullptr);
}
Expand Down Expand Up @@ -160,7 +172,6 @@ TEST(CUDAContextLightTest, GetChosenWorkspaceSize) {

// getCUDABlasLtWorkspaceSize() / getCUDABlasLtWorkspace()
TEST(CUDAContextLightTest, CUDABlasLtWorkspace) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
size_t sz = at::cuda::getCUDABlasLtWorkspaceSize();
ASSERT_GT(sz, 0UL);

Expand All @@ -176,7 +187,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorSingleton) {
}

TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator();
ASSERT_NE(alloc, nullptr);

Expand Down Expand Up @@ -207,7 +217,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) {
}

TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator();
ASSERT_NE(alloc, nullptr);

Expand All @@ -220,7 +229,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) {
}

TEST(CUDAContextLightTest, AllocatorZeroSizeAndNoopCopyBranches) {
SKIP_IF_CUDA_RUNTIME_UNAVAILABLE();
c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator();
ASSERT_NE(alloc, nullptr);

Expand Down
Loading
Loading