-
Notifications
You must be signed in to change notification settings - Fork 6k
[Cpp API Compatibility] Delete useless code and rename test files #78580
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a45e7e0
7cee6e8
3dc2c05
53acaba
82768bc
a0be3de
88c0a55
90a73da
aaac6b3
d825db2
d6e9be4
ce84dbe
003897d
f8e3a74
85ae6b8
7c0fb0f
f9a4cb5
3d9f777
c661ed6
4447f88
ad97976
ac7bab4
283802c
253d526
89106a5
1fcc43d
ea0fa1f
52662f3
934b82a
d283912
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -71,6 +71,7 @@ third_party/ | |
| bazel-* | ||
| .humanize | ||
| .codex | ||
| .paddle-agent | ||
|
|
||
| build_* | ||
| # clion workspace. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,51 +12,62 @@ | |
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
|
|
||
| #include <ATen/cuda/CUDAContextLight.h> | ||
| #include <c10/core/Allocator.h> | ||
| #include <c10/cuda/CUDAFunctions.h> | ||
| #include <c10/cuda/CUDAStream.h> | ||
|
|
||
| #include "gtest/gtest.h" | ||
|
|
||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
| #include <c10/cuda/CUDAStream.h> | ||
| #include "paddle/phi/backends/gpu/gpu_info.h" | ||
| #include "test/cpp/compat/cuda_test_utils.h" | ||
| #endif | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // CUDAFunctions.h — covers the 2 missing lines: | ||
| // c10::cuda::device_synchronize() and c10::cuda::stream_synchronize() | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| TEST(CUDAFunctionsTest, DeviceSynchronize) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
| // Exercises the PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()) branch | ||
| ASSERT_NO_THROW(c10::cuda::device_synchronize()); | ||
| #else | ||
| // In CPU-only builds, device_synchronize throws | ||
| ASSERT_THROW(c10::cuda::device_synchronize(), std::exception); | ||
| #endif | ||
| } | ||
|
|
||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
| TEST(CUDAFunctionsTest, StreamSynchronize) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| // Exercises phi::backends::gpu::GpuStreamSync() | ||
| auto stream = c10::cuda::getCurrentCUDAStream(); | ||
| ASSERT_NO_THROW(c10::cuda::stream_synchronize(stream)); | ||
| } | ||
| #endif | ||
|
|
||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
| TEST(CUDAFunctionsTest, AtNamespaceAliases) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| // Exercises the using aliases in at::cuda namespace | ||
| ASSERT_NO_THROW(at::cuda::device_synchronize()); | ||
| auto stream = c10::cuda::getCurrentCUDAStream(); | ||
|
Comment on lines
31
to
53
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里把运行时 guard 一起删掉后, 像下面这些调用: 建议保留这层运行时判断,或者统一改成
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @youge325 话说这个当初为啥要加,现在又为啥要删?是因为只在 CUDA 上跑吗?其实我觉得如果可以的话,我还是希望这些单测能测就尽可能在 CPU 上测一测的,当初
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 当初用 Agent 加测试提高 coverage rate 的时候自动加的,按我的理解,如果能编译成功的话,那么测试也不能随便就 skip,所以现在就删了。 有些单测也确实可以在CPU上跑,条件编译一下测试用例就行,比如
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ShigureNyako 帮我看看
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 刚刚 auth 过期了,@ShigureNyako 看看呢!
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ShigureNyako 请给我一个 GPU build 但运行时无可用 device/runtime 的例子,在 CI 当中并没有出现这种情况, GPU build 就是为了在有可用 device/runtime 的场景中运行,我原来的代码写错了所以现在删掉,你可能倒果为因了
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
你这个提醒是对的,我这里把表述收敛一下:就这条 PR 当前的 CI 证据来看,我手头没有一个已经实际打出来的 “GPU build 但运行时无 device/runtime” 失败样例。 现在能直接看到的红灯,还是 我这边现在更准确的关注点,其实是另一层:不要因为补了部分 CPU-only compiling branch,就直接推导出可以把整块 原因是当前这批 test 里仍然混着不少明确的 GPU-only case,例如:
所以我认同你说的:“当前 CI 没有证明 runtime 不可用场景会打到这里”。但从代码拆分角度看,我还是倾向于把结论限定成:
我会按这个收敛后的判断继续看,不再把“当前 CI 已经证明有 runtime availability 问题”当成论据本身。
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ShigureNyako 如果从严谨性上来看,确实会有编译期 当初早期 C++ 兼容性第一个 PR #74402 为赶 3.2 发版在拉分支前夕合入临时跳过了一些内容,应该是 defd6a3 跳过了一些新硬件(根据上一个 commit 99f7b88 的报错和该 commit message 看起来是 XPU 和 NPU,但日志已经过期看不到了)上的运行时问题,请着眼于该场景分析下具体可能存在的问题,以及是否有推荐的解决方案 注意,我们即便不能推广到所有硬件(比如 XPU 和 NPU),即便跳过部分硬件,也比直接放弃 CPU 测试更好,我们的目标不是一次修复所有硬件上的问题(当然可以更好),只是希望在确保 CPU 上能够有所监控
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
我按你给的线索回看了一下 PR #74402 里的相关 commit,能从代码里直接确认到的点大概是这样:
所以如果按这个历史上下文收敛,我现在更倾向于认为:当时真正要绕开的,不是单纯“这些测试在 runtime 上能不能 skip”,而是“CPU-capable 的测试文件里混入了 CUDA-only 头文件 / 链接依赖,导致 XPU/NPU build 只能整体退回到 WITH_GPU 里面”。 也就是说,这个场景下更具体的问题其实是:
如果按这个目标来推,我这边比较推荐的解法会是分层处理,而不是一步到位追求“所有后端都修完”:
从这个角度看,我会把当前 review 立场进一步收敛成:支持把 CPU-capable case 往 CPU 流水线拆,但不支持在这一步直接整体移除 这样比较贴近当年
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 我看了最新提交 这次从 CPU 路径挪回 GPU 路径的文件里,至少有几类情况其实不太一样:
所以从 review 角度看,我现在的判断还是一致的:如果目标是“先把 CPU 上能监控的部分保住”,那优先级应该是清理无关 CUDA include / 拆分混合文件,而不是把一批本来可以在 CPU 上监控的 compat tests 再收回 GPU-only 路径。 也就是说,这个 |
||
| ASSERT_NO_THROW(at::cuda::stream_synchronize(stream)); | ||
| } | ||
| #endif | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // CUDAContextLight.h — covers the 1 missing line: is_available() | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| TEST(CUDAContextLightTest, IsAvailable) { | ||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
| // With GPU compilation and at least one device, this must be true. | ||
| int gpu_count = phi::backends::gpu::GetGPUDeviceCount(); | ||
| ASSERT_EQ(at::cuda::is_available(), gpu_count > 0); | ||
| #else | ||
| // In CPU-only builds, is_available() should return false | ||
| ASSERT_FALSE(at::cuda::is_available()); | ||
| #endif | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
|
|
@@ -65,14 +76,21 @@ TEST(CUDAContextLightTest, IsAvailable) { | |
|
|
||
| // getNumGPUs() delegages to c10::cuda::device_count() | ||
| TEST(CUDAContextLightTest, GetNumGPUs) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| int64_t n = at::cuda::getNumGPUs(); | ||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
| ASSERT_GE(n, 1); | ||
| #else | ||
| // In CPU-only builds, device_count() returns 0 | ||
| ASSERT_EQ(n, 0); | ||
| #endif | ||
| } | ||
|
|
||
| #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
|
|
||
| // The following tests require CUDA runtime and can only run in CUDA builds | ||
|
|
||
| // getCurrentDeviceProperties() / getDeviceProperties() | ||
| TEST(CUDAContextLightTest, DeviceProperties) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); | ||
| ASSERT_NE(prop, nullptr); | ||
| // Sanity-check a few well-known fields | ||
|
|
@@ -87,15 +105,13 @@ TEST(CUDAContextLightTest, DeviceProperties) { | |
|
|
||
| // warp_size() | ||
| TEST(CUDAContextLightTest, WarpSize) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| int ws = at::cuda::warp_size(); | ||
| // All NVIDIA and AMD GPU architectures have warp size of 32 or 64 | ||
| ASSERT_TRUE(ws == 32 || ws == 64); | ||
| } | ||
|
|
||
| // canDeviceAccessPeer() — a device cannot peer-access itself | ||
| TEST(CUDAContextLightTest, CanDeviceAccessPeer) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| int device_id = phi::backends::gpu::GetCurrentDeviceId(); | ||
| // Self-to-self peer access is always false per CUDA spec | ||
| bool self_peer = at::cuda::canDeviceAccessPeer(device_id, device_id); | ||
|
|
@@ -104,26 +120,22 @@ TEST(CUDAContextLightTest, CanDeviceAccessPeer) { | |
|
|
||
| // Handle accessors — all must return non-null handles | ||
| TEST(CUDAContextLightTest, GetCurrentCUDABlasHandle) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| cublasHandle_t h = at::cuda::getCurrentCUDABlasHandle(); | ||
| ASSERT_NE(h, nullptr); | ||
| } | ||
|
|
||
| TEST(CUDAContextLightTest, GetCurrentCUDABlasLtHandle) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| cublasLtHandle_t h = at::cuda::getCurrentCUDABlasLtHandle(); | ||
| ASSERT_NE(h, nullptr); | ||
| } | ||
|
|
||
| TEST(CUDAContextLightTest, GetCurrentCUDASparseHandle) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| cusparseHandle_t h = at::cuda::getCurrentCUDASparseHandle(); | ||
| ASSERT_NE(h, nullptr); | ||
| } | ||
|
|
||
| #if defined(CUDART_VERSION) || defined(USE_ROCM) | ||
| TEST(CUDAContextLightTest, GetCurrentCUDASolverDnHandle) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| cusolverDnHandle_t h = at::cuda::getCurrentCUDASolverDnHandle(); | ||
| ASSERT_NE(h, nullptr); | ||
| } | ||
|
|
@@ -160,7 +172,6 @@ TEST(CUDAContextLightTest, GetChosenWorkspaceSize) { | |
|
|
||
| // getCUDABlasLtWorkspaceSize() / getCUDABlasLtWorkspace() | ||
| TEST(CUDAContextLightTest, CUDABlasLtWorkspace) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| size_t sz = at::cuda::getCUDABlasLtWorkspaceSize(); | ||
| ASSERT_GT(sz, 0UL); | ||
|
|
||
|
|
@@ -176,7 +187,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorSingleton) { | |
| } | ||
|
|
||
| TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator(); | ||
| ASSERT_NE(alloc, nullptr); | ||
|
|
||
|
|
@@ -207,7 +217,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneAndCopyData) { | |
| } | ||
|
|
||
| TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator(); | ||
| ASSERT_NE(alloc, nullptr); | ||
|
|
||
|
|
@@ -220,7 +229,6 @@ TEST(CUDAContextLightTest, CUDADeviceAllocatorCloneZeroBytes) { | |
| } | ||
|
|
||
| TEST(CUDAContextLightTest, AllocatorZeroSizeAndNoopCopyBranches) { | ||
| SKIP_IF_CUDA_RUNTIME_UNAVAILABLE(); | ||
| c10::Allocator* alloc = at::cuda::getCUDADeviceAllocator(); | ||
| ASSERT_NE(alloc, nullptr); | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.