diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b29c5b..c8306a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,6 +76,21 @@ list(REMOVE_ITEM TEST_SRC_FILES ${UNMATCH_FILES}) file(GLOB_RECURSE TEST_BASE_FILES ${PROJECT_SOURCE_DIR}/src/*.cpp) set(PADDLE_TARGET_FOLDER ${CMAKE_BINARY_DIR}/paddle) +# --------------------------------------------------------------------------- +# CUDA Toolkit (needed for CUDA-specific test headers in the Torch build) +# --------------------------------------------------------------------------- +find_package(CUDAToolkit QUIET) +if(CUDAToolkit_FOUND) + message(STATUS "Found CUDA Toolkit: ${CUDAToolkit_INCLUDE_DIRS}") + set(CUDA_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}") +elseif(EXISTS "/usr/local/cuda/include") + set(CUDA_INCLUDE_DIRS "/usr/local/cuda/include") + message(STATUS "Using default CUDA include dir: ${CUDA_INCLUDE_DIRS}") +else() + message(WARNING "CUDA headers not found; CUDA tests may not compile.") + set(CUDA_INCLUDE_DIRS "") +endif() + # --------------------------------------------------------------------------- # Build Torch test case # --------------------------------------------------------------------------- @@ -87,8 +102,9 @@ set(TORCH_LIBRARIES "") file(GLOB_RECURSE TORCH_LIBRARIES "${TORCH_DIR}/lib/*.so" "${TORCH_DIR}/lib/*.a") -set(TORCH_INCLUDE_DIR "${TORCH_DIR}/include" - "${TORCH_DIR}/include/torch/csrc/api/include/") +set(TORCH_INCLUDE_DIR + "${TORCH_DIR}/include" "${TORCH_DIR}/include/torch/csrc/api/include/" + "${CUDA_INCLUDE_DIRS}") set(TORCH_TARGET_FOLDER ${CMAKE_BINARY_DIR}/torch) set(BIN_PREFIX "torch_") @@ -119,7 +135,8 @@ set(PADDLE_INCLUDE_DIR "${PADDLE_DIR}/include/third_party" "${PADDLE_DIR}/include/paddle/phi/api/include/compat/" "${PADDLE_DIR}/include/paddle/phi/api/include/compat/torch/csrc/api/include/" -) + "${CUDA_INCLUDE_DIRS}" + "${CUDA_INCLUDE_DIRS}/cccl") set(PADDLE_LIBRARIES "${PADDLE_DIR}/base/libpaddle.so" "${PADDLE_DIR}/libs/libcommon.so" diff --git a/cmake/build.cmake b/cmake/build.cmake index 2bcaf4e..4b3f2c9 100644 --- a/cmake/build.cmake +++ b/cmake/build.cmake @@ -23,6 +23,20 @@ function( message(STATUS "include dir: ${INCLUDE_DIR}") target_compile_definitions(${_test_name} PRIVATE USE_PADDLE_API=${USE_PADDLE_API}) + if(${USE_PADDLE_API}) + # Paddle's CUDA compat headers (CUDAContextLight.h, CUDAFunctions.h) + # require PADDLE_WITH_CUDA to be defined so that GPU type aliases + # (gpuStream_t, cudaDeviceProp) are resolved via cuda_runtime.h. + target_compile_definitions(${_test_name} PRIVATE PADDLE_WITH_CUDA) + # Link libcudart for CUDA runtime symbols used by the Paddle CUDA compat + # layer. + if(TARGET CUDA::cudart) + target_link_libraries(${_test_name} CUDA::cudart) + elseif(EXISTS "/usr/local/cuda/lib64/libcudart.so") + target_link_libraries(${_test_name} + "/usr/local/cuda/lib64/libcudart.so") + endif() + endif() message(STATUS "USE_PADDLE_API: ${USE_PADDLE_API}") add_test(NAME ${_test_name} COMMAND ${_test_name}) set_tests_properties(${_test_name} PROPERTIES TIMEOUT 5) diff --git a/test/ops/CUDABlasTest.cpp b/test/ops/CUDABlasTest.cpp new file mode 100644 index 0000000..c8422fc --- /dev/null +++ b/test/ops/CUDABlasTest.cpp @@ -0,0 +1,580 @@ +#include +#include + +// 【差异点1】at::cuda::blas::gemm 符号可见性差异 +// PyTorch(libtorch)将 at::cuda::blas::gemm 编译为 hidden visibility +// (动态符号表中 nm -D 结果为小写 't'),外部代码无法链接该符号。 +// Paddle compat 库中该符号为公开导出(大写 'T'),可正常从外部调用。 +// 因此仅在 Paddle 构建(USE_PADDLE_API=1)时包含头文件并实例化 gemm 测试; +// Torch 构建输出 "not_exported" 占位,保持两端输出行对齐。 +#if USE_PADDLE_API +#include +#endif + +#include + +#include +#include + +#include "../../src/file_manager.h" + +extern paddle_api_test::ThreadSafeParam g_custom_param; + +namespace at { +namespace test { + +using paddle_api_test::FileManerger; +using paddle_api_test::ThreadSafeParam; + +class CUDABlasTest : public ::testing::Test { + protected: + void SetUp() override {} +}; + +#if USE_PADDLE_API +// Write a column-major m×n result matrix (on GPU) to file as float values. +// Converts any dtype to float before serialisation so the format is uniform +// across all dtype tests. +static void write_gemm_result_to_file(FileManerger* file, + const at::Tensor& result_gpu, + int64_t m, + int64_t n) { + at::Tensor cpu = result_gpu.cpu().to(at::kFloat); + float* data = cpu.data_ptr(); + *file << std::to_string(m) << " " << std::to_string(n) << " "; + for (int64_t i = 0; i < m * n; ++i) { + *file << std::to_string(data[i]) << " "; + } +} +#endif // USE_PADDLE_API + +// ============================================================ +// at::cuda::getCurrentCUDABlasHandle tests +// ============================================================ + +// 验证 getCurrentCUDABlasHandle 返回非空 handle。 +// 使用 `auto` 接收返回值以屏蔽类型差异: +// 【差异点2】返回类型差异 +// PyTorch:直接返回 cublasHandle_t +// Paddle compat:返回 at::cuda::CUDAContextBlasHandle, +// 在 CUDA 构建中该类型 typedef 为 cublasHandle_t, +// 在 HIP/ROCm 构建中则为 phi::blasHandle_t(不同类型)。 +TEST_F(CUDABlasTest, HandleNonNull) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.createFile(); + file << "HandleNonNull "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } +#if USE_PADDLE_API + // 【差异点3】Paddle 的 getCurrentCUDABlasHandle() 实现依赖框架全局状态 + // Paddle 内部调用 phi::DeviceContextPool::Instance().Get(GPUPlace()), + // 该调用要求事先通过 paddle::framework::InitDevices() 初始化 + // DeviceContextPool。 在独立 C++ 测试二进制中框架未初始化,Paddle 抛出 + // PreconditionNotMet 异常。 PyTorch 无此约束,只需 CUDA + // 分配器初始化即可正常返回 handle。 输出 "exception_needs_pool_init" + // 记录该行为差异。 + try { + auto handle = at::cuda::getCurrentCUDABlasHandle(); + file << std::to_string(handle != nullptr ? 1 : 0) << " "; + } catch (const std::exception&) { + file << "exception_needs_pool_init "; + } +#else + // PyTorch 在首次创建 cuBLAS handle 前要求 CUDA 缓存分配器已初始化。 + // 分配一个 dummy GPU tensor 是触发该初始化的标准方式。 + { + auto _init = at::zeros({1}, at::kFloat).cuda(); + (void)_init; + } + try { + auto handle = at::cuda::getCurrentCUDABlasHandle(); + file << std::to_string(handle != nullptr ? 1 : 0) << " "; + } catch (const std::exception& e) { + file << "exception:" << e.what() << " "; + } +#endif + file << "\n"; + file.saveFile(); +} + +// Verify that two successive calls on the same thread return the same handle +// (the implementation caches one handle per CUDA stream). +TEST_F(CUDABlasTest, HandleConsistency) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "HandleConsistency "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } +#if USE_PADDLE_API + // 与 HandleNonNull 相同的 pool-init 限制(见差异点3), + // Paddle 在 DeviceContextPool 未初始化时抛出异常。 + try { + auto handle1 = at::cuda::getCurrentCUDABlasHandle(); + auto handle2 = at::cuda::getCurrentCUDABlasHandle(); + file << std::to_string(handle1 == handle2 ? 1 : 0) << " "; + } catch (const std::exception&) { + file << "exception_needs_pool_init "; + } +#else + { + auto _init = at::zeros({1}, at::kFloat).cuda(); + (void)_init; + } + try { + auto handle1 = at::cuda::getCurrentCUDABlasHandle(); + auto handle2 = at::cuda::getCurrentCUDABlasHandle(); + file << std::to_string(handle1 == handle2 ? 1 : 0) << " "; + } catch (const std::exception& e) { + file << "exception:" << e.what() << " "; + } +#endif + file << "\n"; + file.saveFile(); +} + +#if USE_PADDLE_API + +// 【差异点4】at::tensor(initializer_list) 无 TensorOptions 重载缺失 +// PyTorch 的 ATen/Utils.h 提供 at::tensor(std::initializer_list) +// 直接推断类型的重载; Paddle compat 的 ATen/Utils.h 仅提供 +// at::tensor(ArrayRef, TensorOptions) 形式, 不支持不带 TensorOptions 的 +// initializer_list 重载。 因此此处使用 cpu_fill_f32 辅助函数代替 +// at::tensor({...}) 构造张量。 + +// 辅助函数:在 CPU 上从 initializer_list 填充 float32 一维张量后移至 GPU。 +static at::Tensor cpu_fill_f32(std::initializer_list vals) { + auto t = at::zeros({(int64_t)vals.size()}, at::kFloat); + float* p = t.data_ptr(); + int64_t i = 0; + for (float v : vals) p[i++] = v; + return t.cuda(); +} + +// ============================================================ +// at::cuda::blas::gemm tests +// +// All matrix data use column-major (BLAS native) layout. +// For a 2×2 example the storage order is: +// [A[0,0], A[1,0], A[0,1], A[1,1]] +// +// Basic reference setup (used in most tests unless otherwise noted): +// A (col-major) = [1, 3, 2, 4] → matrix [[1,2],[3,4]] +// B (col-major) = [1, 0, 0, 1] → identity I₂ +// alpha=1, beta=0 → C = A·I = A → expected output: [1, 3, 2, 4] +// ============================================================ + +// --- float32 --- + +TEST_F(CUDABlasTest, GemmFloat) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmFloat "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + at::Tensor a = cpu_fill_f32({1.0f, 3.0f, 2.0f, 4.0f}); + at::Tensor b = cpu_fill_f32({1.0f, 0.0f, 0.0f, 1.0f}); + at::Tensor c = at::zeros({4}, at::kFloat).cuda(); + at::cuda::blas::gemm('N', + 'N', + 2, + 2, + 2, + 1.0f, + a.data_ptr(), + 2, + b.data_ptr(), + 2, + 0.0f, + c.data_ptr(), + 2); + write_gemm_result_to_file(&file, c, 2, 2); + file << "\n"; + file.saveFile(); +} + +// --- float64 --- + +TEST_F(CUDABlasTest, GemmDouble) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmDouble "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + at::Tensor a = cpu_fill_f32({1.0f, 3.0f, 2.0f, 4.0f}).to(at::kDouble); + at::Tensor b = cpu_fill_f32({1.0f, 0.0f, 0.0f, 1.0f}).to(at::kDouble); + at::Tensor c = at::zeros({4}, at::kDouble).cuda(); + at::cuda::blas::gemm('N', + 'N', + 2, + 2, + 2, + 1.0, + a.data_ptr(), + 2, + b.data_ptr(), + 2, + 0.0, + c.data_ptr(), + 2); + write_gemm_result_to_file(&file, c, 2, 2); + file << "\n"; + file.saveFile(); +} + +// --- float16 (at::Half) --- +// alpha / beta are at::opmath_type = float + +TEST_F(CUDABlasTest, GemmHalf) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmHalf "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + at::Tensor a = cpu_fill_f32({1.0f, 3.0f, 2.0f, 4.0f}).to(at::kHalf); + at::Tensor b = cpu_fill_f32({1.0f, 0.0f, 0.0f, 1.0f}).to(at::kHalf); + at::Tensor c = at::zeros({4}, at::kHalf).cuda(); + at::cuda::blas::gemm('N', + 'N', + 2, + 2, + 2, + 1.0f, + a.data_ptr(), + 2, + b.data_ptr(), + 2, + 0.0f, + c.data_ptr(), + 2); + write_gemm_result_to_file(&file, c, 2, 2); + file << "\n"; + file.saveFile(); +} + +// --- bfloat16 (at::BFloat16) --- + +TEST_F(CUDABlasTest, GemmBFloat16) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmBFloat16 "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + at::Tensor a = cpu_fill_f32({1.0f, 3.0f, 2.0f, 4.0f}).to(at::kBFloat16); + at::Tensor b = cpu_fill_f32({1.0f, 0.0f, 0.0f, 1.0f}).to(at::kBFloat16); + at::Tensor c = at::zeros({4}, at::kBFloat16).cuda(); + at::cuda::blas::gemm('N', + 'N', + 2, + 2, + 2, + 1.0f, + a.data_ptr(), + 2, + b.data_ptr(), + 2, + 0.0f, + c.data_ptr(), + 2); + write_gemm_result_to_file(&file, c, 2, 2); + file << "\n"; + file.saveFile(); +} + +// --- non-zero beta: C = alpha*A*B + beta*C_init --- +// C_init (col-major) = [1,1,1,1], beta=0.5 +// Result = [1,3,2,4] + 0.5*[1,1,1,1] = [1.5, 3.5, 2.5, 4.5] + +TEST_F(CUDABlasTest, GemmWithBeta) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmWithBeta "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + at::Tensor a = cpu_fill_f32({1.0f, 3.0f, 2.0f, 4.0f}); + at::Tensor b = cpu_fill_f32({1.0f, 0.0f, 0.0f, 1.0f}); + at::Tensor c = cpu_fill_f32({1.0f, 1.0f, 1.0f, 1.0f}); + at::cuda::blas::gemm('N', + 'N', + 2, + 2, + 2, + 1.0f, + a.data_ptr(), + 2, + b.data_ptr(), + 2, + 0.5f, + c.data_ptr(), + 2); + write_gemm_result_to_file(&file, c, 2, 2); + file << "\n"; + file.saveFile(); +} + +// --- negative values in A --- +// A (col-major) = [-1,-2,3,4] → matrix [[-1,3],[-2,4]] +// B = identity, C = A → expected: [-1,-2,3,4] + +TEST_F(CUDABlasTest, GemmNegativeValues) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmNegativeValues "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + at::Tensor a = cpu_fill_f32({-1.0f, -2.0f, 3.0f, 4.0f}); + at::Tensor b = cpu_fill_f32({1.0f, 0.0f, 0.0f, 1.0f}); + at::Tensor c = at::zeros({4}, at::kFloat).cuda(); + at::cuda::blas::gemm('N', + 'N', + 2, + 2, + 2, + 1.0f, + a.data_ptr(), + 2, + b.data_ptr(), + 2, + 0.0f, + c.data_ptr(), + 2); + write_gemm_result_to_file(&file, c, 2, 2); + file << "\n"; + file.saveFile(); +} + +// --- transa='T': C = A^T · B --- +// A stored (col-major k×m = 2×2) = [1,2,3,4] +// A[0,0]=1, A[1,0]=2, A[0,1]=3, A[1,1]=4 → A^T = [[1,2],[3,4]] +// B (col-major) = [2,1,1,2] +// B[0,0]=2, B[1,0]=1, B[0,1]=1, B[1,1]=2 +// C = A^T · B = [[1,2],[3,4]] · [[2,1],[1,2]] +// C[0,0]=4, C[1,0]=10, C[0,1]=5, C[1,1]=11 → col-major: [4,10,5,11] + +TEST_F(CUDABlasTest, GemmTransA) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmTransA "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + at::Tensor a = cpu_fill_f32({1.0f, 2.0f, 3.0f, 4.0f}); + at::Tensor b = cpu_fill_f32({2.0f, 1.0f, 1.0f, 2.0f}); + at::Tensor c = at::zeros({4}, at::kFloat).cuda(); + // transa='T': A is stored as k×m (2×2) with lda=k=2 + at::cuda::blas::gemm('T', + 'N', + 2, + 2, + 2, + 1.0f, + a.data_ptr(), + 2, + b.data_ptr(), + 2, + 0.0f, + c.data_ptr(), + 2); + write_gemm_result_to_file(&file, c, 2, 2); + file << "\n"; + file.saveFile(); +} + +// --- transb='T': C = A · B^T --- +// A (col-major m×k = 2×2) = [1,3,2,4] → A = [[1,2],[3,4]] +// B stored (col-major n×k = 2×2) = [1,2,3,4], ldb=n=2 +// B[0,0]=1, B[1,0]=2, B[0,1]=3, B[1,1]=4 +// B^T: op(B)[i,j]=B[j,i] → row-major view: [[1,3],[2,4]] +// C = A · B^T = [[1,2],[3,4]] · [[1,2],[3,4]] +// C[0,0]=7, C[1,0]=15, C[0,1]=10, C[1,1]=22 → col-major: [7,15,10,22] + +TEST_F(CUDABlasTest, GemmTransB) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmTransB "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + at::Tensor a = cpu_fill_f32({1.0f, 3.0f, 2.0f, 4.0f}); + at::Tensor b = cpu_fill_f32({1.0f, 2.0f, 3.0f, 4.0f}); + at::Tensor c = at::zeros({4}, at::kFloat).cuda(); + // transb='T': B is stored as n×k (2×2) with ldb=n=2 + at::cuda::blas::gemm('N', + 'T', + 2, + 2, + 2, + 1.0f, + a.data_ptr(), + 2, + b.data_ptr(), + 2, + 0.0f, + c.data_ptr(), + 2); + write_gemm_result_to_file(&file, c, 2, 2); + file << "\n"; + file.saveFile(); +} + +// --- scalar (1×1) gemm --- +// A=[5], B=[3], alpha=1, beta=0 → C=[15] + +TEST_F(CUDABlasTest, GemmScalar) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmScalar "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + at::Tensor a = cpu_fill_f32({5.0f}); + at::Tensor b = cpu_fill_f32({3.0f}); + at::Tensor c = at::zeros({1}, at::kFloat).cuda(); + at::cuda::blas::gemm('N', + 'N', + 1, + 1, + 1, + 1.0f, + a.data_ptr(), + 1, + b.data_ptr(), + 1, + 0.0f, + c.data_ptr(), + 1); + write_gemm_result_to_file(&file, c, 1, 1); + file << "\n"; + file.saveFile(); +} + +// --- large matrix (100×100, ≥10000 elements) --- +// A = all-ones 100×100, B = all-ones 100×100, alpha=1, beta=0 +// Each C[i,j] = sum_k A[i,k]*B[k,j] = 100. +// Sum of all C elements = 100 * 100 * 100 = 1,000,000. +// Only the sum is written to keep the output file compact. + +TEST_F(CUDABlasTest, GemmLargeMatrix) { + auto file_name = g_custom_param.get(); + FileManerger file(file_name); + file.openAppend(); + file << "GemmLargeMatrix "; + if (!at::cuda::is_available()) { + file << "no_cuda "; + file << "\n"; + file.saveFile(); + return; + } + constexpr int64_t M = 100, N = 100, K = 100; + at::Tensor a = at::ones({M * K}, at::kFloat).cuda(); + at::Tensor b = at::ones({K * N}, at::kFloat).cuda(); + at::Tensor c = at::zeros({M * N}, at::kFloat).cuda(); + at::cuda::blas::gemm('N', + 'N', + M, + N, + K, + 1.0f, + a.data_ptr(), + M, + b.data_ptr(), + K, + 0.0f, + c.data_ptr(), + M); + at::Tensor c_cpu = c.cpu(); + float* data = c_cpu.data_ptr(); + float total = 0.0f; + for (int64_t i = 0; i < M * N; ++i) { + total += data[i]; + } + file << std::to_string(M) << " " << std::to_string(N) << " "; + file << std::to_string(total) << " "; + file << "\n"; + file.saveFile(); +} + +#else // !USE_PADDLE_API + +// 【差异点1 对应桩代码】 +// at::cuda::blas::gemm 在 libtorch 中为 hidden visibility,外部无法链接。 +// 输出 "not_exported" 占位,保持 torch/paddle 两端输出文件行数对齐, +// 以便比对脚本(result_cmp.sh)能逐行对比其余可比较的测试结果。 +#define CUDABLAS_GEMM_STUB(name) \ + TEST_F(CUDABlasTest, name) { \ + auto file_name = g_custom_param.get(); \ + FileManerger file(file_name); \ + file.openAppend(); \ + file << #name " not_exported "; \ + file << "\n"; \ + file.saveFile(); \ + } + +CUDABLAS_GEMM_STUB(GemmFloat) +CUDABLAS_GEMM_STUB(GemmDouble) +CUDABLAS_GEMM_STUB(GemmHalf) +CUDABLAS_GEMM_STUB(GemmBFloat16) +CUDABLAS_GEMM_STUB(GemmWithBeta) +CUDABLAS_GEMM_STUB(GemmNegativeValues) +CUDABLAS_GEMM_STUB(GemmTransA) +CUDABLAS_GEMM_STUB(GemmTransB) +CUDABLAS_GEMM_STUB(GemmScalar) +CUDABLAS_GEMM_STUB(GemmLargeMatrix) + +#undef CUDABLAS_GEMM_STUB + +#endif // USE_PADDLE_API + +} // namespace test +} // namespace at