diff --git a/paddle/phi/api/include/compat/ATen/Utils.cpp b/paddle/phi/api/include/compat/ATen/Utils.cpp index 6e2d24477cdc4..2f360a4fb079a 100644 --- a/paddle/phi/api/include/compat/ATen/Utils.cpp +++ b/paddle/phi/api/include/compat/ATen/Utils.cpp @@ -24,6 +24,7 @@ #include +#include "paddle/common/macros.h" #include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/include/tensor.h" @@ -71,24 +72,24 @@ Tensor tensor_complex_backend(ArrayRef values, } // namespace detail -#define TENSOR(T, _1) \ - Tensor tensor(ArrayRef values, const TensorOptions& options) { \ - if (options.device().type() != c10::DeviceType::CPU) { \ - return at::detail::tensor_backend(values, options); \ - } else { \ - return at::detail::tensor_cpu(values, options); \ - } \ +#define TENSOR(T, _1) \ + PADDLE_API Tensor tensor(ArrayRef values, const TensorOptions& options) { \ + if (options.device().type() != c10::DeviceType::CPU) { \ + return at::detail::tensor_backend(values, options); \ + } else { \ + return at::detail::tensor_cpu(values, options); \ + } \ } AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR) #undef TENSOR -#define TENSOR(T, _1) \ - Tensor tensor(ArrayRef values, const TensorOptions& options) { \ - if (options.device().type() != c10::DeviceType::CPU) { \ - return at::detail::tensor_complex_backend(values, options); \ - } else { \ - return at::detail::tensor_complex_cpu(values, options); \ - } \ +#define TENSOR(T, _1) \ + PADDLE_API Tensor tensor(ArrayRef values, const TensorOptions& options) { \ + if (options.device().type() != c10::DeviceType::CPU) { \ + return at::detail::tensor_complex_backend(values, options); \ + } else { \ + return at::detail::tensor_complex_cpu(values, options); \ + } \ } AT_FORALL_COMPLEX_TYPES(TENSOR) #undef TENSOR diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h b/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h index 6c785e334a330..9ef86fa0b19ac 100644 --- a/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h +++ b/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h @@ -31,6 +31,8 @@ #include #include +#include "paddle/common/macros.h" + namespace at::cuda::blas { /* LEVEL 3 BLAS FUNCTIONS */ @@ -54,16 +56,18 @@ inline void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { } template <> -void gemm(CUDABLAS_GEMM_ARGTYPES(double)); +PADDLE_API void gemm(CUDABLAS_GEMM_ARGTYPES(double)); template <> -void gemm(CUDABLAS_GEMM_ARGTYPES(float)); +PADDLE_API void gemm(CUDABLAS_GEMM_ARGTYPES(float)); template <> -void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)); +PADDLE_API void gemm>( + CUDABLAS_GEMM_ARGTYPES(c10::complex)); template <> -void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)); +PADDLE_API void gemm>( + CUDABLAS_GEMM_ARGTYPES(c10::complex)); template <> -void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)); +PADDLE_API void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)); template <> -void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); +PADDLE_API void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); } // namespace at::cuda::blas diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h index a8f67766140b4..e9fb5e0715a52 100644 --- a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h +++ b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h @@ -39,6 +39,7 @@ #include #include +#include "paddle/common/macros.h" #include "paddle/phi/backends/gpu/forwards.h" namespace c10 { @@ -95,40 +96,41 @@ inline int64_t getNumGPUs() { return c10::cuda::device_count(); } inline bool is_available() { return c10::cuda::device_count() > 0; } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -CUDAContextDeviceProp* getCurrentDeviceProperties(); +PADDLE_API CUDAContextDeviceProp* getCurrentDeviceProperties(); -int warp_size(); +PADDLE_API int warp_size(); -CUDAContextDeviceProp* getDeviceProperties(c10::DeviceIndex device); +PADDLE_API CUDAContextDeviceProp* getDeviceProperties(c10::DeviceIndex device); -bool canDeviceAccessPeer(c10::DeviceIndex device, c10::DeviceIndex peer_device); +PADDLE_API bool canDeviceAccessPeer(c10::DeviceIndex device, + c10::DeviceIndex peer_device); /* Handles */ -CUDAContextSparseHandle getCurrentCUDASparseHandle(); -CUDAContextBlasHandle getCurrentCUDABlasHandle(); -CUDAContextBlasLtHandle getCurrentCUDABlasLtHandle(); +PADDLE_API CUDAContextSparseHandle getCurrentCUDASparseHandle(); +PADDLE_API CUDAContextBlasHandle getCurrentCUDABlasHandle(); +PADDLE_API CUDAContextBlasLtHandle getCurrentCUDABlasLtHandle(); -void clearCublasWorkspaces(); +PADDLE_API void clearCublasWorkspaces(); struct WorkspaceMapWithMutex { std::map, at::DataPtr> map; std::shared_mutex mutex; }; -WorkspaceMapWithMutex& cublas_handle_stream_to_workspace(); -WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace(); -size_t getChosenWorkspaceSize(); -size_t getCUDABlasLtWorkspaceSize(); -void* getCUDABlasLtWorkspace(); +PADDLE_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace(); +PADDLE_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace(); +PADDLE_API size_t getChosenWorkspaceSize(); +PADDLE_API size_t getCUDABlasLtWorkspaceSize(); +PADDLE_API void* getCUDABlasLtWorkspace(); -CUDAContextSolverHandle getCurrentCUDASolverDnHandle(); +PADDLE_API CUDAContextSolverHandle getCurrentCUDASolverDnHandle(); #if defined(USE_CUDSS) -cudssHandle_t getCurrentCudssHandle(); +PADDLE_API cudssHandle_t getCurrentCudssHandle(); #endif // Get the CUDA device allocator for the current device. // Returns a pointer to a c10::Allocator that allocates GPU memory. -c10::Allocator* getCUDADeviceAllocator(); +PADDLE_API c10::Allocator* getCUDADeviceAllocator(); #endif } // namespace at::cuda diff --git a/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h b/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h index 080f355994c78..4e3dffa1239b1 100644 --- a/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h +++ b/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h @@ -15,14 +15,18 @@ #pragma once #include +#include "paddle/common/macros.h" + namespace at::detail { using at::Tensor; -at::Tensor empty_cuda(IntArrayRef size, - ScalarType dtype, - std::optional device_opt, - std::optional memory_format_opt); +PADDLE_API at::Tensor empty_cuda( + IntArrayRef size, + ScalarType dtype, + std::optional device_opt, + std::optional memory_format_opt); -at::Tensor empty_cuda(IntArrayRef size, const TensorOptions &options); +PADDLE_API at::Tensor empty_cuda(IntArrayRef size, + const TensorOptions &options); } // namespace at::detail diff --git a/paddle/phi/api/include/compat/ATen/ops/tensor.h b/paddle/phi/api/include/compat/ATen/ops/tensor.h index 947c1c4a0f5de..f22457851a3fa 100644 --- a/paddle/phi/api/include/compat/ATen/ops/tensor.h +++ b/paddle/phi/api/include/compat/ATen/ops/tensor.h @@ -20,23 +20,25 @@ #include #include +#include "paddle/common/macros.h" + namespace at { -#define TENSOR(T, S) \ - Tensor tensor(ArrayRef values, const TensorOptions& options); \ - inline Tensor tensor(std::initializer_list values, \ - const TensorOptions& options) { \ - return at::tensor(ArrayRef(values), options); \ - } \ - inline Tensor tensor(T value, const TensorOptions& options) { \ - return at::tensor(ArrayRef(value), options); \ - } \ - inline Tensor tensor(ArrayRef values) { \ - return at::tensor(std::move(values), at::dtype(k##S)); \ - } \ - inline Tensor tensor(std::initializer_list values) { \ - return at::tensor(ArrayRef(values)); \ - } \ +#define TENSOR(T, S) \ + PADDLE_API Tensor tensor(ArrayRef values, const TensorOptions& options); \ + inline Tensor tensor(std::initializer_list values, \ + const TensorOptions& options) { \ + return at::tensor(ArrayRef(values), options); \ + } \ + inline Tensor tensor(T value, const TensorOptions& options) { \ + return at::tensor(ArrayRef(value), options); \ + } \ + inline Tensor tensor(ArrayRef values) { \ + return at::tensor(std::move(values), at::dtype(k##S)); \ + } \ + inline Tensor tensor(std::initializer_list values) { \ + return at::tensor(ArrayRef(values)); \ + } \ inline Tensor tensor(T value) { return at::tensor(ArrayRef(value)); } AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR) AT_FORALL_COMPLEX_TYPES(TENSOR) diff --git a/paddle/phi/api/include/compat/CMakeLists.txt b/paddle/phi/api/include/compat/CMakeLists.txt index f5ccb62aa4554..060d9debc21c7 100644 --- a/paddle/phi/api/include/compat/CMakeLists.txt +++ b/paddle/phi/api/include/compat/CMakeLists.txt @@ -1,4 +1,5 @@ collect_srcs(api_srcs SRCS c10/core/Device.cpp) +collect_srcs(api_srcs SRCS c10/core/DefaultDtype.cpp) collect_srcs(api_srcs SRCS c10/core/Stream.cpp) collect_srcs(api_srcs SRCS c10/cuda/CUDAFunctions.cpp) collect_srcs(api_srcs SRCS c10/cuda/CUDAStream.cpp) diff --git a/paddle/phi/api/include/compat/c10/core/DefaultDtype.cpp b/paddle/phi/api/include/compat/c10/core/DefaultDtype.cpp new file mode 100644 index 0000000000000..aea6069310223 --- /dev/null +++ b/paddle/phi/api/include/compat/c10/core/DefaultDtype.cpp @@ -0,0 +1,50 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +namespace c10 { +static auto default_dtype = caffe2::TypeMeta::Make(); +static auto default_dtype_as_scalartype = default_dtype.toScalarType(); +static auto default_complex_dtype = + caffe2::TypeMeta::Make>(); + +void set_default_dtype(caffe2::TypeMeta dtype) { + default_dtype = dtype; + default_dtype_as_scalartype = default_dtype.toScalarType(); + switch (default_dtype_as_scalartype) { + case ScalarType::Half: + default_complex_dtype = ScalarType::ComplexHalf; + break; + case ScalarType::Double: + default_complex_dtype = ScalarType::ComplexDouble; + break; + default: + default_complex_dtype = ScalarType::ComplexFloat; + break; + } +} + +const caffe2::TypeMeta get_default_dtype() { return default_dtype; } + +ScalarType get_default_dtype_as_scalartype() { + return default_dtype_as_scalartype; +} + +const caffe2::TypeMeta get_default_complex_dtype() { + return default_complex_dtype; +} +} // namespace c10 diff --git a/paddle/phi/api/include/compat/c10/core/DefaultDtype.h b/paddle/phi/api/include/compat/c10/core/DefaultDtype.h index 02b588476045a..927f7a51eabe5 100644 --- a/paddle/phi/api/include/compat/c10/core/DefaultDtype.h +++ b/paddle/phi/api/include/compat/c10/core/DefaultDtype.h @@ -15,21 +15,16 @@ #pragma once #include -#include -namespace c10 { -static auto default_dtype = ScalarType::Float; -static auto default_complex_dtype = ScalarType::ComplexFloat; - -void inline set_default_dtype(ScalarType dtype) { default_dtype = dtype; } +#include "paddle/common/macros.h" -ScalarType inline get_default_dtype_as_scalartype() { return default_dtype; } +namespace caffe2 { +class TypeMeta; +} // namespace caffe2 -ScalarType inline get_default_complex_dtype() { return default_complex_dtype; } - -/// Returns default dtype as caffe2::TypeMeta (the canonical form, mirrors -/// PyTorch). -inline caffe2::TypeMeta get_default_dtype() { - return caffe2::TypeMeta::fromScalarType(default_dtype); -} +namespace c10 { +PADDLE_API void set_default_dtype(caffe2::TypeMeta dtype); +PADDLE_API const caffe2::TypeMeta get_default_dtype(); +PADDLE_API ScalarType get_default_dtype_as_scalartype(); +PADDLE_API const caffe2::TypeMeta get_default_complex_dtype(); } // namespace c10 diff --git a/paddle/phi/api/include/compat/c10/core/Device.h b/paddle/phi/api/include/compat/c10/core/Device.h index 4e4a8cdb140ed..41b45605ef962 100644 --- a/paddle/phi/api/include/compat/c10/core/Device.h +++ b/paddle/phi/api/include/compat/c10/core/Device.h @@ -33,13 +33,14 @@ using gpuStream_t = hipStream_t; #include #include +#include "paddle/common/macros.h" #include "paddle/phi/core/platform/device/gpu/gpu_info.h" #include "paddle/phi/core/platform/device_event_base.h" namespace c10 { using DeviceIndex = int8_t; -struct Device final { +struct PADDLE_API Device final { using Type = DeviceType; Device() = default; Device(phi::Place place) @@ -161,7 +162,7 @@ struct Device final { } }; -std::ostream& operator<<(std::ostream& stream, const Device& device); +PADDLE_API std::ostream& operator<<(std::ostream& stream, const Device& device); } // namespace c10 diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h index e9bcbc939d921..040fb09789454 100644 --- a/paddle/phi/api/include/compat/c10/core/Stream.h +++ b/paddle/phi/api/include/compat/c10/core/Stream.h @@ -22,6 +22,8 @@ #include #include +#include "paddle/common/macros.h" + namespace c10 { using StreamId = int64_t; @@ -32,7 +34,7 @@ struct StreamData3 { DeviceType device_type; }; -class Stream final { +class PADDLE_API Stream final { private: Device device_; StreamId id_; diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h index 25a0301783b60..fe42648ea775d 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h @@ -18,9 +18,9 @@ namespace c10::cuda { -c10::DeviceIndex device_count(); +PADDLE_API c10::DeviceIndex device_count(); -void device_synchronize(); +PADDLE_API void device_synchronize(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void __inline__ stream_synchronize(gpuStream_t stream) { diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h index f51800cbfaf3b..d69536fa292b1 100644 --- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h +++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h @@ -24,6 +24,7 @@ #include +#include "paddle/common/macros.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/common/place.h" @@ -165,29 +166,29 @@ inline CUDAStream make_cuda_stream(cudaStream_t raw, * Get the current CUDA stream for the passed CUDA device, or for the * current device if no device index is passed. */ -CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1); +PADDLE_API CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1); /** * Get a new stream from the CUDA stream pool. * Priority -1 is high priority, 0 is default/low priority. * Matches PyTorch behavior where negative priority = high priority. */ -CUDAStream getStreamFromPool(const int priority = 0, - c10::DeviceIndex device_index = -1); +PADDLE_API CUDAStream getStreamFromPool(const int priority = 0, + c10::DeviceIndex device_index = -1); /** * Get a new stream from the CUDA stream pool. * Bool overload: true = high priority (-1), false = default priority (0). */ -CUDAStream getStreamFromPool(const bool isHighPriority, - c10::DeviceIndex device_index = -1); +PADDLE_API CUDAStream getStreamFromPool(const bool isHighPriority, + c10::DeviceIndex device_index = -1); #ifdef PADDLE_WITH_HIP -CUDAStream getStreamFromExternal(hipStream_t ext_stream, - c10::DeviceIndex device_index); +PADDLE_API CUDAStream getStreamFromExternal(hipStream_t ext_stream, + c10::DeviceIndex device_index); #else -CUDAStream getStreamFromExternal(cudaStream_t ext_stream, - c10::DeviceIndex device_index); +PADDLE_API CUDAStream getStreamFromExternal(cudaStream_t ext_stream, + c10::DeviceIndex device_index); #endif /** @@ -196,9 +197,9 @@ CUDAStream getStreamFromExternal(cudaStream_t ext_stream, * Keeps the compat c10 stream state aligned with Paddle's GPUContext so * Paddle stream guards and c10 callers observe the same current stream. */ -void setCurrentCUDAStream(CUDAStream stream); +PADDLE_API void setCurrentCUDAStream(CUDAStream stream); -CUDAStream getDefaultCUDAStream(c10::DeviceIndex device_index = -1); +PADDLE_API CUDAStream getDefaultCUDAStream(c10::DeviceIndex device_index = -1); inline std::ostream& operator<<(std::ostream& stream, const CUDAStream& s) { return stream << s.unwrap(); diff --git a/paddle/phi/api/include/compat/c10/util/typeid.h b/paddle/phi/api/include/compat/c10/util/typeid.h index dfaa4db287d76..a58d751abbcb2 100644 --- a/paddle/phi/api/include/compat/c10/util/typeid.h +++ b/paddle/phi/api/include/compat/c10/util/typeid.h @@ -551,9 +551,15 @@ inline std::ostream& operator<<(std::ostream& stream, #define EXPORT_IF_NOT_GCC #endif +#if defined(_MSC_VER) +#define C10_TEMPLATE_API C10_API +#else +#define C10_TEMPLATE_API +#endif + // For use in a .cpp file. #define CAFFE_KNOWN_TYPE(T) \ - template uint16_t TypeMeta::addTypeMetaData(); \ + template C10_TEMPLATE_API uint16_t TypeMeta::addTypeMetaData(); \ template <> \ EXPORT_IF_NOT_GCC uint16_t TypeMeta::_typeMetaData() noexcept { \ static const uint16_t index = addTypeMetaData(); \ @@ -561,11 +567,11 @@ inline std::ostream& operator<<(std::ostream& stream, } // For use in a .cpp file when a declaration in the header is provided. -#define CAFFE_DEFINE_KNOWN_TYPE(T, ident) \ - template uint16_t TypeMeta::addTypeMetaData(); \ - namespace detail { \ - EXPORT_IF_NOT_GCC extern const uint16_t ident##_metadata_index = \ - TypeMeta::addTypeMetaData(); \ +#define CAFFE_DEFINE_KNOWN_TYPE(T, ident) \ + template C10_TEMPLATE_API uint16_t TypeMeta::addTypeMetaData(); \ + namespace detail { \ + EXPORT_IF_NOT_GCC extern const uint16_t ident##_metadata_index = \ + TypeMeta::addTypeMetaData(); \ } /* namespace detail */ // Declaration counterpart: provides an inline fast-path via a detail var. @@ -575,7 +581,7 @@ inline std::ostream& operator<<(std::ostream& stream, // upstream declare/define model. #if defined(_MSC_VER) #define CAFFE_DECLARE_KNOWN_TYPE(T, ident) \ - extern template uint16_t TypeMeta::addTypeMetaData(); \ + extern template C10_API uint16_t TypeMeta::addTypeMetaData(); \ namespace detail { \ extern C10_API const uint16_t ident##_metadata_index; \ } /* namespace detail */ \ diff --git a/paddle/phi/api/include/compat/torch/library.h b/paddle/phi/api/include/compat/torch/library.h index b336f083bc17a..fa6bc830ce286 100644 --- a/paddle/phi/api/include/compat/torch/library.h +++ b/paddle/phi/api/include/compat/torch/library.h @@ -871,7 +871,7 @@ class PADDLE_API OperatorRegistry { } }; -class Library { +class PADDLE_API Library { public: enum Kind { DEF, // TORCH_LIBRARY diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index 99d3733da0490..b3312fc5a7b64 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -173,6 +173,9 @@ static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll"; static constexpr char* win_cublas_lib = "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll"; +static constexpr char* win_cublaslt_lib = + "cublasLt64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;cublasLt64_" CUDA_VERSION_MAJOR ".dll"; static constexpr char* win_curand_lib = "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll"; @@ -524,21 +527,21 @@ void* GetCublasLtDsoHandle() { return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_11.dll"); #else return GetDsoHandleFromSearchPath( - FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); + FLAGS_cuda_dir, win_cublaslt_lib, true, {cuda_lib_path}); #endif } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) { #ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_12.dll"); #else return GetDsoHandleFromSearchPath( - FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); + FLAGS_cuda_dir, win_cublaslt_lib, true, {cuda_lib_path}); #endif } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) { #ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_13.dll"); #else return GetDsoHandleFromSearchPath( - FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); + FLAGS_cuda_dir, win_cublaslt_lib, true, {cuda_lib_path}); #endif } else { std::string warning_msg( diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h index 73f167d7e865a..63c630345c83b 100644 --- a/paddle/phi/backends/gpu/gpu_info.h +++ b/paddle/phi/backends/gpu/gpu_info.h @@ -107,7 +107,7 @@ void GpuMemcpyPeerSync( void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream); //! Blocks until stream has completed all operations. -void GpuStreamSync(gpuStream_t stream); +PADDLE_API void GpuStreamSync(gpuStream_t stream); void GpuDestroyStream(gpuStream_t stream); diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 024a7de73eb72..d07575028c1fa 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -95,7 +95,7 @@ std::string GetCompleteTraceBackString(StrType&& what, inline bool is_error(bool stat) { return !stat; } -void ThrowWarnInternal(const std::string& message); +PADDLE_API void ThrowWarnInternal(const std::string& message); #if defined(__CUDA_ARCH__) // For cuda, the assertions can affect performance and it is therefore diff --git a/paddle/phi/core/platform/cuda_graph_with_memory_pool.cc b/paddle/phi/core/platform/cuda_graph_with_memory_pool.cc index be92a20250674..7122b5c5edb9d 100644 --- a/paddle/phi/core/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/phi/core/platform/cuda_graph_with_memory_pool.cc @@ -25,7 +25,24 @@ COMMON_DECLARE_bool(new_executor_use_cuda_graph); namespace paddle::platform { +// Keep capture-state queries out of headers so Windows callers in different +// DLLs observe the same CUDAGraph global state. +bool IsCUDAGraphCapturing() { + return phi::backends::gpu::IsCUDAGraphCapturing(); +} + +phi::Place CUDAGraphCapturingPlace() { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) + return phi::backends::gpu::CUDAGraph::CapturingPlace(); +#else + PADDLE_THROW(common::errors::Unimplemented( + "CUDA Graph is only supported on NVIDIA GPU device.")); +#endif +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void InitCUDNNRelatedHandle(phi::GPUContext* dev_ctx) { dev_ctx->cudnn_workspace_handle().ResetWorkspace(); diff --git a/paddle/phi/core/platform/cuda_graph_with_memory_pool.h b/paddle/phi/core/platform/cuda_graph_with_memory_pool.h index 7054bb5e7ddb2..8c528b22d9dff 100644 --- a/paddle/phi/core/platform/cuda_graph_with_memory_pool.h +++ b/paddle/phi/core/platform/cuda_graph_with_memory_pool.h @@ -25,6 +25,9 @@ namespace paddle { namespace platform { +PADDLE_API bool IsCUDAGraphCapturing(); +PADDLE_API phi::Place CUDAGraphCapturingPlace(); + // NOTE: These APIs are not thread-safe. #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) @@ -46,18 +49,6 @@ PADDLE_API void BeginCUDAGraphCapture( int64_t pool_id = CUDAGraph::kInvalidPoolID); #endif -inline phi::Place CUDAGraphCapturingPlace() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_CUSTOM_DEVICE) - return CUDAGraph::CapturingPlace(); -#else - PADDLE_THROW(common::errors::Unimplemented( - "CUDA Graph is only supported on NVIDIA GPU device.")); -#endif -} - -using phi::backends::gpu::IsCUDAGraphCapturing; - using phi::backends::gpu::AddPostResetCallbackIfCapturingCUDAGraph; using phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph; diff --git a/test/cpp/compat/ATen_factory_default_dtype_test.cc b/test/cpp/compat/ATen_factory_default_dtype_test.cc index b982d0e3d96f7..3e27476138578 100644 --- a/test/cpp/compat/ATen_factory_default_dtype_test.cc +++ b/test/cpp/compat/ATen_factory_default_dtype_test.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -29,14 +30,14 @@ namespace { class DefaultDtypeGuard { public: explicit DefaultDtypeGuard(c10::ScalarType dtype) - : previous_(c10::get_default_dtype_as_scalartype()) { - c10::set_default_dtype(dtype); + : previous_(c10::get_default_dtype()) { + c10::set_default_dtype(c10::scalarTypeToTypeMeta(dtype)); } ~DefaultDtypeGuard() { c10::set_default_dtype(previous_); } private: - c10::ScalarType previous_; + caffe2::TypeMeta previous_; }; } // namespace diff --git a/test/cpp/compat/CMakeLists.txt b/test/cpp/compat/CMakeLists.txt index 8e99a773fb051..6368129b02f67 100644 --- a/test/cpp/compat/CMakeLists.txt +++ b/test/cpp/compat/CMakeLists.txt @@ -1,92 +1,89 @@ -if(NOT WIN32) - # c10 core tests (CPU compatible) - cc_test(c10_Device_test SRCS c10_Device_test.cc) - cc_test(c10_DispatchKeySet_test SRCS c10_DispatchKeySet_test.cc) - cc_test(c10_DispatchKey_test SRCS c10_DispatchKey_test.cc) - cc_test(c10_MemoryFormat_test SRCS c10_MemoryFormat_test.cc) - cc_test(c10_ScalarType_test SRCS c10_ScalarType_test.cc) - cc_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc) - cc_test(c10_TensorOptions_test SRCS c10_TensorOptions_test.cc) - cc_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc) - cc_test(c10_intrusive_ptr_lifecycle_test - SRCS c10_intrusive_ptr_lifecycle_test.cc) - cc_test(c10_layout_test SRCS c10_layout_test.cc) - cc_test(c10_ptr_test SRCS c10_ptr_test.cc) - cc_test(c10_storage_test SRCS c10_storage_test.cc) +# c10 core tests (CPU compatible) +cc_test(c10_Device_test SRCS c10_Device_test.cc) +cc_test(c10_DispatchKeySet_test SRCS c10_DispatchKeySet_test.cc) +cc_test(c10_DispatchKey_test SRCS c10_DispatchKey_test.cc) +cc_test(c10_MemoryFormat_test SRCS c10_MemoryFormat_test.cc) +cc_test(c10_ScalarType_test SRCS c10_ScalarType_test.cc) +cc_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc) +cc_test(c10_TensorOptions_test SRCS c10_TensorOptions_test.cc) +cc_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc) +cc_test(c10_intrusive_ptr_lifecycle_test + SRCS c10_intrusive_ptr_lifecycle_test.cc) +cc_test(c10_layout_test SRCS c10_layout_test.cc) +cc_test(c10_ptr_test SRCS c10_ptr_test.cc) +cc_test(c10_storage_test SRCS c10_storage_test.cc) - # ATen core tests (CPU compatible) - cc_test(ATen_all_test SRCS ATen_all_test.cc) - cc_test(ATen_any_test SRCS ATen_any_test.cc) - cc_test(ATen_as_strided_test SRCS ATen_as_strided_test.cc) - cc_test(ATen_autograd_test SRCS ATen_autograd_test.cc) - cc_test(ATen_chunk_test SRCS ATen_chunk_test.cc) - cc_test(ATen_clamp_test SRCS ATen_clamp_test.cc) - cc_test(ATen_coalesce_test SRCS ATen_coalesce_test.cc) - cc_test(ATen_dense_sparse_conversion_test - SRCS ATen_dense_sparse_conversion_test.cc) - cc_test(ATen_empty_test SRCS ATen_empty_test.cc) - cc_test(ATen_equal_test SRCS ATen_equal_test.cc) - cc_test(ATen_expand_test SRCS ATen_expand_test.cc) - cc_test(ATen_eye_test SRCS ATen_eye_test.cc) - cc_test(ATen_factory_default_dtype_test - SRCS ATen_factory_default_dtype_test.cc) - cc_test(ATen_flatten_test SRCS ATen_flatten_test.cc) - cc_test(ATen_from_blob_test SRCS ATen_from_blob_test.cc) - cc_test(ATen_hook_test SRCS ATen_hook_test.cc) - cc_test(ATen_index_test SRCS ATen_index_test.cc) - cc_test(ATen_item_test SRCS ATen_item_test.cc) - cc_test(ATen_narrow_test SRCS ATen_narrow_test.cc) - cc_test(ATen_new_test SRCS ATen_new_test.cc) - cc_test(ATen_nnz_test SRCS ATen_nnz_test.cc) - cc_test(ATen_rename_test SRCS ATen_rename_test.cc) - cc_test(ATen_reshape_test SRCS ATen_reshape_test.cc) - cc_test(ATen_resize_test SRCS ATen_resize_test.cc) - cc_test(ATen_squeeze_test SRCS ATen_squeeze_test.cc) - cc_test(ATen_std_var_test SRCS ATen_std_var_test.cc) - cc_test(ATen_sum_test SRCS ATen_sum_test.cc) - cc_test(ATen_t_test SRCS ATen_t_test.cc) - cc_test(ATen_tensor_data_test SRCS ATen_tensor_data_test.cc) - cc_test(ATen_toString_test SRCS ATen_toString_test.cc) - cc_test(ATen_to_test SRCS ATen_to_test.cc) - cc_test(ATen_transpose_test SRCS ATen_transpose_test.cc) - cc_test(ATen_Utils_test SRCS ATen_Utils_test.cc) - cc_test(ATen_values_test SRCS ATen_values_test.cc) - cc_test(ATen_viewAs_test SRCS ATen_viewAs_test.cc) +# ATen core tests (CPU compatible) +cc_test(ATen_all_test SRCS ATen_all_test.cc) +cc_test(ATen_any_test SRCS ATen_any_test.cc) +cc_test(ATen_as_strided_test SRCS ATen_as_strided_test.cc) +cc_test(ATen_autograd_test SRCS ATen_autograd_test.cc) +cc_test(ATen_chunk_test SRCS ATen_chunk_test.cc) +cc_test(ATen_clamp_test SRCS ATen_clamp_test.cc) +cc_test(ATen_coalesce_test SRCS ATen_coalesce_test.cc) +cc_test(ATen_dense_sparse_conversion_test + SRCS ATen_dense_sparse_conversion_test.cc) +cc_test(ATen_empty_test SRCS ATen_empty_test.cc) +cc_test(ATen_equal_test SRCS ATen_equal_test.cc) +cc_test(ATen_expand_test SRCS ATen_expand_test.cc) +cc_test(ATen_eye_test SRCS ATen_eye_test.cc) +cc_test(ATen_factory_default_dtype_test SRCS ATen_factory_default_dtype_test.cc) +cc_test(ATen_flatten_test SRCS ATen_flatten_test.cc) +cc_test(ATen_from_blob_test SRCS ATen_from_blob_test.cc) +cc_test(ATen_hook_test SRCS ATen_hook_test.cc) +cc_test(ATen_index_test SRCS ATen_index_test.cc) +cc_test(ATen_item_test SRCS ATen_item_test.cc) +cc_test(ATen_narrow_test SRCS ATen_narrow_test.cc) +cc_test(ATen_new_test SRCS ATen_new_test.cc) +cc_test(ATen_nnz_test SRCS ATen_nnz_test.cc) +cc_test(ATen_rename_test SRCS ATen_rename_test.cc) +cc_test(ATen_reshape_test SRCS ATen_reshape_test.cc) +cc_test(ATen_resize_test SRCS ATen_resize_test.cc) +cc_test(ATen_squeeze_test SRCS ATen_squeeze_test.cc) +cc_test(ATen_std_var_test SRCS ATen_std_var_test.cc) +cc_test(ATen_sum_test SRCS ATen_sum_test.cc) +cc_test(ATen_t_test SRCS ATen_t_test.cc) +cc_test(ATen_tensor_data_test SRCS ATen_tensor_data_test.cc) +cc_test(ATen_toString_test SRCS ATen_toString_test.cc) +cc_test(ATen_to_test SRCS ATen_to_test.cc) +cc_test(ATen_transpose_test SRCS ATen_transpose_test.cc) +cc_test(ATen_Utils_test SRCS ATen_Utils_test.cc) +cc_test(ATen_values_test SRCS ATen_values_test.cc) +cc_test(ATen_viewAs_test SRCS ATen_viewAs_test.cc) - # torch library tests (CPU compatible) - cc_test(torch_library_test SRCS torch_library_test.cc) - cc_test(torch_library_dispatch_test SRCS torch_library_dispatch_test.cc) +# torch library tests (CPU compatible) +cc_test(torch_library_test SRCS torch_library_test.cc) +cc_test(torch_library_dispatch_test SRCS torch_library_dispatch_test.cc) - # GPU-runtime compat tests are not fully audited on ROCm/DCU yet. - # Keep the DCU surface limited to the cases adapted in this PR. - if(WITH_ROCM) - cc_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc) - cc_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc) - cc_test(c10_Event_test SRCS c10_Event_test.cc) - cc_test(c10_Stream_test SRCS c10_Stream_test.cc) - else() - cc_test(ATen_TensorAccessor_test SRCS ATen_TensorAccessor_test.cc) - cc_test(ATen_basic_test SRCS ATen_basic_test.cc) - cc_test(ATen_local_scalar_dense_test SRCS ATen_local_scalar_dense_test.cc) - cc_test(ATen_memory_test SRCS ATen_memory_test.cc) - cc_test(ATen_pin_memory_creation_test SRCS ATen_pin_memory_creation_test.cc) - cc_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc) - cc_test(ATen_select_test SRCS ATen_select_test.cc) - cc_test(ATen_split_test SRCS ATen_split_test.cc) +# GPU-runtime compat tests are not fully audited on ROCm/DCU yet. +# Keep the DCU surface limited to the cases adapted in this PR. +if(WITH_ROCM) + cc_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc) + cc_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc) + cc_test(c10_Event_test SRCS c10_Event_test.cc) + cc_test(c10_Stream_test SRCS c10_Stream_test.cc) +else() + cc_test(ATen_TensorAccessor_test SRCS ATen_TensorAccessor_test.cc) + cc_test(ATen_basic_test SRCS ATen_basic_test.cc) + cc_test(ATen_local_scalar_dense_test SRCS ATen_local_scalar_dense_test.cc) + cc_test(ATen_memory_test SRCS ATen_memory_test.cc) + cc_test(ATen_pin_memory_creation_test SRCS ATen_pin_memory_creation_test.cc) + cc_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc) + cc_test(ATen_select_test SRCS ATen_select_test.cc) + cc_test(ATen_split_test SRCS ATen_split_test.cc) - cc_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc) - cc_test(ATen_philox_test SRCS ATen_philox_test.cc) - cc_test(c10_Event_test SRCS c10_Event_test.cc) - cc_test(c10_Stream_test SRCS c10_Stream_test.cc) - endif() + cc_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc) + cc_test(ATen_philox_test SRCS ATen_philox_test.cc) + cc_test(c10_Event_test SRCS c10_Event_test.cc) + cc_test(c10_Stream_test SRCS c10_Stream_test.cc) +endif() - if(WITH_GPU) - nv_test(ATen_CUDABlas_test SRCS ATen_CUDABlas_test.cc) - nv_test(ATen_cuda_test SRCS ATen_cuda_test.cc) - nv_test(c10_cuda_generator_test SRCS c10_cuda_generator_test.cc) - nv_test(c10_generator_impl_test SRCS c10_generator_impl_test.cc) - endif() - cc_test(schema_parser_type_test SRCS schema_parser_type_test.cc) +if(WITH_GPU) + nv_test(ATen_CUDABlas_test SRCS ATen_CUDABlas_test.cc) + nv_test(ATen_cuda_test SRCS ATen_cuda_test.cc) + nv_test(c10_cuda_generator_test SRCS c10_cuda_generator_test.cc) + nv_test(c10_generator_impl_test SRCS c10_generator_impl_test.cc) endif() +cc_test(schema_parser_type_test SRCS schema_parser_type_test.cc) add_subdirectory(torch) diff --git a/test/cpp/compat/c10_TensorOptions_test.cc b/test/cpp/compat/c10_TensorOptions_test.cc index 63019c8a4c0ec..070999f2febf8 100644 --- a/test/cpp/compat/c10_TensorOptions_test.cc +++ b/test/cpp/compat/c10_TensorOptions_test.cc @@ -33,14 +33,14 @@ namespace { class DefaultDtypeGuard { public: explicit DefaultDtypeGuard(c10::ScalarType dtype) - : previous_(c10::get_default_dtype_as_scalartype()) { - c10::set_default_dtype(dtype); + : previous_(c10::get_default_dtype()) { + c10::set_default_dtype(c10::scalarTypeToTypeMeta(dtype)); } ~DefaultDtypeGuard() { c10::set_default_dtype(previous_); } private: - c10::ScalarType previous_; + caffe2::TypeMeta previous_; }; } // namespace @@ -154,6 +154,32 @@ TEST(TensorOptionsTest, DtypeDefaultTracksGlobalDefaultDtype) { c10::kDouble); } +TEST(TensorOptionsTest, DefaultComplexDtypeTracksGlobalDefaultDtype) { + { + DefaultDtypeGuard guard(c10::kHalf); + + ASSERT_EQ(c10::get_default_dtype_as_scalartype(), c10::kHalf); + ASSERT_EQ(c10::get_default_complex_dtype().toScalarType(), + c10::ScalarType::ComplexHalf); + } + + { + DefaultDtypeGuard guard(c10::kDouble); + + ASSERT_EQ(c10::get_default_dtype_as_scalartype(), c10::kDouble); + ASSERT_EQ(c10::get_default_complex_dtype().toScalarType(), + c10::ScalarType::ComplexDouble); + } + + { + DefaultDtypeGuard guard(c10::kFloat); + + ASSERT_EQ(c10::get_default_dtype_as_scalartype(), c10::kFloat); + ASSERT_EQ(c10::get_default_complex_dtype().toScalarType(), + c10::ScalarType::ComplexFloat); + } +} + // ---- device ---- TEST(TensorOptionsTest, SetDevice_CPU) { diff --git a/test/cpp/compat/c10_generator_impl_test.cc b/test/cpp/compat/c10_generator_impl_test.cc index 92192b4affca5..b950cb1e0f965 100644 --- a/test/cpp/compat/c10_generator_impl_test.cc +++ b/test/cpp/compat/c10_generator_impl_test.cc @@ -27,13 +27,13 @@ // ---------- Construction ---------------------------------------------------- TEST(GeneratorImplTest, ConstructWithNullptrCreatesDefaultGen) { - c10::GeneratorImpl impl(c10::Device(c10::kCPU)); + c10::GeneratorImpl impl{c10::Device(c10::kCPU)}; ASSERT_NE(impl.paddle_generator(), nullptr); } TEST(GeneratorImplTest, ConstructWithExistingGen) { auto gen = std::make_shared(42u); - c10::GeneratorImpl impl(c10::Device(c10::kCPU), gen); + c10::GeneratorImpl impl{c10::Device(c10::kCPU), gen}; ASSERT_EQ(impl.paddle_generator(), gen); ASSERT_EQ(impl.current_seed(), 42u); } @@ -41,13 +41,13 @@ TEST(GeneratorImplTest, ConstructWithExistingGen) { // ---------- Seed / offset API (base-class versions) ------------------------- TEST(GeneratorImplTest, SetAndGetCurrentSeed) { - c10::GeneratorImpl impl(c10::Device(c10::kCPU)); + c10::GeneratorImpl impl{c10::Device(c10::kCPU)}; impl.set_current_seed(12345); ASSERT_EQ(impl.current_seed(), 12345u); } TEST(GeneratorImplTest, SeedGeneratesNewSeed) { - c10::GeneratorImpl impl(c10::Device(c10::kCPU)); + c10::GeneratorImpl impl{c10::Device(c10::kCPU)}; impl.set_current_seed(1); uint64_t new_seed = impl.seed(); // seed() should return a new random seed (very unlikely to be 1 again). @@ -56,13 +56,13 @@ TEST(GeneratorImplTest, SeedGeneratesNewSeed) { } TEST(GeneratorImplTest, GetOffsetInitiallyZero) { - c10::GeneratorImpl impl(c10::Device(c10::kCPU)); + c10::GeneratorImpl impl{c10::Device(c10::kCPU)}; ASSERT_EQ(impl.get_offset(), 0u); } TEST(GeneratorImplTest, SetOffsetForward) { auto gen = std::make_shared(100u); - c10::GeneratorImpl impl(c10::Device(c10::kCUDA, 0), gen); + c10::GeneratorImpl impl{c10::Device(c10::kCUDA, 0), gen}; impl.set_offset(10); ASSERT_EQ(impl.get_offset(), 10u); @@ -70,7 +70,7 @@ TEST(GeneratorImplTest, SetOffsetForward) { TEST(GeneratorImplTest, SetOffsetBackward) { auto gen = std::make_shared(100u); - c10::GeneratorImpl impl(c10::Device(c10::kCUDA, 0), gen); + c10::GeneratorImpl impl{c10::Device(c10::kCUDA, 0), gen}; impl.set_offset(20); ASSERT_EQ(impl.get_offset(), 20u); @@ -81,7 +81,7 @@ TEST(GeneratorImplTest, SetOffsetBackward) { TEST(GeneratorImplTest, SetOffsetSameValue) { auto gen = std::make_shared(100u); - c10::GeneratorImpl impl(c10::Device(c10::kCUDA, 0), gen); + c10::GeneratorImpl impl{c10::Device(c10::kCUDA, 0), gen}; impl.set_offset(10); impl.set_offset(10); @@ -91,19 +91,19 @@ TEST(GeneratorImplTest, SetOffsetSameValue) { // ---------- Device / DispatchKeySet ----------------------------------------- TEST(GeneratorImplTest, DeviceReturnsCorrectDevice) { - c10::Device cpu_dev(c10::kCPU); - c10::GeneratorImpl impl(cpu_dev); + c10::Device cpu_dev{c10::kCPU}; + c10::GeneratorImpl impl{cpu_dev}; ASSERT_EQ(impl.device(), cpu_dev); } TEST(GeneratorImplTest, KeySetCPU) { - c10::GeneratorImpl impl(c10::Device(c10::kCPU)); + c10::GeneratorImpl impl{c10::Device(c10::kCPU)}; c10::DispatchKeySet ks = impl.key_set(); ASSERT_TRUE(ks.has(c10::DispatchKey::CPU)); } TEST(GeneratorImplTest, KeySetCUDA) { - c10::GeneratorImpl impl(c10::Device(c10::kCUDA, 0)); + c10::GeneratorImpl impl{c10::Device(c10::kCUDA, 0)}; c10::DispatchKeySet ks = impl.key_set(); ASSERT_TRUE(ks.has(c10::DispatchKey::CUDA)); } @@ -111,7 +111,7 @@ TEST(GeneratorImplTest, KeySetCUDA) { TEST(GeneratorImplTest, KeySetOtherDevice) { // Use kCUSTOM which is neither CPU nor CUDA to exercise the fallback // branch that returns an empty DispatchKeySet. - c10::GeneratorImpl impl(c10::Device(c10::kCUSTOM, 0)); + c10::GeneratorImpl impl{c10::Device(c10::kCUSTOM, 0)}; c10::DispatchKeySet ks = impl.key_set(); ASSERT_FALSE(ks.has(c10::DispatchKey::CPU)); ASSERT_FALSE(ks.has(c10::DispatchKey::CUDA)); @@ -121,7 +121,7 @@ TEST(GeneratorImplTest, KeySetOtherDevice) { TEST(GeneratorImplTest, ClonePreservesState) { auto gen = std::make_shared(42u); - c10::GeneratorImpl impl(c10::Device(c10::kCPU), gen); + c10::GeneratorImpl impl{c10::Device(c10::kCPU), gen}; impl.set_current_seed(777); auto cloned = impl.clone(); @@ -137,12 +137,12 @@ TEST(GeneratorImplTest, ClonePreservesState) { // ---------- PyObject binding ------------------------------------------------ TEST(GeneratorImplTest, PyObjDefaultNull) { - c10::GeneratorImpl impl(c10::Device(c10::kCPU)); + c10::GeneratorImpl impl{c10::Device(c10::kCPU)}; ASSERT_EQ(impl.pyobj(), nullptr); } TEST(GeneratorImplTest, SetAndGetPyObj) { - c10::GeneratorImpl impl(c10::Device(c10::kCPU)); + c10::GeneratorImpl impl{c10::Device(c10::kCPU)}; // Use a dummy pointer (we never dereference it). int dummy = 0; @@ -183,7 +183,7 @@ TEST(GeneratorImplTest, MoveIntrusivePtrKeepsRefcount) { TEST(GeneratorImplTest, PaddleGeneratorAccessor) { auto gen = std::make_shared(99u); - c10::GeneratorImpl impl(c10::Device(c10::kCPU), gen); + c10::GeneratorImpl impl{c10::Device(c10::kCPU), gen}; ASSERT_EQ(impl.paddle_generator(), gen); ASSERT_EQ(impl.paddle_generator()->GetCurrentSeed(), 99u); } diff --git a/test/cpp/compat/torch_library_test.cc b/test/cpp/compat/torch_library_test.cc index e1b96502fc2bd..1db9fa7d37470 100644 --- a/test/cpp/compat/torch_library_test.cc +++ b/test/cpp/compat/torch_library_test.cc @@ -971,12 +971,10 @@ TEST(test_torch_library, TestLibraryPrintInfoWithDispatchKey) { __FILE__, __LINE__); - std::ostringstream captured_output; - auto* original_buffer = std::cout.rdbuf(captured_output.rdbuf()); + testing::internal::CaptureStdout(); library.print_info(); - std::cout.rdbuf(original_buffer); + auto output = testing::internal::GetCapturedStdout(); - auto output = captured_output.str(); ASSERT_NE(output.find("Library Info: IMPL"), std::string::npos); ASSERT_NE(output.find("namespace=runtime_library_info"), std::string::npos); ASSERT_NE(output.find("dispatch_key="), std::string::npos);