PaddlePaddle · sneaxiy · Apr 24, 2026 · Apr 19, 2026
diff --git a/paddle/phi/api/include/compat/ATen/Utils.cpp b/paddle/phi/api/include/compat/ATen/Utils.cpp
@@ -24,6 +24,7 @@
 
 #include <algorithm>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/include/tensor.h"
 
@@ -71,24 +72,24 @@ Tensor tensor_complex_backend(ArrayRef<T> values,
 
 }  // namespace detail
 
-#define TENSOR(T, _1)                                               \
-  Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
-    if (options.device().type() != c10::DeviceType::CPU) {          \
-      return at::detail::tensor_backend(values, options);           \
-    } else {                                                        \
-      return at::detail::tensor_cpu(values, options);               \
-    }                                                               \
+#define TENSOR(T, _1)                                                          \
+  PADDLE_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
+    if (options.device().type() != c10::DeviceType::CPU) {                     \
+      return at::detail::tensor_backend(values, options);                      \
+    } else {                                                                   \
+      return at::detail::tensor_cpu(values, options);                          \
+    }                                                                          \
   }
 AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
 #undef TENSOR
 
-#define TENSOR(T, _1)                                               \
-  Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
-    if (options.device().type() != c10::DeviceType::CPU) {          \
-      return at::detail::tensor_complex_backend(values, options);   \
-    } else {                                                        \
-      return at::detail::tensor_complex_cpu(values, options);       \
-    }                                                               \
+#define TENSOR(T, _1)                                                          \
+  PADDLE_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
+    if (options.device().type() != c10::DeviceType::CPU) {                     \
+      return at::detail::tensor_complex_backend(values, options);              \
+    } else {                                                                   \
+      return at::detail::tensor_complex_cpu(values, options);                  \
+    }                                                                          \
   }
 AT_FORALL_COMPLEX_TYPES(TENSOR)
 #undef TENSOR

diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h b/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h
@@ -31,6 +31,8 @@
 #include <ATen/OpMathType.h>
 #include <ATen/cuda/CUDAContext.h>
 
+#include "paddle/common/macros.h"
+
 namespace at::cuda::blas {
 
 /* LEVEL 3 BLAS FUNCTIONS */
@@ -54,16 +56,18 @@ inline void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
 }
 
 template <>
-void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
+PADDLE_API void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
 template <>
-void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float));
+PADDLE_API void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float));
 template <>
-void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
+PADDLE_API void gemm<c10::complex<double>>(
+    CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
 template <>
-void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
+PADDLE_API void gemm<c10::complex<float>>(
+    CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
 template <>
-void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
+PADDLE_API void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
-void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+PADDLE_API void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
 
 }  // namespace at::cuda::blas
diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
@@ -39,6 +39,7 @@
 #include <shared_mutex>
 #include <tuple>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 
 namespace c10 {
@@ -95,40 +96,41 @@ inline int64_t getNumGPUs() { return c10::cuda::device_count(); }
 inline bool is_available() { return c10::cuda::device_count() > 0; }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-CUDAContextDeviceProp* getCurrentDeviceProperties();
+PADDLE_API CUDAContextDeviceProp* getCurrentDeviceProperties();
 
-int warp_size();
+PADDLE_API int warp_size();
 
-CUDAContextDeviceProp* getDeviceProperties(c10::DeviceIndex device);
+PADDLE_API CUDAContextDeviceProp* getDeviceProperties(c10::DeviceIndex device);
 
-bool canDeviceAccessPeer(c10::DeviceIndex device, c10::DeviceIndex peer_device);
+PADDLE_API bool canDeviceAccessPeer(c10::DeviceIndex device,
+                                    c10::DeviceIndex peer_device);
 
 /* Handles */
-CUDAContextSparseHandle getCurrentCUDASparseHandle();
-CUDAContextBlasHandle getCurrentCUDABlasHandle();
-CUDAContextBlasLtHandle getCurrentCUDABlasLtHandle();
+PADDLE_API CUDAContextSparseHandle getCurrentCUDASparseHandle();
+PADDLE_API CUDAContextBlasHandle getCurrentCUDABlasHandle();
+PADDLE_API CUDAContextBlasLtHandle getCurrentCUDABlasLtHandle();
 
-void clearCublasWorkspaces();
+PADDLE_API void clearCublasWorkspaces();
 struct WorkspaceMapWithMutex {
   std::map<std::tuple<void*, void*>, at::DataPtr> map;
   std::shared_mutex mutex;
 };
 
-WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
-WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
-size_t getChosenWorkspaceSize();
-size_t getCUDABlasLtWorkspaceSize();
-void* getCUDABlasLtWorkspace();
+PADDLE_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
+PADDLE_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
+PADDLE_API size_t getChosenWorkspaceSize();
+PADDLE_API size_t getCUDABlasLtWorkspaceSize();
+PADDLE_API void* getCUDABlasLtWorkspace();
 
-CUDAContextSolverHandle getCurrentCUDASolverDnHandle();
+PADDLE_API CUDAContextSolverHandle getCurrentCUDASolverDnHandle();
 
 #if defined(USE_CUDSS)
-cudssHandle_t getCurrentCudssHandle();
+PADDLE_API cudssHandle_t getCurrentCudssHandle();
 #endif
 
 // Get the CUDA device allocator for the current device.
 // Returns a pointer to a c10::Allocator that allocates GPU memory.
-c10::Allocator* getCUDADeviceAllocator();
+PADDLE_API c10::Allocator* getCUDADeviceAllocator();
 #endif
 
 }  // namespace at::cuda
diff --git a/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h b/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h
@@ -15,14 +15,18 @@
 #pragma once
 #include <ATen/core/TensorBody.h>
 
+#include "paddle/common/macros.h"
+
 namespace at::detail {
 
 using at::Tensor;
-at::Tensor empty_cuda(IntArrayRef size,
-                      ScalarType dtype,
-                      std::optional<Device> device_opt,
-                      std::optional<c10::MemoryFormat> memory_format_opt);
+PADDLE_API at::Tensor empty_cuda(
+    IntArrayRef size,
+    ScalarType dtype,
+    std::optional<Device> device_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
 
-at::Tensor empty_cuda(IntArrayRef size, const TensorOptions &options);
+PADDLE_API at::Tensor empty_cuda(IntArrayRef size,
+                                 const TensorOptions &options);
 
 }  // namespace at::detail
diff --git a/paddle/phi/api/include/compat/ATen/ops/tensor.h b/paddle/phi/api/include/compat/ATen/ops/tensor.h
@@ -20,23 +20,25 @@
 #include <ATen/core/Tensor.h>
 #include <c10/core/ScalarType.h>
 
+#include "paddle/common/macros.h"
+
 namespace at {
 
-#define TENSOR(T, S)                                               \
-  Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
-  inline Tensor tensor(std::initializer_list<T> values,            \
-                       const TensorOptions& options) {             \
-    return at::tensor(ArrayRef<T>(values), options);               \
-  }                                                                \
-  inline Tensor tensor(T value, const TensorOptions& options) {    \
-    return at::tensor(ArrayRef<T>(value), options);                \
-  }                                                                \
-  inline Tensor tensor(ArrayRef<T> values) {                       \
-    return at::tensor(std::move(values), at::dtype(k##S));         \
-  }                                                                \
-  inline Tensor tensor(std::initializer_list<T> values) {          \
-    return at::tensor(ArrayRef<T>(values));                        \
-  }                                                                \
+#define TENSOR(T, S)                                                          \
+  PADDLE_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
+  inline Tensor tensor(std::initializer_list<T> values,                       \
+                       const TensorOptions& options) {                        \
+    return at::tensor(ArrayRef<T>(values), options);                          \
+  }                                                                           \
+  inline Tensor tensor(T value, const TensorOptions& options) {               \
+    return at::tensor(ArrayRef<T>(value), options);                           \
+  }                                                                           \
+  inline Tensor tensor(ArrayRef<T> values) {                                  \
+    return at::tensor(std::move(values), at::dtype(k##S));                    \
+  }                                                                           \
+  inline Tensor tensor(std::initializer_list<T> values) {                     \
+    return at::tensor(ArrayRef<T>(values));                                   \
+  }                                                                           \
   inline Tensor tensor(T value) { return at::tensor(ArrayRef<T>(value)); }
 AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
 AT_FORALL_COMPLEX_TYPES(TENSOR)

diff --git a/paddle/phi/api/include/compat/CMakeLists.txt b/paddle/phi/api/include/compat/CMakeLists.txt
@@ -1,4 +1,5 @@
 collect_srcs(api_srcs SRCS c10/core/Device.cpp)
+collect_srcs(api_srcs SRCS c10/core/DefaultDtype.cpp)
 collect_srcs(api_srcs SRCS c10/core/Stream.cpp)
 collect_srcs(api_srcs SRCS c10/cuda/CUDAFunctions.cpp)
 collect_srcs(api_srcs SRCS c10/cuda/CUDAStream.cpp)

diff --git a/paddle/phi/api/include/compat/c10/core/DefaultDtype.cpp b/paddle/phi/api/include/compat/c10/core/DefaultDtype.cpp
@@ -0,0 +1,50 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <c10/core/DefaultDtype.h>
+#include <c10/util/complex.h>
+#include <c10/util/typeid.h>
+
+namespace c10 {
+static auto default_dtype = caffe2::TypeMeta::Make<float>();
+static auto default_dtype_as_scalartype = default_dtype.toScalarType();
+static auto default_complex_dtype =
+    caffe2::TypeMeta::Make<c10::complex<float>>();
+
+void set_default_dtype(caffe2::TypeMeta dtype) {
+  default_dtype = dtype;
+  default_dtype_as_scalartype = default_dtype.toScalarType();
+  switch (default_dtype_as_scalartype) {
+    case ScalarType::Half:
+      default_complex_dtype = ScalarType::ComplexHalf;
+      break;
+    case ScalarType::Double:
+      default_complex_dtype = ScalarType::ComplexDouble;
+      break;
+    default:
+      default_complex_dtype = ScalarType::ComplexFloat;
+      break;
+  }
+}
+
+const caffe2::TypeMeta get_default_dtype() { return default_dtype; }
+
+ScalarType get_default_dtype_as_scalartype() {
+  return default_dtype_as_scalartype;
+}
+
+const caffe2::TypeMeta get_default_complex_dtype() {
+  return default_complex_dtype;
+}
+}  // namespace c10
diff --git a/paddle/phi/api/include/compat/c10/core/DefaultDtype.h b/paddle/phi/api/include/compat/c10/core/DefaultDtype.h
@@ -15,21 +15,16 @@
 #pragma once
 
 #include <c10/core/ScalarType.h>
-#include <c10/util/typeid.h>
 
-namespace c10 {
-static auto default_dtype = ScalarType::Float;
-static auto default_complex_dtype = ScalarType::ComplexFloat;
-
-void inline set_default_dtype(ScalarType dtype) { default_dtype = dtype; }
+#include "paddle/common/macros.h"
 
-ScalarType inline get_default_dtype_as_scalartype() { return default_dtype; }
+namespace caffe2 {
+class TypeMeta;
+}  // namespace caffe2
 
-ScalarType inline get_default_complex_dtype() { return default_complex_dtype; }
-
-/// Returns default dtype as caffe2::TypeMeta (the canonical form, mirrors
-/// PyTorch).
-inline caffe2::TypeMeta get_default_dtype() {
-  return caffe2::TypeMeta::fromScalarType(default_dtype);
-}
+namespace c10 {
+PADDLE_API void set_default_dtype(caffe2::TypeMeta dtype);
+PADDLE_API const caffe2::TypeMeta get_default_dtype();
+PADDLE_API ScalarType get_default_dtype_as_scalartype();
+PADDLE_API const caffe2::TypeMeta get_default_complex_dtype();
 }  // namespace c10
diff --git a/paddle/phi/api/include/compat/c10/core/Device.h b/paddle/phi/api/include/compat/c10/core/Device.h
@@ -33,13 +33,14 @@ using gpuStream_t = hipStream_t;
 #include <string>
 #include <utility>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/core/platform/device_event_base.h"
 
 namespace c10 {
 using DeviceIndex = int8_t;
 
-struct Device final {
+struct PADDLE_API Device final {
   using Type = DeviceType;
   Device() = default;
   Device(phi::Place place)
@@ -161,7 +162,7 @@ struct Device final {
   }
 };
 
-std::ostream& operator<<(std::ostream& stream, const Device& device);
+PADDLE_API std::ostream& operator<<(std::ostream& stream, const Device& device);
 
 }  // namespace c10
 

diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h
@@ -22,6 +22,8 @@
 #include <functional>
 #include <ostream>
 
+#include "paddle/common/macros.h"
+
 namespace c10 {
 
 using StreamId = int64_t;
@@ -32,7 +34,7 @@ struct StreamData3 {
   DeviceType device_type;
 };
 
-class Stream final {
+class PADDLE_API Stream final {
  private:
   Device device_;
   StreamId id_;

diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
@@ -18,9 +18,9 @@
 
 namespace c10::cuda {
 
-c10::DeviceIndex device_count();
+PADDLE_API c10::DeviceIndex device_count();
 
-void device_synchronize();
+PADDLE_API void device_synchronize();
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void __inline__ stream_synchronize(gpuStream_t stream) {