diff --git a/paddle/phi/api/include/compat/ATen/Utils.cpp b/paddle/phi/api/include/compat/ATen/Utils.cpp
index 6e2d24477cdc4..2f360a4fb079a 100644
--- a/paddle/phi/api/include/compat/ATen/Utils.cpp
+++ b/paddle/phi/api/include/compat/ATen/Utils.cpp
@@ -24,6 +24,7 @@
 
 #include <algorithm>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/include/tensor.h"
 
@@ -71,24 +72,24 @@ Tensor tensor_complex_backend(ArrayRef<T> values,
 
 }  // namespace detail
 
-#define TENSOR(T, _1)                                               \
-  Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
-    if (options.device().type() != c10::DeviceType::CPU) {          \
-      return at::detail::tensor_backend(values, options);           \
-    } else {                                                        \
-      return at::detail::tensor_cpu(values, options);               \
-    }                                                               \
+#define TENSOR(T, _1)                                                          \
+  PADDLE_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
+    if (options.device().type() != c10::DeviceType::CPU) {                     \
+      return at::detail::tensor_backend(values, options);                      \
+    } else {                                                                   \
+      return at::detail::tensor_cpu(values, options);                          \
+    }                                                                          \
   }
 AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
 #undef TENSOR
 
-#define TENSOR(T, _1)                                               \
-  Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
-    if (options.device().type() != c10::DeviceType::CPU) {          \
-      return at::detail::tensor_complex_backend(values, options);   \
-    } else {                                                        \
-      return at::detail::tensor_complex_cpu(values, options);       \
-    }                                                               \
+#define TENSOR(T, _1)                                                          \
+  PADDLE_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options) { \
+    if (options.device().type() != c10::DeviceType::CPU) {                     \
+      return at::detail::tensor_complex_backend(values, options);              \
+    } else {                                                                   \
+      return at::detail::tensor_complex_cpu(values, options);                  \
+    }                                                                          \
   }
 AT_FORALL_COMPLEX_TYPES(TENSOR)
 #undef TENSOR
diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h b/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h
index 6c785e334a330..9ef86fa0b19ac 100644
--- a/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h
+++ b/paddle/phi/api/include/compat/ATen/cuda/CUDABlas.h
@@ -31,6 +31,8 @@
 #include <ATen/OpMathType.h>
 #include <ATen/cuda/CUDAContext.h>
 
+#include "paddle/common/macros.h"
+
 namespace at::cuda::blas {
 
 /* LEVEL 3 BLAS FUNCTIONS */
@@ -54,16 +56,18 @@ inline void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
 }
 
 template <>
-void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
+PADDLE_API void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
 template <>
-void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float));
+PADDLE_API void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float));
 template <>
-void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
+PADDLE_API void gemm<c10::complex<double>>(
+    CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
 template <>
-void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
+PADDLE_API void gemm<c10::complex<float>>(
+    CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
 template <>
-void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
+PADDLE_API void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
-void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+PADDLE_API void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
 
 }  // namespace at::cuda::blas
diff --git a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
index a8f67766140b4..e9fb5e0715a52 100644
--- a/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
+++ b/paddle/phi/api/include/compat/ATen/cuda/CUDAContextLight.h
@@ -39,6 +39,7 @@
 #include <shared_mutex>
 #include <tuple>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 
 namespace c10 {
@@ -95,40 +96,41 @@ inline int64_t getNumGPUs() { return c10::cuda::device_count(); }
 inline bool is_available() { return c10::cuda::device_count() > 0; }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-CUDAContextDeviceProp* getCurrentDeviceProperties();
+PADDLE_API CUDAContextDeviceProp* getCurrentDeviceProperties();
 
-int warp_size();
+PADDLE_API int warp_size();
 
-CUDAContextDeviceProp* getDeviceProperties(c10::DeviceIndex device);
+PADDLE_API CUDAContextDeviceProp* getDeviceProperties(c10::DeviceIndex device);
 
-bool canDeviceAccessPeer(c10::DeviceIndex device, c10::DeviceIndex peer_device);
+PADDLE_API bool canDeviceAccessPeer(c10::DeviceIndex device,
+                                    c10::DeviceIndex peer_device);
 
 /* Handles */
-CUDAContextSparseHandle getCurrentCUDASparseHandle();
-CUDAContextBlasHandle getCurrentCUDABlasHandle();
-CUDAContextBlasLtHandle getCurrentCUDABlasLtHandle();
+PADDLE_API CUDAContextSparseHandle getCurrentCUDASparseHandle();
+PADDLE_API CUDAContextBlasHandle getCurrentCUDABlasHandle();
+PADDLE_API CUDAContextBlasLtHandle getCurrentCUDABlasLtHandle();
 
-void clearCublasWorkspaces();
+PADDLE_API void clearCublasWorkspaces();
 struct WorkspaceMapWithMutex {
   std::map<std::tuple<void*, void*>, at::DataPtr> map;
   std::shared_mutex mutex;
 };
 
-WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
-WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
-size_t getChosenWorkspaceSize();
-size_t getCUDABlasLtWorkspaceSize();
-void* getCUDABlasLtWorkspace();
+PADDLE_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
+PADDLE_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
+PADDLE_API size_t getChosenWorkspaceSize();
+PADDLE_API size_t getCUDABlasLtWorkspaceSize();
+PADDLE_API void* getCUDABlasLtWorkspace();
 
-CUDAContextSolverHandle getCurrentCUDASolverDnHandle();
+PADDLE_API CUDAContextSolverHandle getCurrentCUDASolverDnHandle();
 
 #if defined(USE_CUDSS)
-cudssHandle_t getCurrentCudssHandle();
+PADDLE_API cudssHandle_t getCurrentCudssHandle();
 #endif
 
 // Get the CUDA device allocator for the current device.
 // Returns a pointer to a c10::Allocator that allocates GPU memory.
-c10::Allocator* getCUDADeviceAllocator();
+PADDLE_API c10::Allocator* getCUDADeviceAllocator();
 #endif
 
 }  // namespace at::cuda
diff --git a/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h b/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h
index 080f355994c78..4e3dffa1239b1 100644
--- a/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h
+++ b/paddle/phi/api/include/compat/ATen/cuda/EmptyTensor.h
@@ -15,14 +15,18 @@
 #pragma once
 #include <ATen/core/TensorBody.h>
 
+#include "paddle/common/macros.h"
+
 namespace at::detail {
 
 using at::Tensor;
-at::Tensor empty_cuda(IntArrayRef size,
-                      ScalarType dtype,
-                      std::optional<Device> device_opt,
-                      std::optional<c10::MemoryFormat> memory_format_opt);
+PADDLE_API at::Tensor empty_cuda(
+    IntArrayRef size,
+    ScalarType dtype,
+    std::optional<Device> device_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
 
-at::Tensor empty_cuda(IntArrayRef size, const TensorOptions &options);
+PADDLE_API at::Tensor empty_cuda(IntArrayRef size,
+                                 const TensorOptions &options);
 
 }  // namespace at::detail
diff --git a/paddle/phi/api/include/compat/ATen/ops/tensor.h b/paddle/phi/api/include/compat/ATen/ops/tensor.h
index 947c1c4a0f5de..f22457851a3fa 100644
--- a/paddle/phi/api/include/compat/ATen/ops/tensor.h
+++ b/paddle/phi/api/include/compat/ATen/ops/tensor.h
@@ -20,23 +20,25 @@
 #include <ATen/core/Tensor.h>
 #include <c10/core/ScalarType.h>
 
+#include "paddle/common/macros.h"
+
 namespace at {
 
-#define TENSOR(T, S)                                               \
-  Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
-  inline Tensor tensor(std::initializer_list<T> values,            \
-                       const TensorOptions& options) {             \
-    return at::tensor(ArrayRef<T>(values), options);               \
-  }                                                                \
-  inline Tensor tensor(T value, const TensorOptions& options) {    \
-    return at::tensor(ArrayRef<T>(value), options);                \
-  }                                                                \
-  inline Tensor tensor(ArrayRef<T> values) {                       \
-    return at::tensor(std::move(values), at::dtype(k##S));         \
-  }                                                                \
-  inline Tensor tensor(std::initializer_list<T> values) {          \
-    return at::tensor(ArrayRef<T>(values));                        \
-  }                                                                \
+#define TENSOR(T, S)                                                          \
+  PADDLE_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
+  inline Tensor tensor(std::initializer_list<T> values,                       \
+                       const TensorOptions& options) {                        \
+    return at::tensor(ArrayRef<T>(values), options);                          \
+  }                                                                           \
+  inline Tensor tensor(T value, const TensorOptions& options) {               \
+    return at::tensor(ArrayRef<T>(value), options);                           \
+  }                                                                           \
+  inline Tensor tensor(ArrayRef<T> values) {                                  \
+    return at::tensor(std::move(values), at::dtype(k##S));                    \
+  }                                                                           \
+  inline Tensor tensor(std::initializer_list<T> values) {                     \
+    return at::tensor(ArrayRef<T>(values));                                   \
+  }                                                                           \
   inline Tensor tensor(T value) { return at::tensor(ArrayRef<T>(value)); }
 AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
 AT_FORALL_COMPLEX_TYPES(TENSOR)
diff --git a/paddle/phi/api/include/compat/CMakeLists.txt b/paddle/phi/api/include/compat/CMakeLists.txt
index f5ccb62aa4554..060d9debc21c7 100644
--- a/paddle/phi/api/include/compat/CMakeLists.txt
+++ b/paddle/phi/api/include/compat/CMakeLists.txt
@@ -1,4 +1,5 @@
 collect_srcs(api_srcs SRCS c10/core/Device.cpp)
+collect_srcs(api_srcs SRCS c10/core/DefaultDtype.cpp)
 collect_srcs(api_srcs SRCS c10/core/Stream.cpp)
 collect_srcs(api_srcs SRCS c10/cuda/CUDAFunctions.cpp)
 collect_srcs(api_srcs SRCS c10/cuda/CUDAStream.cpp)
diff --git a/paddle/phi/api/include/compat/c10/core/DefaultDtype.cpp b/paddle/phi/api/include/compat/c10/core/DefaultDtype.cpp
new file mode 100644
index 0000000000000..aea6069310223
--- /dev/null
+++ b/paddle/phi/api/include/compat/c10/core/DefaultDtype.cpp
@@ -0,0 +1,50 @@
+// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <c10/core/DefaultDtype.h>
+#include <c10/util/complex.h>
+#include <c10/util/typeid.h>
+
+namespace c10 {
+static auto default_dtype = caffe2::TypeMeta::Make<float>();
+static auto default_dtype_as_scalartype = default_dtype.toScalarType();
+static auto default_complex_dtype =
+    caffe2::TypeMeta::Make<c10::complex<float>>();
+
+void set_default_dtype(caffe2::TypeMeta dtype) {
+  default_dtype = dtype;
+  default_dtype_as_scalartype = default_dtype.toScalarType();
+  switch (default_dtype_as_scalartype) {
+    case ScalarType::Half:
+      default_complex_dtype = ScalarType::ComplexHalf;
+      break;
+    case ScalarType::Double:
+      default_complex_dtype = ScalarType::ComplexDouble;
+      break;
+    default:
+      default_complex_dtype = ScalarType::ComplexFloat;
+      break;
+  }
+}
+
+const caffe2::TypeMeta get_default_dtype() { return default_dtype; }
+
+ScalarType get_default_dtype_as_scalartype() {
+  return default_dtype_as_scalartype;
+}
+
+const caffe2::TypeMeta get_default_complex_dtype() {
+  return default_complex_dtype;
+}
+}  // namespace c10
diff --git a/paddle/phi/api/include/compat/c10/core/DefaultDtype.h b/paddle/phi/api/include/compat/c10/core/DefaultDtype.h
index 02b588476045a..927f7a51eabe5 100644
--- a/paddle/phi/api/include/compat/c10/core/DefaultDtype.h
+++ b/paddle/phi/api/include/compat/c10/core/DefaultDtype.h
@@ -15,21 +15,16 @@
 #pragma once
 
 #include <c10/core/ScalarType.h>
-#include <c10/util/typeid.h>
 
-namespace c10 {
-static auto default_dtype = ScalarType::Float;
-static auto default_complex_dtype = ScalarType::ComplexFloat;
-
-void inline set_default_dtype(ScalarType dtype) { default_dtype = dtype; }
+#include "paddle/common/macros.h"
 
-ScalarType inline get_default_dtype_as_scalartype() { return default_dtype; }
+namespace caffe2 {
+class TypeMeta;
+}  // namespace caffe2
 
-ScalarType inline get_default_complex_dtype() { return default_complex_dtype; }
-
-/// Returns default dtype as caffe2::TypeMeta (the canonical form, mirrors
-/// PyTorch).
-inline caffe2::TypeMeta get_default_dtype() {
-  return caffe2::TypeMeta::fromScalarType(default_dtype);
-}
+namespace c10 {
+PADDLE_API void set_default_dtype(caffe2::TypeMeta dtype);
+PADDLE_API const caffe2::TypeMeta get_default_dtype();
+PADDLE_API ScalarType get_default_dtype_as_scalartype();
+PADDLE_API const caffe2::TypeMeta get_default_complex_dtype();
 }  // namespace c10
diff --git a/paddle/phi/api/include/compat/c10/core/Device.h b/paddle/phi/api/include/compat/c10/core/Device.h
index 4e4a8cdb140ed..41b45605ef962 100644
--- a/paddle/phi/api/include/compat/c10/core/Device.h
+++ b/paddle/phi/api/include/compat/c10/core/Device.h
@@ -33,13 +33,14 @@ using gpuStream_t = hipStream_t;
 #include <string>
 #include <utility>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/core/platform/device_event_base.h"
 
 namespace c10 {
 using DeviceIndex = int8_t;
 
-struct Device final {
+struct PADDLE_API Device final {
   using Type = DeviceType;
   Device() = default;
   Device(phi::Place place)
@@ -161,7 +162,7 @@ struct Device final {
   }
 };
 
-std::ostream& operator<<(std::ostream& stream, const Device& device);
+PADDLE_API std::ostream& operator<<(std::ostream& stream, const Device& device);
 
 }  // namespace c10
 
diff --git a/paddle/phi/api/include/compat/c10/core/Stream.h b/paddle/phi/api/include/compat/c10/core/Stream.h
index e9bcbc939d921..040fb09789454 100644
--- a/paddle/phi/api/include/compat/c10/core/Stream.h
+++ b/paddle/phi/api/include/compat/c10/core/Stream.h
@@ -22,6 +22,8 @@
 #include <functional>
 #include <ostream>
 
+#include "paddle/common/macros.h"
+
 namespace c10 {
 
 using StreamId = int64_t;
@@ -32,7 +34,7 @@ struct StreamData3 {
   DeviceType device_type;
 };
 
-class Stream final {
+class PADDLE_API Stream final {
  private:
   Device device_;
   StreamId id_;
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
index 25a0301783b60..fe42648ea775d 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAFunctions.h
@@ -18,9 +18,9 @@
 
 namespace c10::cuda {
 
-c10::DeviceIndex device_count();
+PADDLE_API c10::DeviceIndex device_count();
 
-void device_synchronize();
+PADDLE_API void device_synchronize();
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void __inline__ stream_synchronize(gpuStream_t stream) {
diff --git a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
index f51800cbfaf3b..d69536fa292b1 100644
--- a/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
+++ b/paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
@@ -24,6 +24,7 @@
 
 #include <ostream>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/place.h"
 
@@ -165,29 +166,29 @@ inline CUDAStream make_cuda_stream(cudaStream_t raw,
  * Get the current CUDA stream for the passed CUDA device, or for the
  * current device if no device index is passed.
  */
-CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1);
+PADDLE_API CUDAStream getCurrentCUDAStream(c10::DeviceIndex device_index = -1);
 
 /**
  * Get a new stream from the CUDA stream pool.
  * Priority -1 is high priority, 0 is default/low priority.
  * Matches PyTorch behavior where negative priority = high priority.
  */
-CUDAStream getStreamFromPool(const int priority = 0,
-                             c10::DeviceIndex device_index = -1);
+PADDLE_API CUDAStream getStreamFromPool(const int priority = 0,
+                                        c10::DeviceIndex device_index = -1);
 
 /**
  * Get a new stream from the CUDA stream pool.
  * Bool overload: true = high priority (-1), false = default priority (0).
  */
-CUDAStream getStreamFromPool(const bool isHighPriority,
-                             c10::DeviceIndex device_index = -1);
+PADDLE_API CUDAStream getStreamFromPool(const bool isHighPriority,
+                                        c10::DeviceIndex device_index = -1);
 
 #ifdef PADDLE_WITH_HIP
-CUDAStream getStreamFromExternal(hipStream_t ext_stream,
-                                 c10::DeviceIndex device_index);
+PADDLE_API CUDAStream getStreamFromExternal(hipStream_t ext_stream,
+                                            c10::DeviceIndex device_index);
 #else
-CUDAStream getStreamFromExternal(cudaStream_t ext_stream,
-                                 c10::DeviceIndex device_index);
+PADDLE_API CUDAStream getStreamFromExternal(cudaStream_t ext_stream,
+                                            c10::DeviceIndex device_index);
 #endif
 
 /**
@@ -196,9 +197,9 @@ CUDAStream getStreamFromExternal(cudaStream_t ext_stream,
  * Keeps the compat c10 stream state aligned with Paddle's GPUContext so
  * Paddle stream guards and c10 callers observe the same current stream.
  */
-void setCurrentCUDAStream(CUDAStream stream);
+PADDLE_API void setCurrentCUDAStream(CUDAStream stream);
 
-CUDAStream getDefaultCUDAStream(c10::DeviceIndex device_index = -1);
+PADDLE_API CUDAStream getDefaultCUDAStream(c10::DeviceIndex device_index = -1);
 
 inline std::ostream& operator<<(std::ostream& stream, const CUDAStream& s) {
   return stream << s.unwrap();
diff --git a/paddle/phi/api/include/compat/c10/util/typeid.h b/paddle/phi/api/include/compat/c10/util/typeid.h
index dfaa4db287d76..a58d751abbcb2 100644
--- a/paddle/phi/api/include/compat/c10/util/typeid.h
+++ b/paddle/phi/api/include/compat/c10/util/typeid.h
@@ -551,9 +551,15 @@ inline std::ostream& operator<<(std::ostream& stream,
 #define EXPORT_IF_NOT_GCC
 #endif
 
+#if defined(_MSC_VER)
+#define C10_TEMPLATE_API C10_API
+#else
+#define C10_TEMPLATE_API
+#endif
+
 // For use in a .cpp file.
 #define CAFFE_KNOWN_TYPE(T)                                          \
-  template uint16_t TypeMeta::addTypeMetaData<T>();                  \
+  template C10_TEMPLATE_API uint16_t TypeMeta::addTypeMetaData<T>(); \
   template <>                                                        \
   EXPORT_IF_NOT_GCC uint16_t TypeMeta::_typeMetaData<T>() noexcept { \
     static const uint16_t index = addTypeMetaData<T>();              \
@@ -561,11 +567,11 @@ inline std::ostream& operator<<(std::ostream& stream,
   }
 
 // For use in a .cpp file when a declaration in the header is provided.
-#define CAFFE_DEFINE_KNOWN_TYPE(T, ident)                          \
-  template uint16_t TypeMeta::addTypeMetaData<T>();                \
-  namespace detail {                                               \
-  EXPORT_IF_NOT_GCC extern const uint16_t ident##_metadata_index = \
-      TypeMeta::addTypeMetaData<T>();                              \
+#define CAFFE_DEFINE_KNOWN_TYPE(T, ident)                            \
+  template C10_TEMPLATE_API uint16_t TypeMeta::addTypeMetaData<T>(); \
+  namespace detail {                                                 \
+  EXPORT_IF_NOT_GCC extern const uint16_t ident##_metadata_index =   \
+      TypeMeta::addTypeMetaData<T>();                                \
   } /* namespace detail */
 
 // Declaration counterpart: provides an inline fast-path via a detail var.
@@ -575,7 +581,7 @@ inline std::ostream& operator<<(std::ostream& stream,
 // upstream declare/define model.
 #if defined(_MSC_VER)
 #define CAFFE_DECLARE_KNOWN_TYPE(T, ident)                           \
-  extern template uint16_t TypeMeta::addTypeMetaData<T>();           \
+  extern template C10_API uint16_t TypeMeta::addTypeMetaData<T>();   \
   namespace detail {                                                 \
   extern C10_API const uint16_t ident##_metadata_index;              \
   } /* namespace detail */                                           \
diff --git a/paddle/phi/api/include/compat/torch/library.h b/paddle/phi/api/include/compat/torch/library.h
index b336f083bc17a..fa6bc830ce286 100644
--- a/paddle/phi/api/include/compat/torch/library.h
+++ b/paddle/phi/api/include/compat/torch/library.h
@@ -871,7 +871,7 @@ class PADDLE_API OperatorRegistry {
   }
 };
 
-class Library {
+class PADDLE_API Library {
  public:
   enum Kind {
     DEF,      // TORCH_LIBRARY
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 99d3733da0490..b3312fc5a7b64 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -173,6 +173,9 @@ static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
 static constexpr char* win_cublas_lib =
     "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_cublaslt_lib =
+    "cublasLt64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cublasLt64_" CUDA_VERSION_MAJOR ".dll";
 static constexpr char* win_curand_lib =
     "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
@@ -524,21 +527,21 @@ void* GetCublasLtDsoHandle() {
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_11.dll");
 #else
     return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+        FLAGS_cuda_dir, win_cublaslt_lib, true, {cuda_lib_path});
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
 #ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_12.dll");
 #else
     return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+        FLAGS_cuda_dir, win_cublaslt_lib, true, {cuda_lib_path});
 #endif
   } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
 #ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_13.dll");
 #else
     return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+        FLAGS_cuda_dir, win_cublaslt_lib, true, {cuda_lib_path});
 #endif
   } else {
     std::string warning_msg(
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index 73f167d7e865a..63c630345c83b 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -107,7 +107,7 @@ void GpuMemcpyPeerSync(
 void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
 
 //! Blocks until stream has completed all operations.
-void GpuStreamSync(gpuStream_t stream);
+PADDLE_API void GpuStreamSync(gpuStream_t stream);
 
 void GpuDestroyStream(gpuStream_t stream);
 
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 024a7de73eb72..d07575028c1fa 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -95,7 +95,7 @@ std::string GetCompleteTraceBackString(StrType&& what,
 
 inline bool is_error(bool stat) { return !stat; }
 
-void ThrowWarnInternal(const std::string& message);
+PADDLE_API void ThrowWarnInternal(const std::string& message);
 
 #if defined(__CUDA_ARCH__)
 // For cuda, the assertions can affect performance and it is therefore
diff --git a/paddle/phi/core/platform/cuda_graph_with_memory_pool.cc b/paddle/phi/core/platform/cuda_graph_with_memory_pool.cc
index be92a20250674..7122b5c5edb9d 100644
--- a/paddle/phi/core/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/phi/core/platform/cuda_graph_with_memory_pool.cc
@@ -25,7 +25,24 @@ COMMON_DECLARE_bool(new_executor_use_cuda_graph);
 
 namespace paddle::platform {
 
+// Keep capture-state queries out of headers so Windows callers in different
+// DLLs observe the same CUDAGraph global state.
+bool IsCUDAGraphCapturing() {
+  return phi::backends::gpu::IsCUDAGraphCapturing();
+}
+
+phi::Place CUDAGraphCapturingPlace() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
+  return phi::backends::gpu::CUDAGraph::CapturingPlace();
+#else
+  PADDLE_THROW(common::errors::Unimplemented(
+      "CUDA Graph is only supported on NVIDIA GPU device."));
+#endif
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 void InitCUDNNRelatedHandle(phi::GPUContext* dev_ctx) {
   dev_ctx->cudnn_workspace_handle().ResetWorkspace();
 
diff --git a/paddle/phi/core/platform/cuda_graph_with_memory_pool.h b/paddle/phi/core/platform/cuda_graph_with_memory_pool.h
index 7054bb5e7ddb2..8c528b22d9dff 100644
--- a/paddle/phi/core/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/phi/core/platform/cuda_graph_with_memory_pool.h
@@ -25,6 +25,9 @@
 namespace paddle {
 namespace platform {
 
+PADDLE_API bool IsCUDAGraphCapturing();
+PADDLE_API phi::Place CUDAGraphCapturingPlace();
+
 // NOTE: These APIs are not thread-safe.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -46,18 +49,6 @@ PADDLE_API void BeginCUDAGraphCapture(
     int64_t pool_id = CUDAGraph::kInvalidPoolID);
 #endif
 
-inline phi::Place CUDAGraphCapturingPlace() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
-  return CUDAGraph::CapturingPlace();
-#else
-  PADDLE_THROW(common::errors::Unimplemented(
-      "CUDA Graph is only supported on NVIDIA GPU device."));
-#endif
-}
-
-using phi::backends::gpu::IsCUDAGraphCapturing;
-
 using phi::backends::gpu::AddPostResetCallbackIfCapturingCUDAGraph;
 
 using phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph;
diff --git a/test/cpp/compat/ATen_factory_default_dtype_test.cc b/test/cpp/compat/ATen_factory_default_dtype_test.cc
index b982d0e3d96f7..3e27476138578 100644
--- a/test/cpp/compat/ATen_factory_default_dtype_test.cc
+++ b/test/cpp/compat/ATen_factory_default_dtype_test.cc
@@ -19,6 +19,7 @@
 #include <ATen/ops/ones.h>
 #include <ATen/ops/zeros.h>
 #include <c10/core/DefaultDtype.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/core/TensorOptions.h>
 
@@ -29,14 +30,14 @@ namespace {
 class DefaultDtypeGuard {
  public:
   explicit DefaultDtypeGuard(c10::ScalarType dtype)
-      : previous_(c10::get_default_dtype_as_scalartype()) {
-    c10::set_default_dtype(dtype);
+      : previous_(c10::get_default_dtype()) {
+    c10::set_default_dtype(c10::scalarTypeToTypeMeta(dtype));
   }
 
   ~DefaultDtypeGuard() { c10::set_default_dtype(previous_); }
 
  private:
-  c10::ScalarType previous_;
+  caffe2::TypeMeta previous_;
 };
 
 }  // namespace
diff --git a/test/cpp/compat/CMakeLists.txt b/test/cpp/compat/CMakeLists.txt
index 8e99a773fb051..6368129b02f67 100644
--- a/test/cpp/compat/CMakeLists.txt
+++ b/test/cpp/compat/CMakeLists.txt
@@ -1,92 +1,89 @@
-if(NOT WIN32)
-  # c10 core tests (CPU compatible)
-  cc_test(c10_Device_test SRCS c10_Device_test.cc)
-  cc_test(c10_DispatchKeySet_test SRCS c10_DispatchKeySet_test.cc)
-  cc_test(c10_DispatchKey_test SRCS c10_DispatchKey_test.cc)
-  cc_test(c10_MemoryFormat_test SRCS c10_MemoryFormat_test.cc)
-  cc_test(c10_ScalarType_test SRCS c10_ScalarType_test.cc)
-  cc_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc)
-  cc_test(c10_TensorOptions_test SRCS c10_TensorOptions_test.cc)
-  cc_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc)
-  cc_test(c10_intrusive_ptr_lifecycle_test
-          SRCS c10_intrusive_ptr_lifecycle_test.cc)
-  cc_test(c10_layout_test SRCS c10_layout_test.cc)
-  cc_test(c10_ptr_test SRCS c10_ptr_test.cc)
-  cc_test(c10_storage_test SRCS c10_storage_test.cc)
+# c10 core tests (CPU compatible)
+cc_test(c10_Device_test SRCS c10_Device_test.cc)
+cc_test(c10_DispatchKeySet_test SRCS c10_DispatchKeySet_test.cc)
+cc_test(c10_DispatchKey_test SRCS c10_DispatchKey_test.cc)
+cc_test(c10_MemoryFormat_test SRCS c10_MemoryFormat_test.cc)
+cc_test(c10_ScalarType_test SRCS c10_ScalarType_test.cc)
+cc_test(c10_SizesAndStrides_test SRCS c10_SizesAndStrides_test.cc)
+cc_test(c10_TensorOptions_test SRCS c10_TensorOptions_test.cc)
+cc_test(c10_TypeMeta_test SRCS c10_TypeMeta_test.cc)
+cc_test(c10_intrusive_ptr_lifecycle_test
+        SRCS c10_intrusive_ptr_lifecycle_test.cc)
+cc_test(c10_layout_test SRCS c10_layout_test.cc)
+cc_test(c10_ptr_test SRCS c10_ptr_test.cc)
+cc_test(c10_storage_test SRCS c10_storage_test.cc)
 
-  # ATen core tests (CPU compatible)
-  cc_test(ATen_all_test SRCS ATen_all_test.cc)
-  cc_test(ATen_any_test SRCS ATen_any_test.cc)
-  cc_test(ATen_as_strided_test SRCS ATen_as_strided_test.cc)
-  cc_test(ATen_autograd_test SRCS ATen_autograd_test.cc)
-  cc_test(ATen_chunk_test SRCS ATen_chunk_test.cc)
-  cc_test(ATen_clamp_test SRCS ATen_clamp_test.cc)
-  cc_test(ATen_coalesce_test SRCS ATen_coalesce_test.cc)
-  cc_test(ATen_dense_sparse_conversion_test
-          SRCS ATen_dense_sparse_conversion_test.cc)
-  cc_test(ATen_empty_test SRCS ATen_empty_test.cc)
-  cc_test(ATen_equal_test SRCS ATen_equal_test.cc)
-  cc_test(ATen_expand_test SRCS ATen_expand_test.cc)
-  cc_test(ATen_eye_test SRCS ATen_eye_test.cc)
-  cc_test(ATen_factory_default_dtype_test
-          SRCS ATen_factory_default_dtype_test.cc)
-  cc_test(ATen_flatten_test SRCS ATen_flatten_test.cc)
-  cc_test(ATen_from_blob_test SRCS ATen_from_blob_test.cc)
-  cc_test(ATen_hook_test SRCS ATen_hook_test.cc)
-  cc_test(ATen_index_test SRCS ATen_index_test.cc)
-  cc_test(ATen_item_test SRCS ATen_item_test.cc)
-  cc_test(ATen_narrow_test SRCS ATen_narrow_test.cc)
-  cc_test(ATen_new_test SRCS ATen_new_test.cc)
-  cc_test(ATen_nnz_test SRCS ATen_nnz_test.cc)
-  cc_test(ATen_rename_test SRCS ATen_rename_test.cc)
-  cc_test(ATen_reshape_test SRCS ATen_reshape_test.cc)
-  cc_test(ATen_resize_test SRCS ATen_resize_test.cc)
-  cc_test(ATen_squeeze_test SRCS ATen_squeeze_test.cc)
-  cc_test(ATen_std_var_test SRCS ATen_std_var_test.cc)
-  cc_test(ATen_sum_test SRCS ATen_sum_test.cc)
-  cc_test(ATen_t_test SRCS ATen_t_test.cc)
-  cc_test(ATen_tensor_data_test SRCS ATen_tensor_data_test.cc)
-  cc_test(ATen_toString_test SRCS ATen_toString_test.cc)
-  cc_test(ATen_to_test SRCS ATen_to_test.cc)
-  cc_test(ATen_transpose_test SRCS ATen_transpose_test.cc)
-  cc_test(ATen_Utils_test SRCS ATen_Utils_test.cc)
-  cc_test(ATen_values_test SRCS ATen_values_test.cc)
-  cc_test(ATen_viewAs_test SRCS ATen_viewAs_test.cc)
+# ATen core tests (CPU compatible)
+cc_test(ATen_all_test SRCS ATen_all_test.cc)
+cc_test(ATen_any_test SRCS ATen_any_test.cc)
+cc_test(ATen_as_strided_test SRCS ATen_as_strided_test.cc)
+cc_test(ATen_autograd_test SRCS ATen_autograd_test.cc)
+cc_test(ATen_chunk_test SRCS ATen_chunk_test.cc)
+cc_test(ATen_clamp_test SRCS ATen_clamp_test.cc)
+cc_test(ATen_coalesce_test SRCS ATen_coalesce_test.cc)
+cc_test(ATen_dense_sparse_conversion_test
+        SRCS ATen_dense_sparse_conversion_test.cc)
+cc_test(ATen_empty_test SRCS ATen_empty_test.cc)
+cc_test(ATen_equal_test SRCS ATen_equal_test.cc)
+cc_test(ATen_expand_test SRCS ATen_expand_test.cc)
+cc_test(ATen_eye_test SRCS ATen_eye_test.cc)
+cc_test(ATen_factory_default_dtype_test SRCS ATen_factory_default_dtype_test.cc)
+cc_test(ATen_flatten_test SRCS ATen_flatten_test.cc)
+cc_test(ATen_from_blob_test SRCS ATen_from_blob_test.cc)
+cc_test(ATen_hook_test SRCS ATen_hook_test.cc)
+cc_test(ATen_index_test SRCS ATen_index_test.cc)
+cc_test(ATen_item_test SRCS ATen_item_test.cc)
+cc_test(ATen_narrow_test SRCS ATen_narrow_test.cc)
+cc_test(ATen_new_test SRCS ATen_new_test.cc)
+cc_test(ATen_nnz_test SRCS ATen_nnz_test.cc)
+cc_test(ATen_rename_test SRCS ATen_rename_test.cc)
+cc_test(ATen_reshape_test SRCS ATen_reshape_test.cc)
+cc_test(ATen_resize_test SRCS ATen_resize_test.cc)
+cc_test(ATen_squeeze_test SRCS ATen_squeeze_test.cc)
+cc_test(ATen_std_var_test SRCS ATen_std_var_test.cc)
+cc_test(ATen_sum_test SRCS ATen_sum_test.cc)
+cc_test(ATen_t_test SRCS ATen_t_test.cc)
+cc_test(ATen_tensor_data_test SRCS ATen_tensor_data_test.cc)
+cc_test(ATen_toString_test SRCS ATen_toString_test.cc)
+cc_test(ATen_to_test SRCS ATen_to_test.cc)
+cc_test(ATen_transpose_test SRCS ATen_transpose_test.cc)
+cc_test(ATen_Utils_test SRCS ATen_Utils_test.cc)
+cc_test(ATen_values_test SRCS ATen_values_test.cc)
+cc_test(ATen_viewAs_test SRCS ATen_viewAs_test.cc)
 
-  # torch library tests (CPU compatible)
-  cc_test(torch_library_test SRCS torch_library_test.cc)
-  cc_test(torch_library_dispatch_test SRCS torch_library_dispatch_test.cc)
+# torch library tests (CPU compatible)
+cc_test(torch_library_test SRCS torch_library_test.cc)
+cc_test(torch_library_dispatch_test SRCS torch_library_dispatch_test.cc)
 
-  # GPU-runtime compat tests are not fully audited on ROCm/DCU yet.
-  # Keep the DCU surface limited to the cases adapted in this PR.
-  if(WITH_ROCM)
-    cc_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc)
-    cc_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc)
-    cc_test(c10_Event_test SRCS c10_Event_test.cc)
-    cc_test(c10_Stream_test SRCS c10_Stream_test.cc)
-  else()
-    cc_test(ATen_TensorAccessor_test SRCS ATen_TensorAccessor_test.cc)
-    cc_test(ATen_basic_test SRCS ATen_basic_test.cc)
-    cc_test(ATen_local_scalar_dense_test SRCS ATen_local_scalar_dense_test.cc)
-    cc_test(ATen_memory_test SRCS ATen_memory_test.cc)
-    cc_test(ATen_pin_memory_creation_test SRCS ATen_pin_memory_creation_test.cc)
-    cc_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc)
-    cc_test(ATen_select_test SRCS ATen_select_test.cc)
-    cc_test(ATen_split_test SRCS ATen_split_test.cc)
+# GPU-runtime compat tests are not fully audited on ROCm/DCU yet.
+# Keep the DCU surface limited to the cases adapted in this PR.
+if(WITH_ROCM)
+  cc_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc)
+  cc_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc)
+  cc_test(c10_Event_test SRCS c10_Event_test.cc)
+  cc_test(c10_Stream_test SRCS c10_Stream_test.cc)
+else()
+  cc_test(ATen_TensorAccessor_test SRCS ATen_TensorAccessor_test.cc)
+  cc_test(ATen_basic_test SRCS ATen_basic_test.cc)
+  cc_test(ATen_local_scalar_dense_test SRCS ATen_local_scalar_dense_test.cc)
+  cc_test(ATen_memory_test SRCS ATen_memory_test.cc)
+  cc_test(ATen_pin_memory_creation_test SRCS ATen_pin_memory_creation_test.cc)
+  cc_test(ATen_record_stream_test SRCS ATen_record_stream_test.cc)
+  cc_test(ATen_select_test SRCS ATen_select_test.cc)
+  cc_test(ATen_split_test SRCS ATen_split_test.cc)
 
-    cc_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc)
-    cc_test(ATen_philox_test SRCS ATen_philox_test.cc)
-    cc_test(c10_Event_test SRCS c10_Event_test.cc)
-    cc_test(c10_Stream_test SRCS c10_Stream_test.cc)
-  endif()
+  cc_test(ATen_CUDAContext_test SRCS ATen_CUDAContext_test.cc)
+  cc_test(ATen_philox_test SRCS ATen_philox_test.cc)
+  cc_test(c10_Event_test SRCS c10_Event_test.cc)
+  cc_test(c10_Stream_test SRCS c10_Stream_test.cc)
+endif()
 
-  if(WITH_GPU)
-    nv_test(ATen_CUDABlas_test SRCS ATen_CUDABlas_test.cc)
-    nv_test(ATen_cuda_test SRCS ATen_cuda_test.cc)
-    nv_test(c10_cuda_generator_test SRCS c10_cuda_generator_test.cc)
-    nv_test(c10_generator_impl_test SRCS c10_generator_impl_test.cc)
-  endif()
-  cc_test(schema_parser_type_test SRCS schema_parser_type_test.cc)
+if(WITH_GPU)
+  nv_test(ATen_CUDABlas_test SRCS ATen_CUDABlas_test.cc)
+  nv_test(ATen_cuda_test SRCS ATen_cuda_test.cc)
+  nv_test(c10_cuda_generator_test SRCS c10_cuda_generator_test.cc)
+  nv_test(c10_generator_impl_test SRCS c10_generator_impl_test.cc)
 endif()
+cc_test(schema_parser_type_test SRCS schema_parser_type_test.cc)
 
 add_subdirectory(torch)
diff --git a/test/cpp/compat/c10_TensorOptions_test.cc b/test/cpp/compat/c10_TensorOptions_test.cc
index 63019c8a4c0ec..070999f2febf8 100644
--- a/test/cpp/compat/c10_TensorOptions_test.cc
+++ b/test/cpp/compat/c10_TensorOptions_test.cc
@@ -33,14 +33,14 @@ namespace {
 class DefaultDtypeGuard {
  public:
   explicit DefaultDtypeGuard(c10::ScalarType dtype)
-      : previous_(c10::get_default_dtype_as_scalartype()) {
-    c10::set_default_dtype(dtype);
+      : previous_(c10::get_default_dtype()) {
+    c10::set_default_dtype(c10::scalarTypeToTypeMeta(dtype));
   }
 
   ~DefaultDtypeGuard() { c10::set_default_dtype(previous_); }
 
  private:
-  c10::ScalarType previous_;
+  caffe2::TypeMeta previous_;
 };
 
 }  // namespace
@@ -154,6 +154,32 @@ TEST(TensorOptionsTest, DtypeDefaultTracksGlobalDefaultDtype) {
             c10::kDouble);
 }
 
+TEST(TensorOptionsTest, DefaultComplexDtypeTracksGlobalDefaultDtype) {
+  {
+    DefaultDtypeGuard guard(c10::kHalf);
+
+    ASSERT_EQ(c10::get_default_dtype_as_scalartype(), c10::kHalf);
+    ASSERT_EQ(c10::get_default_complex_dtype().toScalarType(),
+              c10::ScalarType::ComplexHalf);
+  }
+
+  {
+    DefaultDtypeGuard guard(c10::kDouble);
+
+    ASSERT_EQ(c10::get_default_dtype_as_scalartype(), c10::kDouble);
+    ASSERT_EQ(c10::get_default_complex_dtype().toScalarType(),
+              c10::ScalarType::ComplexDouble);
+  }
+
+  {
+    DefaultDtypeGuard guard(c10::kFloat);
+
+    ASSERT_EQ(c10::get_default_dtype_as_scalartype(), c10::kFloat);
+    ASSERT_EQ(c10::get_default_complex_dtype().toScalarType(),
+              c10::ScalarType::ComplexFloat);
+  }
+}
+
 // ---- device ----
 
 TEST(TensorOptionsTest, SetDevice_CPU) {
diff --git a/test/cpp/compat/c10_generator_impl_test.cc b/test/cpp/compat/c10_generator_impl_test.cc
index 92192b4affca5..b950cb1e0f965 100644
--- a/test/cpp/compat/c10_generator_impl_test.cc
+++ b/test/cpp/compat/c10_generator_impl_test.cc
@@ -27,13 +27,13 @@
 // ---------- Construction ----------------------------------------------------
 
 TEST(GeneratorImplTest, ConstructWithNullptrCreatesDefaultGen) {
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU));
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU)};
   ASSERT_NE(impl.paddle_generator(), nullptr);
 }
 
 TEST(GeneratorImplTest, ConstructWithExistingGen) {
   auto gen = std::make_shared<phi::Generator>(42u);
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU), gen);
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU), gen};
   ASSERT_EQ(impl.paddle_generator(), gen);
   ASSERT_EQ(impl.current_seed(), 42u);
 }
@@ -41,13 +41,13 @@ TEST(GeneratorImplTest, ConstructWithExistingGen) {
 // ---------- Seed / offset API (base-class versions) -------------------------
 
 TEST(GeneratorImplTest, SetAndGetCurrentSeed) {
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU));
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU)};
   impl.set_current_seed(12345);
   ASSERT_EQ(impl.current_seed(), 12345u);
 }
 
 TEST(GeneratorImplTest, SeedGeneratesNewSeed) {
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU));
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU)};
   impl.set_current_seed(1);
   uint64_t new_seed = impl.seed();
   // seed() should return a new random seed (very unlikely to be 1 again).
@@ -56,13 +56,13 @@ TEST(GeneratorImplTest, SeedGeneratesNewSeed) {
 }
 
 TEST(GeneratorImplTest, GetOffsetInitiallyZero) {
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU));
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU)};
   ASSERT_EQ(impl.get_offset(), 0u);
 }
 
 TEST(GeneratorImplTest, SetOffsetForward) {
   auto gen = std::make_shared<phi::Generator>(100u);
-  c10::GeneratorImpl impl(c10::Device(c10::kCUDA, 0), gen);
+  c10::GeneratorImpl impl{c10::Device(c10::kCUDA, 0), gen};
 
   impl.set_offset(10);
   ASSERT_EQ(impl.get_offset(), 10u);
@@ -70,7 +70,7 @@ TEST(GeneratorImplTest, SetOffsetForward) {
 
 TEST(GeneratorImplTest, SetOffsetBackward) {
   auto gen = std::make_shared<phi::Generator>(100u);
-  c10::GeneratorImpl impl(c10::Device(c10::kCUDA, 0), gen);
+  c10::GeneratorImpl impl{c10::Device(c10::kCUDA, 0), gen};
 
   impl.set_offset(20);
   ASSERT_EQ(impl.get_offset(), 20u);
@@ -81,7 +81,7 @@ TEST(GeneratorImplTest, SetOffsetBackward) {
 
 TEST(GeneratorImplTest, SetOffsetSameValue) {
   auto gen = std::make_shared<phi::Generator>(100u);
-  c10::GeneratorImpl impl(c10::Device(c10::kCUDA, 0), gen);
+  c10::GeneratorImpl impl{c10::Device(c10::kCUDA, 0), gen};
 
   impl.set_offset(10);
   impl.set_offset(10);
@@ -91,19 +91,19 @@ TEST(GeneratorImplTest, SetOffsetSameValue) {
 // ---------- Device / DispatchKeySet -----------------------------------------
 
 TEST(GeneratorImplTest, DeviceReturnsCorrectDevice) {
-  c10::Device cpu_dev(c10::kCPU);
-  c10::GeneratorImpl impl(cpu_dev);
+  c10::Device cpu_dev{c10::kCPU};
+  c10::GeneratorImpl impl{cpu_dev};
   ASSERT_EQ(impl.device(), cpu_dev);
 }
 
 TEST(GeneratorImplTest, KeySetCPU) {
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU));
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU)};
   c10::DispatchKeySet ks = impl.key_set();
   ASSERT_TRUE(ks.has(c10::DispatchKey::CPU));
 }
 
 TEST(GeneratorImplTest, KeySetCUDA) {
-  c10::GeneratorImpl impl(c10::Device(c10::kCUDA, 0));
+  c10::GeneratorImpl impl{c10::Device(c10::kCUDA, 0)};
   c10::DispatchKeySet ks = impl.key_set();
   ASSERT_TRUE(ks.has(c10::DispatchKey::CUDA));
 }
@@ -111,7 +111,7 @@ TEST(GeneratorImplTest, KeySetCUDA) {
 TEST(GeneratorImplTest, KeySetOtherDevice) {
   // Use kCUSTOM which is neither CPU nor CUDA to exercise the fallback
   // branch that returns an empty DispatchKeySet.
-  c10::GeneratorImpl impl(c10::Device(c10::kCUSTOM, 0));
+  c10::GeneratorImpl impl{c10::Device(c10::kCUSTOM, 0)};
   c10::DispatchKeySet ks = impl.key_set();
   ASSERT_FALSE(ks.has(c10::DispatchKey::CPU));
   ASSERT_FALSE(ks.has(c10::DispatchKey::CUDA));
@@ -121,7 +121,7 @@ TEST(GeneratorImplTest, KeySetOtherDevice) {
 
 TEST(GeneratorImplTest, ClonePreservesState) {
   auto gen = std::make_shared<phi::Generator>(42u);
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU), gen);
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU), gen};
   impl.set_current_seed(777);
 
   auto cloned = impl.clone();
@@ -137,12 +137,12 @@ TEST(GeneratorImplTest, ClonePreservesState) {
 // ---------- PyObject binding ------------------------------------------------
 
 TEST(GeneratorImplTest, PyObjDefaultNull) {
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU));
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU)};
   ASSERT_EQ(impl.pyobj(), nullptr);
 }
 
 TEST(GeneratorImplTest, SetAndGetPyObj) {
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU));
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU)};
 
   // Use a dummy pointer (we never dereference it).
   int dummy = 0;
@@ -183,7 +183,7 @@ TEST(GeneratorImplTest, MoveIntrusivePtrKeepsRefcount) {
 
 TEST(GeneratorImplTest, PaddleGeneratorAccessor) {
   auto gen = std::make_shared<phi::Generator>(99u);
-  c10::GeneratorImpl impl(c10::Device(c10::kCPU), gen);
+  c10::GeneratorImpl impl{c10::Device(c10::kCPU), gen};
   ASSERT_EQ(impl.paddle_generator(), gen);
   ASSERT_EQ(impl.paddle_generator()->GetCurrentSeed(), 99u);
 }
diff --git a/test/cpp/compat/torch_library_test.cc b/test/cpp/compat/torch_library_test.cc
index e1b96502fc2bd..1db9fa7d37470 100644
--- a/test/cpp/compat/torch_library_test.cc
+++ b/test/cpp/compat/torch_library_test.cc
@@ -971,12 +971,10 @@ TEST(test_torch_library, TestLibraryPrintInfoWithDispatchKey) {
                          __FILE__,
                          __LINE__);
 
-  std::ostringstream captured_output;
-  auto* original_buffer = std::cout.rdbuf(captured_output.rdbuf());
+  testing::internal::CaptureStdout();
   library.print_info();
-  std::cout.rdbuf(original_buffer);
+  auto output = testing::internal::GetCapturedStdout();
 
-  auto output = captured_output.str();
   ASSERT_NE(output.find("Library Info: IMPL"), std::string::npos);
   ASSERT_NE(output.find("namespace=runtime_library_info"), std::string::npos);
   ASSERT_NE(output.find("dispatch_key="), std::string::npos);