diff --git a/paddle/phi/kernels/fusion/gpu/attn_gemm_int8.h b/paddle/phi/kernels/fusion/gpu/attn_gemm_int8.h
index 382d69fc450d6a..9bce833790f51c 100644
--- a/paddle/phi/kernels/fusion/gpu/attn_gemm_int8.h
+++ b/paddle/phi/kernels/fusion/gpu/attn_gemm_int8.h
@@ -26,7 +26,7 @@
 namespace phi {
 namespace fusion {
 
-using phi::backends::gpu::GpuLaunchConfig;
+using backends::gpu::GpuLaunchConfig;
 
 template <typename T>
 class AttnMatmulINT8 {
@@ -37,8 +37,8 @@ class AttnMatmulINT8 {
     auto helper = std::make_shared<phi::CublasLtHelper>(
         m, k, n, dev_ctx.cublaslt_handle());
     helpers_.emplace_back(helper);
-    gpu_config_ = std::make_unique<GpuLaunchConfig>(
-        phi::backends::gpu::GetGpuLaunchConfig1D(
+    gpu_config_ =
+        std::make_unique<GpuLaunchConfig>(backends::gpu::GetGpuLaunchConfig1D(
             dev_ctx, m * n, DequantKernelVecSize));
   }
   ~AttnMatmulINT8() {}
diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h
index d81450b62e6086..8ee3c2f0c7e11b 100644
--- a/paddle/phi/kernels/fusion/gpu/block_attn.h
+++ b/paddle/phi/kernels/fusion/gpu/block_attn.h
@@ -1812,10 +1812,10 @@ inline GPU(Error_t) GetNumBlocks(int64_t n, int *num_blocks) {
   constexpr int kBlockSize = 128;
   constexpr int kNumWaves = 16;
 
-  const int device_id = phi::backends::gpu::GetCurrentDeviceId();
-  const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id);
+  const int device_id = backends::gpu::GetCurrentDeviceId();
+  const int sm_count = backends::gpu::GetGPUMultiProcessors(device_id);
   const int max_thread_per_multiprocessor =
-      phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
+      backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
 
   *num_blocks =
       std::max<int>(1,
diff --git a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
index ee2cc986c3eb26..f223b0d3abc1cb 100644
--- a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
@@ -22,13 +22,6 @@
 #include "paddle/phi/kernels/gpu/flash_attn_utils.h"
 #include "paddle/utils/none.h"
 
-inline int getSMVersion() {
-  const int device = phi::backends::gpu::GetCurrentDeviceId();
-  const phi::gpuDeviceProp prop =
-      phi::backends::gpu::GetDeviceProperties(device);
-  return prop.major * 10 + prop.minor;
-}
-
 #if defined(__CUDACC__) && CUDA_VERSION >= 11000
 #define CUDA_BFLOAT16_AVAILABLE
 #include <cuda_bf16.h>
@@ -37,6 +30,12 @@ inline int getSMVersion() {
 namespace phi {
 namespace fusion {
 
+inline int getSMVersion() {
+  const int device = backends::gpu::GetCurrentDeviceId();
+  const phi::gpuDeviceProp prop = backends::gpu::GetDeviceProperties(device);
+  return prop.major * 10 + prop.minor;
+}
+
 int GetMaxLen(const GPUContext& dev_ctx,
               const DenseTensor& seq_lens_tensor,
               DenseTensor* max_len_tensor,
diff --git a/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
index 49355243a5ceba..9dfd2214ce449e 100644
--- a/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
+++ b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
@@ -31,7 +31,7 @@ static void VecCastKernel(const GPUContext &dev_ctx,
                           const InT *x,
                           OutT *y,
                           size_t n) {
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, VecSize);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, VecSize);
   auto block = config.GetGridSize();
   auto thread = config.GetBlockSize();
   auto main_offset = n / (VecSize * thread) * VecSize * thread;
diff --git a/paddle/phi/kernels/fusion/gpu/cudnn_bn_stats_finalize.cu.h b/paddle/phi/kernels/fusion/gpu/cudnn_bn_stats_finalize.cu.h
index 7afb087ea92a37..89a7c7c054d255 100644
--- a/paddle/phi/kernels/fusion/gpu/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/phi/kernels/fusion/gpu/cudnn_bn_stats_finalize.cu.h
@@ -22,16 +22,15 @@ namespace fusion {
 
 template <typename T>
 using BatchNormParamType =
-    typename phi::backends::gpu::CudnnDataType<T>::BatchNormParamType;
+    typename backends::gpu::CudnnDataType<T>::BatchNormParamType;
 
 #if CUDNN_VERSION >= 8000
 
 template <typename T>
 struct BNStatsFinalizeArgs {
   BNStatsFinalizeArgs() {
-    dtype = phi::backends::gpu::CudnnDataType<T>::type;
-    param_dtype =
-        phi::backends::gpu::CudnnDataType<BatchNormParamType<T>>::type;
+    dtype = backends::gpu::CudnnDataType<T>::type;
+    param_dtype = backends::gpu::CudnnDataType<BatchNormParamType<T>>::type;
     format = CUDNN_TENSOR_NHWC;
   }
 
@@ -53,8 +52,8 @@ struct BNStatsFinalizeArgs {
   cudnnDataType_t param_dtype;
   cudnnTensorFormat_t format;
 
-  phi::backends::gpu::TensorDescriptor in_desc;
-  phi::backends::gpu::TensorDescriptor out_desc;
+  backends::gpu::TensorDescriptor in_desc;
+  backends::gpu::TensorDescriptor out_desc;
 };
 
 template <typename T>
diff --git a/paddle/phi/kernels/fusion/gpu/cudnn_norm_conv.cu.h b/paddle/phi/kernels/fusion/gpu/cudnn_norm_conv.cu.h
index f27d8f8e8cddea..a39724f16ee342 100644
--- a/paddle/phi/kernels/fusion/gpu/cudnn_norm_conv.cu.h
+++ b/paddle/phi/kernels/fusion/gpu/cudnn_norm_conv.cu.h
@@ -23,7 +23,7 @@ namespace dynload = phi::dynload;
 
 template <typename T>
 using ScalingParamType =
-    typename phi::backends::gpu::CudnnDataType<T>::ScalingParamType;
+    typename backends::gpu::CudnnDataType<T>::ScalingParamType;
 
 #if CUDNN_VERSION >= 8000
 
@@ -32,9 +32,9 @@ static size_t RoundUp(int64_t a, int64_t b) { return (a + b - 1) / b * b; }
 template <typename T>
 struct NormConvolutionArgs {
   NormConvolutionArgs() {
-    dtype = phi::backends::gpu::CudnnDataType<T>::type;
+    dtype = backends::gpu::CudnnDataType<T>::type;
     format = CUDNN_TENSOR_NHWC;
-    compute_type = phi::backends::gpu::CudnnDataType<float>::type;
+    compute_type = backends::gpu::CudnnDataType<float>::type;
   }
 
   void Set(const GPUContext &dev_ctx,
@@ -163,11 +163,11 @@ struct NormConvolutionArgs {
   std::vector<int> paddings;
   std::vector<int> dilations;
 
-  phi::backends::gpu::TensorDescriptor in_desc;
-  phi::backends::gpu::FilterDescriptor filter_desc;
-  phi::backends::gpu::TensorDescriptor out_desc;
-  phi::backends::gpu::TensorDescriptor out_stats_desc;
-  phi::backends::gpu::ConvolutionDescriptor conv_desc;
+  backends::gpu::TensorDescriptor in_desc;
+  backends::gpu::FilterDescriptor filter_desc;
+  backends::gpu::TensorDescriptor out_desc;
+  backends::gpu::TensorDescriptor out_stats_desc;
+  backends::gpu::ConvolutionDescriptor conv_desc;
 
   bool is_support;
 };
diff --git a/paddle/phi/kernels/fusion/gpu/cudnn_scale_bias_add_relu.cu.h b/paddle/phi/kernels/fusion/gpu/cudnn_scale_bias_add_relu.cu.h
index 7aff27dd267758..839304f3117308 100644
--- a/paddle/phi/kernels/fusion/gpu/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/phi/kernels/fusion/gpu/cudnn_scale_bias_add_relu.cu.h
@@ -20,20 +20,19 @@
 namespace phi {
 namespace fusion {
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 namespace dynload = phi::dynload;
 template <typename T>
 using BatchNormParamType =
-    typename phi::backends::gpu::CudnnDataType<T>::BatchNormParamType;
+    typename backends::gpu::CudnnDataType<T>::BatchNormParamType;
 
 #if CUDNN_VERSION >= 8000
 
 template <typename T>
 struct ScaleBiasAddReluArgs {
   ScaleBiasAddReluArgs() {
-    dtype = phi::backends::gpu::CudnnDataType<T>::type;
-    param_dtype =
-        phi::backends::gpu::CudnnDataType<BatchNormParamType<T>>::type;
+    dtype = backends::gpu::CudnnDataType<T>::type;
+    param_dtype = backends::gpu::CudnnDataType<BatchNormParamType<T>>::type;
     format = CUDNN_TENSOR_NHWC;
   }
 
@@ -89,12 +88,12 @@ struct ScaleBiasAddReluArgs {
   cudnnDataType_t param_dtype;
   cudnnTensorFormat_t format;
 
-  phi::backends::gpu::TensorDescriptor in_desc;
-  phi::backends::gpu::TensorDescriptor out_desc;
-  phi::backends::gpu::TensorDescriptor equiv_scale_bias_desc;
-  phi::backends::gpu::TensorDescriptor scale_bias_mean_var_desc;
-  phi::backends::gpu::TensorDescriptor bitmask_desc;
-  phi::backends::gpu::ActivationDescriptor activation_desc;
+  backends::gpu::TensorDescriptor in_desc;
+  backends::gpu::TensorDescriptor out_desc;
+  backends::gpu::TensorDescriptor equiv_scale_bias_desc;
+  backends::gpu::TensorDescriptor scale_bias_mean_var_desc;
+  backends::gpu::TensorDescriptor bitmask_desc;
+  backends::gpu::ActivationDescriptor activation_desc;
 };
 
 template <typename T>
diff --git a/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
index 00a22d8f05bb49..5b5b2292143f70 100644
--- a/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
@@ -494,7 +494,7 @@ void DistributedFusedLambInitOpKernel(
   VLOG(10) << "rank = " << rank << ", nranks = " << nranks
            << " , alignment = " << alignment;
   if (alignment <= 0) {
-    alignment = phi::backends::gpu::GpuMinChunkSize();
+    alignment = backends::gpu::GpuMinChunkSize();
   }
   PADDLE_ENFORCE_GE(
       alignment,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
index f7250122bd79a0..11db8f7e673825 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
@@ -116,10 +116,10 @@ inline gpuError_t GetNumBlocks(int64_t n, int *num_blocks) {
   constexpr int kBlockSize = 128;
   constexpr int kNumWaves = 16;
 
-  const int device_id = phi::backends::gpu::GetCurrentDeviceId();
-  const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id);
+  const int device_id = backends::gpu::GetCurrentDeviceId();
+  const int sm_count = backends::gpu::GetGPUMultiProcessors(device_id);
   const int max_thread_per_multiprocessor =
-      phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
+      backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
 
   *num_blocks =
       std::max<int>(1,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
index a153bd2ac2c74f..db9f35bf9385dc 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
@@ -55,7 +55,7 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx,
                                  DenseTensor *bias_grad) {
 // Note(andsonder): Fused bn activation only used in the gpu place.
 #if defined(PADDLE_WITH_CUDA) and CUDNN_VERSION >= 7401
-  using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+  using CudnnDataType = backends::gpu::CudnnDataType<T>;
   using BatchNormParamType = typename CudnnDataType::BatchNormParamType;
   double epsilon1 = static_cast<double>(epsilon);
 
@@ -147,7 +147,7 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx,
   DenseTensor workspace_tensor;
   auto reserve_space_size = reserve_space.memory_size();
   cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
-  phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
+  backends::gpu::ScopedActivationDescriptor scope_act_desc;
   cudnnActivationDescriptor_t activation_desc_ =
       scope_act_desc.descriptor<T>(act_type);
   // --------------- cudnn batchnorm workspace ---------------
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
index cd08ea15d8a4a5..bfafd0ac2d0f57 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
@@ -54,7 +54,7 @@ void FusedBatchNormActKernel(const Context &dev_ctx,
                              DenseTensor *reserve_space) {
 // Note(andsonder): Fused bn activation only used in the gpu place.
 #if defined(PADDLE_WITH_CUDA) and CUDNN_VERSION >= 7401
-  using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+  using CudnnDataType = backends::gpu::CudnnDataType<T>;
   using BatchNormParamType = typename CudnnDataType::BatchNormParamType;
   double epsilon1 = static_cast<double>(epsilon);
 
@@ -129,7 +129,7 @@ void FusedBatchNormActKernel(const Context &dev_ctx,
 
   double this_factor = 1. - momentum;
   cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
-  phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
+  backends::gpu::ScopedActivationDescriptor scope_act_desc;
   cudnnActivationDescriptor_t activation_desc_ =
       scope_act_desc.descriptor<T>(act_type);
   size_t workspace_size = 0;
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
index 7177bdda8a09bd..642ef1761fe884 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
@@ -38,7 +38,7 @@ namespace phi {
 namespace fusion {
 
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
@@ -141,7 +141,7 @@ void FusedBatchNormAddActGradKernel(const Context &dev_ctx,
   DenseTensor workspace_tensor;
   auto reserve_space_size = reserve_space_ptr->memory_size();
   cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
-  phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
+  backends::gpu::ScopedActivationDescriptor scope_act_desc;
   cudnnActivationDescriptor_t activation_desc_ =
       scope_act_desc.descriptor<T>(act_type);
   // --------------- cudnn batchnorm workspace ---------------
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
index 837e0d40f89f41..b5a69bc25871e5 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
@@ -38,7 +38,7 @@ namespace phi {
 namespace fusion {
 
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
@@ -112,7 +112,7 @@ void FusedBatchNormAddActKernel(const Context &dev_ctx,
 
   double this_factor = 1. - momentum;
   cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
-  phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
+  backends::gpu::ScopedActivationDescriptor scope_act_desc;
   cudnnActivationDescriptor_t activation_desc_ =
       scope_act_desc.descriptor<T>(act_type);
   size_t workspace_size = 0;
diff --git a/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act_kernel.cu
index efa8d043d2e61d..c98c667ed75af4 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act_kernel.cu
@@ -47,12 +47,12 @@ class CudnnConvDescManager {
   }
 
   struct CudnnCacheInfo {
-    phi::backends::gpu::TensorDescriptor* x_desc{nullptr};
-    phi::backends::gpu::FilterDescriptor* w_desc{nullptr};
-    phi::backends::gpu::TensorDescriptor* b_desc{nullptr};
-    phi::backends::gpu::TensorDescriptor* o_desc{nullptr};
-    phi::backends::gpu::ConvolutionDescriptor* conv_desc{nullptr};
-    phi::backends::gpu::ActivationDescriptor* act_desc{nullptr};
+    backends::gpu::TensorDescriptor* x_desc{nullptr};
+    backends::gpu::FilterDescriptor* w_desc{nullptr};
+    backends::gpu::TensorDescriptor* b_desc{nullptr};
+    backends::gpu::TensorDescriptor* o_desc{nullptr};
+    backends::gpu::ConvolutionDescriptor* conv_desc{nullptr};
+    backends::gpu::ActivationDescriptor* act_desc{nullptr};
     size_t workspace_size;
     cudnnConvolutionFwdAlgo_t algo;
 
@@ -283,42 +283,42 @@ class CudnnConvDescManager {
   }
 
  private:
-  phi::backends::gpu::TensorDescriptor* GetTensorDescInfo(
+  backends::gpu::TensorDescriptor* GetTensorDescInfo(
       const std::vector<int>& input_dims,
       phi::DataType input_dtype,
       cudnnTensorFormat_t input_format) {
-    auto* desc = new phi::backends::gpu::TensorDescriptor();
+    auto* desc = new backends::gpu::TensorDescriptor();
     desc->set(
         input_dims, input_format, backends::gpu::ToCudnnDataType(input_dtype));
     return desc;
   }
 
-  phi::backends::gpu::FilterDescriptor* GetFilterDescInfo(
+  backends::gpu::FilterDescriptor* GetFilterDescInfo(
       const std::vector<int>& input_dims,
       phi::DataType input_dtype,
       cudnnTensorFormat_t input_format) {
-    auto* desc = new phi::backends::gpu::FilterDescriptor();
+    auto* desc = new backends::gpu::FilterDescriptor();
     desc->set(
         input_dims, input_format, backends::gpu::ToCudnnDataType(input_dtype));
     return desc;
   }
 
-  phi::backends::gpu::ConvolutionDescriptor* GetConvDescInfo(
+  backends::gpu::ConvolutionDescriptor* GetConvDescInfo(
       const std::vector<int>& paddings,
       const std::vector<int>& strides,
       const std::vector<int>& dilations,
       int groups,
       cudnnDataType_t dtype) {
-    auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
+    auto* desc = new backends::gpu::ConvolutionDescriptor();
     desc->set(
         dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
     return desc;
   }
 
-  phi::backends::gpu::ActivationDescriptor* GetActivationDescInfo(
+  backends::gpu::ActivationDescriptor* GetActivationDescInfo(
       const std::string& act,
       double value_max = std::numeric_limits<double>::max()) {
-    auto* desc = new phi::backends::gpu::ActivationDescriptor();
+    auto* desc = new backends::gpu::ActivationDescriptor();
     cudnnActivationMode_t mode;
     double relu_ceiling = 0.0;
     if (act == "identity") {
@@ -545,7 +545,7 @@ void FusedConv2dAddActKernel(const Context& dev_ctx,
       conv_attr_cache->dilations,
       transformed_input.dtype(),
       groups,
-      phi::backends::gpu::CudnnDataType<T>::type,
+      backends::gpu::CudnnDataType<T>::type,
       compute_format,
       search_func,
       activation);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dconv_drelu_dbn_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dconv_drelu_dbn_kernel.cu
index f187543d70d14c..5b0cc43081f424 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dconv_drelu_dbn_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dconv_drelu_dbn_kernel.cu
@@ -34,7 +34,7 @@ namespace fusion {
 using helper = phi::CudnnFrontendConvHelper;
 
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 
 namespace {
 cudnn_frontend::Operation MakeDreluOp(cudnnDataType_t dtype,
@@ -131,17 +131,16 @@ void _DgradDreluBnBwdWeightImpl(const Context& dev_ctx,
   TransToChannelLast<Context, T>(dev_ctx, weight, &w_tensor_transformed);
   // build tensor descriptors
   cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
-  auto tensor_format =
-      phi::backends::gpu::ToCudnnDataType(grad_output->dtype());
+  auto tensor_format = backends::gpu::ToCudnnDataType(grad_output->dtype());
   auto tensor_format_math = CUDNN_DATA_FLOAT;
   auto compute_dtype = CUDNN_DATA_FLOAT;
   // get dims in CUDNN manner: [N, C, H, W]
-  auto dim_x = phi::backends::gpu::TransformDimOrder(
-      vectorize<int64_t>(bn1_input->dims()));
-  auto dim_filt = phi::backends::gpu::TransformDimOrder(
+  auto dim_x =
+      backends::gpu::TransformDimOrder(vectorize<int64_t>(bn1_input->dims()));
+  auto dim_filt = backends::gpu::TransformDimOrder(
       vectorize<int64_t>(w_tensor_transformed.dims()));
-  auto dim_y = phi::backends::gpu::TransformDimOrder(
-      vectorize<int64_t>(grad_output->dims()));
+  auto dim_y =
+      backends::gpu::TransformDimOrder(vectorize<int64_t>(grad_output->dims()));
   std::vector<int64_t> dim_scale(dim_x.size(), 1);
   dim_scale[1] = dim_x[1];  //  [1, C, 1, 1]
 
@@ -542,12 +541,12 @@ void _DbnApplyImpl(const Context& dev_ctx,
   auto handle = dev_ctx.cudnn_handle();
   auto workspace_handle = dev_ctx.cudnn_workspace_handle();
   cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
-  auto tensor_format = phi::backends::gpu::ToCudnnDataType(dY_tensor->dtype());
+  auto tensor_format = backends::gpu::ToCudnnDataType(dY_tensor->dtype());
   auto tensor_format_math = CUDNN_DATA_FLOAT;
   auto compute_dtype = CUDNN_DATA_FLOAT;
   // build tensor descriptors
-  auto dim_x = phi::backends::gpu::TransformDimOrder(
-      vectorize<int64_t>(X_tensor->dims()));
+  auto dim_x =
+      backends::gpu::TransformDimOrder(vectorize<int64_t>(X_tensor->dims()));
   std::vector<int64_t> dim_a(dim_x.size(), 1);
   dim_a[1] = dim_x[1];  //  [1, C, 1, 1]
 
@@ -746,16 +745,16 @@ void _BnActWgradImpl(const Context& dev_ctx,
   ResizeToChannelLast<Context, T>(dev_ctx, dw_tensor, &dw_tensor_transformed);
   // create tensor descriptors
   cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
-  auto tensor_format = phi::backends::gpu::ToCudnnDataType(conv_input->dtype());
+  auto tensor_format = backends::gpu::ToCudnnDataType(conv_input->dtype());
   auto tensor_format_math = CUDNN_DATA_FLOAT;
   auto compute_dtype = CUDNN_DATA_FLOAT;
   // create tensor descriptors
-  auto dim_x = phi::backends::gpu::TransformDimOrder(
-      vectorize<int64_t>(conv_input->dims()));
-  auto dim_filt = phi::backends::gpu::TransformDimOrder(
+  auto dim_x =
+      backends::gpu::TransformDimOrder(vectorize<int64_t>(conv_input->dims()));
+  auto dim_filt = backends::gpu::TransformDimOrder(
       vectorize<int64_t>(dw_tensor_transformed.dims()));
-  auto dim_y = phi::backends::gpu::TransformDimOrder(
-      vectorize<int64_t>(grad_output->dims()));
+  auto dim_y =
+      backends::gpu::TransformDimOrder(vectorize<int64_t>(grad_output->dims()));
   std::vector<int64_t> dim_scale(dim_x.size(), 1);
   dim_scale[1] = dim_x[1];  //  [1, C, 1, 1]
 
@@ -995,7 +994,7 @@ void FusedDconvDreluDbnKernel(const Context& dev_ctx,
                         "This op only supports Ampere and later devices, "
                         "but got compute capability: %d.",
                         dev_ctx.GetComputeCapability()));
-  auto cudnn_version = phi::backends::gpu::DnnVersion();
+  auto cudnn_version = backends::gpu::DnnVersion();
   PADDLE_ENFORCE_GE(cudnn_version,
                     8900,
                     common::errors::PreconditionNotMet(
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu b/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu
index fff5cf7398b837..54c6888bcf4391 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu
@@ -86,7 +86,7 @@ void FusedDotProductAttentionKernel(const Context &dev_ctx,
                         "This op only supports Ampere and later devices, "
                         "but got compute capability: %d.",
                         dev_ctx.GetComputeCapability()));
-  auto cudnn_version = phi::backends::gpu::DnnVersion();
+  auto cudnn_version = backends::gpu::DnnVersion();
   PADDLE_ENFORCE_GE(cudnn_version,
                     8906,
                     common::errors::PreconditionNotMet(
@@ -294,7 +294,7 @@ void FusedDotProductAttentionGradKernel(
                         "This op only supports Ampere and later devices, "
                         "but got compute capability: %d.",
                         dev_ctx.GetComputeCapability()));
-  auto cudnn_version = phi::backends::gpu::DnnVersion();
+  auto cudnn_version = backends::gpu::DnnVersion();
   PADDLE_ENFORCE_GE(cudnn_version,
                     8906,
                     common::errors::PreconditionNotMet(
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
index fdfbcfc7d57e9f..c1c007a966c076 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
@@ -459,8 +459,8 @@ void LaunchDropoutActBiasGrad(Functor act_functor,
     }
   } else {
     const uint64_t n = rows * cols;
-    phi::backends::gpu::GpuLaunchConfig config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n / real_vec_size);
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n / real_vec_size);
     if (n % VecSize == 0) {
       FusedDropoutActGrad<T, MaskType, VecSize, Functor>
           <<<config.block_per_grid,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index 9ca8a59314614b..086d873174dc7d 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -206,7 +206,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
     // seed_offset_data should preserved by cudaGraph pool
     const GPUContext* dev_ctx_p = &dev_ctx;
     auto parameterSetter = [offset, dev_ctx_p, seed_offset](
-                               phi::backends::gpu::gpuKernelParams& params) {
+                               backends::gpu::gpuKernelParams& params) {
       const auto* seed_offset_data = seed_offset.data<int64_t>();
       const uint64_t seed_data = static_cast<uint64_t>(seed_offset_data[0]);
       const uint64_t increment = static_cast<uint64_t>(seed_offset_data[1]);
@@ -217,7 +217,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                << ", increment = " << increment;
     };
 
-    phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t
+    backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>));
@@ -244,7 +244,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                   functor);
           return cudaFunc;
         };
-    phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(
+    backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(
         parameterSetter, cudaKernelCallback);
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index 3766d881dedc74..1ab2d3df35e277 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -193,35 +193,34 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
     // seed_offset_data should preserved by cudaGraph pool
     auto gen_cuda = dev_ctx.GetGenerator();
     auto state_index = gen_cuda->GetStateIndex();
-    auto parameterSetter =
-        [dev_ctx_p,
-         offset,
-         seed_offset_data,
-         state_index,
-         seed_tensor_ptr,
-         fix_seed](phi::backends::gpu::gpuKernelParams& params) {
-          if (!fix_seed) {
-            auto gen_cuda = dev_ctx_p->GetGenerator();
-            // ensure the generator use correct state index
-            gen_cuda->SetStateIndex(state_index);
-
-            // we assume seed is null pointer
-            // seed copy to cpu is meaningless here
-            assert(seed_tensor_ptr == nullptr);
-
-            uint64_t seed, increment;
-            std::tie(seed, increment) = gen_cuda->IncrementOffset(offset);
-            VLOG(10) << "CUDA_GRAPH seed = " << seed
-                     << ", increment = " << increment;
-
-            params.As<uint64_t>(2) = seed;
-            params.As<uint64_t>(6) = increment;
-
-            seed_offset_data[0] = static_cast<int64_t>(seed);
-            seed_offset_data[1] = static_cast<int64_t>(increment);
-          }
-        };
-    phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t
+    auto parameterSetter = [dev_ctx_p,
+                            offset,
+                            seed_offset_data,
+                            state_index,
+                            seed_tensor_ptr,
+                            fix_seed](backends::gpu::gpuKernelParams& params) {
+      if (!fix_seed) {
+        auto gen_cuda = dev_ctx_p->GetGenerator();
+        // ensure the generator use correct state index
+        gen_cuda->SetStateIndex(state_index);
+
+        // we assume seed is null pointer
+        // seed copy to cpu is meaningless here
+        assert(seed_tensor_ptr == nullptr);
+
+        uint64_t seed, increment;
+        std::tie(seed, increment) = gen_cuda->IncrementOffset(offset);
+        VLOG(10) << "CUDA_GRAPH seed = " << seed
+                 << ", increment = " << increment;
+
+        params.As<uint64_t>(2) = seed;
+        params.As<uint64_t>(6) = increment;
+
+        seed_offset_data[0] = static_cast<int64_t>(seed);
+        seed_offset_data[1] = static_cast<int64_t>(increment);
+      }
+    };
+    backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t
         cudaKernelCallback = [=](unsigned int id) {
           void* functionPtr = reinterpret_cast<void*>(
               &(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));
@@ -247,7 +246,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
                                                      dst_functor);
           return cudaFunc;
         };
-    phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(
+    backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch(
         parameterSetter, cudaKernelCallback);
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h
index 83d180b42ae3a5..9f42dcdd21cdb5 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h
@@ -28,7 +28,7 @@ static inline std::vector<size_t> GetRandomCudaProp(int64_t numel,
   size_t grid_size = gpu_config.GetGridSize();
   size_t block_size = gpu_config.GetBlockSize();
   int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
-  const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
+  const auto& prop = backends::gpu::GetDeviceProperties(device_id);
   size_t max_grid_size =
       prop.maxThreadsPerMultiProcessor * prop.multiProcessorCount / block_size;
   grid_size = std::min(grid_size, max_grid_size);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
index 4cf7c8ed9060f2..c2e0f109174466 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h
@@ -51,7 +51,7 @@ namespace fusion {
  * 1D blocks: blockDim.x = cols
  * 2D grids: gridDim.y = rows
  */
-inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids(
+inline backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids(
     const GPUContext &dev_ctx,
     const uint64_t rows,
     const uint64_t cols,
@@ -79,7 +79,7 @@ inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids(
     blocks_y = (blocks_y + blocks_z - 1) / blocks_z;
     blocks_y = blocks_y >= 65536 ? 65535 : blocks_y;
   }
-  phi::backends::gpu::GpuLaunchConfig config;
+  backends::gpu::GpuLaunchConfig config;
   config.block_per_grid.x = static_cast<uint32_t>(blocks_x);
   config.block_per_grid.y = static_cast<uint32_t>(blocks_y);
   config.block_per_grid.z = static_cast<uint32_t>(blocks_z);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
index 8409e70911a6be..1b43ebfddf3787 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
@@ -67,7 +67,7 @@ void EmbeddingEltWiseLayerNormKernel(
   }
 
   const int64_t* stable_in1s =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<int64_t*>(in1s.data()), in1s.size());
   phi::memory_utils::Copy(GPUPlace{},
                           in_ids_d,
@@ -76,7 +76,7 @@ void EmbeddingEltWiseLayerNormKernel(
                           sizeof(int64_t) * input_num,
                           dev_ctx.stream());
   const int64_t* stable_in2s =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<int64_t*>(in2s.data()), in2s.size());
   phi::memory_utils::Copy(GPUPlace{},
                           in_embs_d,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
index 2eded99e6416e5..40553cecc2f7ae 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
@@ -312,7 +312,7 @@ void AddReluAddLayerNorm(const Context& dev_ctx,
                          int N,
                          float epsilon) {
   if (with_relu) {
-    switch (phi::backends::gpu::RoundToPowerOfTwo(N)) {
+    switch (backends::gpu::RoundToPowerOfTwo(N)) {
       CUDA_LAUNCH_KERNEL_HELPER(
           InplaceAddReluAddLayerNormKernel<T, true, kPowerOfTwoDim>
           <<<std::max(max_threads / kPowerOfTwoDim, 1),
@@ -322,7 +322,7 @@ void AddReluAddLayerNorm(const Context& dev_ctx,
               y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon));
     }
   } else {
-    switch (phi::backends::gpu::RoundToPowerOfTwo(N)) {
+    switch (backends::gpu::RoundToPowerOfTwo(N)) {
       CUDA_LAUNCH_KERNEL_HELPER(
           InplaceAddReluAddLayerNormKernel<T, false, kPowerOfTwoDim>
           <<<std::max(max_threads / kPowerOfTwoDim, 1),
@@ -349,7 +349,7 @@ void AddReluAddLayerNorm(const Context& dev_ctx,
                          int N,
                          float epsilon) {
   if (with_relu) {
-    switch (phi::backends::gpu::RoundToPowerOfTwo(N)) {
+    switch (backends::gpu::RoundToPowerOfTwo(N)) {
       CUDA_LAUNCH_KERNEL_HELPER(
           InplaceAddReluAddLayerNormKernel<true, kPowerOfTwoDim>
           <<<std::max(max_threads / kPowerOfTwoDim, 1),
@@ -359,7 +359,7 @@ void AddReluAddLayerNorm(const Context& dev_ctx,
               y, bias_0, bias_1, scale, out, mean, variance, M, N, epsilon));
     }
   } else {
-    switch (phi::backends::gpu::RoundToPowerOfTwo(N)) {
+    switch (backends::gpu::RoundToPowerOfTwo(N)) {
       CUDA_LAUNCH_KERNEL_HELPER(
           InplaceAddReluAddLayerNormKernel<false, kPowerOfTwoDim>
           <<<std::max(max_threads / kPowerOfTwoDim, 1),
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
index f64a8fbaf391bc..f1fe1d35c4a494 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
@@ -31,7 +31,7 @@ namespace fusion {
 #endif
 
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h
index 6dd276d4e2151b..37bba718254417 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h
@@ -2093,10 +2093,10 @@ inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) {
   constexpr int kBlockSize = 128;
   constexpr int kNumWaves = 16;
 
-  const int device_id = phi::backends::gpu::GetCurrentDeviceId();
-  const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id);
+  const int device_id = backends::gpu::GetCurrentDeviceId();
+  const int sm_count = backends::gpu::GetGPUMultiProcessors(device_id);
   const int max_thread_per_multiprocessor =
-      phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
+      backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
 
   *num_blocks =
       std::max<int>(1,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
index 227e5a2413b050..e4d9da8723e0af 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h
@@ -504,9 +504,8 @@ void LaunchResidualDropoutBiasGrad(const T *dout,
                                 dev_ctx.stream());                            \
       } else {                                                                \
         const uint64_t n = rows * cols;                                       \
-        phi::backends::gpu::GpuLaunchConfig config =                          \
-            phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx,                 \
-                                                     n / real_vec_size);      \
+        backends::gpu::GpuLaunchConfig config =                               \
+            backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n / real_vec_size);  \
         if (n % VecSize == 0) {                                               \
           FusedResidualDropoutGrad<T, MaskType, VecSize>                      \
               <<<config.block_per_grid,                                       \
diff --git a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu
index 648815ae859e75..e659343972a810 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu
@@ -34,7 +34,7 @@ namespace fusion {
 using helper = phi::CudnnFrontendConvHelper;
 
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 
 template <typename T, typename Context>
 void FusedScaleBiasAddReluKernel(const Context& dev_ctx,
@@ -75,12 +75,11 @@ void FusedScaleBiasAddReluKernel(const Context& dev_ctx,
   auto workspace_handle = dev_ctx.cudnn_workspace_handle();
   // create tensor descriptors
   cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
-  auto tensor_format = phi::backends::gpu::ToCudnnDataType(x1.dtype());
+  auto tensor_format = backends::gpu::ToCudnnDataType(x1.dtype());
   auto tensor_format_math = CUDNN_DATA_FLOAT;
   auto compute_dtype = CUDNN_DATA_FLOAT;
 
-  auto dim_x =
-      phi::backends::gpu::TransformDimOrder(vectorize<int64_t>(x1.dims()));
+  auto dim_x = backends::gpu::TransformDimOrder(vectorize<int64_t>(x1.dims()));
   std::vector<int64_t> dim_c(dim_x.size(), 1);
   dim_c[1] = dim_x[1];  //  [1, C, 1, 1]
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu
index cedae622480554..10f2cef97cf20b 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu
@@ -33,7 +33,7 @@ namespace fusion {
 
 using helper = phi::CudnnFrontendConvHelper;
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 
 /*
  * Implements Scale + Bias + ReLU + Conv + BNStats fusion pattern.
@@ -115,18 +115,17 @@ void FusedScaleBiasReluConvBnstatsImpl(const Context& dev_ctx,
 
   // build tensors
   cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
-  auto tensor_format = phi::backends::gpu::ToCudnnDataType(x.dtype());
+  auto tensor_format = backends::gpu::ToCudnnDataType(x.dtype());
 
   auto tensor_format_math = CUDNN_DATA_FLOAT;
   auto compute_dtype = CUDNN_DATA_FLOAT;
 
   // get dims in CUDNN manner: [N, C, H, W]
-  auto dim_x =
-      phi::backends::gpu::TransformDimOrder(vectorize<int64_t>(in_dims));
+  auto dim_x = backends::gpu::TransformDimOrder(vectorize<int64_t>(in_dims));
   auto dim_filt =
-      phi::backends::gpu::TransformDimOrder(vectorize<int64_t>(filter_dims));
+      backends::gpu::TransformDimOrder(vectorize<int64_t>(filter_dims));
   auto dim_y =
-      phi::backends::gpu::TransformDimOrder(vectorize<int64_t>(output->dims()));
+      backends::gpu::TransformDimOrder(vectorize<int64_t>(output->dims()));
   std::vector<int64_t> dim_scale(dim_x.size(), 1);
   dim_scale[1] = dim_x[1];                        //  [1, C, 1, 1]
   std::vector<int64_t> dim_sum(dim_x.size(), 1);  // [1, K, 1, 1]
@@ -317,9 +316,8 @@ void BNFinalizeImpl(const Context& dev_ctx,
   auto workspace_handle = dev_ctx.cudnn_workspace_handle();
   // set dtypes
   cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
-  auto tensor_format_bn =
-      phi::backends::gpu::ToCudnnDataType(sum_tensor.dtype());
-  auto tensor_format = phi::backends::gpu::ToCudnnDataType(eq_scale->dtype());
+  auto tensor_format_bn = backends::gpu::ToCudnnDataType(sum_tensor.dtype());
+  auto tensor_format = backends::gpu::ToCudnnDataType(eq_scale->dtype());
   auto compute_dtype = CUDNN_DATA_FLOAT;
   // create tensor descriptors
   auto dim_input = vectorize<int64_t>(sum_tensor.dims());
@@ -499,7 +497,7 @@ void FusedScaleBiasReluConvBnKernel(const Context& dev_ctx,
                                     DenseTensor* saved_var,
                                     DenseTensor* eq_scale,
                                     DenseTensor* eq_bias) {
-  auto cudnn_version = phi::backends::gpu::DnnVersion();
+  auto cudnn_version = backends::gpu::DnnVersion();
   PADDLE_ENFORCE_GE(cudnn_version,
                     8800,
                     common::errors::PreconditionNotMet(
diff --git a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu
index 20cbe5e06ddc66..b458c6e07f979d 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu
@@ -132,9 +132,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx,
       dev_ctx.GetPlace(), total_ptr_len * sizeof(void *));
 #ifdef PADDLE_WITH_HIP
   T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_out_grads_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(out_grads_data.data()), out_grads_data.size()),
       out_grads_data.size() * sizeof(T *),
       hipMemcpyHostToDevice,
@@ -142,9 +142,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx,
 
   T **gpu_in_grads_values =
       reinterpret_cast<T **>(&gpu_out_grads_values[out_grads_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_in_grads_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(in_grads_data.data()), in_grads_data.size()),
       in_grads_data.size() * sizeof(T *),
       hipMemcpyHostToDevice,
@@ -152,9 +152,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx,
 
   T **gpu_cvm_values =
       reinterpret_cast<T **>(&gpu_in_grads_values[in_grads_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_cvm_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(cvm_data.data()), cvm_data.size()),
       cvm_data.size() * sizeof(T *),
       hipMemcpyHostToDevice,
@@ -162,18 +162,18 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx,
 
   size_t **lods_values =
       reinterpret_cast<size_t **>(&gpu_cvm_values[cvm_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       lods_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<size_t **>(lods.data()), lods.size()),
       lods.size() * sizeof(size_t *),
       hipMemcpyHostToDevice,
       stream);
 #else
   T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_out_grads_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(out_grads_data.data()), out_grads_data.size()),
       out_grads_data.size() * sizeof(T *),
       cudaMemcpyHostToDevice,
@@ -181,9 +181,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx,
 
   T **gpu_in_grads_values =
       reinterpret_cast<T **>(&gpu_out_grads_values[out_grads_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_in_grads_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(in_grads_data.data()), in_grads_data.size()),
       in_grads_data.size() * sizeof(T *),
       cudaMemcpyHostToDevice,
@@ -191,9 +191,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx,
 
   T **gpu_cvm_values =
       reinterpret_cast<T **>(&gpu_in_grads_values[in_grads_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_cvm_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(cvm_data.data()), cvm_data.size()),
       cvm_data.size() * sizeof(T *),
       cudaMemcpyHostToDevice,
@@ -201,9 +201,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx,
 
   size_t **lods_values =
       reinterpret_cast<size_t **>(&gpu_cvm_values[cvm_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       lods_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<size_t **>(lods.data()), lods.size()),
       lods.size() * sizeof(size_t *),
       cudaMemcpyHostToDevice,
@@ -211,7 +211,7 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx,
 #endif
 
   size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N);
   if (use_cvm) {
     // join grad
     FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu
index 3200528764c19c..c55d7c542ff8ed 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu
@@ -122,27 +122,27 @@ void FusedSeqpoolCVM(
 
 #ifdef PADDLE_WITH_HIP
   T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_input_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(input_data.data()), input_data.size()),
       input_data.size() * sizeof(T *),
       hipMemcpyHostToDevice,
       stream);
   T **gpu_output_values =
       reinterpret_cast<T **>(&gpu_input_values[input_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_output_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(output_data.data()), output_data.size()),
       output_data.size() * sizeof(T *),
       hipMemcpyHostToDevice,
       stream);
   T **gpu_seqpool_output_values =
       reinterpret_cast<T **>(&gpu_output_values[output_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_seqpool_output_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(seqpool_output_data.data()),
           seqpool_output_data.size()),
       seqpool_output_data.size() * sizeof(T *),
@@ -150,36 +150,36 @@ void FusedSeqpoolCVM(
       stream);
   size_t **lods_values = reinterpret_cast<size_t **>(
       &gpu_seqpool_output_values[seqpool_output_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       lods_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<size_t **>(lods.data()), lods.size()),
       lods.size() * sizeof(size_t *),
       hipMemcpyHostToDevice,
       stream);
 #else
   T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_input_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(input_data.data()), input_data.size()),
       input_data.size() * sizeof(T *),
       cudaMemcpyHostToDevice,
       stream);
   T **gpu_output_values =
       reinterpret_cast<T **>(&gpu_input_values[input_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_output_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(output_data.data()), output_data.size()),
       output_data.size() * sizeof(T *),
       cudaMemcpyHostToDevice,
       stream);
   T **gpu_seqpool_output_values =
       reinterpret_cast<T **>(&gpu_output_values[output_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       gpu_seqpool_output_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<T **>(seqpool_output_data.data()),
           seqpool_output_data.size()),
       seqpool_output_data.size() * sizeof(T *),
@@ -187,9 +187,9 @@ void FusedSeqpoolCVM(
       stream);
   size_t **lods_values = reinterpret_cast<size_t **>(
       &gpu_seqpool_output_values[seqpool_output_data.size()]);
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       lods_values,
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<size_t **>(lods.data()), lods.size()),
       lods.size() * sizeof(size_t *),
       cudaMemcpyHostToDevice,
@@ -197,8 +197,8 @@ void FusedSeqpoolCVM(
 #endif
 
   size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
-  phi::backends::gpu::GpuLaunchConfig config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N);
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N);
   // first sum pool
   FusedSeqpoolKernelNormal<<<config.block_per_grid.x,
                              config.thread_per_block.x,
@@ -225,8 +225,8 @@ void FusedSeqpoolCVM(
     // not need show click input
     N = static_cast<size_t>(batch_size * slot_num *
                             (embedding_size - cvm_offset));
-    phi::backends::gpu::GpuLaunchConfig config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N);
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N);
     FusedCVMKernelNoCVM<<<config.block_per_grid.x,
                           config.thread_per_block.x,
                           0,
diff --git a/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu b/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
index a4f9632898e1a6..0c1b3570b58cf4 100644
--- a/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
@@ -26,7 +26,7 @@ namespace phi {
 namespace fusion {
 
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 
 template <typename T, typename Context>
 void TransposeFlattenConcatFusionKernel(
diff --git a/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_grad_kernel.cu
index 9c0eb5e1d81be9..93246925b39f13 100644
--- a/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_grad_kernel.cu
@@ -122,8 +122,8 @@ void MaxPoolV2GradCUDNNKernel(const Context& dev_ctx,
   auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
   auto layout = GetLayoutFromStr(data_format);
-  auto layout_format = phi::backends::gpu::GetCudnnTensorFormat(layout);
-  auto input_dtype = phi::backends::gpu::CudnnDataType<T1>::type;
+  auto layout_format = backends::gpu::GetCudnnTensorFormat(layout);
+  auto input_dtype = backends::gpu::CudnnDataType<T1>::type;
   auto saved_idx_dtype = CudnnIndexType<T2>::type;
 
   // Create plan and execute
diff --git a/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_kernel.cu b/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_kernel.cu
index efaca556b50cea..88b80f899cb721 100644
--- a/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_kernel.cu
@@ -104,8 +104,8 @@ void MaxPoolV2CUDNNKernel(const Context& dev_ctx,
   auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
   auto layout = GetLayoutFromStr(data_format);
-  auto layout_format = phi::backends::gpu::GetCudnnTensorFormat(layout);
-  auto input_dtype = phi::backends::gpu::CudnnDataType<T1>::type;
+  auto layout_format = backends::gpu::GetCudnnTensorFormat(layout);
+  auto input_dtype = backends::gpu::CudnnDataType<T1>::type;
   auto saved_idx_dtype = CudnnIndexType<T2>::type;
 
   // Create plan and execute
diff --git a/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h b/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h
index b703192190992b..1e313e4f6f21ef 100644
--- a/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h
+++ b/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h
@@ -24,7 +24,7 @@
 namespace phi {
 namespace fusion {
 
-using phi::backends::gpu::GpuLaunchConfig;
+using backends::gpu::GpuLaunchConfig;
 
 constexpr int DequantKernelVecSize = 4;
 
diff --git a/paddle/phi/kernels/fusion/gpu/resnet_unit_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/resnet_unit_grad_kernel.cu
index e1aaca0038b9f1..ba7ec1a45fdb60 100644
--- a/paddle/phi/kernels/fusion/gpu/resnet_unit_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/resnet_unit_grad_kernel.cu
@@ -64,7 +64,7 @@ void ResNetUnitGradKernel(const Context &dev_ctx,
                           DenseTensor *filter_z_grad,
                           DenseTensor *scale_z_grad,
                           DenseTensor *bias_z_grad) {
-  PADDLE_ENFORCE_EQ(phi::backends::gpu::CudnnDataType<T>::type,
+  PADDLE_ENFORCE_EQ(backends::gpu::CudnnDataType<T>::type,
                     CUDNN_DATA_HALF,
                     common::errors::Unavailable(
                         "ResNetUnitOp only supports float16 for now."));
diff --git a/paddle/phi/kernels/fusion/gpu/resnet_unit_kernel.cu b/paddle/phi/kernels/fusion/gpu/resnet_unit_kernel.cu
index 77ccbe117f96b4..5479e96c030852 100644
--- a/paddle/phi/kernels/fusion/gpu/resnet_unit_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/resnet_unit_kernel.cu
@@ -63,7 +63,7 @@ void ResNetUnitKernel(const Context &dev_ctx,
                       DenseTensor *saved_invstd_z,
                       DenseTensor *running_mean_z,
                       DenseTensor *running_var_z) {
-  PADDLE_ENFORCE_EQ(phi::backends::gpu::CudnnDataType<T>::type,
+  PADDLE_ENFORCE_EQ(backends::gpu::CudnnDataType<T>::type,
                     CUDNN_DATA_HALF,
                     common::errors::Unavailable(
                         "ResNetUnitOp only supports float16 for now."));
diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
index d2cc2b6e2b02a0..1c35fd225e1cd7 100644
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -189,10 +189,9 @@ void AddNKernel(const Context &dev_ctx,
       auto tmp_in_array = phi::memory_utils::Alloc(
           dev_ctx.GetPlace(), in_data.size() * sizeof(void *));
       size_t nbytes_in = in_data.size() * sizeof(void *);
-      const void *stable_in =
-          phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-              reinterpret_cast<uint8_t *>(const_cast<void **>(in_data.data())),
-              nbytes_in);
+      const void *stable_in = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+          reinterpret_cast<uint8_t *>(const_cast<void **>(in_data.data())),
+          nbytes_in);
       memory_utils::Copy(dev_ctx.GetPlace(),
                          tmp_in_array->ptr(),
                          CPUPlace(),
@@ -285,9 +284,8 @@ void AddNKernel(const Context &dev_ctx,
           dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
 
       size_t nbytes_sr = sr_in_out_data.size() * sizeof(T *);
-      const void *stable_sr =
-          phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-              reinterpret_cast<uint8_t *>(sr_in_out_data.data()), nbytes_sr);
+      const void *stable_sr = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+          reinterpret_cast<uint8_t *>(sr_in_out_data.data()), nbytes_sr);
       memory_utils::Copy(dev_ctx.GetPlace(),
                          tmp_sr_in_out_array->ptr(),
                          CPUPlace(),
@@ -310,10 +308,9 @@ void AddNKernel(const Context &dev_ctx,
         memory_utils::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));
 
     size_t nbytes_in2 = in_data.size() * sizeof(T *);
-    const void *stable_in2 =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-            reinterpret_cast<uint8_t *>(const_cast<T **>(in_data.data())),
-            nbytes_in2);
+    const void *stable_in2 = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+        reinterpret_cast<uint8_t *>(const_cast<T **>(in_data.data())),
+        nbytes_in2);
     memory_utils::Copy(dev_ctx.GetPlace(),
                        tmp_in_array->ptr(),
                        CPUPlace(),
diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu
index 22875642d68075..f65ae8303c2b89 100644
--- a/paddle/phi/kernels/gpu/amp_kernel.cu
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -175,9 +175,8 @@ class LazyZeros<GPUContext, T> {
     for (int i = 0; i < xs_size; i++) {
       h_starts[i + 1] = h_starts[i] + outs[i]->numel();
     }
-    auto* stable_h_starts =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_starts,
-                                                               xs_size + 1);
+    auto* stable_h_starts = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+        h_starts, xs_size + 1);
     memory_utils::Copy(dev_ctx.GetPlace(),
                        d_starts,
                        cpu_place,
@@ -199,8 +198,7 @@ class LazyZeros<GPUContext, T> {
       h_out_addrs[i] = dev_ctx.Alloc<T>(outs[i]);
     }
     auto* stable_h_out_addrs =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_out_addrs,
-                                                               xs_size);
+        backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_out_addrs, xs_size);
     memory_utils::Copy(dev_ctx.GetPlace(),
                        d_out_addrs,
                        cpu_place,
@@ -311,8 +309,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
   }
   int64_t total_num = h_starts[xs_size];
   auto* stable_h_starts =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_starts,
-                                                             xs_size + 1);
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_starts, xs_size + 1);
   memory_utils::Copy(dev_ctx.GetPlace(),
                      d_starts,
                      cpu_place,
@@ -337,7 +334,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
     h_outs[i] = dev_ctx.template Alloc<T>(outs[i]);
   }
   auto* stable_h_xs =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_xs, 2 * xs_size);
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_xs, 2 * xs_size);
   memory_utils::Copy(dev_ctx.GetPlace(),
                      d_xs,
                      cpu_place,
diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
index aa57e24e041271..cae023a1642eb4 100644
--- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -70,8 +70,7 @@ void SetAccumulators<GPUContext>(const GPUContext& dev_ctx,
 
   auto cuda_place = out_old_num_accumulates->place();
   const int64_t* stable_na =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&num_accumulates,
-                                                             1);
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&num_accumulates, 1);
   memory_utils::Copy(dev_ctx.GetPlace(),
                      out_num_accumulates_ptr,
                      CPUPlace(),
@@ -79,9 +78,8 @@ void SetAccumulators<GPUContext>(const GPUContext& dev_ctx,
                      sizeof(int64_t),
                      stream);
 
-  const int64_t* stable_ona =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          &old_num_accumulates, 1);
+  const int64_t* stable_ona = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      &old_num_accumulates, 1);
   memory_utils::Copy(dev_ctx.GetPlace(),
                      out_old_num_accumulates_ptr,
                      CPUPlace(),
@@ -90,7 +88,7 @@ void SetAccumulators<GPUContext>(const GPUContext& dev_ctx,
                      stream);
 
   const int64_t* stable_nu =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&num_updates, 1);
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&num_updates, 1);
   memory_utils::Copy(cuda_place,
                      out_num_updates_ptr,
                      CPUPlace(),
diff --git a/paddle/phi/kernels/gpu/box_coder_kernel.cu b/paddle/phi/kernels/gpu/box_coder_kernel.cu
index bc2ed13f307649..9d7e9a221de6fd 100644
--- a/paddle/phi/kernels/gpu/box_coder_kernel.cu
+++ b/paddle/phi/kernels/gpu/box_coder_kernel.cu
@@ -217,7 +217,7 @@ void BoxCoderKernel(const Context &dev_ctx,
   auto cplace = CPUPlace();
   const auto gplace = dev_ctx.GetPlace();
   const float *stable_variance =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           const_cast<float *>(variance.data()), variance.size());
   memory_utils::Copy(
       gplace, dev_var_data, cplace, stable_variance, bytes, dev_ctx.stream());
diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
index fab45447fb5c95..00282c339c3a33 100644
--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -323,7 +323,7 @@ static char* GetGpuHintStringPtr(const GPUContext& dev_ctx,
                                                 dev_ctx.stream()));
 #else
       const char* stable_str =
-          phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+          backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
               const_cast<char*>(iter->first.c_str()), op_var.length() + 1);
       PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(gpu_str_ptr,
                                                  stable_str,
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
index a74998897135e4..231597aea9d483 100644
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -254,7 +254,7 @@ void CholeskyKernel(const Context& dev_ctx,
 #endif
   // check the info
   PADDLE_ENFORCE_EQ(
-      phi::backends::gpu::IsCUDAGraphCapturing(),
+      backends::gpu::IsCUDAGraphCapturing(),
       false,
       common::errors::InvalidArgument(
           "CholeskyKernel does not support CUDA Graph capture: async D2H copy "
diff --git a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu
index a5ef2390d0d911..0419156609847c 100644
--- a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu
@@ -238,7 +238,7 @@ void GPUCollectFpnProposalsOpKernel(
   GetLengthLoD<<<blocks, threads, 0, dev_ctx.stream()>>>(
       real_post_num, out_id_data, length_lod_data);
   PADDLE_ENFORCE_EQ(
-      phi::backends::gpu::IsCUDAGraphCapturing(),
+      backends::gpu::IsCUDAGraphCapturing(),
       false,
       common::errors::InvalidArgument(
           "CollectFpnProposals does not support CUDA Graph capture: async D2H "
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index a19d4c27cc7db7..e2984db1ae7fca 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -313,7 +313,7 @@ template <typename T>
 __device__ __forceinline__ T WarpReduceSumDown(T val) {
 #pragma unroll
   for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
-    val += phi::backends::gpu::CudaShuffleDownSync(0xFFFFFFFF, val, offset);
+    val += backends::gpu::CudaShuffleDownSync(0xFFFFFFFF, val, offset);
   }
   return val;
 }
@@ -322,7 +322,7 @@ template <typename T>
 __device__ __forceinline__ T WarpReduceMaxDown(T val) {
 #pragma unroll
   for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
-    T other = phi::backends::gpu::CudaShuffleDownSync(0xFFFFFFFF, val, offset);
+    T other = backends::gpu::CudaShuffleDownSync(0xFFFFFFFF, val, offset);
     val = max(val, other);
   }
   return val;
@@ -1035,10 +1035,10 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
                                    : MIOPEN_SOFTMAX_MODE_CHANNEL;
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2(
           handle,
-          phi::backends::gpu::CudnnDataType<T>::kOne(),
+          backends::gpu::CudnnDataType<T>::kOne(),
           descp,
           logits_data,
-          phi::backends::gpu::CudnnDataType<T>::kZero(),
+          backends::gpu::CudnnDataType<T>::kZero(),
           descp,
           softmax_data,
           MIOPEN_SOFTMAX_LOG,
@@ -1781,10 +1781,10 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
                                    : MIOPEN_SOFTMAX_MODE_CHANNEL;
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2(
           handle,
-          phi::backends::gpu::CudnnDataType<T>::kOne(),
+          backends::gpu::CudnnDataType<T>::kOne(),
           descp,
           logits_data,
-          phi::backends::gpu::CudnnDataType<T>::kZero(),
+          backends::gpu::CudnnDataType<T>::kZero(),
           descp,
           softmax_data,
           MIOPEN_SOFTMAX_LOG,
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 9a280de8315a06..cbb17c02179b14 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -177,7 +177,7 @@ class DepthwiseConvFilterGradFunctor {
 template <typename T>
 __forceinline__ __device__ T WarpReduceSum(T val, unsigned lane_mask) {
   for (int mask = HALF_WARP; mask > 0; mask >>= 1)
-    val += phi::backends::gpu::CudaShuffleDownSync(lane_mask, val, mask);
+    val += backends::gpu::CudaShuffleDownSync(lane_mask, val, mask);
   return val;
 }
 
diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
index 3c6d6a74e299dc..2ab6c3abe383ed 100644
--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -228,7 +228,7 @@ void DistributeFpnProposalsKernel(
   size_t start = 0;
 
   PADDLE_ENFORCE_EQ(
-      phi::backends::gpu::IsCUDAGraphCapturing(),
+      backends::gpu::IsCUDAGraphCapturing(),
       false,
       common::errors::InvalidArgument(
           "DistributeFpnProposals does not support CUDA Graph capture: async "
diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
index af3c25ef1251ed..94ac3aaa626822 100644
--- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu
+++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
@@ -143,7 +143,7 @@ void EditDistanceKernel(const Context& dev_ctx,
         distance = distance / n;
       }
       const T* stable_dist =
-          phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&distance, 1);
+          backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&distance, 1);
       memory_utils::Copy(dev_ctx.GetPlace(),
                          out_data + num,
                          CPUPlace(),
diff --git a/paddle/phi/kernels/gpu/eig_grad_kernel.cu b/paddle/phi/kernels/gpu/eig_grad_kernel.cu
index 89ee3f6a347ff3..2c0ac79014a49e 100644
--- a/paddle/phi/kernels/gpu/eig_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/eig_grad_kernel.cu
@@ -213,7 +213,7 @@ void SolveLinearSystemGPU<dtype::complex<float>>(
 
   std::vector<int> h_info(batch_count, 0);
   PADDLE_ENFORCE_EQ(
-      phi::backends::gpu::IsCUDAGraphCapturing(),
+      backends::gpu::IsCUDAGraphCapturing(),
       false,
       common::errors::InvalidArgument(
           "EigGradKernel does not support CUDA Graph capture: async D2H copy "
@@ -402,7 +402,7 @@ void SolveLinearSystemGPU<dtype::complex<double>>(
 
   std::vector<int> h_info(batch_count, 0);
   PADDLE_ENFORCE_EQ(
-      phi::backends::gpu::IsCUDAGraphCapturing(),
+      backends::gpu::IsCUDAGraphCapturing(),
       false,
       common::errors::InvalidArgument(
           "EigGradKernel does not support CUDA Graph capture: async D2H copy "
diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
index f266a2bbb5596c..b59fe1c70bd30d 100644
--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
@@ -77,7 +77,7 @@ void FillDiagonalTensorGradKernel(const Context &dev_ctx,
     tensor_tmp.Resize({2 + matrows});
     int64_t *memory_block_cu = dev_ctx.template Alloc<int64_t>(&tensor_tmp);
     const auto gpu_place = dev_ctx.GetPlace();
-    auto *stable_mb = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+    auto *stable_mb = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
         memory_block.data(), memory_block.size());
     memory_utils::Copy(gpu_place,
                        memory_block_cu,
diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
index 82d65405e9308a..716177eca199d4 100644
--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
@@ -93,7 +93,7 @@ void FillDiagonalTensorKernel(const Context &dev_ctx,
   tensor_tmp.Resize({2 + fill_dims[0]});
   int64_t *memory_block_cu = dev_ctx.template Alloc<int64_t>(&tensor_tmp);
   const auto gpu_place = dev_ctx.GetPlace();
-  auto *stable_mb = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+  auto *stable_mb = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
       memory_block.data(), memory_block.size());
   memory_utils::Copy(gpu_place,
                      memory_block_cu,
diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
index bed31526f6ccb7..c3713f8aa66cfa 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -306,7 +306,7 @@ static void NMS(const GPUContext &dev_ctx,
   memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
 
   PADDLE_ENFORCE_EQ(
-      phi::backends::gpu::IsCUDAGraphCapturing(),
+      backends::gpu::IsCUDAGraphCapturing(),
       false,
       common::errors::InvalidArgument(
           "GenerateProposals does not support CUDA Graph capture: async D2H "
@@ -338,9 +338,8 @@ static void NMS(const GPUContext &dev_ctx,
   }
   keep_out->Resize({num_to_keep});
   int *keep = dev_ctx.template Alloc<int>(keep_out);
-  const int *stable_keep =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          const_cast<int *>(keep_vec.data()), keep_vec.size());
+  const int *stable_keep = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      const_cast<int *>(keep_vec.data()), keep_vec.size());
   memory_utils::Copy(place,
                      keep,
                      CPUPlace(),
@@ -582,9 +581,8 @@ void GenerateProposalsKernel(const Context &dev_ctx,
     rpn_rois_num->Resize({num});
     dev_ctx.template Alloc<int>(rpn_rois_num);
     int *num_data = rpn_rois_num->data<int>();
-    const int *stable_num =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-            const_cast<int *>(tmp_num.data()), num);
+    const int *stable_num = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+        const_cast<int *>(tmp_num.data()), num);
     memory_utils::Copy(place,
                        num_data,
                        cpu_place,
diff --git a/paddle/phi/kernels/gpu/index_fill_kernel.cu b/paddle/phi/kernels/gpu/index_fill_kernel.cu
index e525c8af330e38..6c5e2053614623 100644
--- a/paddle/phi/kernels/gpu/index_fill_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_fill_kernel.cu
@@ -99,7 +99,7 @@ void LaunchIndexFillCudaKernelImpl(const Context& dev_ctx,
                                    T fill_value,
                                    T* out_data) {
   IndexT numel = outer_size * index_size * inner_size;
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
   IndexFillCudaKernel<T, IndexT, IndT>
       <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
           x_data,
diff --git a/paddle/phi/kernels/gpu/masked_scatter_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_scatter_grad_kernel.cu
index ab855097866552..e5c1785834e079 100644
--- a/paddle/phi/kernels/gpu/masked_scatter_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_scatter_grad_kernel.cu
@@ -109,7 +109,7 @@ void MaskedScatterGradKernel(const Context& dev_ctx,
   // Compute x_grad
   if (x_grad) {
     auto x_grad_dims = x_grad->dims();
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
 
     if (x_grad_dims == out_grad_dims) {
       // No broadcast happened, compute directly into x_grad.
@@ -180,7 +180,7 @@ void MaskedScatterGradKernel(const Context& dev_ctx,
       }
     }
 
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
     MaskedScatterGradValueKernel<T>
         <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
             out_grad.data<T>(),
diff --git a/paddle/phi/kernels/gpu/masked_scatter_kernel.cu b/paddle/phi/kernels/gpu/masked_scatter_kernel.cu
index 1fd78187d6d361..23b25213a0ac21 100644
--- a/paddle/phi/kernels/gpu/masked_scatter_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_scatter_kernel.cu
@@ -155,7 +155,7 @@ void MaskedScatterKernel(const Context& dev_ctx,
       &prefix_sum_data[total - 1], &mask_bool_data[total - 1], value.numel());
 
   // Launch masked scatter kernel
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
   MaskedScatterCUDAKernel<T>
       <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
           x_expand.data<T>(),
diff --git a/paddle/phi/kernels/gpu/moe_permute_kernel.cu b/paddle/phi/kernels/gpu/moe_permute_kernel.cu
index 9c0d0dacb56fcc..dff267774ec22e 100644
--- a/paddle/phi/kernels/gpu/moe_permute_kernel.cu
+++ b/paddle/phi/kernels/gpu/moe_permute_kernel.cu
@@ -583,9 +583,8 @@ void dispatch_preprocess(const Context &dev_ctx,
   padding_tokens_tensor.Resize({static_cast<int64_t>(padding_rows.size())});
   dev_ctx.template Alloc<int>(&padding_tokens_tensor);
 
-  auto *stable_padding_rows =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          const_cast<int *>(padding_rows.data()), padding_rows.size());
+  auto *stable_padding_rows = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      const_cast<int *>(padding_rows.data()), padding_rows.size());
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(padding_tokens_tensor.data<int>(),
                                              stable_padding_rows,
                                              sizeof(int) * padding_rows.size(),
@@ -800,16 +799,16 @@ void MoePermuteKernel(const Context &dev_ctx,
       }
     }
     auto *stable_expert_offset =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(expert_offset,
-                                                               kMaxNumExperts);
+        backends::gpu::RestoreHostMemIfCapturingCUDAGraph(expert_offset,
+                                                          kMaxNumExperts);
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(expert_offset_tensor.data<int>(),
                                                stable_expert_offset,
                                                sizeof(int) * kMaxNumExperts,
                                                cudaMemcpyHostToDevice,
                                                dev_ctx.stream()));
     auto *stable_expert_offset_end =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-            expert_offset_end, kMaxNumExperts);
+        backends::gpu::RestoreHostMemIfCapturingCUDAGraph(expert_offset_end,
+                                                          kMaxNumExperts);
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMemcpyAsync(expert_offset_end_tensor.data<int>(),
                         stable_expert_offset_end,
diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu
index 2a09c0f3658c42..f68448980dfb07 100644
--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
@@ -83,7 +83,7 @@ void NMSKernel(const Context& dev_ctx,
   NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
       boxes.data<T>(), threshold, num_boxes, mask_dev);
   PADDLE_ENFORCE_EQ(
-      phi::backends::gpu::IsCUDAGraphCapturing(),
+      backends::gpu::IsCUDAGraphCapturing(),
       false,
       common::errors::InvalidArgument(
           "NMSKernel does not support CUDA Graph capture: async D2H copy to "
@@ -115,8 +115,8 @@ void NMSKernel(const Context& dev_ctx,
   output->Resize({last_box_num});
   auto* output_data = dev_ctx.template Alloc<int64_t>(output);
   const int64_t* stable_output =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(output_host,
-                                                             last_box_num);
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(output_host,
+                                                        last_box_num);
   memory_utils::Copy(dev_ctx.GetPlace(),
                      output_data,
                      CPUPlace(),
diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
index e9f04b0f812dac..7b2170f3152df6 100644
--- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
@@ -440,7 +440,7 @@ void PNormGradKernel(const Context& dev_ctx,
     GetPreAxisPost(xdim, axis, reduce_all, &pre, &axis_size, &post);
 
     int64_t total = in_x->numel();
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
 
     PNormGradP1Kernel<T><<<config.block_per_grid,
                            config.thread_per_block,
@@ -459,7 +459,7 @@ void PNormGradKernel(const Context& dev_ctx,
     GetPreAxisPost(xdim, axis, reduce_all, &pre, &axis_size, &post);
 
     int64_t total = in_x->numel();
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
 
     PNormGradP2Kernel<T><<<config.block_per_grid,
                            config.thread_per_block,
@@ -480,7 +480,7 @@ void PNormGradKernel(const Context& dev_ctx,
     GetPreAxisPost(xdim, axis, reduce_all, &pre, &axis_size, &post);
 
     int64_t total = in_x->numel();
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
 
     PNormGradPLessThan1Kernel<T><<<config.block_per_grid,
                                    config.thread_per_block,
@@ -502,7 +502,7 @@ void PNormGradKernel(const Context& dev_ctx,
     GetPreAxisPost(xdim, axis, reduce_all, &pre, &axis_size, &post);
 
     int64_t total = in_x->numel();
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
 
     PNormGradPBetween1And2Kernel<T><<<config.block_per_grid,
                                       config.thread_per_block,
@@ -524,7 +524,7 @@ void PNormGradKernel(const Context& dev_ctx,
     GetPreAxisPost(xdim, axis, reduce_all, &pre, &axis_size, &post);
 
     int64_t total = in_x->numel();
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total);
 
     PNormGradPGreaterThan2Kernel<T><<<config.block_per_grid,
                                       config.thread_per_block,
diff --git a/paddle/phi/kernels/gpu/partial_concat_grad_kernel.cu b/paddle/phi/kernels/gpu/partial_concat_grad_kernel.cu
index e3645c7ffd6376..1ec3a35a0a9a2a 100644
--- a/paddle/phi/kernels/gpu/partial_concat_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/partial_concat_grad_kernel.cu
@@ -110,10 +110,9 @@ void PartialConcatGradOpCUDAKernel(const Context &dev_ctx,
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
 
   size_t nbytes_out = out_data.size() * sizeof(T *);
-  const void *stable_out =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          reinterpret_cast<uint8_t *>(const_cast<T **>(out_data.data())),
-          nbytes_out);
+  const void *stable_out = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      reinterpret_cast<uint8_t *>(const_cast<T **>(out_data.data())),
+      nbytes_out);
   phi::memory_utils::Copy(dev_ctx.GetPlace(),
                           tmp_out_array->ptr(),
                           CPUPlace(),
diff --git a/paddle/phi/kernels/gpu/partial_concat_kernel.cu b/paddle/phi/kernels/gpu/partial_concat_kernel.cu
index a7ee5838b8d82f..8bef20f4422556 100644
--- a/paddle/phi/kernels/gpu/partial_concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/partial_concat_kernel.cu
@@ -109,10 +109,8 @@ void PartialConcatOpCUDAKernel(const Context &dev_ctx,
       in_data.size() * sizeof(T *),
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   size_t nbytes_in = in_data.size() * sizeof(T *);
-  const void *stable_in =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          reinterpret_cast<uint8_t *>(const_cast<T **>(in_data.data())),
-          nbytes_in);
+  const void *stable_in = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      reinterpret_cast<uint8_t *>(const_cast<T **>(in_data.data())), nbytes_in);
   phi::memory_utils::Copy(dev_ctx.GetPlace(),
                           tmp_in_array->ptr(),
                           CPUPlace(),
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
index ad0e9c2bcc6d69..744a4713074b1f 100644
--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -254,7 +254,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
   int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
   int64_t bytes = box_batch_id_list.numel() * sizeof(int);
   const int* stable_box_batch_size =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           box_batch_size, static_cast<size_t>(bytes / sizeof(int)));
   memory_utils::Copy(gplace,
                      roi_id_data,
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
index a6f2bb15759ad9..d34ab1bf471439 100644
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -268,7 +268,7 @@ void RoiAlignKernel(const Context& dev_ctx,
                           Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
   const int* stable_roi_batch_id =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           roi_batch_id_data, static_cast<size_t>(bytes / sizeof(int)));
   memory_utils::Copy(gplace,
                      roi_id_data,
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
index f20398b971f06a..180ca66227e039 100644
--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -139,7 +139,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
         Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
     int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
     const int* stable_box_batch_id =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+        backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
             box_batch_id_data, static_cast<size_t>(bytes / sizeof(int)));
     memory_utils::Copy(gplace,
                        roi_id_data,
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
index e3635cf6fe1877..a347a25247f093 100644
--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -202,7 +202,7 @@ void RoiPoolKernel(const Context& dev_ctx,
                           Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
   const int* stable_box_batch_id =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
           box_batch_id_data, static_cast<size_t>(bytes / sizeof(int)));
   memory_utils::Copy(gplace,
                      box_id_data,
diff --git a/paddle/phi/kernels/gpu/seed_kernel.cu b/paddle/phi/kernels/gpu/seed_kernel.cu
index d7b8afc95df46e..b5e7caaf8c1d89 100644
--- a/paddle/phi/kernels/gpu/seed_kernel.cu
+++ b/paddle/phi/kernels/gpu/seed_kernel.cu
@@ -44,7 +44,7 @@ void GPUSeedKernel(const Context &dev_ctx,
     auto *out_data = dev_ctx.template Alloc<T>(out);
     auto stream = dev_ctx.stream();
     const int *stable_seed =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&seed, 1);
+        backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&seed, 1);
     phi::memory_utils::Copy(dev_ctx.GetPlace(),
                             out_data,
                             CPUPlace(),
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index d07f3db4ac43cc..eb08365e9bb306 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -140,7 +140,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(
         dev_ctx, counts_tensor, &norm_tensor, NonzeroFunctor<T>(), reduce_dim);
     T *norm = dev_ctx.template Alloc<T>(&norm_tensor);
     PADDLE_ENFORCE_EQ(
-        phi::backends::gpu::IsCUDAGraphCapturing(),
+        backends::gpu::IsCUDAGraphCapturing(),
         false,
         common::errors::InvalidArgument(
             "SigmoidCrossEntropyWithLogitsGrad does not support CUDA Graph "
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index 2fa7b2a5f6a69e..34f24d356337a7 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -135,7 +135,7 @@ void SigmoidCrossEntropyWithLogitsKernel(
         dev_ctx, counts_tensor, &norm_tensor, NonzeroFunctor<T>(), reduce_dim);
     T *norm = dev_ctx.template Alloc<T>(&norm_tensor);
     PADDLE_ENFORCE_EQ(
-        phi::backends::gpu::IsCUDAGraphCapturing(),
+        backends::gpu::IsCUDAGraphCapturing(),
         false,
         common::errors::InvalidArgument(
             "SigmoidCrossEntropyWithLogits does not support CUDA Graph "
diff --git a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu
index cd769520d60336..13c1b5624107a8 100644
--- a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu
+++ b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu
@@ -149,7 +149,7 @@ struct SlogDeterminantFunctor<dtype::complex<T>, Context> {
         phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     size_t nbytes_ptrs_c1 = cpu_ptrs.size() * sizeof(phi::dtype::complex<T>*);
     const void* stable_ptrs_c1 =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+        backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
             reinterpret_cast<uint8_t*>(
                 const_cast<phi::dtype::complex<T>**>(cpu_ptrs.data())),
             nbytes_ptrs_c1);
@@ -339,7 +339,7 @@ struct SlogDeterminantV2Functor {
         phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     size_t nbytes_ptrs_v2 = cpu_ptrs.size() * sizeof(T*);
     const void* stable_ptrs_v2 =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+        backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
             reinterpret_cast<uint8_t*>(const_cast<T**>(cpu_ptrs.data())),
             nbytes_ptrs_v2);
     memory_utils::Copy(dev_ctx.GetPlace(),
@@ -492,7 +492,7 @@ struct SlogDeterminantV2Functor<dtype::complex<T>, Context> {
         phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     size_t nbytes_ptrs_v2c = cpu_ptrs.size() * sizeof(phi::dtype::complex<T>*);
     const void* stable_ptrs_v2c =
-        phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+        backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
             reinterpret_cast<uint8_t*>(
                 const_cast<phi::dtype::complex<T>**>(cpu_ptrs.data())),
             nbytes_ptrs_v2c);
diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
index 80378c9f9af1ab..6993c6731ae9aa 100644
--- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
@@ -240,9 +240,9 @@ template <typename T>
 __forceinline__ __device__ Pair<T> WarpReduce(Pair<T> input) {
 #pragma unroll
   for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
-    T tmp_val = phi::backends::gpu::CudaShuffleDownSync(
+    T tmp_val = backends::gpu::CudaShuffleDownSync(
         FINAL_MASK, input.v, offset, WARP_SIZE);
-    int tmp_id = phi::backends::gpu::CudaShuffleDownSync(
+    int tmp_id = backends::gpu::CudaShuffleDownSync(
         FINAL_MASK, input.id, offset, WARP_SIZE);
     if (static_cast<float>(input.v) < static_cast<float>(tmp_val)) {
       input.v = tmp_val;
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
index 3dbf561c09cf46..e09f51421ec3e7 100644
--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -114,7 +114,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
           phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
       size_t nbytes_a_ptrs = cpu_a_ptrs.size() * sizeof(T*);
       const void* stable_a_ptrs =
-          phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+          backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
               reinterpret_cast<uint8_t*>(const_cast<T**>(cpu_a_ptrs.data())),
               nbytes_a_ptrs);
       memory_utils::Copy(dev_ctx.GetPlace(),
@@ -143,7 +143,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
         }
         size_t nbytes_b_ptrs = cpu_b_ptrs_for_chunk.size() * sizeof(T*);
         const void* stable_b_ptrs =
-            phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+            backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
                 reinterpret_cast<uint8_t*>(cpu_b_ptrs_for_chunk.data()),
                 nbytes_b_ptrs);
         memory_utils::Copy(dev_ctx.GetPlace(),
@@ -180,7 +180,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
 
       size_t nbytes_ptrs = cpu_ptrs.size() * sizeof(T*);
       const void* stable_ptrs =
-          phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+          backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
               reinterpret_cast<uint8_t*>(const_cast<T**>(cpu_ptrs.data())),
               nbytes_ptrs);
       memory_utils::Copy(dev_ctx.GetPlace(),
diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
index f05863f2aa9ea1..18e2083b846c71 100644
--- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -142,9 +142,8 @@ void YoloBoxKernel(const Context& dev_ctx,
   int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors);
   const auto gplace = dev_ctx.GetPlace();
   const auto cplace = CPUPlace();
-  const int* stable_anchors =
-      phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          const_cast<int*>(anchors.data()), anchors.size());
+  const int* stable_anchors = backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
+      const_cast<int*>(anchors.data()), anchors.size());
   memory_utils::Copy(
       gplace, anchors_data, cplace, stable_anchors, bytes, dev_ctx.stream());
 
diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index 94a2e1e6884f03..8634e64ffe3867 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -53,14 +53,14 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
       dev_ctx, DenseTensorMeta(x_indices.dtype(), {nnz}, x_indices.layout()));
   IntT* indices_ptr = indices.data<IntT>();
 
-  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
-                                     sparse_offsets.data(),
-                                     sizeof(IntT) * sparse_dim,
-                                     gpuMemcpyHostToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
+                                sparse_offsets.data(),
+                                sizeof(IntT) * sparse_dim,
+                                gpuMemcpyHostToDevice,
+                                dev_ctx.stream());
 
   // 1. flatten indices
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz, 1);
   funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
                                         config.thread_per_block,
                                         0,
@@ -111,11 +111,11 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
       indices_ptr, new_end.first, out_indices.data<IntT>());
 
   IntT out_nnz = 0;
-  phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
-                                     out_indices.data<IntT>(),
-                                     sizeof(IntT),
-                                     gpuMemcpyDeviceToHost,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(&out_nnz,
+                                out_indices.data<IntT>(),
+                                sizeof(IntT),
+                                gpuMemcpyDeviceToHost,
+                                dev_ctx.stream());
   dev_ctx.Wait();
 
   out_indices.Resize({x_indices.dims()[0], out_nnz});
@@ -128,8 +128,8 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
   // 5. scatter the values
   const int VecSize = VecBytes / sizeof(T);
   if (stride % VecSize == 0) {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, nnz * stride / VecSize, 1);
+    config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride / VecSize, 1);
     funcs::sparse::ScatterKernel<T, VecSize>
         <<<config.block_per_grid,
            config.thread_per_block,
@@ -142,7 +142,7 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
                                stride,
                                out_values.data<T>());
   } else {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
+    config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
     funcs::sparse::ScatterKernel<T, 1>
         <<<config.block_per_grid,
            config.thread_per_block,
@@ -162,7 +162,7 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
     const_dims[i] = x.dims()[i];
   }
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
+  config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
   funcs::sparse::IndexToCoordinateKernel<<<config.block_per_grid,
                                            config.thread_per_block,
                                            0,
diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
index 1de7951f16ed5f..c2125b1a991284 100644
--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -104,7 +104,7 @@ inline void Gather(const GPUContext& dev_ctx,
                    T* output) {
   const int VecSize = VecBytes / sizeof(T);
   if (channels % VecSize == 0) {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, indices_size * channels / VecSize, 1);
     GatherKernel<T, IntT, VecSize>
         <<<config.block_per_grid.x,
@@ -112,7 +112,7 @@ inline void Gather(const GPUContext& dev_ctx,
            0,
            dev_ctx.stream()>>>(inputs, indices, output, indices_size, channels);
   } else {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, indices_size * channels, 1);
     GatherKernel<T, IntT, 1>
         <<<config.block_per_grid.x,
@@ -134,7 +134,7 @@ inline void GatherV2(const GPUContext& dev_ctx,
                      T* output) {
   const int VecSize = VecBytes / sizeof(T);
   if (channels % VecSize == 0) {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, non_zero_num * channels / VecSize, 1);
     GatherKernelV2<T, IntT, VecSize><<<config.block_per_grid.x,
                                        config.thread_per_block.x,
@@ -148,7 +148,7 @@ inline void GatherV2(const GPUContext& dev_ctx,
                                                            buffer_count,
                                                            output);
   } else {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, non_zero_num * channels, 1);
     GatherKernelV2<T, IntT, 1><<<config.block_per_grid.x,
                                  config.thread_per_block.x,
@@ -392,17 +392,17 @@ inline void CallThrustScan(const GPUContext& dev_ctx,
                          counter_ptr + kernel_size,
                          offsets_ptr);
 
-  phi::backends::gpu::GpuMemcpyAsync(h_counter_ptr,
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     gpuMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-
-  phi::backends::gpu::GpuMemcpyAsync(h_offsets_ptr,
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     gpuMemcpyDeviceToHost,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(h_counter_ptr,
+                                counter_ptr,
+                                kernel_size * sizeof(int),
+                                gpuMemcpyDeviceToHost,
+                                dev_ctx.stream());
+
+  backends::gpu::GpuMemcpyAsync(h_offsets_ptr,
+                                offsets_ptr,
+                                kernel_size * sizeof(int),
+                                gpuMemcpyDeviceToHost,
+                                dev_ctx.stream());
 }
 
 // the basic algorithm can refer to convolution_kernel.cc or
@@ -494,12 +494,11 @@ int ProductRuleBook(const Context& dev_ctx,
   const Dims4D d_dilations(ddim0, ddim1, ddim2, ddim3);
 
   // 1. product rule book
-  phi::backends::gpu::GpuMemsetAsync(counter_ptr,
-                                     0,
-                                     sizeof(int) * counter_per_kernel->numel(),
-                                     dev_ctx.stream());
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  backends::gpu::GpuMemsetAsync(counter_ptr,
+                                0,
+                                sizeof(int) * counter_per_kernel->numel(),
+                                dev_ctx.stream());
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
   const int rulebook_rows = 2;
   const int rulebook_cols = kernel_size * non_zero_num;
@@ -516,7 +515,7 @@ int ProductRuleBook(const Context& dev_ctx,
   int index_flags_size = (table_size + 31) / 32;
   DenseTensor index_flags = Empty<int>(dev_ctx, {index_flags_size});
   int* index_flags_ptr = index_flags.data<int>();
-  phi::backends::gpu::GpuMemsetAsync(
+  backends::gpu::GpuMemsetAsync(
       index_flags_ptr, 0, sizeof(int) * index_flags.numel(), dev_ctx.stream());
 
   if (subm) {
@@ -528,8 +527,7 @@ int ProductRuleBook(const Context& dev_ctx,
 
     phi::Copy(dev_ctx, x.indices(), dev_ctx.GetPlace(), false, &out_indices);
 
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
     GetOutIndexTable1<IntT><<<config.block_per_grid,
                               config.thread_per_block,
                               0,
@@ -581,7 +579,7 @@ int ProductRuleBook(const Context& dev_ctx,
     DenseTensor out_rulebook =
         Empty<IntT>(dev_ctx, {rulebook_rows, rulebook_len});
     IntT* out_rulebook_ptr = out_rulebook.data<IntT>();
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
     cache_size = kernel_size * 2 * sizeof(int);
     CopyRuleBook<IntT><<<config.block_per_grid,
                          config.thread_per_block,
@@ -667,10 +665,10 @@ int ProductRuleBook(const Context& dev_ctx,
     int* out_index_ptr = out_index->data<int>();
     int* unique_key_ptr = unique_key.data<int>();
 
-    phi::backends::gpu::GpuMemsetAsync(
+    backends::gpu::GpuMemsetAsync(
         unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
 
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
     size_t cache_size = sizeof(int) * config.thread_per_block.x;
     UniqueKernel<IntT><<<config.block_per_grid,
                          config.thread_per_block,
@@ -682,11 +680,11 @@ int ProductRuleBook(const Context& dev_ctx,
                                              unique_key_ptr);
 
     int out_nnz = 0;
-    phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
-                                       unique_key_ptr,
-                                       sizeof(int),
-                                       gpuMemcpyDeviceToHost,
-                                       dev_ctx.stream());
+    backends::gpu::GpuMemcpyAsync(&out_nnz,
+                                  unique_key_ptr,
+                                  sizeof(int),
+                                  gpuMemcpyDeviceToHost,
+                                  dev_ctx.stream());
     dev_ctx.Wait();
 
     const int threads = 256;
@@ -716,7 +714,7 @@ int ProductRuleBook(const Context& dev_ctx,
 
     IntT* out_indices_ptr = out_indices.data<IntT>();
 
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
+    config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
     GetOutIndexTable<IntT><<<config.block_per_grid,
                              config.thread_per_block,
                              0,
@@ -726,7 +724,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                                  is2D,
                                                  out_index_table_ptr,
                                                  out_indices_ptr);
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
     unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
     int* unique_value_ptr = unique_value->data<int>();
 
diff --git a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
index 26dfd82779d65e..ba53f46058e595 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
@@ -79,7 +79,7 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
   if (!is_params_freezing) {
     *kernel_grad = EmptyLike<T>(dev_ctx, kernel);
     d_kernel_ptr = kernel_grad->data<T>();
-    phi::backends::gpu::GpuMemsetAsync(
+    backends::gpu::GpuMemsetAsync(
         d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream());
   }
 
@@ -88,11 +88,11 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
   DenseTensor x_grad_indices = EmptyLike<IntT>(dev_ctx, x.indices());
   DenseTensor x_grad_values = EmptyLike<T>(dev_ctx, x.values());
   T* x_grad_values_ptr = x_grad_values.data<T>();
-  phi::backends::gpu::GpuMemsetAsync(x_grad_values_ptr,
-                                     0,
-                                     sizeof(T) * x_grad_values.numel(),
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemsetAsync(
+  backends::gpu::GpuMemsetAsync(x_grad_values_ptr,
+                                0,
+                                sizeof(T) * x_grad_values.numel(),
+                                dev_ctx.stream());
+  backends::gpu::GpuMemsetAsync(
       d_x_features_ptr, 0, sizeof(T) * d_x_features.numel(), dev_ctx.stream());
   phi::Copy<GPUContext>(
       dev_ctx, x.indices(), dev_ctx.GetPlace(), false, &x_grad_indices);
@@ -125,14 +125,13 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
     }
   }
 
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
   DenseTensor unique_value =
       Empty<int>(dev_ctx, {static_cast<int>(x_grad->nnz() * kernel_size * 2)});
   DenseTensor out_index = Empty<int>(dev_ctx, {static_cast<int>(x.nnz() * 2)});
   int* out_index_ptr = out_index.data<int>();
   int* unique_value_ptr = unique_value.data<int>();
-  phi::backends::gpu::GpuMemsetAsync(
+  backends::gpu::GpuMemsetAsync(
       out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream());
 
   GroupIndicesV2<<<config.block_per_grid,
diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
index 240bcc9e7f4d3c..26513b4df955da 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -194,14 +194,13 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                                counter);
   }
   if (subm) {
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
     unique_value.ResizeAndAllocate(
         {static_cast<int>(out->nnz() * kernel_size)});
     out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
     int* out_index_ptr = out_index.data<int>();
     int* unique_value_ptr = unique_value.data<int>();
-    phi::backends::gpu::GpuMemsetAsync(
+    backends::gpu::GpuMemsetAsync(
         out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
     GroupIndices<<<config.block_per_grid,
                    config.thread_per_block,
diff --git a/paddle/phi/kernels/sparse/gpu/conv_with_buffer.cu.h b/paddle/phi/kernels/sparse/gpu/conv_with_buffer.cu.h
index 361edc3cae3040..40ea7e297c1a23 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_with_buffer.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv_with_buffer.cu.h
@@ -170,11 +170,11 @@ void cuda_remove(const GPUContext& dev_ctx,
       n,
       out_num_ptr);
 
-  phi::backends::gpu::GpuMemcpyAsync(input.data<T>(),
-                                     out.data<T>(),
-                                     sizeof(T) * n,
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(input.data<T>(),
+                                out.data<T>(),
+                                sizeof(T) * n,
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
 }
 
 template <int BS>
@@ -381,8 +381,7 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx,
                               int* h_buffer) {
   DenseTensor d_buffer = Empty<int>(dev_ctx, {2 * kernel_size + 3});
   const bool is2D = out_dims.size() == 4 ? true : false;
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
   ProductRuleBookKernel<IntT><<<config.block_per_grid.x,
                                 config.thread_per_block.x,
                                 kernel_size * sizeof(int),
@@ -424,9 +423,9 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx,
   int* out_index_ptr = out_index->data<int>();
   int* unique_key_ptr = unique_key.data<int>();
 
-  phi::backends::gpu::GpuMemsetAsync(
+  backends::gpu::GpuMemsetAsync(
       unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, max_nnz, 1);
+  config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, max_nnz, 1);
   size_t cache_size = sizeof(int) * config.thread_per_block.x;
   int* index_flags_ptr = index_flags->data<int>();
   UniqueKernel<IntT><<<config.block_per_grid,
@@ -438,31 +437,31 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx,
                                            out_index_ptr,
                                            unique_key_ptr);
 
-  phi::backends::gpu::GpuMemcpyAsync(d_buffer.data<int>(),
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(d_buffer.data<int>() + kernel_size,
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(d_buffer.data<int>() + 2 * kernel_size + 1,
-                                     rulebook_len_tensor.data<int>(),
-                                     sizeof(int),
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(d_buffer.data<int>() + 2 * kernel_size + 2,
-                                     unique_key_ptr,
-                                     sizeof(int),
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(h_buffer,
-                                     d_buffer.data<int>(),
-                                     (2 * kernel_size + 3) * sizeof(int),
-                                     gpuMemcpyDeviceToHost,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(d_buffer.data<int>(),
+                                counter_ptr,
+                                kernel_size * sizeof(int),
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(d_buffer.data<int>() + kernel_size,
+                                offsets_ptr,
+                                kernel_size * sizeof(int),
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(d_buffer.data<int>() + 2 * kernel_size + 1,
+                                rulebook_len_tensor.data<int>(),
+                                sizeof(int),
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(d_buffer.data<int>() + 2 * kernel_size + 2,
+                                unique_key_ptr,
+                                sizeof(int),
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(h_buffer,
+                                d_buffer.data<int>(),
+                                (2 * kernel_size + 3) * sizeof(int),
+                                gpuMemcpyDeviceToHost,
+                                dev_ctx.stream());
 
   dev_ctx.Wait();
   int rulebook_len = h_buffer[2 * kernel_size + 1] / 2;
@@ -512,7 +511,7 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx,
 
   IntT* out_indices_ptr = out_indices.data<IntT>();
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
+  config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
   GetOutIndexTable<IntT>
       <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
           out_index_ptr,
@@ -522,7 +521,7 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx,
           out_index_table_ptr,
           out_indices_ptr);
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+  config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
   unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
   int* unique_value_ptr = unique_value->data<int>();
 
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index a63a75dd670047..07fc90eb946982 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -73,15 +73,15 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
   phi::IndexKernel<int, kps::IdentityFunctor<int>>(
       dev_ctx, unique_value, kps::IdentityFunctor<int>());
 
-  phi::backends::gpu::GpuMemcpyAsync(unique_key->data<IntT>(),
-                                     rulebook_ptr,
-                                     sizeof(IntT) * len,
+  backends::gpu::GpuMemcpyAsync(unique_key->data<IntT>(),
+                                rulebook_ptr,
+                                sizeof(IntT) * len,
 #ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToDevice,
+                                hipMemcpyDeviceToDevice,
 #else
-                                     cudaMemcpyDeviceToDevice,
+                                cudaMemcpyDeviceToDevice,
 #endif
-                                     dev_ctx.stream());
+                                dev_ctx.stream());
 // compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
 // performance, but thrust::merge_by_key limited by data size
 #ifdef PADDLE_WITH_HIP
@@ -325,8 +325,7 @@ int ProductRuleBook(const Context& dev_ctx,
   // 1. product rule book
   funcs::SetConstant<Context, int> set_zero;
   set_zero(dev_ctx, counter_per_kernel, 0);
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
   ProductRuleBookKernel<IntT><<<config.block_per_grid.x,
                                 config.thread_per_block.x,
@@ -357,7 +356,7 @@ int ProductRuleBook(const Context& dev_ctx,
   funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
       rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
   IntT rulebook_len = 0;
-  phi::backends::gpu::GpuMemcpyAsync(
+  backends::gpu::GpuMemcpyAsync(
       &rulebook_len,
       rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
       sizeof(IntT),
@@ -396,7 +395,7 @@ int ProductRuleBook(const Context& dev_ctx,
                         out_indices_ptr + rulebook_len,
                         bound_ptr);
 
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
 
     UpdateOutIndexAndCounterAfterLowerBound<<<config.block_per_grid,
                                               config.thread_per_block,
@@ -422,15 +421,15 @@ int ProductRuleBook(const Context& dev_ctx,
                                 -1);
     funcs::sparse::DistanceKernel<IntT>
         <<<1, 1, 0, dev_ctx.stream()>>>(rulebook_ptr, last, bound_ptr);
-    phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
-                                       bound_ptr,
-                                       sizeof(IntT),
+    backends::gpu::GpuMemcpyAsync(&rulebook_len,
+                                  bound_ptr,
+                                  sizeof(IntT),
 #ifdef PADDLE_WITH_HIP
-                                       hipMemcpyDeviceToHost,
+                                  hipMemcpyDeviceToHost,
 #else
-                                       cudaMemcpyDeviceToHost,
+                                  cudaMemcpyDeviceToHost,
 #endif
-                                       dev_ctx.stream());
+                                  dev_ctx.stream());
     dev_ctx.Wait();
     rulebook_len /= 3;
   }
@@ -444,25 +443,25 @@ int ProductRuleBook(const Context& dev_ctx,
                          counter_ptr + kernel_size,
                          offsets_ptr);
 
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
-                                     counter_ptr,
-                                     kernel_size * sizeof(int),
+  backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                counter_ptr,
+                                kernel_size * sizeof(int),
 #ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
+                                hipMemcpyDeviceToHost,
 #else
-                                     cudaMemcpyDeviceToHost,
+                                cudaMemcpyDeviceToHost,
 #endif
-                                     dev_ctx.stream());
+                                dev_ctx.stream());
 
-  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
-                                     offsets_ptr,
-                                     kernel_size * sizeof(int),
+  backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                offsets_ptr,
+                                kernel_size * sizeof(int),
 #ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
+                                hipMemcpyDeviceToHost,
 #else
-                                     cudaMemcpyDeviceToHost,
+                                cudaMemcpyDeviceToHost,
 #endif
-                                     dev_ctx.stream());
+                                dev_ctx.stream());
 
   rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
 
@@ -494,14 +493,14 @@ int ProductRuleBook(const Context& dev_ctx,
         rulebook_ptr + rulebook_rows * rulebook_cols - 1);
     IntT out_non_zero_num = 0;
 #ifdef PADDLE_WITH_HIP
-    phi::backends::gpu::GpuMemcpyAsync(
+    backends::gpu::GpuMemcpyAsync(
         &out_non_zero_num,
         rulebook_ptr + rulebook_rows * rulebook_cols - 1,
         sizeof(IntT),
         hipMemcpyDeviceToHost,
         dev_ctx.stream());
 #else
-    phi::backends::gpu::GpuMemcpyAsync(
+    backends::gpu::GpuMemcpyAsync(
         &out_non_zero_num,
         rulebook_ptr + rulebook_rows * rulebook_cols - 1,
         sizeof(IntT),
@@ -521,8 +520,7 @@ int ProductRuleBook(const Context& dev_ctx,
 
     IntT* out_indices_ptr = out_indices.data<IntT>();
 
-    config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+    config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
     UpdateIndexKernel<IntT>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
diff --git a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
index ee6f6a64204ff4..c2bcf46ad35e79 100644
--- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -78,11 +78,11 @@ void MaskCooGPUKernel(const GPUContext& dev_ctx,
   std::vector<int64_t> h_sparse_offsets(sparse_dim);
   funcs::sparse::CalcOffsetsPerDim(dims, sparse_dim, h_sparse_offsets.data());
 
-  phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
-                                     &h_sparse_offsets[0],
-                                     sizeof(int64_t) * sparse_dim,
-                                     gpuMemcpyHostToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
+                                &h_sparse_offsets[0],
+                                sizeof(int64_t) * sparse_dim,
+                                gpuMemcpyHostToDevice,
+                                dev_ctx.stream());
 
   phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices);
 
@@ -94,7 +94,7 @@ void MaskCooGPUKernel(const GPUContext& dev_ctx,
   const int cols = dims_2d[1];
 
   auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
   MaskKernel<T, IntT>
       <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
           x_ptr,
@@ -163,11 +163,11 @@ void MaskCsr2DGPUKernel(const GPUContext& dev_ctx,
   std::vector<int64_t> h_sparse_offsets(sparse_dim);
   funcs::sparse::CalcOffsetsPerDim(dims, sparse_dim, h_sparse_offsets.data());
 
-  phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
-                                     &h_sparse_offsets[0],
-                                     sizeof(int64_t) * sparse_dim,
-                                     gpuMemcpyHostToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
+                                &h_sparse_offsets[0],
+                                sizeof(int64_t) * sparse_dim,
+                                gpuMemcpyHostToDevice,
+                                dev_ctx.stream());
 
   const auto& csr_crows = mask.crows();
   const auto& csr_cols = mask.cols();
@@ -186,23 +186,23 @@ void MaskCsr2DGPUKernel(const GPUContext& dev_ctx,
   IntT* coo_cols_data = coo_rows_data + non_zero_num;
   IntT* offsets_ptr = nullptr;
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
   config.block_per_grid.y = batches;
   ConvertCsrCrowsToCooRows<IntT>
       <<<config.block_per_grid, config.thread_per_block.x>>>(
           csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
-  phi::backends::gpu::GpuMemcpyAsync(coo_cols_data,
-                                     csr_cols_data,
-                                     sizeof(IntT) * non_zero_num,
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(coo_cols_data,
+                                csr_cols_data,
+                                sizeof(IntT) * non_zero_num,
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
 
   const T* x_ptr = x.data<T>();
   const IntT* indices_ptr = coo_indices;
   T* out_values_ptr = out_values.data<T>();
 
   auto config_mask =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
   MaskKernel<T, IntT><<<config_mask.block_per_grid,
                         config_mask.thread_per_block,
                         0,
@@ -240,11 +240,11 @@ void MaskCsr3DGPUKernel(const GPUContext& dev_ctx,
   std::vector<int64_t> h_sparse_offsets(sparse_dim);
   funcs::sparse::CalcOffsetsPerDim(dims, sparse_dim, h_sparse_offsets.data());
 
-  phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
-                                     &h_sparse_offsets[0],
-                                     sizeof(int64_t) * sparse_dim,
-                                     gpuMemcpyHostToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
+                                &h_sparse_offsets[0],
+                                sizeof(int64_t) * sparse_dim,
+                                gpuMemcpyHostToDevice,
+                                dev_ctx.stream());
 
   const auto& csr_crows = mask.crows();
   const auto& csr_cols = mask.cols();
@@ -264,8 +264,7 @@ void MaskCsr3DGPUKernel(const GPUContext& dev_ctx,
   IntT* coo_cols_data = coo_rows_data + non_zero_num;
   IntT* offsets_ptr = offsets.data<IntT>();
 
-  auto config_batch =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
+  auto config_batch = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
   GetBatchSizes<IntT>
       <<<config_batch.block_per_grid.x, config_batch.thread_per_block.x>>>(
           csr_crows_data, rows, batches, offsets_ptr);
@@ -279,23 +278,23 @@ void MaskCsr3DGPUKernel(const GPUContext& dev_ctx,
                          offsets_ptr + batches,
                          offsets_ptr);
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
   config.block_per_grid.y = batches;
   ConvertCsrCrowsToCooRows<IntT>
       <<<config.block_per_grid, config.thread_per_block.x>>>(
           csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
-  phi::backends::gpu::GpuMemcpyAsync(coo_cols_data,
-                                     csr_cols_data,
-                                     sizeof(IntT) * non_zero_num,
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(coo_cols_data,
+                                csr_cols_data,
+                                sizeof(IntT) * non_zero_num,
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
 
   const T* x_ptr = x.data<T>();
   const IntT* indices_ptr = coo_indices;
   T* out_values_ptr = out_values.data<T>();
 
   auto config_mask =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
   MaskKernel<T, IntT><<<config_mask.block_per_grid,
                         config_mask.thread_per_block,
                         0,
@@ -427,15 +426,15 @@ void MaskHelperCooGPUKernel(const GPUContext& dev_ctx,
   // 1. calc the offsets of per dim
   funcs::sparse::CalcOffsetsPerDim(x.dims(), sparse_dim, sparse_offsets.data());
   // 2. copy sparse_offsets to device
-  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
-                                     sparse_offsets.data(),
-                                     sizeof(IntT) * sparse_dim,
-                                     gpuMemcpyHostToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
+                                sparse_offsets.data(),
+                                sizeof(IntT) * sparse_dim,
+                                gpuMemcpyHostToDevice,
+                                dev_ctx.stream());
 
   // 3. flatten x indices and mask indices
   auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indices.numel(), 1);
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indices.numel(), 1);
   funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
                                         config.thread_per_block,
                                         0,
@@ -446,7 +445,7 @@ void MaskHelperCooGPUKernel(const GPUContext& dev_ctx,
       sparse_dim,
       x_indices_ptr);
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  config = backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, mask_meta_indices.numel(), 1);
   funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
                                         config.thread_per_block,
@@ -465,18 +464,17 @@ void MaskHelperCooGPUKernel(const GPUContext& dev_ctx,
   }
   DenseTensor table = Empty<int>(dev_ctx, {table_size});
   DenseTensor index_flags = Empty<int>(dev_ctx, {(table_size + 31) / 32});
-  phi::backends::gpu::GpuMemsetAsync(index_flags.data<int>(),
-                                     0,
-                                     index_flags.numel() * sizeof(int),
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemsetAsync(index_flags.data<int>(),
+                                0,
+                                index_flags.numel() * sizeof(int),
+                                dev_ctx.stream());
   const int64_t stride =
       x.dims().size() == sparse_dim ? 1 : x.values().dims()[1];
   *out = EmptyLike<T>(dev_ctx, x.values());
   funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, out, static_cast<T>(0));
   T* out_ptr = out->data<T>();
-  config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indices.numel(), 1);
+  config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indices.numel(), 1);
   MaskTable<<<config.block_per_grid,
               config.thread_per_block,
               0,
@@ -484,7 +482,7 @@ void MaskHelperCooGPUKernel(const GPUContext& dev_ctx,
                                   x_indices.numel(),
                                   index_flags.data<int>(),
                                   table.data<int>());
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  config = backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, mask_meta_indices.numel(), 1);
 
   const int VecBytes = 16;
diff --git a/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu
index 210ae220abd5a7..6733cfe8131280 100644
--- a/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu
@@ -69,7 +69,7 @@ void MvCooGradKernel(const Context &dev_ctx,
   if (dx) {
     // InferMeta of SparseCooTensor 'dx', CreateLikeInferMeta
     EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, dx->nnz());
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, dx->nnz());
     PD_VISIT_BASE_INTEGRAL_TYPES(
         dx->indices().dtype(), "MvCooGradKernel", ([&] {
           MvCooGradGpuKernel<T>
@@ -111,8 +111,8 @@ void MvCsrGradKernel(const Context &dev_ctx,
 
     int64_t row_number = dx->dims()[0];
     int64_t col_number = dx->dims()[1];
-    auto config = phi::backends::gpu::GetGpuLaunchConfig2D(
-        dev_ctx, col_number, row_number);
+    auto config =
+        backends::gpu::GetGpuLaunchConfig2D(dev_ctx, col_number, row_number);
     PD_VISIT_BASE_INTEGRAL_TYPES(dx->crows().dtype(), "MvCsrGradKernel", ([&] {
                                    MvCsrGradGpuKernel<T>
                                        <<<config.block_per_grid.x,
diff --git a/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
index ed9097af9e0206..b042043d4720bd 100644
--- a/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_grad_kernel.cu
@@ -89,7 +89,7 @@ void MaxPoolCooGradGPUKernel(const GPUContext& dev_ctx,
       continue;
     }
 
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, counter_ptr[i] * in_channels, 1);
     MaxPoolGradCudaKernel<T, IntT>
         <<<config.block_per_grid.x,
diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
index 3a0fb48def61eb..bede24822b1130 100644
--- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu
@@ -115,7 +115,7 @@ void MaxPoolCooGPUKernel(const GPUContext& dev_ctx,
       continue;
     }
 
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, h_counter[i] * in_channels, 1);
     MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
                                  config.thread_per_block.x,
diff --git a/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu b/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
index c0c0306dfb041c..edcef2a1d9bead 100644
--- a/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
@@ -108,7 +108,7 @@ void ReshapeCooGPUKernel(const Context& dev_ctx,
                      sizeof(int64_t) * out_sparse_part_strides.size(),
                      dev_ctx.stream());
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_nnz, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_nnz, 1);
   ReshapeCooCudaKernel<<<config.block_per_grid.x,
                          config.thread_per_block.x,
                          0,
diff --git a/paddle/phi/kernels/sparse/gpu/slice_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/slice_grad_kernel.cu
index a2d47ed914f4c9..64241751a915e2 100644
--- a/paddle/phi/kernels/sparse/gpu/slice_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/slice_grad_kernel.cu
@@ -101,7 +101,7 @@ void SliceCooGradCompute(const Context& dev_ctx,
   x_grad->SetMember(dx_indices, dx_values, x.dims(), x.coalesced());
 
   auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_grad_nnz + 1, 1);
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_grad_nnz + 1, 1);
   GetCooInputGradCudaKernel<T><<<config.block_per_grid.x,
                                  config.thread_per_block.x,
                                  0,
@@ -198,7 +198,7 @@ void SliceCsrGrad2D(const Context& dev_ctx,
 
   // set cols and values
   auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_grad_nnz + 1, 1);
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_grad_nnz + 1, 1);
   GetCsrInputColsValuesCudaKernel<T><<<config.block_per_grid.x,
                                        config.thread_per_block.x,
                                        0,
@@ -208,7 +208,7 @@ void SliceCsrGrad2D(const Context& dev_ctx,
                                                            starts[1],
                                                            dx_cols_data,
                                                            dx_values_data);
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n_rows + 1, 1);
+  config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n_rows + 1, 1);
   GetCsrInputCrowsCudaKernel<<<config.block_per_grid.x,
                                config.thread_per_block.x,
                                0,
@@ -255,7 +255,7 @@ void SliceCsrGrad3D(const Context& dev_ctx,
 
   // set cols and values
   auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_grad_nnz + 1, 1);
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_grad_nnz + 1, 1);
   GetCsrInputColsValuesCudaKernel<T><<<config.block_per_grid.x,
                                        config.thread_per_block.x,
                                        0,
@@ -269,7 +269,7 @@ void SliceCsrGrad3D(const Context& dev_ctx,
   int64_t out_grad_n_rows = out_grad.dims()[1];
   for (int64_t i = 0; i < dim0; ++i) {
     if (i < starts[0] || i >= ends[0]) {
-      config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n_rows + 1, 1);
+      config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n_rows + 1, 1);
       GetCsrInputCrowsPart1CudaKernel<<<config.block_per_grid.x,
                                         config.thread_per_block.x,
                                         0,
@@ -278,7 +278,7 @@ void SliceCsrGrad3D(const Context& dev_ctx,
     } else {
       int64_t dx_crows_offset = i * (n_rows + 1);
       int64_t out_grad_crows_offset = (i - starts[0]) * (out_grad_n_rows + 1);
-      config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n_rows + 1, 1);
+      config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n_rows + 1, 1);
       GetCsrInputCrowsCudaKernel<<<config.block_per_grid.x,
                                    config.thread_per_block.x,
                                    0,
diff --git a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
index 6ed5f2f0d5f63b..f50cece4cc66ed 100644
--- a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
@@ -100,14 +100,14 @@ void SliceCooGPUCompute(const Context& dev_ctx,
   // Step2: Get the number of non zero elements
   DenseTensor d_out_nnz = Empty<int32_t>(dev_ctx, {1});
   int* d_out_nnz_ptr = d_out_nnz.data<int32_t>();
-  phi::backends::gpu::GpuMemsetAsync(
+  backends::gpu::GpuMemsetAsync(
       d_out_nnz_ptr, 0, sizeof(int32_t), dev_ctx.stream());
 
   // out_nnz_indices is the indices where the data is valid in out
   // the length of the out_nnz_indices must be less than x.nnz()
   DenseTensor d_out_nnz_indices = Empty<IntT>(dev_ctx, {x.nnz()});
   auto* d_out_nnz_indices_ptr = d_out_nnz_indices.data<IntT>();
-  phi::backends::gpu::GpuMemsetAsync(
+  backends::gpu::GpuMemsetAsync(
       d_out_nnz_indices_ptr, 0, sizeof(IntT), dev_ctx.stream());
 
   // copy axes to device
@@ -151,8 +151,7 @@ void SliceCooGPUCompute(const Context& dev_ctx,
 
   const auto* x_indices_data = x.indices().data<IntT>();
 
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x.nnz() + 1, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x.nnz() + 1, 1);
   GetCooNonZeroNumberCudaKernel<IntT>
       <<<config.block_per_grid.x,
          config.thread_per_block.x,
@@ -168,11 +167,11 @@ void SliceCooGPUCompute(const Context& dev_ctx,
 
   // copy d_out_nnz from device to host (out_nnz)
   int32_t out_nnz = 0;
-  phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
-                                     d_out_nnz_ptr,
-                                     sizeof(int32_t),
-                                     gpuMemcpyDeviceToHost,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(&out_nnz,
+                                d_out_nnz_ptr,
+                                sizeof(int32_t),
+                                gpuMemcpyDeviceToHost,
+                                dev_ctx.stream());
   dev_ctx.Wait();
   // sort `d_out_nnz_indices_ptr`
   d_out_nnz_indices.Resize({out_nnz});
@@ -195,7 +194,7 @@ void SliceCooGPUCompute(const Context& dev_ctx,
   auto* out_values_data = out_values.data<T>();
   const auto* x_values_data = x.values().data<T>();
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz + 1, 1);
+  config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz + 1, 1);
   GetCooOutCudaKernel<T, IntT>
       <<<config.block_per_grid.x,
          config.thread_per_block.x,
@@ -307,8 +306,8 @@ void SliceCsrTensor2D(const Context& dev_ctx,
   DenseTensor out_crows = Empty<int64_t, Context>(dev_ctx, {out_n_rows + 1});
   auto* out_crows_data = out_crows.data<int64_t>();
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, ends[0] - starts[0] + 1, 1);
+  auto config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, ends[0] - starts[0] + 1, 1);
   GetCsr2DNonZeroNumberCudaKernel<<<config.block_per_grid.x,
                                     config.thread_per_block.x,
                                     0,
@@ -328,18 +327,18 @@ void SliceCsrTensor2D(const Context& dev_ctx,
                          out_crows_data + out_n_rows + 1,
                          out_crows_data);
   int64_t out_nnz = 0;
-  phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
-                                     &out_crows_data[out_n_rows],
-                                     sizeof(int64_t),
-                                     gpuMemcpyDeviceToHost,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(&out_nnz,
+                                &out_crows_data[out_n_rows],
+                                sizeof(int64_t),
+                                gpuMemcpyDeviceToHost,
+                                dev_ctx.stream());
   dev_ctx.Wait();
   // Step2: Set out
   DenseTensor out_cols = Empty<int64_t, Context>(dev_ctx, {out_nnz});
   DenseTensor out_values = Empty<T, Context>(dev_ctx, {out_nnz});
   out->SetMember(out_crows, out_cols, out_values, out_dims);
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, ends[0] - starts[0] + 1, 1);
+  config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, ends[0] - starts[0] + 1, 1);
   GetCsr2DCudaKernel<T><<<config.block_per_grid.x,
                           config.thread_per_block.x,
                           0,
@@ -460,8 +459,7 @@ void SliceCsrTensor3D(const Context& dev_ctx,
   DenseTensor x_cols_offsets = Empty<int64_t>(dev_ctx, {x_dim0 + 1});
   auto* x_cols_offsets_data = x_cols_offsets.data<int64_t>();
 
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_dim0 + 1, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_dim0 + 1, 1);
   GetXColsOffsetsCudaKernel<<<config.block_per_grid.x,
                               config.thread_per_block.x,
                               0,
@@ -508,7 +506,7 @@ void SliceCsrTensor3D(const Context& dev_ctx,
   DenseTensor out_crows =
       Empty<int64_t, Context>(dev_ctx, {out_dim0 * (out_n_rows + 1)});
   auto* out_crows_data = out_crows.data<int64_t>();
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  config = backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, x_dim0 * (x_n_rows + 1) + 1, 1);
   GetCsr3DNonZeroNumberCudaKernel<<<config.block_per_grid.x,
                                     config.thread_per_block.x,
@@ -525,12 +523,11 @@ void SliceCsrTensor3D(const Context& dev_ctx,
   DenseTensor out_cols_offsets =
       Empty<int64_t, Context>(dev_ctx, {out_dim0 * (out_n_rows + 1)});
   auto* out_cols_offsets_data = out_cols_offsets.data<int64_t>();
-  phi::backends::gpu::GpuMemcpyAsync(
-      out_cols_offsets_data,
-      out_crows_data,
-      out_dim0 * (out_n_rows + 1) * sizeof(int64_t),
-      gpuMemcpyDeviceToDevice,
-      dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(out_cols_offsets_data,
+                                out_crows_data,
+                                out_dim0 * (out_n_rows + 1) * sizeof(int64_t),
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
   dev_ctx.Wait();
   int64_t out_nnz =
 #ifdef PADDLE_WITH_HIP
@@ -565,7 +562,7 @@ void SliceCsrTensor3D(const Context& dev_ctx,
   DenseTensor out_values = Empty<T, Context>(dev_ctx, {out_nnz});
   auto* out_values_data = out_values.data<T>();
   out->SetMember(out_crows, out_cols, out_values, out_dims);
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  config = backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, x_dim0 * (x_n_rows + 1) + 1, 1);
   GetCsr3DCudaKernel<T><<<config.block_per_grid.x,
                           config.thread_per_block.x,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index a3b0d6ea80cf11..387b512fe591f2 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -54,11 +54,11 @@ limitations under the License. */
         break;                                                              \
     }                                                                       \
   } else {                                                                  \
-    phi::backends::gpu::GpuMemcpyAsync(tensor.data<T>(),                    \
-                                       vector.data(),                       \
-                                       vector.size() * sizeof(T),           \
-                                       gpuMemcpyHostToDevice,               \
-                                       dev_ctx.stream());                   \
+    backends::gpu::GpuMemcpyAsync(tensor.data<T>(),                         \
+                                  vector.data(),                            \
+                                  vector.size() * sizeof(T),                \
+                                  gpuMemcpyHostToDevice,                    \
+                                  dev_ctx.stream());                        \
   }
 
 namespace phi {
@@ -220,9 +220,8 @@ void DenseToCooKernel(const Context& dev_ctx,
 
   // 1. get numbers of non zero elements, and get the index of non zero elements
   int* nums_ptr = nums.data<int>();
-  phi::backends::gpu::GpuMemsetAsync(
-      nums_ptr, 0, sizeof(int), dev_ctx.stream());
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
+  backends::gpu::GpuMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream());
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
 
   DenseTensor temp_indices = Empty<int32_t>(dev_ctx, {rows});
   int* temp_indices_ptr = temp_indices.data<int>();
@@ -244,16 +243,16 @@ void DenseToCooKernel(const Context& dev_ctx,
 
   // 2. copy non_zero_num to host, copy x_dims to device
   int non_zero_num = 0;
-  phi::backends::gpu::GpuMemcpyAsync(&non_zero_num,
-                                     nums_ptr,
-                                     sizeof(int),
-                                     gpuMemcpyDeviceToHost,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(d_x_dims.data<int64_t>(),
-                                     x_dims.Get(),
-                                     x_dims.size() * sizeof(x_dims[0]),
-                                     gpuMemcpyHostToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(&non_zero_num,
+                                nums_ptr,
+                                sizeof(int),
+                                gpuMemcpyDeviceToHost,
+                                dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(d_x_dims.data<int64_t>(),
+                                x_dims.Get(),
+                                x_dims.size() * sizeof(x_dims[0]),
+                                gpuMemcpyHostToDevice,
+                                dev_ctx.stream());
 
   dev_ctx.Wait();  // wait the copy
 
@@ -268,7 +267,7 @@ void DenseToCooKernel(const Context& dev_ctx,
 
   // 3. calc indices by indices and get values by indices
   if (non_zero_num > 0) {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+    config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
     GetNonZeroElementsAndIndices<<<config.block_per_grid.x,
                                    config.thread_per_block.x,
                                    0,
@@ -382,7 +381,7 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
         "'rocsparse_csr2coo' only supports batches "
         "with a value of 1 currently."));
 #else
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
     GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
         csr_crows_data, rows, batches, offsets_ptr);
 
@@ -403,26 +402,26 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
                                     rocsparse_index_base_zero);
   });
 #else
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
   config.block_per_grid.y = batches;
   ConvertCsrCrowsToCooRows<IntT>
       <<<config.block_per_grid, config.thread_per_block.x>>>(
           csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
 #endif
-  phi::backends::gpu::GpuMemcpyAsync(coo_cols_data,
-                                     csr_cols_data,
+  backends::gpu::GpuMemcpyAsync(coo_cols_data,
+                                csr_cols_data,
 #ifdef PADDLE_WITH_HIP
-                                     sizeof(int) * non_zero_num,
+                                sizeof(int) * non_zero_num,
 #else
-                                     sizeof(IntT) * non_zero_num,
+                                sizeof(IntT) * non_zero_num,
 #endif
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(coo_values_data,
-                                     csr_values_data,
-                                     sizeof(T) * non_zero_num,
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(coo_values_data,
+                                csr_values_data,
+                                sizeof(T) * non_zero_num,
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
 
 #ifdef PADDLE_WITH_HIP
   if (std::is_same<IntT, int64_t>::value)
@@ -534,10 +533,9 @@ void CooToCsrGPUKernel(const GPUContext& dev_ctx,
   const IntT* coo_cols_data = coo_rows_data + non_zero_num;
   const T* coo_values_data = coo_values.data<T>();
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
   if (batches > 1) {
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
     DenseTensor batches_offset = Empty<int>(dev_ctx, {batches});
     int* batches_offset_ptr = batches_offset.data<int>();
     funcs::SetConstant<GPUContext, int> set_zero;
@@ -563,16 +561,16 @@ void CooToCsrGPUKernel(const GPUContext& dev_ctx,
         nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
   }
 
-  phi::backends::gpu::GpuMemcpyAsync(csr_cols_data,
-                                     coo_cols_data,
-                                     sizeof(IntT) * non_zero_num,
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
-  phi::backends::gpu::GpuMemcpyAsync(csr_values_data,
-                                     coo_values_data,
-                                     sizeof(T) * non_zero_num,
-                                     gpuMemcpyDeviceToDevice,
-                                     dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(csr_cols_data,
+                                coo_cols_data,
+                                sizeof(IntT) * non_zero_num,
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
+  backends::gpu::GpuMemcpyAsync(csr_values_data,
+                                coo_values_data,
+                                sizeof(T) * non_zero_num,
+                                gpuMemcpyDeviceToDevice,
+                                dev_ctx.stream());
   out->SetMember(crows, cols, values, x_dims);
 }
 
@@ -625,7 +623,7 @@ void CooToDenseGPUKernel(const GPUContext& dev_ctx,
   dev_ctx.template Alloc<T>(out);
 
   T* out_data = out->data<T>();
-  phi::backends::gpu::GpuMemsetAsync(
+  backends::gpu::GpuMemsetAsync(
       out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream());
 
   if (x.nnz() <= 0) {
@@ -648,8 +646,7 @@ void CooToDenseGPUKernel(const GPUContext& dev_ctx,
 
   BUILD_CUDA_TENSOR(int64_t, sparse_offsets, d_sparse_offsets);
 
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
   KernelCooToDense<T, IntT>
       <<<config.block_per_grid.x,
diff --git a/paddle/phi/kernels/sparse/gpu/sum_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sum_grad_kernel.cu
index 81119e9b1b2fdf..48147ed760ba3e 100644
--- a/paddle/phi/kernels/sparse/gpu/sum_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sum_grad_kernel.cu
@@ -99,7 +99,7 @@ void SumCooGradGPUKernel(const Context& dev_ctx,
     for (auto i = 1; i < x.values().dims().size(); ++i) {
       length *= x.values().dims()[i];
     }
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, length, 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, length, 1);
 
     SetValueCudaKernel<T>
         <<<config.block_per_grid.x,
@@ -153,8 +153,7 @@ void SumCsrGradKernel(const Context& dev_ctx,
   *dx_cols = x_cols;
 
   if (n_dim == 0) {
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, dx->nnz(), 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, dx->nnz(), 1);
     SetValueCudaKernel<T>
         <<<config.block_per_grid.x,
            config.thread_per_block.x,
@@ -172,15 +171,14 @@ void SumCsrGradKernel(const Context& dev_ctx,
                         "`axis` of SumCsrKernel only support None or -1 now."
                         "More number will be supported in the future."));
   if (x.dims().size() == 2) {
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x.dims()[0], 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x.dims()[0], 1);
     SumCsr2DGradCudaKernel<T><<<config.block_per_grid.x,
                                 config.thread_per_block.x,
                                 0,
                                 dev_ctx.stream()>>>(
         x_crows_data, dout_values_data, x.dims()[0], dx_values_data);
   } else {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x.dims()[0] * (x.dims()[1] + 1), 1);
     SumCsr3DGradCudaKernel<T><<<config.block_per_grid.x,
                                 config.thread_per_block.x,
diff --git a/paddle/phi/kernels/sparse/gpu/sum_kernel.cu b/paddle/phi/kernels/sparse/gpu/sum_kernel.cu
index a62db2168b6957..9a25485589ed72 100644
--- a/paddle/phi/kernels/sparse/gpu/sum_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sum_kernel.cu
@@ -257,8 +257,7 @@ void SumCooGPU1Kernel(const Context& dev_ctx,
   auto* out_indices_data = out_indices.data<IntT>();
   auto* out_values_data = out_values.data<T>();
 
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, x.nnz(), x.nnz());
+  auto config = backends::gpu::GetGpuLaunchConfig2D(dev_ctx, x.nnz(), x.nnz());
   SumCooCudaKernel<T, IntT><<<config.block_per_grid.x,
                               config.thread_per_block.x,
                               0,
@@ -324,7 +323,7 @@ void SumCsr0Kernel(const Context& dev_ctx,
   auto* out_crows_data = out_crows.data<int64_t>();
   auto* out_cols_data = out_cols.data<int64_t>();
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, 2, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, 2, 1);
   SumAllCsrCudaKernel<<<config.block_per_grid.x,
                         config.thread_per_block.x,
                         0,
@@ -359,8 +358,7 @@ void SumCsr1Kernel(const Context& dev_ctx,
     auto* out_cols_data = out_cols.data<int64_t>();
     auto* out_values_data = out_values.data<T>();
     out_dims = make_ddim({x_dim0, 1});
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_dim0 + 1, 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_dim0 + 1, 1);
     SumCsr2DCudaKernel<T><<<config.block_per_grid.x,
                             config.thread_per_block.x,
                             0,
@@ -397,7 +395,7 @@ void SumCsr1Kernel(const Context& dev_ctx,
         dev_ctx, x_crows_last, Scalar(0), false, false, false, &batch_nnz);
     auto* batch_nnz_data = batch_nnz.data<int64_t>();
 
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+    auto config = backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, x.dims()[0] * (x.dims()[1] + 1), 1);
     SumCsr3DCudaKernel<T><<<config.block_per_grid.x,
                             config.thread_per_block.x,
diff --git a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu
index e8bdf2fed10fae..0b5bd039bb0c03 100644
--- a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu
@@ -184,8 +184,7 @@ void TransposeCooKernel(const Context &dev_ctx,
                      perm.data(),
                      sizeof(int) * perm.size(),
                      dev_ctx.stream());
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_nnz * n_dim, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_nnz * n_dim, 1);
   TransposeCooCudaKernel<<<config.block_per_grid.x,
                            config.thread_per_block.x,
                            0,
@@ -283,8 +282,7 @@ void TransposeCsrGpuKernel(const GPUContext &dev_ctx,
                      dev_ctx.stream());
 
   int64_t x_nnz = x.nnz();
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_dims[0], 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_dims[0], 1);
   if (perm.size() == 2) {
     TransposeCsr2DCudaKernel<T><<<config.block_per_grid.x,
                                   config.thread_per_block.x,