diff --git a/paddle/phi/kernels/fusion/gpu/attn_gemm_int8.h b/paddle/phi/kernels/fusion/gpu/attn_gemm_int8.h index 382d69fc450d6a..9bce833790f51c 100644 --- a/paddle/phi/kernels/fusion/gpu/attn_gemm_int8.h +++ b/paddle/phi/kernels/fusion/gpu/attn_gemm_int8.h @@ -26,7 +26,7 @@ namespace phi { namespace fusion { -using phi::backends::gpu::GpuLaunchConfig; +using backends::gpu::GpuLaunchConfig; template class AttnMatmulINT8 { @@ -37,8 +37,8 @@ class AttnMatmulINT8 { auto helper = std::make_shared( m, k, n, dev_ctx.cublaslt_handle()); helpers_.emplace_back(helper); - gpu_config_ = std::make_unique( - phi::backends::gpu::GetGpuLaunchConfig1D( + gpu_config_ = + std::make_unique(backends::gpu::GetGpuLaunchConfig1D( dev_ctx, m * n, DequantKernelVecSize)); } ~AttnMatmulINT8() {} diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h index d81450b62e6086..8ee3c2f0c7e11b 100644 --- a/paddle/phi/kernels/fusion/gpu/block_attn.h +++ b/paddle/phi/kernels/fusion/gpu/block_attn.h @@ -1812,10 +1812,10 @@ inline GPU(Error_t) GetNumBlocks(int64_t n, int *num_blocks) { constexpr int kBlockSize = 128; constexpr int kNumWaves = 16; - const int device_id = phi::backends::gpu::GetCurrentDeviceId(); - const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id); + const int device_id = backends::gpu::GetCurrentDeviceId(); + const int sm_count = backends::gpu::GetGPUMultiProcessors(device_id); const int max_thread_per_multiprocessor = - phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id); + backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id); *num_blocks = std::max(1, diff --git a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu index ee2cc986c3eb26..f223b0d3abc1cb 100644 --- a/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu @@ -22,13 +22,6 @@ #include "paddle/phi/kernels/gpu/flash_attn_utils.h" #include "paddle/utils/none.h" -inline int getSMVersion() { - const int device = phi::backends::gpu::GetCurrentDeviceId(); - const phi::gpuDeviceProp prop = - phi::backends::gpu::GetDeviceProperties(device); - return prop.major * 10 + prop.minor; -} - #if defined(__CUDACC__) && CUDA_VERSION >= 11000 #define CUDA_BFLOAT16_AVAILABLE #include @@ -37,6 +30,12 @@ inline int getSMVersion() { namespace phi { namespace fusion { +inline int getSMVersion() { + const int device = backends::gpu::GetCurrentDeviceId(); + const phi::gpuDeviceProp prop = backends::gpu::GetDeviceProperties(device); + return prop.major * 10 + prop.minor; +} + int GetMaxLen(const GPUContext& dev_ctx, const DenseTensor& seq_lens_tensor, DenseTensor* max_len_tensor, diff --git a/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h index 49355243a5ceba..9dfd2214ce449e 100644 --- a/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h +++ b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h @@ -31,7 +31,7 @@ static void VecCastKernel(const GPUContext &dev_ctx, const InT *x, OutT *y, size_t n) { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, VecSize); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, VecSize); auto block = config.GetGridSize(); auto thread = config.GetBlockSize(); auto main_offset = n / (VecSize * thread) * VecSize * thread; diff --git a/paddle/phi/kernels/fusion/gpu/cudnn_bn_stats_finalize.cu.h b/paddle/phi/kernels/fusion/gpu/cudnn_bn_stats_finalize.cu.h index 7afb087ea92a37..89a7c7c054d255 100644 --- a/paddle/phi/kernels/fusion/gpu/cudnn_bn_stats_finalize.cu.h +++ b/paddle/phi/kernels/fusion/gpu/cudnn_bn_stats_finalize.cu.h @@ -22,16 +22,15 @@ namespace fusion { template using BatchNormParamType = - typename phi::backends::gpu::CudnnDataType::BatchNormParamType; + typename backends::gpu::CudnnDataType::BatchNormParamType; #if CUDNN_VERSION >= 8000 template struct BNStatsFinalizeArgs { BNStatsFinalizeArgs() { - dtype = phi::backends::gpu::CudnnDataType::type; - param_dtype = - phi::backends::gpu::CudnnDataType>::type; + dtype = backends::gpu::CudnnDataType::type; + param_dtype = backends::gpu::CudnnDataType>::type; format = CUDNN_TENSOR_NHWC; } @@ -53,8 +52,8 @@ struct BNStatsFinalizeArgs { cudnnDataType_t param_dtype; cudnnTensorFormat_t format; - phi::backends::gpu::TensorDescriptor in_desc; - phi::backends::gpu::TensorDescriptor out_desc; + backends::gpu::TensorDescriptor in_desc; + backends::gpu::TensorDescriptor out_desc; }; template diff --git a/paddle/phi/kernels/fusion/gpu/cudnn_norm_conv.cu.h b/paddle/phi/kernels/fusion/gpu/cudnn_norm_conv.cu.h index f27d8f8e8cddea..a39724f16ee342 100644 --- a/paddle/phi/kernels/fusion/gpu/cudnn_norm_conv.cu.h +++ b/paddle/phi/kernels/fusion/gpu/cudnn_norm_conv.cu.h @@ -23,7 +23,7 @@ namespace dynload = phi::dynload; template using ScalingParamType = - typename phi::backends::gpu::CudnnDataType::ScalingParamType; + typename backends::gpu::CudnnDataType::ScalingParamType; #if CUDNN_VERSION >= 8000 @@ -32,9 +32,9 @@ static size_t RoundUp(int64_t a, int64_t b) { return (a + b - 1) / b * b; } template struct NormConvolutionArgs { NormConvolutionArgs() { - dtype = phi::backends::gpu::CudnnDataType::type; + dtype = backends::gpu::CudnnDataType::type; format = CUDNN_TENSOR_NHWC; - compute_type = phi::backends::gpu::CudnnDataType::type; + compute_type = backends::gpu::CudnnDataType::type; } void Set(const GPUContext &dev_ctx, @@ -163,11 +163,11 @@ struct NormConvolutionArgs { std::vector paddings; std::vector dilations; - phi::backends::gpu::TensorDescriptor in_desc; - phi::backends::gpu::FilterDescriptor filter_desc; - phi::backends::gpu::TensorDescriptor out_desc; - phi::backends::gpu::TensorDescriptor out_stats_desc; - phi::backends::gpu::ConvolutionDescriptor conv_desc; + backends::gpu::TensorDescriptor in_desc; + backends::gpu::FilterDescriptor filter_desc; + backends::gpu::TensorDescriptor out_desc; + backends::gpu::TensorDescriptor out_stats_desc; + backends::gpu::ConvolutionDescriptor conv_desc; bool is_support; }; diff --git a/paddle/phi/kernels/fusion/gpu/cudnn_scale_bias_add_relu.cu.h b/paddle/phi/kernels/fusion/gpu/cudnn_scale_bias_add_relu.cu.h index 7aff27dd267758..839304f3117308 100644 --- a/paddle/phi/kernels/fusion/gpu/cudnn_scale_bias_add_relu.cu.h +++ b/paddle/phi/kernels/fusion/gpu/cudnn_scale_bias_add_relu.cu.h @@ -20,20 +20,19 @@ namespace phi { namespace fusion { template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; namespace dynload = phi::dynload; template using BatchNormParamType = - typename phi::backends::gpu::CudnnDataType::BatchNormParamType; + typename backends::gpu::CudnnDataType::BatchNormParamType; #if CUDNN_VERSION >= 8000 template struct ScaleBiasAddReluArgs { ScaleBiasAddReluArgs() { - dtype = phi::backends::gpu::CudnnDataType::type; - param_dtype = - phi::backends::gpu::CudnnDataType>::type; + dtype = backends::gpu::CudnnDataType::type; + param_dtype = backends::gpu::CudnnDataType>::type; format = CUDNN_TENSOR_NHWC; } @@ -89,12 +88,12 @@ struct ScaleBiasAddReluArgs { cudnnDataType_t param_dtype; cudnnTensorFormat_t format; - phi::backends::gpu::TensorDescriptor in_desc; - phi::backends::gpu::TensorDescriptor out_desc; - phi::backends::gpu::TensorDescriptor equiv_scale_bias_desc; - phi::backends::gpu::TensorDescriptor scale_bias_mean_var_desc; - phi::backends::gpu::TensorDescriptor bitmask_desc; - phi::backends::gpu::ActivationDescriptor activation_desc; + backends::gpu::TensorDescriptor in_desc; + backends::gpu::TensorDescriptor out_desc; + backends::gpu::TensorDescriptor equiv_scale_bias_desc; + backends::gpu::TensorDescriptor scale_bias_mean_var_desc; + backends::gpu::TensorDescriptor bitmask_desc; + backends::gpu::ActivationDescriptor activation_desc; }; template diff --git a/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu index 00a22d8f05bb49..5b5b2292143f70 100644 --- a/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu @@ -494,7 +494,7 @@ void DistributedFusedLambInitOpKernel( VLOG(10) << "rank = " << rank << ", nranks = " << nranks << " , alignment = " << alignment; if (alignment <= 0) { - alignment = phi::backends::gpu::GpuMinChunkSize(); + alignment = backends::gpu::GpuMinChunkSize(); } PADDLE_ENFORCE_GE( alignment, diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h index f7250122bd79a0..11db8f7e673825 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h +++ b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h @@ -116,10 +116,10 @@ inline gpuError_t GetNumBlocks(int64_t n, int *num_blocks) { constexpr int kBlockSize = 128; constexpr int kNumWaves = 16; - const int device_id = phi::backends::gpu::GetCurrentDeviceId(); - const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id); + const int device_id = backends::gpu::GetCurrentDeviceId(); + const int sm_count = backends::gpu::GetGPUMultiProcessors(device_id); const int max_thread_per_multiprocessor = - phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id); + backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id); *num_blocks = std::max(1, diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu index a153bd2ac2c74f..db9f35bf9385dc 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu @@ -55,7 +55,7 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx, DenseTensor *bias_grad) { // Note(andsonder): Fused bn activation only used in the gpu place. #if defined(PADDLE_WITH_CUDA) and CUDNN_VERSION >= 7401 - using CudnnDataType = phi::backends::gpu::CudnnDataType; + using CudnnDataType = backends::gpu::CudnnDataType; using BatchNormParamType = typename CudnnDataType::BatchNormParamType; double epsilon1 = static_cast(epsilon); @@ -147,7 +147,7 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx, DenseTensor workspace_tensor; auto reserve_space_size = reserve_space.memory_size(); cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; - phi::backends::gpu::ScopedActivationDescriptor scope_act_desc; + backends::gpu::ScopedActivationDescriptor scope_act_desc; cudnnActivationDescriptor_t activation_desc_ = scope_act_desc.descriptor(act_type); // --------------- cudnn batchnorm workspace --------------- diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu index cd08ea15d8a4a5..bfafd0ac2d0f57 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu @@ -54,7 +54,7 @@ void FusedBatchNormActKernel(const Context &dev_ctx, DenseTensor *reserve_space) { // Note(andsonder): Fused bn activation only used in the gpu place. #if defined(PADDLE_WITH_CUDA) and CUDNN_VERSION >= 7401 - using CudnnDataType = phi::backends::gpu::CudnnDataType; + using CudnnDataType = backends::gpu::CudnnDataType; using BatchNormParamType = typename CudnnDataType::BatchNormParamType; double epsilon1 = static_cast(epsilon); @@ -129,7 +129,7 @@ void FusedBatchNormActKernel(const Context &dev_ctx, double this_factor = 1. - momentum; cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION; - phi::backends::gpu::ScopedActivationDescriptor scope_act_desc; + backends::gpu::ScopedActivationDescriptor scope_act_desc; cudnnActivationDescriptor_t activation_desc_ = scope_act_desc.descriptor(act_type); size_t workspace_size = 0; diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu index 7177bdda8a09bd..642ef1761fe884 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu @@ -38,7 +38,7 @@ namespace phi { namespace fusion { template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; @@ -141,7 +141,7 @@ void FusedBatchNormAddActGradKernel(const Context &dev_ctx, DenseTensor workspace_tensor; auto reserve_space_size = reserve_space_ptr->memory_size(); cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION; - phi::backends::gpu::ScopedActivationDescriptor scope_act_desc; + backends::gpu::ScopedActivationDescriptor scope_act_desc; cudnnActivationDescriptor_t activation_desc_ = scope_act_desc.descriptor(act_type); // --------------- cudnn batchnorm workspace --------------- diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu index 837e0d40f89f41..b5a69bc25871e5 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu @@ -38,7 +38,7 @@ namespace phi { namespace fusion { template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; @@ -112,7 +112,7 @@ void FusedBatchNormAddActKernel(const Context &dev_ctx, double this_factor = 1. - momentum; cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION; - phi::backends::gpu::ScopedActivationDescriptor scope_act_desc; + backends::gpu::ScopedActivationDescriptor scope_act_desc; cudnnActivationDescriptor_t activation_desc_ = scope_act_desc.descriptor(act_type); size_t workspace_size = 0; diff --git a/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act_kernel.cu index efa8d043d2e61d..c98c667ed75af4 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act_kernel.cu @@ -47,12 +47,12 @@ class CudnnConvDescManager { } struct CudnnCacheInfo { - phi::backends::gpu::TensorDescriptor* x_desc{nullptr}; - phi::backends::gpu::FilterDescriptor* w_desc{nullptr}; - phi::backends::gpu::TensorDescriptor* b_desc{nullptr}; - phi::backends::gpu::TensorDescriptor* o_desc{nullptr}; - phi::backends::gpu::ConvolutionDescriptor* conv_desc{nullptr}; - phi::backends::gpu::ActivationDescriptor* act_desc{nullptr}; + backends::gpu::TensorDescriptor* x_desc{nullptr}; + backends::gpu::FilterDescriptor* w_desc{nullptr}; + backends::gpu::TensorDescriptor* b_desc{nullptr}; + backends::gpu::TensorDescriptor* o_desc{nullptr}; + backends::gpu::ConvolutionDescriptor* conv_desc{nullptr}; + backends::gpu::ActivationDescriptor* act_desc{nullptr}; size_t workspace_size; cudnnConvolutionFwdAlgo_t algo; @@ -283,42 +283,42 @@ class CudnnConvDescManager { } private: - phi::backends::gpu::TensorDescriptor* GetTensorDescInfo( + backends::gpu::TensorDescriptor* GetTensorDescInfo( const std::vector& input_dims, phi::DataType input_dtype, cudnnTensorFormat_t input_format) { - auto* desc = new phi::backends::gpu::TensorDescriptor(); + auto* desc = new backends::gpu::TensorDescriptor(); desc->set( input_dims, input_format, backends::gpu::ToCudnnDataType(input_dtype)); return desc; } - phi::backends::gpu::FilterDescriptor* GetFilterDescInfo( + backends::gpu::FilterDescriptor* GetFilterDescInfo( const std::vector& input_dims, phi::DataType input_dtype, cudnnTensorFormat_t input_format) { - auto* desc = new phi::backends::gpu::FilterDescriptor(); + auto* desc = new backends::gpu::FilterDescriptor(); desc->set( input_dims, input_format, backends::gpu::ToCudnnDataType(input_dtype)); return desc; } - phi::backends::gpu::ConvolutionDescriptor* GetConvDescInfo( + backends::gpu::ConvolutionDescriptor* GetConvDescInfo( const std::vector& paddings, const std::vector& strides, const std::vector& dilations, int groups, cudnnDataType_t dtype) { - auto* desc = new phi::backends::gpu::ConvolutionDescriptor(); + auto* desc = new backends::gpu::ConvolutionDescriptor(); desc->set( dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups); return desc; } - phi::backends::gpu::ActivationDescriptor* GetActivationDescInfo( + backends::gpu::ActivationDescriptor* GetActivationDescInfo( const std::string& act, double value_max = std::numeric_limits::max()) { - auto* desc = new phi::backends::gpu::ActivationDescriptor(); + auto* desc = new backends::gpu::ActivationDescriptor(); cudnnActivationMode_t mode; double relu_ceiling = 0.0; if (act == "identity") { @@ -545,7 +545,7 @@ void FusedConv2dAddActKernel(const Context& dev_ctx, conv_attr_cache->dilations, transformed_input.dtype(), groups, - phi::backends::gpu::CudnnDataType::type, + backends::gpu::CudnnDataType::type, compute_format, search_func, activation); diff --git a/paddle/phi/kernels/fusion/gpu/fused_dconv_drelu_dbn_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dconv_drelu_dbn_kernel.cu index f187543d70d14c..5b0cc43081f424 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dconv_drelu_dbn_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_dconv_drelu_dbn_kernel.cu @@ -34,7 +34,7 @@ namespace fusion { using helper = phi::CudnnFrontendConvHelper; template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; namespace { cudnn_frontend::Operation MakeDreluOp(cudnnDataType_t dtype, @@ -131,17 +131,16 @@ void _DgradDreluBnBwdWeightImpl(const Context& dev_ctx, TransToChannelLast(dev_ctx, weight, &w_tensor_transformed); // build tensor descriptors cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC; - auto tensor_format = - phi::backends::gpu::ToCudnnDataType(grad_output->dtype()); + auto tensor_format = backends::gpu::ToCudnnDataType(grad_output->dtype()); auto tensor_format_math = CUDNN_DATA_FLOAT; auto compute_dtype = CUDNN_DATA_FLOAT; // get dims in CUDNN manner: [N, C, H, W] - auto dim_x = phi::backends::gpu::TransformDimOrder( - vectorize(bn1_input->dims())); - auto dim_filt = phi::backends::gpu::TransformDimOrder( + auto dim_x = + backends::gpu::TransformDimOrder(vectorize(bn1_input->dims())); + auto dim_filt = backends::gpu::TransformDimOrder( vectorize(w_tensor_transformed.dims())); - auto dim_y = phi::backends::gpu::TransformDimOrder( - vectorize(grad_output->dims())); + auto dim_y = + backends::gpu::TransformDimOrder(vectorize(grad_output->dims())); std::vector dim_scale(dim_x.size(), 1); dim_scale[1] = dim_x[1]; // [1, C, 1, 1] @@ -542,12 +541,12 @@ void _DbnApplyImpl(const Context& dev_ctx, auto handle = dev_ctx.cudnn_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC; - auto tensor_format = phi::backends::gpu::ToCudnnDataType(dY_tensor->dtype()); + auto tensor_format = backends::gpu::ToCudnnDataType(dY_tensor->dtype()); auto tensor_format_math = CUDNN_DATA_FLOAT; auto compute_dtype = CUDNN_DATA_FLOAT; // build tensor descriptors - auto dim_x = phi::backends::gpu::TransformDimOrder( - vectorize(X_tensor->dims())); + auto dim_x = + backends::gpu::TransformDimOrder(vectorize(X_tensor->dims())); std::vector dim_a(dim_x.size(), 1); dim_a[1] = dim_x[1]; // [1, C, 1, 1] @@ -746,16 +745,16 @@ void _BnActWgradImpl(const Context& dev_ctx, ResizeToChannelLast(dev_ctx, dw_tensor, &dw_tensor_transformed); // create tensor descriptors cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC; - auto tensor_format = phi::backends::gpu::ToCudnnDataType(conv_input->dtype()); + auto tensor_format = backends::gpu::ToCudnnDataType(conv_input->dtype()); auto tensor_format_math = CUDNN_DATA_FLOAT; auto compute_dtype = CUDNN_DATA_FLOAT; // create tensor descriptors - auto dim_x = phi::backends::gpu::TransformDimOrder( - vectorize(conv_input->dims())); - auto dim_filt = phi::backends::gpu::TransformDimOrder( + auto dim_x = + backends::gpu::TransformDimOrder(vectorize(conv_input->dims())); + auto dim_filt = backends::gpu::TransformDimOrder( vectorize(dw_tensor_transformed.dims())); - auto dim_y = phi::backends::gpu::TransformDimOrder( - vectorize(grad_output->dims())); + auto dim_y = + backends::gpu::TransformDimOrder(vectorize(grad_output->dims())); std::vector dim_scale(dim_x.size(), 1); dim_scale[1] = dim_x[1]; // [1, C, 1, 1] @@ -995,7 +994,7 @@ void FusedDconvDreluDbnKernel(const Context& dev_ctx, "This op only supports Ampere and later devices, " "but got compute capability: %d.", dev_ctx.GetComputeCapability())); - auto cudnn_version = phi::backends::gpu::DnnVersion(); + auto cudnn_version = backends::gpu::DnnVersion(); PADDLE_ENFORCE_GE(cudnn_version, 8900, common::errors::PreconditionNotMet( diff --git a/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu b/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu index fff5cf7398b837..54c6888bcf4391 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_dot_product_attention_op.cu @@ -86,7 +86,7 @@ void FusedDotProductAttentionKernel(const Context &dev_ctx, "This op only supports Ampere and later devices, " "but got compute capability: %d.", dev_ctx.GetComputeCapability())); - auto cudnn_version = phi::backends::gpu::DnnVersion(); + auto cudnn_version = backends::gpu::DnnVersion(); PADDLE_ENFORCE_GE(cudnn_version, 8906, common::errors::PreconditionNotMet( @@ -294,7 +294,7 @@ void FusedDotProductAttentionGradKernel( "This op only supports Ampere and later devices, " "but got compute capability: %d.", dev_ctx.GetComputeCapability())); - auto cudnn_version = phi::backends::gpu::DnnVersion(); + auto cudnn_version = backends::gpu::DnnVersion(); PADDLE_ENFORCE_GE(cudnn_version, 8906, common::errors::PreconditionNotMet( diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h index fdfbcfc7d57e9f..c1c007a966c076 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h @@ -459,8 +459,8 @@ void LaunchDropoutActBiasGrad(Functor act_functor, } } else { const uint64_t n = rows * cols; - phi::backends::gpu::GpuLaunchConfig config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n / real_vec_size); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n / real_vec_size); if (n % VecSize == 0) { FusedDropoutActGrad <<(); const uint64_t seed_data = static_cast(seed_offset_data[0]); const uint64_t increment = static_cast(seed_offset_data[1]); @@ -217,7 +217,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx, << ", increment = " << increment; }; - phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t + backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t cudaKernelCallback = [=](unsigned int id) { void* functionPtr = reinterpret_cast( &(VectorizedDropoutBackward>)); @@ -244,7 +244,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx, functor); return cudaFunc; }; - phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch( + backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch( parameterSetter, cudaKernelCallback); VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu index 3766d881dedc74..1ab2d3df35e277 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu @@ -193,35 +193,34 @@ void FusedDropoutAddKernel(const Context& dev_ctx, // seed_offset_data should preserved by cudaGraph pool auto gen_cuda = dev_ctx.GetGenerator(); auto state_index = gen_cuda->GetStateIndex(); - auto parameterSetter = - [dev_ctx_p, - offset, - seed_offset_data, - state_index, - seed_tensor_ptr, - fix_seed](phi::backends::gpu::gpuKernelParams& params) { - if (!fix_seed) { - auto gen_cuda = dev_ctx_p->GetGenerator(); - // ensure the generator use correct state index - gen_cuda->SetStateIndex(state_index); - - // we assume seed is null pointer - // seed copy to cpu is meaningless here - assert(seed_tensor_ptr == nullptr); - - uint64_t seed, increment; - std::tie(seed, increment) = gen_cuda->IncrementOffset(offset); - VLOG(10) << "CUDA_GRAPH seed = " << seed - << ", increment = " << increment; - - params.As(2) = seed; - params.As(6) = increment; - - seed_offset_data[0] = static_cast(seed); - seed_offset_data[1] = static_cast(increment); - } - }; - phi::backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t + auto parameterSetter = [dev_ctx_p, + offset, + seed_offset_data, + state_index, + seed_tensor_ptr, + fix_seed](backends::gpu::gpuKernelParams& params) { + if (!fix_seed) { + auto gen_cuda = dev_ctx_p->GetGenerator(); + // ensure the generator use correct state index + gen_cuda->SetStateIndex(state_index); + + // we assume seed is null pointer + // seed copy to cpu is meaningless here + assert(seed_tensor_ptr == nullptr); + + uint64_t seed, increment; + std::tie(seed, increment) = gen_cuda->IncrementOffset(offset); + VLOG(10) << "CUDA_GRAPH seed = " << seed + << ", increment = " << increment; + + params.As(2) = seed; + params.As(6) = increment; + + seed_offset_data[0] = static_cast(seed); + seed_offset_data[1] = static_cast(increment); + } + }; + backends::gpu::CUDAGraphNodeLauncher::gpuKernelCallback_t cudaKernelCallback = [=](unsigned int id) { void* functionPtr = reinterpret_cast( &(VectorizedDropoutForward>)); @@ -247,7 +246,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx, dst_functor); return cudaFunc; }; - phi::backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch( + backends::gpu::CUDAGraphNodeLauncher::Instance().KernelNodeLaunch( parameterSetter, cudaKernelCallback); VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h index 83d180b42ae3a5..9f42dcdd21cdb5 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h @@ -28,7 +28,7 @@ static inline std::vector GetRandomCudaProp(int64_t numel, size_t grid_size = gpu_config.GetGridSize(); size_t block_size = gpu_config.GetBlockSize(); int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); - const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id); + const auto& prop = backends::gpu::GetDeviceProperties(device_id); size_t max_grid_size = prop.maxThreadsPerMultiProcessor * prop.multiProcessorCount / block_size; grid_size = std::min(grid_size, max_grid_size); diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h index 4cf7c8ed9060f2..c2e0f109174466 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_common.h @@ -51,7 +51,7 @@ namespace fusion { * 1D blocks: blockDim.x = cols * 2D grids: gridDim.y = rows */ -inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids( +inline backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids( const GPUContext &dev_ctx, const uint64_t rows, const uint64_t cols, @@ -79,7 +79,7 @@ inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids( blocks_y = (blocks_y + blocks_z - 1) / blocks_z; blocks_y = blocks_y >= 65536 ? 65535 : blocks_y; } - phi::backends::gpu::GpuLaunchConfig config; + backends::gpu::GpuLaunchConfig config; config.block_per_grid.x = static_cast(blocks_x); config.block_per_grid.y = static_cast(blocks_y); config.block_per_grid.z = static_cast(blocks_z); diff --git a/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu index 8409e70911a6be..1b43ebfddf3787 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu @@ -67,7 +67,7 @@ void EmbeddingEltWiseLayerNormKernel( } const int64_t* stable_in1s = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(in1s.data()), in1s.size()); phi::memory_utils::Copy(GPUPlace{}, in_ids_d, @@ -76,7 +76,7 @@ void EmbeddingEltWiseLayerNormKernel( sizeof(int64_t) * input_num, dev_ctx.stream()); const int64_t* stable_in2s = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(in2s.data()), in2s.size()); phi::memory_utils::Copy(GPUPlace{}, in_embs_d, diff --git a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu index 2eded99e6416e5..40553cecc2f7ae 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu @@ -312,7 +312,7 @@ void AddReluAddLayerNorm(const Context& dev_ctx, int N, float epsilon) { if (with_relu) { - switch (phi::backends::gpu::RoundToPowerOfTwo(N)) { + switch (backends::gpu::RoundToPowerOfTwo(N)) { CUDA_LAUNCH_KERNEL_HELPER( InplaceAddReluAddLayerNormKernel << << << << -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template using LayerNormParamType = typename CudnnDataType::BatchNormParamType; diff --git a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h index 6dd276d4e2151b..37bba718254417 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h +++ b/paddle/phi/kernels/fusion/gpu/fused_multi_transformer_op.cu.h @@ -2093,10 +2093,10 @@ inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) { constexpr int kBlockSize = 128; constexpr int kNumWaves = 16; - const int device_id = phi::backends::gpu::GetCurrentDeviceId(); - const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id); + const int device_id = backends::gpu::GetCurrentDeviceId(); + const int sm_count = backends::gpu::GetGPUMultiProcessors(device_id); const int max_thread_per_multiprocessor = - phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id); + backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id); *num_blocks = std::max(1, diff --git a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h index 227e5a2413b050..e4d9da8723e0af 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h +++ b/paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h @@ -504,9 +504,8 @@ void LaunchResidualDropoutBiasGrad(const T *dout, dev_ctx.stream()); \ } else { \ const uint64_t n = rows * cols; \ - phi::backends::gpu::GpuLaunchConfig config = \ - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, \ - n / real_vec_size); \ + backends::gpu::GpuLaunchConfig config = \ + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n / real_vec_size); \ if (n % VecSize == 0) { \ FusedResidualDropoutGrad \ << -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template void FusedScaleBiasAddReluKernel(const Context& dev_ctx, @@ -75,12 +75,11 @@ void FusedScaleBiasAddReluKernel(const Context& dev_ctx, auto workspace_handle = dev_ctx.cudnn_workspace_handle(); // create tensor descriptors cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC; - auto tensor_format = phi::backends::gpu::ToCudnnDataType(x1.dtype()); + auto tensor_format = backends::gpu::ToCudnnDataType(x1.dtype()); auto tensor_format_math = CUDNN_DATA_FLOAT; auto compute_dtype = CUDNN_DATA_FLOAT; - auto dim_x = - phi::backends::gpu::TransformDimOrder(vectorize(x1.dims())); + auto dim_x = backends::gpu::TransformDimOrder(vectorize(x1.dims())); std::vector dim_c(dim_x.size(), 1); dim_c[1] = dim_x[1]; // [1, C, 1, 1] diff --git a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu index cedae622480554..10f2cef97cf20b 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu @@ -33,7 +33,7 @@ namespace fusion { using helper = phi::CudnnFrontendConvHelper; template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; /* * Implements Scale + Bias + ReLU + Conv + BNStats fusion pattern. @@ -115,18 +115,17 @@ void FusedScaleBiasReluConvBnstatsImpl(const Context& dev_ctx, // build tensors cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC; - auto tensor_format = phi::backends::gpu::ToCudnnDataType(x.dtype()); + auto tensor_format = backends::gpu::ToCudnnDataType(x.dtype()); auto tensor_format_math = CUDNN_DATA_FLOAT; auto compute_dtype = CUDNN_DATA_FLOAT; // get dims in CUDNN manner: [N, C, H, W] - auto dim_x = - phi::backends::gpu::TransformDimOrder(vectorize(in_dims)); + auto dim_x = backends::gpu::TransformDimOrder(vectorize(in_dims)); auto dim_filt = - phi::backends::gpu::TransformDimOrder(vectorize(filter_dims)); + backends::gpu::TransformDimOrder(vectorize(filter_dims)); auto dim_y = - phi::backends::gpu::TransformDimOrder(vectorize(output->dims())); + backends::gpu::TransformDimOrder(vectorize(output->dims())); std::vector dim_scale(dim_x.size(), 1); dim_scale[1] = dim_x[1]; // [1, C, 1, 1] std::vector dim_sum(dim_x.size(), 1); // [1, K, 1, 1] @@ -317,9 +316,8 @@ void BNFinalizeImpl(const Context& dev_ctx, auto workspace_handle = dev_ctx.cudnn_workspace_handle(); // set dtypes cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC; - auto tensor_format_bn = - phi::backends::gpu::ToCudnnDataType(sum_tensor.dtype()); - auto tensor_format = phi::backends::gpu::ToCudnnDataType(eq_scale->dtype()); + auto tensor_format_bn = backends::gpu::ToCudnnDataType(sum_tensor.dtype()); + auto tensor_format = backends::gpu::ToCudnnDataType(eq_scale->dtype()); auto compute_dtype = CUDNN_DATA_FLOAT; // create tensor descriptors auto dim_input = vectorize(sum_tensor.dims()); @@ -499,7 +497,7 @@ void FusedScaleBiasReluConvBnKernel(const Context& dev_ctx, DenseTensor* saved_var, DenseTensor* eq_scale, DenseTensor* eq_bias) { - auto cudnn_version = phi::backends::gpu::DnnVersion(); + auto cudnn_version = backends::gpu::DnnVersion(); PADDLE_ENFORCE_GE(cudnn_version, 8800, common::errors::PreconditionNotMet( diff --git a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu index 20cbe5e06ddc66..b458c6e07f979d 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu @@ -132,9 +132,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx, dev_ctx.GetPlace(), total_ptr_len * sizeof(void *)); #ifdef PADDLE_WITH_HIP T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_out_grads_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(out_grads_data.data()), out_grads_data.size()), out_grads_data.size() * sizeof(T *), hipMemcpyHostToDevice, @@ -142,9 +142,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx, T **gpu_in_grads_values = reinterpret_cast(&gpu_out_grads_values[out_grads_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_in_grads_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(in_grads_data.data()), in_grads_data.size()), in_grads_data.size() * sizeof(T *), hipMemcpyHostToDevice, @@ -152,9 +152,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx, T **gpu_cvm_values = reinterpret_cast(&gpu_in_grads_values[in_grads_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_cvm_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(cvm_data.data()), cvm_data.size()), cvm_data.size() * sizeof(T *), hipMemcpyHostToDevice, @@ -162,18 +162,18 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx, size_t **lods_values = reinterpret_cast(&gpu_cvm_values[cvm_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( lods_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(lods.data()), lods.size()), lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); #else T **gpu_out_grads_values = reinterpret_cast(temp_ptr->ptr()); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_out_grads_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(out_grads_data.data()), out_grads_data.size()), out_grads_data.size() * sizeof(T *), cudaMemcpyHostToDevice, @@ -181,9 +181,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx, T **gpu_in_grads_values = reinterpret_cast(&gpu_out_grads_values[out_grads_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_in_grads_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(in_grads_data.data()), in_grads_data.size()), in_grads_data.size() * sizeof(T *), cudaMemcpyHostToDevice, @@ -191,9 +191,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx, T **gpu_cvm_values = reinterpret_cast(&gpu_in_grads_values[in_grads_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_cvm_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(cvm_data.data()), cvm_data.size()), cvm_data.size() * sizeof(T *), cudaMemcpyHostToDevice, @@ -201,9 +201,9 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx, size_t **lods_values = reinterpret_cast(&gpu_cvm_values[cvm_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( lods_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(lods.data()), lods.size()), lods.size() * sizeof(size_t *), cudaMemcpyHostToDevice, @@ -211,7 +211,7 @@ void FusedSeqpoolCVMGrad(const GPUContext &dev_ctx, #endif size_t N = static_cast(batch_size * slot_num * embedding_size); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N); if (use_cvm) { // join grad FusedSeqpoolCVMGradKernelWithCVM<<(temp_ptr->ptr()); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_input_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(input_data.data()), input_data.size()), input_data.size() * sizeof(T *), hipMemcpyHostToDevice, stream); T **gpu_output_values = reinterpret_cast(&gpu_input_values[input_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_output_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(output_data.data()), output_data.size()), output_data.size() * sizeof(T *), hipMemcpyHostToDevice, stream); T **gpu_seqpool_output_values = reinterpret_cast(&gpu_output_values[output_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_seqpool_output_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(seqpool_output_data.data()), seqpool_output_data.size()), seqpool_output_data.size() * sizeof(T *), @@ -150,36 +150,36 @@ void FusedSeqpoolCVM( stream); size_t **lods_values = reinterpret_cast( &gpu_seqpool_output_values[seqpool_output_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( lods_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(lods.data()), lods.size()), lods.size() * sizeof(size_t *), hipMemcpyHostToDevice, stream); #else T **gpu_input_values = reinterpret_cast(temp_ptr->ptr()); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_input_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(input_data.data()), input_data.size()), input_data.size() * sizeof(T *), cudaMemcpyHostToDevice, stream); T **gpu_output_values = reinterpret_cast(&gpu_input_values[input_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_output_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(output_data.data()), output_data.size()), output_data.size() * sizeof(T *), cudaMemcpyHostToDevice, stream); T **gpu_seqpool_output_values = reinterpret_cast(&gpu_output_values[output_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( gpu_seqpool_output_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(seqpool_output_data.data()), seqpool_output_data.size()), seqpool_output_data.size() * sizeof(T *), @@ -187,9 +187,9 @@ void FusedSeqpoolCVM( stream); size_t **lods_values = reinterpret_cast( &gpu_seqpool_output_values[seqpool_output_data.size()]); - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( lods_values, - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(lods.data()), lods.size()), lods.size() * sizeof(size_t *), cudaMemcpyHostToDevice, @@ -197,8 +197,8 @@ void FusedSeqpoolCVM( #endif size_t N = static_cast(batch_size * slot_num * embedding_size); - phi::backends::gpu::GpuLaunchConfig config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N); // first sum pool FusedSeqpoolKernelNormal<<(batch_size * slot_num * (embedding_size - cvm_offset)); - phi::backends::gpu::GpuLaunchConfig config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N); FusedCVMKernelNoCVM<< -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template void TransposeFlattenConcatFusionKernel( diff --git a/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_grad_kernel.cu index 9c0eb5e1d81be9..93246925b39f13 100644 --- a/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_grad_kernel.cu @@ -122,8 +122,8 @@ void MaxPoolV2GradCUDNNKernel(const Context& dev_ctx, auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto layout = GetLayoutFromStr(data_format); - auto layout_format = phi::backends::gpu::GetCudnnTensorFormat(layout); - auto input_dtype = phi::backends::gpu::CudnnDataType::type; + auto layout_format = backends::gpu::GetCudnnTensorFormat(layout); + auto input_dtype = backends::gpu::CudnnDataType::type; auto saved_idx_dtype = CudnnIndexType::type; // Create plan and execute diff --git a/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_kernel.cu b/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_kernel.cu index efaca556b50cea..88b80f899cb721 100644 --- a/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/max_pool2d_v2_kernel.cu @@ -104,8 +104,8 @@ void MaxPoolV2CUDNNKernel(const Context& dev_ctx, auto workspace_handle = dev_ctx.cudnn_workspace_handle(); auto layout = GetLayoutFromStr(data_format); - auto layout_format = phi::backends::gpu::GetCudnnTensorFormat(layout); - auto input_dtype = phi::backends::gpu::CudnnDataType::type; + auto layout_format = backends::gpu::GetCudnnTensorFormat(layout); + auto input_dtype = backends::gpu::CudnnDataType::type; auto saved_idx_dtype = CudnnIndexType::type; // Create plan and execute diff --git a/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h b/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h index b703192190992b..1e313e4f6f21ef 100644 --- a/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h +++ b/paddle/phi/kernels/fusion/gpu/quant_dequant_kernel.h @@ -24,7 +24,7 @@ namespace phi { namespace fusion { -using phi::backends::gpu::GpuLaunchConfig; +using backends::gpu::GpuLaunchConfig; constexpr int DequantKernelVecSize = 4; diff --git a/paddle/phi/kernels/fusion/gpu/resnet_unit_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/resnet_unit_grad_kernel.cu index e1aaca0038b9f1..ba7ec1a45fdb60 100644 --- a/paddle/phi/kernels/fusion/gpu/resnet_unit_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/resnet_unit_grad_kernel.cu @@ -64,7 +64,7 @@ void ResNetUnitGradKernel(const Context &dev_ctx, DenseTensor *filter_z_grad, DenseTensor *scale_z_grad, DenseTensor *bias_z_grad) { - PADDLE_ENFORCE_EQ(phi::backends::gpu::CudnnDataType::type, + PADDLE_ENFORCE_EQ(backends::gpu::CudnnDataType::type, CUDNN_DATA_HALF, common::errors::Unavailable( "ResNetUnitOp only supports float16 for now.")); diff --git a/paddle/phi/kernels/fusion/gpu/resnet_unit_kernel.cu b/paddle/phi/kernels/fusion/gpu/resnet_unit_kernel.cu index 77ccbe117f96b4..5479e96c030852 100644 --- a/paddle/phi/kernels/fusion/gpu/resnet_unit_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/resnet_unit_kernel.cu @@ -63,7 +63,7 @@ void ResNetUnitKernel(const Context &dev_ctx, DenseTensor *saved_invstd_z, DenseTensor *running_mean_z, DenseTensor *running_var_z) { - PADDLE_ENFORCE_EQ(phi::backends::gpu::CudnnDataType::type, + PADDLE_ENFORCE_EQ(backends::gpu::CudnnDataType::type, CUDNN_DATA_HALF, common::errors::Unavailable( "ResNetUnitOp only supports float16 for now.")); diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu index d2cc2b6e2b02a0..1c35fd225e1cd7 100644 --- a/paddle/phi/kernels/gpu/add_n_kernel.cu +++ b/paddle/phi/kernels/gpu/add_n_kernel.cu @@ -189,10 +189,9 @@ void AddNKernel(const Context &dev_ctx, auto tmp_in_array = phi::memory_utils::Alloc( dev_ctx.GetPlace(), in_data.size() * sizeof(void *)); size_t nbytes_in = in_data.size() * sizeof(void *); - const void *stable_in = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - reinterpret_cast(const_cast(in_data.data())), - nbytes_in); + const void *stable_in = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + reinterpret_cast(const_cast(in_data.data())), + nbytes_in); memory_utils::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), CPUPlace(), @@ -285,9 +284,8 @@ void AddNKernel(const Context &dev_ctx, dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *)); size_t nbytes_sr = sr_in_out_data.size() * sizeof(T *); - const void *stable_sr = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - reinterpret_cast(sr_in_out_data.data()), nbytes_sr); + const void *stable_sr = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + reinterpret_cast(sr_in_out_data.data()), nbytes_sr); memory_utils::Copy(dev_ctx.GetPlace(), tmp_sr_in_out_array->ptr(), CPUPlace(), @@ -310,10 +308,9 @@ void AddNKernel(const Context &dev_ctx, memory_utils::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *)); size_t nbytes_in2 = in_data.size() * sizeof(T *); - const void *stable_in2 = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - reinterpret_cast(const_cast(in_data.data())), - nbytes_in2); + const void *stable_in2 = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + reinterpret_cast(const_cast(in_data.data())), + nbytes_in2); memory_utils::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), CPUPlace(), diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu index 22875642d68075..f65ae8303c2b89 100644 --- a/paddle/phi/kernels/gpu/amp_kernel.cu +++ b/paddle/phi/kernels/gpu/amp_kernel.cu @@ -175,9 +175,8 @@ class LazyZeros { for (int i = 0; i < xs_size; i++) { h_starts[i + 1] = h_starts[i] + outs[i]->numel(); } - auto* stable_h_starts = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_starts, - xs_size + 1); + auto* stable_h_starts = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + h_starts, xs_size + 1); memory_utils::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, @@ -199,8 +198,7 @@ class LazyZeros { h_out_addrs[i] = dev_ctx.Alloc(outs[i]); } auto* stable_h_out_addrs = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_out_addrs, - xs_size); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_out_addrs, xs_size); memory_utils::Copy(dev_ctx.GetPlace(), d_out_addrs, cpu_place, @@ -311,8 +309,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, } int64_t total_num = h_starts[xs_size]; auto* stable_h_starts = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_starts, - xs_size + 1); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_starts, xs_size + 1); memory_utils::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, @@ -337,7 +334,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, h_outs[i] = dev_ctx.template Alloc(outs[i]); } auto* stable_h_xs = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_xs, 2 * xs_size); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(h_xs, 2 * xs_size); memory_utils::Copy(dev_ctx.GetPlace(), d_xs, cpu_place, diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu index aa57e24e041271..cae023a1642eb4 100644 --- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu +++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu @@ -70,8 +70,7 @@ void SetAccumulators(const GPUContext& dev_ctx, auto cuda_place = out_old_num_accumulates->place(); const int64_t* stable_na = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&num_accumulates, - 1); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&num_accumulates, 1); memory_utils::Copy(dev_ctx.GetPlace(), out_num_accumulates_ptr, CPUPlace(), @@ -79,9 +78,8 @@ void SetAccumulators(const GPUContext& dev_ctx, sizeof(int64_t), stream); - const int64_t* stable_ona = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - &old_num_accumulates, 1); + const int64_t* stable_ona = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + &old_num_accumulates, 1); memory_utils::Copy(dev_ctx.GetPlace(), out_old_num_accumulates_ptr, CPUPlace(), @@ -90,7 +88,7 @@ void SetAccumulators(const GPUContext& dev_ctx, stream); const int64_t* stable_nu = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&num_updates, 1); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&num_updates, 1); memory_utils::Copy(cuda_place, out_num_updates_ptr, CPUPlace(), diff --git a/paddle/phi/kernels/gpu/box_coder_kernel.cu b/paddle/phi/kernels/gpu/box_coder_kernel.cu index bc2ed13f307649..9d7e9a221de6fd 100644 --- a/paddle/phi/kernels/gpu/box_coder_kernel.cu +++ b/paddle/phi/kernels/gpu/box_coder_kernel.cu @@ -217,7 +217,7 @@ void BoxCoderKernel(const Context &dev_ctx, auto cplace = CPUPlace(); const auto gplace = dev_ctx.GetPlace(); const float *stable_variance = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(variance.data()), variance.size()); memory_utils::Copy( gplace, dev_var_data, cplace, stable_variance, bytes, dev_ctx.stream()); diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu index fab45447fb5c95..00282c339c3a33 100644 --- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu +++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu @@ -323,7 +323,7 @@ static char* GetGpuHintStringPtr(const GPUContext& dev_ctx, dev_ctx.stream())); #else const char* stable_str = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( const_cast(iter->first.c_str()), op_var.length() + 1); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(gpu_str_ptr, stable_str, diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu index a74998897135e4..231597aea9d483 100644 --- a/paddle/phi/kernels/gpu/cholesky_kernel.cu +++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu @@ -254,7 +254,7 @@ void CholeskyKernel(const Context& dev_ctx, #endif // check the info PADDLE_ENFORCE_EQ( - phi::backends::gpu::IsCUDAGraphCapturing(), + backends::gpu::IsCUDAGraphCapturing(), false, common::errors::InvalidArgument( "CholeskyKernel does not support CUDA Graph capture: async D2H copy " diff --git a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu index a5ef2390d0d911..0419156609847c 100644 --- a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu @@ -238,7 +238,7 @@ void GPUCollectFpnProposalsOpKernel( GetLengthLoD<<>>( real_post_num, out_id_data, length_lod_data); PADDLE_ENFORCE_EQ( - phi::backends::gpu::IsCUDAGraphCapturing(), + backends::gpu::IsCUDAGraphCapturing(), false, common::errors::InvalidArgument( "CollectFpnProposals does not support CUDA Graph capture: async D2H " diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu index a19d4c27cc7db7..e2984db1ae7fca 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -313,7 +313,7 @@ template __device__ __forceinline__ T WarpReduceSumDown(T val) { #pragma unroll for (int offset = warpSize / 2; offset > 0; offset >>= 1) { - val += phi::backends::gpu::CudaShuffleDownSync(0xFFFFFFFF, val, offset); + val += backends::gpu::CudaShuffleDownSync(0xFFFFFFFF, val, offset); } return val; } @@ -322,7 +322,7 @@ template __device__ __forceinline__ T WarpReduceMaxDown(T val) { #pragma unroll for (int offset = warpSize / 2; offset > 0; offset >>= 1) { - T other = phi::backends::gpu::CudaShuffleDownSync(0xFFFFFFFF, val, offset); + T other = backends::gpu::CudaShuffleDownSync(0xFFFFFFFF, val, offset); val = max(val, other); } return val; @@ -1035,10 +1035,10 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx, : MIOPEN_SOFTMAX_MODE_CHANNEL; PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2( handle, - phi::backends::gpu::CudnnDataType::kOne(), + backends::gpu::CudnnDataType::kOne(), descp, logits_data, - phi::backends::gpu::CudnnDataType::kZero(), + backends::gpu::CudnnDataType::kZero(), descp, softmax_data, MIOPEN_SOFTMAX_LOG, @@ -1781,10 +1781,10 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx, : MIOPEN_SOFTMAX_MODE_CHANNEL; PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2( handle, - phi::backends::gpu::CudnnDataType::kOne(), + backends::gpu::CudnnDataType::kOne(), descp, logits_data, - phi::backends::gpu::CudnnDataType::kZero(), + backends::gpu::CudnnDataType::kZero(), descp, softmax_data, MIOPEN_SOFTMAX_LOG, diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index 9a280de8315a06..cbb17c02179b14 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -177,7 +177,7 @@ class DepthwiseConvFilterGradFunctor { template __forceinline__ __device__ T WarpReduceSum(T val, unsigned lane_mask) { for (int mask = HALF_WARP; mask > 0; mask >>= 1) - val += phi::backends::gpu::CudaShuffleDownSync(lane_mask, val, mask); + val += backends::gpu::CudaShuffleDownSync(lane_mask, val, mask); return val; } diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu index 3c6d6a74e299dc..2ab6c3abe383ed 100644 --- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu @@ -228,7 +228,7 @@ void DistributeFpnProposalsKernel( size_t start = 0; PADDLE_ENFORCE_EQ( - phi::backends::gpu::IsCUDAGraphCapturing(), + backends::gpu::IsCUDAGraphCapturing(), false, common::errors::InvalidArgument( "DistributeFpnProposals does not support CUDA Graph capture: async " diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu index af3c25ef1251ed..94ac3aaa626822 100644 --- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu +++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu @@ -143,7 +143,7 @@ void EditDistanceKernel(const Context& dev_ctx, distance = distance / n; } const T* stable_dist = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&distance, 1); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&distance, 1); memory_utils::Copy(dev_ctx.GetPlace(), out_data + num, CPUPlace(), diff --git a/paddle/phi/kernels/gpu/eig_grad_kernel.cu b/paddle/phi/kernels/gpu/eig_grad_kernel.cu index 89ee3f6a347ff3..2c0ac79014a49e 100644 --- a/paddle/phi/kernels/gpu/eig_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/eig_grad_kernel.cu @@ -213,7 +213,7 @@ void SolveLinearSystemGPU>( std::vector h_info(batch_count, 0); PADDLE_ENFORCE_EQ( - phi::backends::gpu::IsCUDAGraphCapturing(), + backends::gpu::IsCUDAGraphCapturing(), false, common::errors::InvalidArgument( "EigGradKernel does not support CUDA Graph capture: async D2H copy " @@ -402,7 +402,7 @@ void SolveLinearSystemGPU>( std::vector h_info(batch_count, 0); PADDLE_ENFORCE_EQ( - phi::backends::gpu::IsCUDAGraphCapturing(), + backends::gpu::IsCUDAGraphCapturing(), false, common::errors::InvalidArgument( "EigGradKernel does not support CUDA Graph capture: async D2H copy " diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu index f266a2bbb5596c..b59fe1c70bd30d 100644 --- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu @@ -77,7 +77,7 @@ void FillDiagonalTensorGradKernel(const Context &dev_ctx, tensor_tmp.Resize({2 + matrows}); int64_t *memory_block_cu = dev_ctx.template Alloc(&tensor_tmp); const auto gpu_place = dev_ctx.GetPlace(); - auto *stable_mb = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + auto *stable_mb = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( memory_block.data(), memory_block.size()); memory_utils::Copy(gpu_place, memory_block_cu, diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu index 82d65405e9308a..716177eca199d4 100644 --- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu +++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu @@ -93,7 +93,7 @@ void FillDiagonalTensorKernel(const Context &dev_ctx, tensor_tmp.Resize({2 + fill_dims[0]}); int64_t *memory_block_cu = dev_ctx.template Alloc(&tensor_tmp); const auto gpu_place = dev_ctx.GetPlace(); - auto *stable_mb = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + auto *stable_mb = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( memory_block.data(), memory_block.size()); memory_utils::Copy(gpu_place, memory_block_cu, diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu index bed31526f6ccb7..c3713f8aa66cfa 100644 --- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu @@ -306,7 +306,7 @@ static void NMS(const GPUContext &dev_ctx, memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); PADDLE_ENFORCE_EQ( - phi::backends::gpu::IsCUDAGraphCapturing(), + backends::gpu::IsCUDAGraphCapturing(), false, common::errors::InvalidArgument( "GenerateProposals does not support CUDA Graph capture: async D2H " @@ -338,9 +338,8 @@ static void NMS(const GPUContext &dev_ctx, } keep_out->Resize({num_to_keep}); int *keep = dev_ctx.template Alloc(keep_out); - const int *stable_keep = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - const_cast(keep_vec.data()), keep_vec.size()); + const int *stable_keep = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + const_cast(keep_vec.data()), keep_vec.size()); memory_utils::Copy(place, keep, CPUPlace(), @@ -582,9 +581,8 @@ void GenerateProposalsKernel(const Context &dev_ctx, rpn_rois_num->Resize({num}); dev_ctx.template Alloc(rpn_rois_num); int *num_data = rpn_rois_num->data(); - const int *stable_num = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - const_cast(tmp_num.data()), num); + const int *stable_num = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + const_cast(tmp_num.data()), num); memory_utils::Copy(place, num_data, cpu_place, diff --git a/paddle/phi/kernels/gpu/index_fill_kernel.cu b/paddle/phi/kernels/gpu/index_fill_kernel.cu index e525c8af330e38..6c5e2053614623 100644 --- a/paddle/phi/kernels/gpu/index_fill_kernel.cu +++ b/paddle/phi/kernels/gpu/index_fill_kernel.cu @@ -99,7 +99,7 @@ void LaunchIndexFillCudaKernelImpl(const Context& dev_ctx, T fill_value, T* out_data) { IndexT numel = outer_size * index_size * inner_size; - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); IndexFillCudaKernel <<>>( x_data, diff --git a/paddle/phi/kernels/gpu/masked_scatter_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_scatter_grad_kernel.cu index ab855097866552..e5c1785834e079 100644 --- a/paddle/phi/kernels/gpu/masked_scatter_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_scatter_grad_kernel.cu @@ -109,7 +109,7 @@ void MaskedScatterGradKernel(const Context& dev_ctx, // Compute x_grad if (x_grad) { auto x_grad_dims = x_grad->dims(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); if (x_grad_dims == out_grad_dims) { // No broadcast happened, compute directly into x_grad. @@ -180,7 +180,7 @@ void MaskedScatterGradKernel(const Context& dev_ctx, } } - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); MaskedScatterGradValueKernel <<>>( out_grad.data(), diff --git a/paddle/phi/kernels/gpu/masked_scatter_kernel.cu b/paddle/phi/kernels/gpu/masked_scatter_kernel.cu index 1fd78187d6d361..23b25213a0ac21 100644 --- a/paddle/phi/kernels/gpu/masked_scatter_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_scatter_kernel.cu @@ -155,7 +155,7 @@ void MaskedScatterKernel(const Context& dev_ctx, &prefix_sum_data[total - 1], &mask_bool_data[total - 1], value.numel()); // Launch masked scatter kernel - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); MaskedScatterCUDAKernel <<>>( x_expand.data(), diff --git a/paddle/phi/kernels/gpu/moe_permute_kernel.cu b/paddle/phi/kernels/gpu/moe_permute_kernel.cu index 9c0d0dacb56fcc..dff267774ec22e 100644 --- a/paddle/phi/kernels/gpu/moe_permute_kernel.cu +++ b/paddle/phi/kernels/gpu/moe_permute_kernel.cu @@ -583,9 +583,8 @@ void dispatch_preprocess(const Context &dev_ctx, padding_tokens_tensor.Resize({static_cast(padding_rows.size())}); dev_ctx.template Alloc(&padding_tokens_tensor); - auto *stable_padding_rows = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - const_cast(padding_rows.data()), padding_rows.size()); + auto *stable_padding_rows = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + const_cast(padding_rows.data()), padding_rows.size()); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(padding_tokens_tensor.data(), stable_padding_rows, sizeof(int) * padding_rows.size(), @@ -800,16 +799,16 @@ void MoePermuteKernel(const Context &dev_ctx, } } auto *stable_expert_offset = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(expert_offset, - kMaxNumExperts); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(expert_offset, + kMaxNumExperts); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(expert_offset_tensor.data(), stable_expert_offset, sizeof(int) * kMaxNumExperts, cudaMemcpyHostToDevice, dev_ctx.stream())); auto *stable_expert_offset_end = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - expert_offset_end, kMaxNumExperts); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(expert_offset_end, + kMaxNumExperts); PADDLE_ENFORCE_GPU_SUCCESS( cudaMemcpyAsync(expert_offset_end_tensor.data(), stable_expert_offset_end, diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu index 2a09c0f3658c42..f68448980dfb07 100644 --- a/paddle/phi/kernels/gpu/nms_kernel.cu +++ b/paddle/phi/kernels/gpu/nms_kernel.cu @@ -83,7 +83,7 @@ void NMSKernel(const Context& dev_ctx, NMS<<>>( boxes.data(), threshold, num_boxes, mask_dev); PADDLE_ENFORCE_EQ( - phi::backends::gpu::IsCUDAGraphCapturing(), + backends::gpu::IsCUDAGraphCapturing(), false, common::errors::InvalidArgument( "NMSKernel does not support CUDA Graph capture: async D2H copy to " @@ -115,8 +115,8 @@ void NMSKernel(const Context& dev_ctx, output->Resize({last_box_num}); auto* output_data = dev_ctx.template Alloc(output); const int64_t* stable_output = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(output_host, - last_box_num); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(output_host, + last_box_num); memory_utils::Copy(dev_ctx.GetPlace(), output_data, CPUPlace(), diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu index e9f04b0f812dac..7b2170f3152df6 100644 --- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu @@ -440,7 +440,7 @@ void PNormGradKernel(const Context& dev_ctx, GetPreAxisPost(xdim, axis, reduce_all, &pre, &axis_size, &post); int64_t total = in_x->numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); PNormGradP1Kernel<<numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); PNormGradP2Kernel<<numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); PNormGradPLessThan1Kernel<<numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); PNormGradPBetween1And2Kernel<<numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, total); PNormGradPGreaterThan2Kernel<<(dev_ctx.stream()))); size_t nbytes_out = out_data.size() * sizeof(T *); - const void *stable_out = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - reinterpret_cast(const_cast(out_data.data())), - nbytes_out); + const void *stable_out = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + reinterpret_cast(const_cast(out_data.data())), + nbytes_out); phi::memory_utils::Copy(dev_ctx.GetPlace(), tmp_out_array->ptr(), CPUPlace(), diff --git a/paddle/phi/kernels/gpu/partial_concat_kernel.cu b/paddle/phi/kernels/gpu/partial_concat_kernel.cu index a7ee5838b8d82f..8bef20f4422556 100644 --- a/paddle/phi/kernels/gpu/partial_concat_kernel.cu +++ b/paddle/phi/kernels/gpu/partial_concat_kernel.cu @@ -109,10 +109,8 @@ void PartialConcatOpCUDAKernel(const Context &dev_ctx, in_data.size() * sizeof(T *), phi::Stream(reinterpret_cast(dev_ctx.stream()))); size_t nbytes_in = in_data.size() * sizeof(T *); - const void *stable_in = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - reinterpret_cast(const_cast(in_data.data())), - nbytes_in); + const void *stable_in = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + reinterpret_cast(const_cast(in_data.data())), nbytes_in); phi::memory_utils::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), CPUPlace(), diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu index ad0e9c2bcc6d69..744a4713074b1f 100644 --- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu @@ -254,7 +254,7 @@ void RoiAlignGradKernel(const Context& dev_ctx, int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); int64_t bytes = box_batch_id_list.numel() * sizeof(int); const int* stable_box_batch_size = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( box_batch_size, static_cast(bytes / sizeof(int))); memory_utils::Copy(gplace, roi_id_data, diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu index a6f2bb15759ad9..d34ab1bf471439 100644 --- a/paddle/phi/kernels/gpu/roi_align_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu @@ -268,7 +268,7 @@ void RoiAlignKernel(const Context& dev_ctx, Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const int* stable_roi_batch_id = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( roi_batch_id_data, static_cast(bytes / sizeof(int))); memory_utils::Copy(gplace, roi_id_data, diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu index f20398b971f06a..180ca66227e039 100644 --- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu @@ -139,7 +139,7 @@ void RoiPoolGradKernel(const Context& dev_ctx, Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const int* stable_box_batch_id = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( box_batch_id_data, static_cast(bytes / sizeof(int))); memory_utils::Copy(gplace, roi_id_data, diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu index e3635cf6fe1877..a347a25247f093 100644 --- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu @@ -202,7 +202,7 @@ void RoiPoolKernel(const Context& dev_ctx, Stream(reinterpret_cast(dev_ctx.stream()))); int* box_id_data = reinterpret_cast(box_ptr->ptr()); const int* stable_box_batch_id = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( box_batch_id_data, static_cast(bytes / sizeof(int))); memory_utils::Copy(gplace, box_id_data, diff --git a/paddle/phi/kernels/gpu/seed_kernel.cu b/paddle/phi/kernels/gpu/seed_kernel.cu index d7b8afc95df46e..b5e7caaf8c1d89 100644 --- a/paddle/phi/kernels/gpu/seed_kernel.cu +++ b/paddle/phi/kernels/gpu/seed_kernel.cu @@ -44,7 +44,7 @@ void GPUSeedKernel(const Context &dev_ctx, auto *out_data = dev_ctx.template Alloc(out); auto stream = dev_ctx.stream(); const int *stable_seed = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&seed, 1); + backends::gpu::RestoreHostMemIfCapturingCUDAGraph(&seed, 1); phi::memory_utils::Copy(dev_ctx.GetPlace(), out_data, CPUPlace(), diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu index d07f3db4ac43cc..eb08365e9bb306 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -140,7 +140,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel( dev_ctx, counts_tensor, &norm_tensor, NonzeroFunctor(), reduce_dim); T *norm = dev_ctx.template Alloc(&norm_tensor); PADDLE_ENFORCE_EQ( - phi::backends::gpu::IsCUDAGraphCapturing(), + backends::gpu::IsCUDAGraphCapturing(), false, common::errors::InvalidArgument( "SigmoidCrossEntropyWithLogitsGrad does not support CUDA Graph " diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu index 2fa7b2a5f6a69e..34f24d356337a7 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -135,7 +135,7 @@ void SigmoidCrossEntropyWithLogitsKernel( dev_ctx, counts_tensor, &norm_tensor, NonzeroFunctor(), reduce_dim); T *norm = dev_ctx.template Alloc(&norm_tensor); PADDLE_ENFORCE_EQ( - phi::backends::gpu::IsCUDAGraphCapturing(), + backends::gpu::IsCUDAGraphCapturing(), false, common::errors::InvalidArgument( "SigmoidCrossEntropyWithLogits does not support CUDA Graph " diff --git a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu index cd769520d60336..13c1b5624107a8 100644 --- a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu +++ b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu @@ -149,7 +149,7 @@ struct SlogDeterminantFunctor, Context> { phi::Stream(reinterpret_cast(dev_ctx.stream()))); size_t nbytes_ptrs_c1 = cpu_ptrs.size() * sizeof(phi::dtype::complex*); const void* stable_ptrs_c1 = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( reinterpret_cast( const_cast**>(cpu_ptrs.data())), nbytes_ptrs_c1); @@ -339,7 +339,7 @@ struct SlogDeterminantV2Functor { phi::Stream(reinterpret_cast(dev_ctx.stream()))); size_t nbytes_ptrs_v2 = cpu_ptrs.size() * sizeof(T*); const void* stable_ptrs_v2 = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( reinterpret_cast(const_cast(cpu_ptrs.data())), nbytes_ptrs_v2); memory_utils::Copy(dev_ctx.GetPlace(), @@ -492,7 +492,7 @@ struct SlogDeterminantV2Functor, Context> { phi::Stream(reinterpret_cast(dev_ctx.stream()))); size_t nbytes_ptrs_v2c = cpu_ptrs.size() * sizeof(phi::dtype::complex*); const void* stable_ptrs_v2c = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( reinterpret_cast( const_cast**>(cpu_ptrs.data())), nbytes_ptrs_v2c); diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu index 80378c9f9af1ab..6993c6731ae9aa 100644 --- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu +++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu @@ -240,9 +240,9 @@ template __forceinline__ __device__ Pair WarpReduce(Pair input) { #pragma unroll for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) { - T tmp_val = phi::backends::gpu::CudaShuffleDownSync( + T tmp_val = backends::gpu::CudaShuffleDownSync( FINAL_MASK, input.v, offset, WARP_SIZE); - int tmp_id = phi::backends::gpu::CudaShuffleDownSync( + int tmp_id = backends::gpu::CudaShuffleDownSync( FINAL_MASK, input.id, offset, WARP_SIZE); if (static_cast(input.v) < static_cast(tmp_val)) { input.v = tmp_val; diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu index 3dbf561c09cf46..e09f51421ec3e7 100644 --- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu +++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu @@ -114,7 +114,7 @@ void TriangularSolveKernel(const Context& dev_ctx, phi::Stream(reinterpret_cast(dev_ctx.stream()))); size_t nbytes_a_ptrs = cpu_a_ptrs.size() * sizeof(T*); const void* stable_a_ptrs = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( reinterpret_cast(const_cast(cpu_a_ptrs.data())), nbytes_a_ptrs); memory_utils::Copy(dev_ctx.GetPlace(), @@ -143,7 +143,7 @@ void TriangularSolveKernel(const Context& dev_ctx, } size_t nbytes_b_ptrs = cpu_b_ptrs_for_chunk.size() * sizeof(T*); const void* stable_b_ptrs = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( reinterpret_cast(cpu_b_ptrs_for_chunk.data()), nbytes_b_ptrs); memory_utils::Copy(dev_ctx.GetPlace(), @@ -180,7 +180,7 @@ void TriangularSolveKernel(const Context& dev_ctx, size_t nbytes_ptrs = cpu_ptrs.size() * sizeof(T*); const void* stable_ptrs = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + backends::gpu::RestoreHostMemIfCapturingCUDAGraph( reinterpret_cast(const_cast(cpu_ptrs.data())), nbytes_ptrs); memory_utils::Copy(dev_ctx.GetPlace(), diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu index f05863f2aa9ea1..18e2083b846c71 100644 --- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu +++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu @@ -142,9 +142,8 @@ void YoloBoxKernel(const Context& dev_ctx, int* anchors_data = dev_ctx.template Alloc(&tmp_anchors); const auto gplace = dev_ctx.GetPlace(); const auto cplace = CPUPlace(); - const int* stable_anchors = - phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( - const_cast(anchors.data()), anchors.size()); + const int* stable_anchors = backends::gpu::RestoreHostMemIfCapturingCUDAGraph( + const_cast(anchors.data()), anchors.size()); memory_utils::Copy( gplace, anchors_data, cplace, stable_anchors, bytes, dev_ctx.stream()); diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu index 94a2e1e6884f03..8634e64ffe3867 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu @@ -53,14 +53,14 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx, dev_ctx, DenseTensorMeta(x_indices.dtype(), {nnz}, x_indices.layout())); IntT* indices_ptr = indices.data(); - phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), - sparse_offsets.data(), - sizeof(IntT) * sparse_dim, - gpuMemcpyHostToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), + sparse_offsets.data(), + sizeof(IntT) * sparse_dim, + gpuMemcpyHostToDevice, + dev_ctx.stream()); // 1. flatten indices - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz, 1); funcs::sparse::FlattenIndicesKernel<<()); IntT out_nnz = 0; - phi::backends::gpu::GpuMemcpyAsync(&out_nnz, - out_indices.data(), - sizeof(IntT), - gpuMemcpyDeviceToHost, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(&out_nnz, + out_indices.data(), + sizeof(IntT), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); dev_ctx.Wait(); out_indices.Resize({x_indices.dims()[0], out_nnz}); @@ -128,8 +128,8 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx, // 5. scatter the values const int VecSize = VecBytes / sizeof(T); if (stride % VecSize == 0) { - config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, nnz * stride / VecSize, 1); + config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride / VecSize, 1); funcs::sparse::ScatterKernel <<()); } else { - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1); funcs::sparse::ScatterKernel << <<>>(inputs, indices, output, indices_size, channels); } else { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + auto config = backends::gpu::GetGpuLaunchConfig1D( dev_ctx, indices_size * channels, 1); GatherKernel <<<<<<numel(), - dev_ctx.stream()); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + backends::gpu::GpuMemsetAsync(counter_ptr, + 0, + sizeof(int) * counter_per_kernel->numel(), + dev_ctx.stream()); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); const int rulebook_rows = 2; const int rulebook_cols = kernel_size * non_zero_num; @@ -516,7 +515,7 @@ int ProductRuleBook(const Context& dev_ctx, int index_flags_size = (table_size + 31) / 32; DenseTensor index_flags = Empty(dev_ctx, {index_flags_size}); int* index_flags_ptr = index_flags.data(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( index_flags_ptr, 0, sizeof(int) * index_flags.numel(), dev_ctx.stream()); if (subm) { @@ -528,8 +527,7 @@ int ProductRuleBook(const Context& dev_ctx, phi::Copy(dev_ctx, x.indices(), dev_ctx.GetPlace(), false, &out_indices); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); GetOutIndexTable1<<(dev_ctx, {rulebook_rows, rulebook_len}); IntT* out_rulebook_ptr = out_rulebook.data(); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); cache_size = kernel_size * 2 * sizeof(int); CopyRuleBook<<data(); int* unique_key_ptr = unique_key.data(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( unique_key_ptr, 0, sizeof(int), dev_ctx.stream()); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); size_t cache_size = sizeof(int) * config.thread_per_block.x; UniqueKernel<<(); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); GetOutIndexTable<<ResizeAndAllocate({static_cast(out_nnz * kernel_size)}); int* unique_value_ptr = unique_value->data(); diff --git a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu index 26dfd82779d65e..ba53f46058e595 100644 --- a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu @@ -79,7 +79,7 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, if (!is_params_freezing) { *kernel_grad = EmptyLike(dev_ctx, kernel); d_kernel_ptr = kernel_grad->data(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel(), dev_ctx.stream()); } @@ -88,11 +88,11 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, DenseTensor x_grad_indices = EmptyLike(dev_ctx, x.indices()); DenseTensor x_grad_values = EmptyLike(dev_ctx, x.values()); T* x_grad_values_ptr = x_grad_values.data(); - phi::backends::gpu::GpuMemsetAsync(x_grad_values_ptr, - 0, - sizeof(T) * x_grad_values.numel(), - dev_ctx.stream()); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync(x_grad_values_ptr, + 0, + sizeof(T) * x_grad_values.numel(), + dev_ctx.stream()); + backends::gpu::GpuMemsetAsync( d_x_features_ptr, 0, sizeof(T) * d_x_features.numel(), dev_ctx.stream()); phi::Copy( dev_ctx, x.indices(), dev_ctx.GetPlace(), false, &x_grad_indices); @@ -125,14 +125,13 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, } } - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); DenseTensor unique_value = Empty(dev_ctx, {static_cast(x_grad->nnz() * kernel_size * 2)}); DenseTensor out_index = Empty(dev_ctx, {static_cast(x.nnz() * 2)}); int* out_index_ptr = out_index.data(); int* unique_value_ptr = unique_value.data(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream()); GroupIndicesV2<<(out->nnz() * kernel_size)}); out_index.ResizeAndAllocate({static_cast(rulebook_len)}); int* out_index_ptr = out_index.data(); int* unique_value_ptr = unique_value.data(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream()); GroupIndices<<(), - out.data(), - sizeof(T) * n, - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(input.data(), + out.data(), + sizeof(T) * n, + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); } template @@ -381,8 +381,7 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx, int* h_buffer) { DenseTensor d_buffer = Empty(dev_ctx, {2 * kernel_size + 3}); const bool is2D = out_dims.size() == 4 ? true : false; - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); ProductRuleBookKernel<<data(); int* unique_key_ptr = unique_key.data(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( unique_key_ptr, 0, sizeof(int), dev_ctx.stream()); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, max_nnz, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, max_nnz, 1); size_t cache_size = sizeof(int) * config.thread_per_block.x; int* index_flags_ptr = index_flags->data(); UniqueKernel<<(), - counter_ptr, - kernel_size * sizeof(int), - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(d_buffer.data() + kernel_size, - offsets_ptr, - kernel_size * sizeof(int), - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(d_buffer.data() + 2 * kernel_size + 1, - rulebook_len_tensor.data(), - sizeof(int), - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(d_buffer.data() + 2 * kernel_size + 2, - unique_key_ptr, - sizeof(int), - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(h_buffer, - d_buffer.data(), - (2 * kernel_size + 3) * sizeof(int), - gpuMemcpyDeviceToHost, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(d_buffer.data(), + counter_ptr, + kernel_size * sizeof(int), + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(d_buffer.data() + kernel_size, + offsets_ptr, + kernel_size * sizeof(int), + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(d_buffer.data() + 2 * kernel_size + 1, + rulebook_len_tensor.data(), + sizeof(int), + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(d_buffer.data() + 2 * kernel_size + 2, + unique_key_ptr, + sizeof(int), + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(h_buffer, + d_buffer.data(), + (2 * kernel_size + 3) * sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); dev_ctx.Wait(); int rulebook_len = h_buffer[2 * kernel_size + 1] / 2; @@ -512,7 +511,7 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx, IntT* out_indices_ptr = out_indices.data(); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); GetOutIndexTable <<>>( out_index_ptr, @@ -522,7 +521,7 @@ int ProductRuleBookWithBuffer(const Context& dev_ctx, out_index_table_ptr, out_indices_ptr); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); unique_value->ResizeAndAllocate({static_cast(out_nnz * kernel_size)}); int* unique_value_ptr = unique_value->data(); diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index a63a75dd670047..07fc90eb946982 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -73,15 +73,15 @@ inline IntT* SortedAndUniqueIndex(const Context& dev_ctx, phi::IndexKernel>( dev_ctx, unique_value, kps::IdentityFunctor()); - phi::backends::gpu::GpuMemcpyAsync(unique_key->data(), - rulebook_ptr, - sizeof(IntT) * len, + backends::gpu::GpuMemcpyAsync(unique_key->data(), + rulebook_ptr, + sizeof(IntT) * len, #ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToDevice, + hipMemcpyDeviceToDevice, #else - cudaMemcpyDeviceToDevice, + cudaMemcpyDeviceToDevice, #endif - dev_ctx.stream()); + dev_ctx.stream()); // compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher // performance, but thrust::merge_by_key limited by data size #ifdef PADDLE_WITH_HIP @@ -325,8 +325,7 @@ int ProductRuleBook(const Context& dev_ctx, // 1. product rule book funcs::SetConstant set_zero; set_zero(dev_ctx, counter_per_kernel, 0); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); ProductRuleBookKernel<<<<<1, 1, 0, dev_ctx.stream()>>>( rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); IntT rulebook_len = 0; - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( &rulebook_len, rulebook_ptr + 3 * kernel_size * non_zero_num - 1, sizeof(IntT), @@ -396,7 +395,7 @@ int ProductRuleBook(const Context& dev_ctx, out_indices_ptr + rulebook_len, bound_ptr); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); UpdateOutIndexAndCounterAfterLowerBound<< <<<1, 1, 0, dev_ctx.stream()>>>(rulebook_ptr, last, bound_ptr); - phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, - bound_ptr, - sizeof(IntT), + backends::gpu::GpuMemcpyAsync(&rulebook_len, + bound_ptr, + sizeof(IntT), #ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, + hipMemcpyDeviceToHost, #else - cudaMemcpyDeviceToHost, + cudaMemcpyDeviceToHost, #endif - dev_ctx.stream()); + dev_ctx.stream()); dev_ctx.Wait(); rulebook_len /= 3; } @@ -444,25 +443,25 @@ int ProductRuleBook(const Context& dev_ctx, counter_ptr + kernel_size, offsets_ptr); - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), + backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), #ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, + hipMemcpyDeviceToHost, #else - cudaMemcpyDeviceToHost, + cudaMemcpyDeviceToHost, #endif - dev_ctx.stream()); + dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), + backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), #ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, + hipMemcpyDeviceToHost, #else - cudaMemcpyDeviceToHost, + cudaMemcpyDeviceToHost, #endif - dev_ctx.stream()); + dev_ctx.stream()); rulebook->Resize({rulebook_rows, static_cast(rulebook_len)}); @@ -494,14 +493,14 @@ int ProductRuleBook(const Context& dev_ctx, rulebook_ptr + rulebook_rows * rulebook_cols - 1); IntT out_non_zero_num = 0; #ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( &out_non_zero_num, rulebook_ptr + rulebook_rows * rulebook_cols - 1, sizeof(IntT), hipMemcpyDeviceToHost, dev_ctx.stream()); #else - phi::backends::gpu::GpuMemcpyAsync( + backends::gpu::GpuMemcpyAsync( &out_non_zero_num, rulebook_ptr + rulebook_rows * rulebook_cols - 1, sizeof(IntT), @@ -521,8 +520,7 @@ int ProductRuleBook(const Context& dev_ctx, IntT* out_indices_ptr = out_indices.data(); - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); UpdateIndexKernel << h_sparse_offsets(sparse_dim); funcs::sparse::CalcOffsetsPerDim(dims, sparse_dim, h_sparse_offsets.data()); - phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), - &h_sparse_offsets[0], - sizeof(int64_t) * sparse_dim, - gpuMemcpyHostToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), + &h_sparse_offsets[0], + sizeof(int64_t) * sparse_dim, + gpuMemcpyHostToDevice, + dev_ctx.stream()); phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices); @@ -94,7 +94,7 @@ void MaskCooGPUKernel(const GPUContext& dev_ctx, const int cols = dims_2d[1]; auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); MaskKernel <<>>( x_ptr, @@ -163,11 +163,11 @@ void MaskCsr2DGPUKernel(const GPUContext& dev_ctx, std::vector h_sparse_offsets(sparse_dim); funcs::sparse::CalcOffsetsPerDim(dims, sparse_dim, h_sparse_offsets.data()); - phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), - &h_sparse_offsets[0], - sizeof(int64_t) * sparse_dim, - gpuMemcpyHostToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), + &h_sparse_offsets[0], + sizeof(int64_t) * sparse_dim, + gpuMemcpyHostToDevice, + dev_ctx.stream()); const auto& csr_crows = mask.crows(); const auto& csr_cols = mask.cols(); @@ -186,23 +186,23 @@ void MaskCsr2DGPUKernel(const GPUContext& dev_ctx, IntT* coo_cols_data = coo_rows_data + non_zero_num; IntT* offsets_ptr = nullptr; - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); config.block_per_grid.y = batches; ConvertCsrCrowsToCooRows <<>>( csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows); - phi::backends::gpu::GpuMemcpyAsync(coo_cols_data, - csr_cols_data, - sizeof(IntT) * non_zero_num, - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(coo_cols_data, + csr_cols_data, + sizeof(IntT) * non_zero_num, + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); const T* x_ptr = x.data(); const IntT* indices_ptr = coo_indices; T* out_values_ptr = out_values.data(); auto config_mask = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); MaskKernel<< h_sparse_offsets(sparse_dim); funcs::sparse::CalcOffsetsPerDim(dims, sparse_dim, h_sparse_offsets.data()); - phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), - &h_sparse_offsets[0], - sizeof(int64_t) * sparse_dim, - gpuMemcpyHostToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), + &h_sparse_offsets[0], + sizeof(int64_t) * sparse_dim, + gpuMemcpyHostToDevice, + dev_ctx.stream()); const auto& csr_crows = mask.crows(); const auto& csr_cols = mask.cols(); @@ -264,8 +264,7 @@ void MaskCsr3DGPUKernel(const GPUContext& dev_ctx, IntT* coo_cols_data = coo_rows_data + non_zero_num; IntT* offsets_ptr = offsets.data(); - auto config_batch = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1); + auto config_batch = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1); GetBatchSizes <<>>( csr_crows_data, rows, batches, offsets_ptr); @@ -279,23 +278,23 @@ void MaskCsr3DGPUKernel(const GPUContext& dev_ctx, offsets_ptr + batches, offsets_ptr); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); config.block_per_grid.y = batches; ConvertCsrCrowsToCooRows <<>>( csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows); - phi::backends::gpu::GpuMemcpyAsync(coo_cols_data, - csr_cols_data, - sizeof(IntT) * non_zero_num, - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(coo_cols_data, + csr_cols_data, + sizeof(IntT) * non_zero_num, + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); const T* x_ptr = x.data(); const IntT* indices_ptr = coo_indices; T* out_values_ptr = out_values.data(); auto config_mask = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); MaskKernel<<(), - sparse_offsets.data(), - sizeof(IntT) * sparse_dim, - gpuMemcpyHostToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), + sparse_offsets.data(), + sizeof(IntT) * sparse_dim, + gpuMemcpyHostToDevice, + dev_ctx.stream()); // 3. flatten x indices and mask indices auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indices.numel(), 1); + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indices.numel(), 1); funcs::sparse::FlattenIndicesKernel<<(dev_ctx, {table_size}); DenseTensor index_flags = Empty(dev_ctx, {(table_size + 31) / 32}); - phi::backends::gpu::GpuMemsetAsync(index_flags.data(), - 0, - index_flags.numel() * sizeof(int), - dev_ctx.stream()); + backends::gpu::GpuMemsetAsync(index_flags.data(), + 0, + index_flags.numel() * sizeof(int), + dev_ctx.stream()); const int64_t stride = x.dims().size() == sparse_dim ? 1 : x.values().dims()[1]; *out = EmptyLike(dev_ctx, x.values()); funcs::SetConstant set_zero; set_zero(dev_ctx, out, static_cast(0)); T* out_ptr = out->data(); - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indices.numel(), 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indices.numel(), 1); MaskTable<<(), table.data()); - config = phi::backends::gpu::GetGpuLaunchConfig1D( + config = backends::gpu::GetGpuLaunchConfig1D( dev_ctx, mask_meta_indices.numel(), 1); const int VecBytes = 16; diff --git a/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu index 210ae220abd5a7..6733cfe8131280 100644 --- a/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu @@ -69,7 +69,7 @@ void MvCooGradKernel(const Context &dev_ctx, if (dx) { // InferMeta of SparseCooTensor 'dx', CreateLikeInferMeta EmptyLikeCooKernel(dev_ctx, x, dx); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, dx->nnz()); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, dx->nnz()); PD_VISIT_BASE_INTEGRAL_TYPES( dx->indices().dtype(), "MvCooGradKernel", ([&] { MvCooGradGpuKernel @@ -111,8 +111,8 @@ void MvCsrGradKernel(const Context &dev_ctx, int64_t row_number = dx->dims()[0]; int64_t col_number = dx->dims()[1]; - auto config = phi::backends::gpu::GetGpuLaunchConfig2D( - dev_ctx, col_number, row_number); + auto config = + backends::gpu::GetGpuLaunchConfig2D(dev_ctx, col_number, row_number); PD_VISIT_BASE_INTEGRAL_TYPES(dx->crows().dtype(), "MvCsrGradKernel", ([&] { MvCsrGradGpuKernel << <<<<SetMember(dx_indices, dx_values, x.dims(), x.coalesced()); auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_grad_nnz + 1, 1); + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_grad_nnz + 1, 1); GetCooInputGradCudaKernel<<<<<<= ends[0]) { - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n_rows + 1, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n_rows + 1, 1); GetCsrInputCrowsPart1CudaKernel<<(dev_ctx, {1}); int* d_out_nnz_ptr = d_out_nnz.data(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( d_out_nnz_ptr, 0, sizeof(int32_t), dev_ctx.stream()); // out_nnz_indices is the indices where the data is valid in out // the length of the out_nnz_indices must be less than x.nnz() DenseTensor d_out_nnz_indices = Empty(dev_ctx, {x.nnz()}); auto* d_out_nnz_indices_ptr = d_out_nnz_indices.data(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( d_out_nnz_indices_ptr, 0, sizeof(IntT), dev_ctx.stream()); // copy axes to device @@ -151,8 +151,7 @@ void SliceCooGPUCompute(const Context& dev_ctx, const auto* x_indices_data = x.indices().data(); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x.nnz() + 1, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x.nnz() + 1, 1); GetCooNonZeroNumberCudaKernel <<(); const auto* x_values_data = x.values().data(); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz + 1, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz + 1, 1); GetCooOutCudaKernel <<(dev_ctx, {out_n_rows + 1}); auto* out_crows_data = out_crows.data(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, ends[0] - starts[0] + 1, 1); + auto config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, ends[0] - starts[0] + 1, 1); GetCsr2DNonZeroNumberCudaKernel<<(dev_ctx, {out_nnz}); DenseTensor out_values = Empty(dev_ctx, {out_nnz}); out->SetMember(out_crows, out_cols, out_values, out_dims); - config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, ends[0] - starts[0] + 1, 1); + config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, ends[0] - starts[0] + 1, 1); GetCsr2DCudaKernel<<(dev_ctx, {x_dim0 + 1}); auto* x_cols_offsets_data = x_cols_offsets.data(); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_dim0 + 1, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_dim0 + 1, 1); GetXColsOffsetsCudaKernel<<(dev_ctx, {out_dim0 * (out_n_rows + 1)}); auto* out_crows_data = out_crows.data(); - config = phi::backends::gpu::GetGpuLaunchConfig1D( + config = backends::gpu::GetGpuLaunchConfig1D( dev_ctx, x_dim0 * (x_n_rows + 1) + 1, 1); GetCsr3DNonZeroNumberCudaKernel<<(dev_ctx, {out_dim0 * (out_n_rows + 1)}); auto* out_cols_offsets_data = out_cols_offsets.data(); - phi::backends::gpu::GpuMemcpyAsync( - out_cols_offsets_data, - out_crows_data, - out_dim0 * (out_n_rows + 1) * sizeof(int64_t), - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(out_cols_offsets_data, + out_crows_data, + out_dim0 * (out_n_rows + 1) * sizeof(int64_t), + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); dev_ctx.Wait(); int64_t out_nnz = #ifdef PADDLE_WITH_HIP @@ -565,7 +562,7 @@ void SliceCsrTensor3D(const Context& dev_ctx, DenseTensor out_values = Empty(dev_ctx, {out_nnz}); auto* out_values_data = out_values.data(); out->SetMember(out_crows, out_cols, out_values, out_dims); - config = phi::backends::gpu::GetGpuLaunchConfig1D( + config = backends::gpu::GetGpuLaunchConfig1D( dev_ctx, x_dim0 * (x_n_rows + 1) + 1, 1); GetCsr3DCudaKernel<<(), \ - vector.data(), \ - vector.size() * sizeof(T), \ - gpuMemcpyHostToDevice, \ - dev_ctx.stream()); \ + backends::gpu::GpuMemcpyAsync(tensor.data(), \ + vector.data(), \ + vector.size() * sizeof(T), \ + gpuMemcpyHostToDevice, \ + dev_ctx.stream()); \ } namespace phi { @@ -220,9 +220,8 @@ void DenseToCooKernel(const Context& dev_ctx, // 1. get numbers of non zero elements, and get the index of non zero elements int* nums_ptr = nums.data(); - phi::backends::gpu::GpuMemsetAsync( - nums_ptr, 0, sizeof(int), dev_ctx.stream()); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); + backends::gpu::GpuMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); DenseTensor temp_indices = Empty(dev_ctx, {rows}); int* temp_indices_ptr = temp_indices.data(); @@ -244,16 +243,16 @@ void DenseToCooKernel(const Context& dev_ctx, // 2. copy non_zero_num to host, copy x_dims to device int non_zero_num = 0; - phi::backends::gpu::GpuMemcpyAsync(&non_zero_num, - nums_ptr, - sizeof(int), - gpuMemcpyDeviceToHost, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(d_x_dims.data(), - x_dims.Get(), - x_dims.size() * sizeof(x_dims[0]), - gpuMemcpyHostToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(&non_zero_num, + nums_ptr, + sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(d_x_dims.data(), + x_dims.Get(), + x_dims.size() * sizeof(x_dims[0]), + gpuMemcpyHostToDevice, + dev_ctx.stream()); dev_ctx.Wait(); // wait the copy @@ -268,7 +267,7 @@ void DenseToCooKernel(const Context& dev_ctx, // 3. calc indices by indices and get values by indices if (non_zero_num > 0) { - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); GetNonZeroElementsAndIndices<<<<>>( csr_crows_data, rows, batches, offsets_ptr); @@ -403,26 +402,26 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx, rocsparse_index_base_zero); }); #else - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); config.block_per_grid.y = batches; ConvertCsrCrowsToCooRows <<>>( csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows); #endif - phi::backends::gpu::GpuMemcpyAsync(coo_cols_data, - csr_cols_data, + backends::gpu::GpuMemcpyAsync(coo_cols_data, + csr_cols_data, #ifdef PADDLE_WITH_HIP - sizeof(int) * non_zero_num, + sizeof(int) * non_zero_num, #else - sizeof(IntT) * non_zero_num, + sizeof(IntT) * non_zero_num, #endif - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(coo_values_data, - csr_values_data, - sizeof(T) * non_zero_num, - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(coo_values_data, + csr_values_data, + sizeof(T) * non_zero_num, + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); #ifdef PADDLE_WITH_HIP if (std::is_same::value) @@ -534,10 +533,9 @@ void CooToCsrGPUKernel(const GPUContext& dev_ctx, const IntT* coo_cols_data = coo_rows_data + non_zero_num; const T* coo_values_data = coo_values.data(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1); if (batches > 1) { - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); DenseTensor batches_offset = Empty(dev_ctx, {batches}); int* batches_offset_ptr = batches_offset.data(); funcs::SetConstant set_zero; @@ -563,16 +561,16 @@ void CooToCsrGPUKernel(const GPUContext& dev_ctx, nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num); } - phi::backends::gpu::GpuMemcpyAsync(csr_cols_data, - coo_cols_data, - sizeof(IntT) * non_zero_num, - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(csr_values_data, - coo_values_data, - sizeof(T) * non_zero_num, - gpuMemcpyDeviceToDevice, - dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(csr_cols_data, + coo_cols_data, + sizeof(IntT) * non_zero_num, + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); + backends::gpu::GpuMemcpyAsync(csr_values_data, + coo_values_data, + sizeof(T) * non_zero_num, + gpuMemcpyDeviceToDevice, + dev_ctx.stream()); out->SetMember(crows, cols, values, x_dims); } @@ -625,7 +623,7 @@ void CooToDenseGPUKernel(const GPUContext& dev_ctx, dev_ctx.template Alloc(out); T* out_data = out->data(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream()); if (x.nnz() <= 0) { @@ -648,8 +646,7 @@ void CooToDenseGPUKernel(const GPUContext& dev_ctx, BUILD_CUDA_TENSOR(int64_t, sparse_offsets, d_sparse_offsets); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); KernelCooToDense << <<nnz(), 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, dx->nnz(), 1); SetValueCudaKernel <<<<>>( x_crows_data, dout_values_data, x.dims()[0], dx_values_data); } else { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + auto config = backends::gpu::GetGpuLaunchConfig1D( dev_ctx, x.dims()[0] * (x.dims()[1] + 1), 1); SumCsr3DGradCudaKernel<<(); auto* out_values_data = out_values.data(); - auto config = - phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, x.nnz(), x.nnz()); + auto config = backends::gpu::GetGpuLaunchConfig2D(dev_ctx, x.nnz(), x.nnz()); SumCooCudaKernel<<(); auto* out_cols_data = out_cols.data(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, 2, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, 2, 1); SumAllCsrCudaKernel<<(); auto* out_values_data = out_values.data(); out_dims = make_ddim({x_dim0, 1}); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_dim0 + 1, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_dim0 + 1, 1); SumCsr2DCudaKernel<<(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + auto config = backends::gpu::GetGpuLaunchConfig1D( dev_ctx, x.dims()[0] * (x.dims()[1] + 1), 1); SumCsr3DCudaKernel<<<<