diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu index ef69c22275b808..ed3c1882223390 100644 --- a/paddle/phi/kernels/gpu/abs_kernel.cu +++ b/paddle/phi/kernels/gpu/abs_kernel.cu @@ -28,25 +28,23 @@ template struct CudaAbsFunctor; template -struct CudaAbsFunctor>> { - __device__ __forceinline__ phi::dtype::Real operator()(const T x) const { +struct CudaAbsFunctor>> { + __device__ __forceinline__ dtype::Real operator()(const T x) const { return abs(x); } }; template -struct CudaAbsFunctor< - T, - std::enable_if_t>::value && - std::is_same::value>> { +struct CudaAbsFunctor>::value && + std::is_same::value>> { __device__ __forceinline__ T operator()(const T x) const { return abs(x); } }; template -struct CudaAbsFunctor< - T, - std::enable_if_t>::value && - !std::is_same::value>> { +struct CudaAbsFunctor>::value && + !std::is_same::value>> { __device__ __forceinline__ T operator()(const T x) const { return std::abs(x); } @@ -56,12 +54,12 @@ template PADDLE_API void AbsKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { - dev_ctx.template Alloc>(out); + dev_ctx.template Alloc>(out); std::vector ins = {&x}; std::vector outs = {out}; auto functor = CudaAbsFunctor(); - funcs::ElementwiseKernel>(dev_ctx, ins, &outs, functor); + funcs::ElementwiseKernel>(dev_ctx, ins, &outs, functor); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu index b1478e842f4cfa..f97d49d717144b 100644 --- a/paddle/phi/kernels/gpu/accuracy_kernel.cu +++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu @@ -24,7 +24,6 @@ #include "paddle/phi/core/kernel_registry.h" namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void AccuracyCudaKernel(const int64_t N, @@ -34,7 +33,7 @@ __global__ void AccuracyCudaKernel(const int64_t N, int* correct_data, T* accuracy, int* total_data) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; int count = 0; __shared__ int total[BlockSize]; @@ -98,7 +97,7 @@ void AccuracyKernel(const Context& dev_ctx, int64_t num_samples = inference.dims()[0]; size_t infer_width = inference.dims()[1]; auto stream = dev_ctx.stream(); - phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream); + backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream); PADDLE_ENFORCE_GT(label.dims().size(), 0, diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index 35ed74c551ec06..4e867f387e9082 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -371,8 +371,7 @@ void PowGradKernel(const Context& dev_ctx, DenseTensor* dx) { if (factor.to() == 0) { std::vector vec_dims = vectorize(dx->dims()); - phi::Full( - dev_ctx, phi::IntArray(vec_dims), static_cast(0), dx); + Full(dev_ctx, IntArray(vec_dims), static_cast(0), dx); return; } if (factor.to() == 1) { diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 2fb25ec6151085..5da1b755ee8cf6 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -283,8 +283,7 @@ void PowKernel(const Context& dev_ctx, } if (factor.to() == 0) { std::vector vec_dims = vectorize(out->dims()); - phi::Full( - dev_ctx, phi::IntArray(vec_dims), static_cast(1), out); + Full(dev_ctx, IntArray(vec_dims), static_cast(1), out); return; } if (factor.to() == 1) { diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu index 303b180244feb5..69431f94844837 100644 --- a/paddle/phi/kernels/gpu/adagrad_kernel.cu +++ b/paddle/phi/kernels/gpu/adagrad_kernel.cu @@ -70,7 +70,7 @@ struct DenseAdagradFunctor { DenseTensor* param_out_tensor, DenseTensor* moment_out_tensor, DenseTensor* master_param_outs) { - using MT = typename phi::dtype::template MPTypeTrait::Type; + using MT = typename dtype::template MPTypeTrait::Type; T* param_out_data = dev_ctx.template Alloc(param_out_tensor); MT* moment_out_data = dev_ctx.template Alloc(moment_out_tensor); const MT* master_in_data = @@ -82,7 +82,7 @@ struct DenseAdagradFunctor { MT epsilon = static_cast(epsilon_t); int64_t numel = param_t.numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1); int grid = config.block_per_grid.x; int block = config.thread_per_block.x; auto stream = dev_ctx.stream(); @@ -180,7 +180,7 @@ struct SparseAdagradFunctor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid2(1, merge_rows.size()); - phi::MixVector mixv_merge_rows(&merge_rows); + MixVector mixv_merge_rows(&merge_rows); SparseAdagradFunctorKernel <<; template struct SparseAdagradFunctor; template struct DenseAdagradFunctor; template struct DenseAdagradFunctor; -template struct DenseAdagradFunctor; +template struct DenseAdagradFunctor; } // namespace phi diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu index 8cbde3b60d716d..c3c50207d09b96 100644 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -187,7 +187,7 @@ PADDLE_API void AdamDenseKernel(const Context& dev_ctx, DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; const auto grad_type = grad.dtype(); VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; @@ -394,7 +394,7 @@ void MergedAdamKernel( std::vector beta1_pow_out, std::vector beta2_pow_out, std::vector master_param_out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; MT beta1_ = beta1.to(); MT beta2_ = beta2.to(); diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu index e6c60d485528c1..dda8d4c45ba1d6 100644 --- a/paddle/phi/kernels/gpu/adamax_kernel.cu +++ b/paddle/phi/kernels/gpu/adamax_kernel.cu @@ -84,7 +84,7 @@ void AdamaxKernel(const Context& dev_ctx, DenseTensor* moment_out, DenseTensor* inf_norm_out, DenseTensor* master_param_outs) { - using MT = typename phi::dtype::template MPTypeTrait::Type; + using MT = typename dtype::template MPTypeTrait::Type; T* param_out_data = dev_ctx.template Alloc(param_out); MT* moment_out_data = dev_ctx.template Alloc(moment_out); MT* inf_norm_out_data = dev_ctx.template Alloc(inf_norm_out); @@ -104,7 +104,7 @@ void AdamaxKernel(const Context& dev_ctx, MT epsilon_ = static_cast(epsilon); int64_t numel = param.numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1); int grid = config.block_per_grid.x; int block = config.thread_per_block.x; auto stream = dev_ctx.stream(); diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu index 0d11bc4ac8e932..7eee4fb0a4461f 100644 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -168,7 +168,7 @@ PADDLE_API void AdamwDenseKernel(const Context& dev_ctx, DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT coeff_ = static_cast(coeff); MT lr_ratio_ = static_cast(lr_ratio); diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu index 9aa073279319de..d2cc2b6e2b02a0 100644 --- a/paddle/phi/kernels/gpu/add_n_kernel.cu +++ b/paddle/phi/kernels/gpu/add_n_kernel.cu @@ -25,7 +25,7 @@ namespace phi { template __global__ void SumArrayCUDAKernel( T **in, T *out, int64_t N, size_t in_size, bool read_dst) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { MPType total(read_dst ? static_cast(out[idx]) : static_cast(0)); @@ -46,7 +46,7 @@ __global__ void SumArrayMixedTypeCUDAKernel(const T *in_0, int64_t N, size_t in_others_size, bool read_dst) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { MPType total(read_dst ? static_cast(out[idx]) : static_cast(0)); @@ -128,7 +128,7 @@ void AddNKernel(const Context &dev_ctx, int64_t length_0 = in_0.numel(); int64_t length_1 = in_1.numel(); if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; auto result = EigenVector::Flatten(*out); auto &place = *dev_ctx.eigen_device(); auto in_0_e = EigenVector::Flatten(in_0).template cast(); @@ -281,7 +281,7 @@ void AddNKernel(const Context &dev_ctx, } } if (!sr_in_out_data.empty()) { - auto tmp_sr_in_out_array = phi::memory_utils::Alloc( + auto tmp_sr_in_out_array = memory_utils::Alloc( dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *)); size_t nbytes_sr = sr_in_out_data.size() * sizeof(T *); @@ -306,8 +306,8 @@ void AddNKernel(const Context &dev_ctx, } // if indata not null, merge into one kernel call. if (!in_data.empty()) { - auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(), - in_data.size() * sizeof(T *)); + auto tmp_in_array = + memory_utils::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *)); size_t nbytes_in2 = in_data.size() * sizeof(T *); const void *stable_in2 = diff --git a/paddle/phi/kernels/gpu/all_to_all_kernel.cu b/paddle/phi/kernels/gpu/all_to_all_kernel.cu index 2ac99ee46b58a6..091662d90ea60c 100644 --- a/paddle/phi/kernels/gpu/all_to_all_kernel.cu +++ b/paddle/phi/kernels/gpu/all_to_all_kernel.cu @@ -63,10 +63,9 @@ void AllToAllKernel(const Context& dev_ctx, const auto* send_buf = x.data(); auto* recv_buf = out->data(); for (auto i = 0; i < nranks; ++i) { - auto send_buf = phi::distributed::GetPartialTensor(x, offset, send_numel); + auto send_buf = distributed::GetPartialTensor(x, offset, send_numel); comm_ctx->Send(send_buf, send_numel, i, stream); - auto recv_buf = - phi::distributed::GetPartialTensor(*out, offset, send_numel); + auto recv_buf = distributed::GetPartialTensor(*out, offset, send_numel); comm_ctx->Recv(&recv_buf, send_numel, i, stream); offset += send_numel; } diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu index dba76d4b1eac86..409d09c52e7bb1 100644 --- a/paddle/phi/kernels/gpu/allclose_kernel.cu +++ b/paddle/phi/kernels/gpu/allclose_kernel.cu @@ -36,7 +36,7 @@ __global__ void AllcloseCUDAKernel(const T* in_data, bool* out_data) { unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; bool val; - using BaseMPType = typename phi::dtype::MPTypeTrait::Type; + using BaseMPType = typename dtype::MPTypeTrait::Type; using MPType = typename std::conditional::value || @@ -98,8 +98,7 @@ void AllCloseKernel(const Context& dev_ctx, int64_t num = x.numel(); const int vec_size = 4; - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size); uint32_t grid = config.block_per_grid.x; uint32_t block = config.thread_per_block.x; diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu index e5880c74a4f501..22875642d68075 100644 --- a/paddle/phi/kernels/gpu/amp_kernel.cu +++ b/paddle/phi/kernels/gpu/amp_kernel.cu @@ -159,13 +159,13 @@ class LazyZeros { const auto& cpu_place = CPUPlace(); // alloc each tensor's start index and copy to device auto h_in_starts_mem = - phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); int64_t* h_starts = reinterpret_cast(h_in_starts_mem->ptr()); - auto d_in_starts_mem = phi::memory_utils::Alloc( + auto d_in_starts_mem = memory_utils::Alloc( dev_ctx.GetPlace(), (xs_size + 1) * sizeof(int64_t), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); int64_t* d_starts = reinterpret_cast(d_in_starts_mem->ptr()); // the start index value of each tensor is @@ -186,14 +186,13 @@ class LazyZeros { dev_ctx.stream()); // copy each tensor of "outs" data address array to device - auto h_out_addrs_mem = - phi::memory_utils::Alloc(cpu_place, xs_size * sizeof(T*)); + auto h_out_addrs_mem = memory_utils::Alloc(cpu_place, xs_size * sizeof(T*)); T** h_out_addrs = reinterpret_cast(h_out_addrs_mem->ptr()); - auto d_out_addrs_mem = phi::memory_utils::Alloc( + auto d_out_addrs_mem = memory_utils::Alloc( dev_ctx.GetPlace(), xs_size * sizeof(T*), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); T** d_out_addrs = reinterpret_cast(d_out_addrs_mem->ptr()); for (size_t i = 0; i < xs_size; ++i) { @@ -277,7 +276,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, const DenseTensor& scale, std::vector outs, DenseTensor* found_infinite) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; const MT* scale_data = scale.data(); bool* found_inf_data = dev_ctx.template Alloc(found_infinite); @@ -294,13 +293,13 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, const auto& cpu_place = CPUPlace(); // calculate each tensor's start index and copy to device auto h_starts_tensor = - phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); - auto d_starts_tensor = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - (xs_size + 1) * sizeof(int64_t), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto d_starts_tensor = + memory_utils::Alloc(dev_ctx.GetPlace(), + (xs_size + 1) * sizeof(int64_t), + Stream(reinterpret_cast(dev_ctx.stream()))); int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); // the start index value of each tensor is @@ -322,14 +321,14 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, dev_ctx.stream()); // copy each tensor's data address to device - auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); + auto h_mem = memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); const T** h_xs = reinterpret_cast(h_mem->ptr()); T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; - auto d_mem = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - 2 * xs_size * sizeof(T*), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto d_mem = + memory_utils::Alloc(dev_ctx.GetPlace(), + 2 * xs_size * sizeof(T*), + Stream(reinterpret_cast(dev_ctx.stream()))); const T** d_xs = reinterpret_cast(d_mem->ptr()); T** d_outs = reinterpret_cast(d_mem->ptr()) + xs_size; diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu index 35e53055533460..ee7873e0110f4e 100644 --- a/paddle/phi/kernels/gpu/arange_kernel.cu +++ b/paddle/phi/kernels/gpu/arange_kernel.cu @@ -37,7 +37,7 @@ void ArangeTensorKernel(const Context& dev_ctx, const DenseTensor& end, const DenseTensor& step, DenseTensor* out) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType start_value = static_cast(GetValue(dev_ctx, start)); MPType end_value = static_cast(GetValue(dev_ctx, end)); @@ -64,7 +64,7 @@ void ArangeNullaryKernel(const Context& dev_ctx, const T end_value, const T step_value, DenseTensor* out) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType start_value_mpt = static_cast(start_value); MPType end_value_mpt = static_cast(end_value); MPType step_value_mpt = static_cast(step_value); diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu index 868257e9edbc64..3f98b957d60162 100644 --- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu +++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu @@ -232,7 +232,7 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx, dev_ctx, x, axis.to(), keepdims, flatten, out)); return; } - phi::VisitDataTypeTiny( + VisitDataTypeTiny( dtype, VisitDataCudaArgMinMaxFunctor( dev_ctx, x, axis.to(), keepdims, flatten, out)); diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu index b351fe22a13104..7d73826d561c6e 100644 --- a/paddle/phi/kernels/gpu/argsort_kernel.cu +++ b/paddle/phi/kernels/gpu/argsort_kernel.cu @@ -375,7 +375,7 @@ void ArgsortKernel(const Context& dev_ctx, PerSort( dev_ctx, out_data, ids_data, start, end, stable, descending); if (start != 0) { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end); merge_kernel<<::Type; + using MT = typename dtype::MPTypeTrait::Type; const MT* master_in_data = multi_precision ? master_param->data() : nullptr; MT* master_out_data = diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu index 0d3d3f2545866d..4da580f474912e 100644 --- a/paddle/phi/kernels/gpu/auc_kernel.cu +++ b/paddle/phi/kernels/gpu/auc_kernel.cu @@ -18,8 +18,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - __global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg, const int bucket_length, diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu index fd639434f8193e..fe17bb4eb7b39e 100644 --- a/paddle/phi/kernels/gpu/barrier_kernel.cu +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -27,8 +27,8 @@ void BarrierKernel(const Context &dev_ctx, DenseTensor *out) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = &x; - auto comm_ctx = static_cast( - dev_ctx.GetCommContext()); + auto comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( @@ -37,7 +37,7 @@ void BarrierKernel(const Context &dev_ctx, auto stream = comm_ctx->GetStream(); ncclRedOp_t nccl_red_type = ncclSum; comm_ctx->AllReduce(out, *in, nccl_red_type, stream); - phi::backends::gpu::GpuStreamSync(stream); + backends::gpu::GpuStreamSync(stream); #else PADDLE_THROW( common::errors::Unavailable("PaddlePaddle should compile with NCCL.")); diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index cc81ffcc682bcf..abb34236991aa2 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -42,7 +42,7 @@ COMMON_DECLARE_bool(batch_norm_use_miopen); namespace phi { template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; @@ -611,7 +611,7 @@ void BatchNormGradFunctor(const Context &dev_ctx, C, new_scale.dims()[0])); - auto dtype = phi::backends::gpu::CudnnDataType::type; + auto dtype = backends::gpu::CudnnDataType::type; #ifdef PADDLE_WITH_HIP auto compute_format = data_layout == DataLayout::NHWC diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 37b7cdfb1f0534..2500b2a553022d 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -42,7 +42,7 @@ COMMON_DECLARE_bool(batch_norm_use_miopen); namespace phi { template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; @@ -588,7 +588,7 @@ void BatchNormKernel(const Context &dev_ctx, int N, C, H, W, D; funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - auto dtype = phi::backends::gpu::CudnnDataType::type; + auto dtype = backends::gpu::CudnnDataType::type; auto *Scale = scale.get_ptr(); auto *Bias = bias.get_ptr(); @@ -599,13 +599,13 @@ void BatchNormKernel(const Context &dev_ctx, if (Scale) { new_scale = scale.get(); } else { - new_scale = phi::Full(dev_ctx, {C}, static_cast(1)); + new_scale = Full(dev_ctx, {C}, static_cast(1)); } if (Bias) { new_bias = bias.get(); } else { - new_bias = phi::Full(dev_ctx, {C}, static_cast(0)); + new_bias = Full(dev_ctx, {C}, static_cast(0)); } #ifdef PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu index 3b05ea51a32151..3211a29d6cf365 100644 --- a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu @@ -27,7 +27,7 @@ namespace phi { template struct BCELossGradFunctor { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT one = static_cast(1.0f); MT eps = static_cast(1e-12); diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu index 05866af47c6130..4cbb343223ae77 100644 --- a/paddle/phi/kernels/gpu/bce_loss_kernel.cu +++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu @@ -28,7 +28,7 @@ namespace phi { template struct BCELossFunctor { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT zero = static_cast(0); MT one = static_cast(1.0f); MT neg_100 = static_cast(-100.); @@ -42,8 +42,8 @@ struct BCELossFunctor { "Input is expected to be within the interval [0, 1], but received %f.", x_mt); - MT term1 = max(phi::kps::details::Log(x_mt), neg_100); - MT term2 = max(phi::kps::details::Log(one - x_mt), neg_100); + MT term1 = max(kps::details::Log(x_mt), neg_100); + MT term2 = max(kps::details::Log(one - x_mt), neg_100); return static_cast((label_mt - one) * term2 - label_mt * term1); } }; diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu index e83cc590a1b310..9adf7c7cb4fe1c 100644 --- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu +++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu @@ -53,7 +53,7 @@ __global__ void bernoulli_cuda_kernel( for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) { funcs::uniform_distribution dist; float4 rand = dist(&state); - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; #pragma unroll for (size_t j = 0; j < 4; j++) { size_t idx = i + j; @@ -82,7 +82,7 @@ void BernoulliKernel(const Context& dev_ctx, uint64_t seed = seed_offset.first; uint64_t offset = seed_offset.second; - auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 4); + auto gpu_config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 4); size_t grid_size = gpu_config.GetGridSize(); size_t block_size = gpu_config.GetBlockSize(); diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu index a770eaa4562079..7a1f5d7d8427e0 100644 --- a/paddle/phi/kernels/gpu/bincount_kernel.cu +++ b/paddle/phi/kernels/gpu/bincount_kernel.cu @@ -22,8 +22,6 @@ #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - inline int64_t GET_BLOCKS(const int64_t N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; } @@ -62,8 +60,8 @@ __global__ void KernelReduceMinMax(const T* input, } if (tid == 0) { - phi::CudaAtomicMin(min_out, smin[0]); - phi::CudaAtomicMax(max_out, smax[0]); + CudaAtomicMin(min_out, smin[0]); + CudaAtomicMax(max_out, smax[0]); } } diff --git a/paddle/phi/kernels/gpu/binomial_kernel.cu b/paddle/phi/kernels/gpu/binomial_kernel.cu index a5547c9f9aa4f9..67e5a012ea6e35 100644 --- a/paddle/phi/kernels/gpu/binomial_kernel.cu +++ b/paddle/phi/kernels/gpu/binomial_kernel.cu @@ -145,7 +145,7 @@ __global__ void BinomialSampling(const T* n, const int N, unsigned int seed, unsigned int offset) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { MT nt = static_cast(n[idx]); MT pt = static_cast(p[idx]); @@ -189,7 +189,7 @@ void BinomialKernel(const Context& dev_ctx, int block_size = std::min(kMaxBlockDim, dev_ctx.GetMaxThreadsPerBlock()); dim3 dim_block(block_size); dim3 dim_grid((size + block_size - 1) / block_size); - phi::backends::gpu::LimitGridDim(dev_ctx, &dim_grid); + backends::gpu::LimitGridDim(dev_ctx, &dim_grid); auto gen_cuda = dev_ctx.GetGenerator(); auto seed_offset = gen_cuda->IncrementOffset(20); diff --git a/paddle/phi/kernels/gpu/box_clip_kernel.cu b/paddle/phi/kernels/gpu/box_clip_kernel.cu index 0668965403e7ce..315934df1c6842 100644 --- a/paddle/phi/kernels/gpu/box_clip_kernel.cu +++ b/paddle/phi/kernels/gpu/box_clip_kernel.cu @@ -63,7 +63,7 @@ void GPUBoxClipKernel(const Context &dev_ctx, auto stream = dev_ctx.stream(); const size_t batch_size = lod.back().size() - 1; T *output_data = dev_ctx.template Alloc(output); - phi::MixVector mix_vector(&abs_offset_lod[0]); + MixVector mix_vector(&abs_offset_lod[0]); GPUBoxClip<<>>( input_p->data(), mix_vector.CUDAMutableData(dev_ctx.GetPlace()), diff --git a/paddle/phi/kernels/gpu/box_coder_kernel.cu b/paddle/phi/kernels/gpu/box_coder_kernel.cu index 5ff86cac661ca1..bc2ed13f307649 100644 --- a/paddle/phi/kernels/gpu/box_coder_kernel.cu +++ b/paddle/phi/kernels/gpu/box_coder_kernel.cu @@ -209,10 +209,10 @@ void BoxCoderKernel(const Context &dev_ctx, int grid = (row * col + block - 1) / block; int64_t bytes = var_size * sizeof(float); - auto dev_var = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto dev_var = + memory_utils::Alloc(dev_ctx.GetPlace(), + bytes, + Stream(reinterpret_cast(dev_ctx.stream()))); float *dev_var_data = reinterpret_cast(dev_var->ptr()); auto cplace = CPUPlace(); const auto gplace = dev_ctx.GetPlace(); diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 518a8567ecd9f5..cb6cb3eb72f9a9 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -89,12 +89,12 @@ void BroadcastTensorsGradKernel(const Context& dev_ctx, Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, output_tensor); } else { // reduce_sum implementation on CUDA - phi::SumKernel(dev_ctx, - *input_tensor, - reduce_dims_vec, - output_tensor->dtype(), - false, - output_tensor); + SumKernel(dev_ctx, + *input_tensor, + reduce_dims_vec, + output_tensor->dtype(), + false, + output_tensor); } } } diff --git a/paddle/phi/kernels/gpu/c_concat_kernel.cu b/paddle/phi/kernels/gpu/c_concat_kernel.cu index 1cf9f41072708c..763915f70f1b49 100644 --- a/paddle/phi/kernels/gpu/c_concat_kernel.cu +++ b/paddle/phi/kernels/gpu/c_concat_kernel.cu @@ -69,13 +69,13 @@ void CConcatKernel(const Context& dev_ctx, gpuStream_t stream = nullptr; #if defined(PADDLE_WITH_FLAGCX) && defined(PADDLE_KERNEL_WITH_FLAGCX) - phi::distributed::FlagcxCommContext* comm_ctx = nullptr; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + distributed::FlagcxCommContext* comm_ctx = nullptr; + comm_ctx = + static_cast(dev_ctx.GetCommContext()); #else - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; comm_ctx = - static_cast(dev_ctx.GetCommContext()); + static_cast(dev_ctx.GetCommContext()); #endif PADDLE_ENFORCE_NE(comm_ctx, nullptr, diff --git a/paddle/phi/kernels/gpu/c_scatter_kernel.cu b/paddle/phi/kernels/gpu/c_scatter_kernel.cu index e8e1f37137fe3c..19aa361d57dadb 100644 --- a/paddle/phi/kernels/gpu/c_scatter_kernel.cu +++ b/paddle/phi/kernels/gpu/c_scatter_kernel.cu @@ -35,12 +35,12 @@ void CScatterOpCUDAKernel(const Context& dev_ctx, #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto x = &input; int64_t numel = x->numel(); - ncclDataType_t dtype = phi::ToNCCLDataType(x->dtype()); + ncclDataType_t dtype = ToNCCLDataType(x->dtype()); int root_id = root; auto place = dev_ctx.GetPlace(); gpuStream_t stream = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; PADDLE_ENFORCE_GE( root_id, 0, @@ -53,7 +53,7 @@ void CScatterOpCUDAKernel(const Context& dev_ctx, "The ring_id (%d) for c_scatter_op must be non-negative.", ring_id)); comm_ctx = - static_cast(dev_ctx.GetCommContext()); + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( diff --git a/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu index 2512503a102e3e..3df577e5d5a2da 100644 --- a/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu @@ -189,10 +189,10 @@ struct CSoftmaxWithCrossEntropyFunctor { const DenseTensor* labels = &label_in; gpuStream_t stream = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( @@ -223,7 +223,7 @@ struct CSoftmaxWithCrossEntropyFunctor { logits_max.Resize({N, 1}); dev_ctx.template Alloc(&logits_max); - phi::MaxKernel(dev_ctx, logits_2d, {-1}, true, &logits_max); + MaxKernel(dev_ctx, logits_2d, {-1}, true, &logits_max); comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream); @@ -305,14 +305,14 @@ struct CSoftmaxWithCrossEntropyFunctor { comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream); // step 4, obtain exp(logit) - phi::ExpKernel(dev_ctx, softmax_2d, &softmax_2d); + ExpKernel(dev_ctx, softmax_2d, &softmax_2d); // step 5, obtain sum_exp_logits DenseTensor sum_exp_logits; sum_exp_logits.Resize({N, 1}); dev_ctx.template Alloc(&sum_exp_logits); - phi::SumKernel( + SumKernel( dev_ctx, softmax_2d, {-1}, softmax_2d.dtype(), true, &sum_exp_logits); comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream); @@ -358,8 +358,7 @@ struct CSoftmaxWithCrossEntropyFunctor { } } - phi::ReciprocalKernel( - dev_ctx, sum_exp_logits, &sum_exp_logits); + ReciprocalKernel(dev_ctx, sum_exp_logits, &sum_exp_logits); inputs = std::vector{&softmax_2d, &sum_exp_logits}; outputs = std::vector{&softmax_2d}; diff --git a/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu index 4e34885514ae97..9de70f0a25491f 100644 --- a/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu @@ -159,10 +159,10 @@ struct CSoftmaxWithMultiLabelCrossEntropyFunctor { const DenseTensor* smooth_weight = &smooth_weight_in; gpuStream_t stream = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( @@ -266,7 +266,7 @@ struct CSoftmaxWithMultiLabelCrossEntropyFunctor { sum_exp_logits.Resize({N, 1}); dev_ctx.template Alloc(&sum_exp_logits); - phi::SumKernel( + SumKernel( dev_ctx, softmax_2d, {-1}, softmax_2d.dtype(), true, &sum_exp_logits); comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream); diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu index a300efb15e7306..b635faf6d6b9ef 100644 --- a/paddle/phi/kernels/gpu/cast_kernel.cu +++ b/paddle/phi/kernels/gpu/cast_kernel.cu @@ -25,7 +25,7 @@ void CastKernel(const Context& dev_ctx, DataType out_dtype, DenseTensor* out) { if (x.dtype() == out_dtype) { - if (x.dims() == phi::make_ddim({-1})) { + if (x.dims() == make_ddim({-1})) { *out = x; return; } @@ -53,8 +53,8 @@ INSTANTIATE_CAST_KERNEL(uint32_t, GPUContext) INSTANTIATE_CAST_KERNEL(uint64_t, GPUContext) INSTANTIATE_CAST_KERNEL(bool, GPUContext) INSTANTIATE_CAST_KERNEL(int16_t, GPUContext) -INSTANTIATE_CAST_KERNEL(phi::float16, GPUContext) -INSTANTIATE_CAST_KERNEL(phi::bfloat16, GPUContext) +INSTANTIATE_CAST_KERNEL(float16, GPUContext) +INSTANTIATE_CAST_KERNEL(bfloat16, GPUContext) #endif } // namespace phi diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu index b7fa564d9c3dea..fab45447fb5c95 100644 --- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu +++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu @@ -28,11 +28,9 @@ namespace phi { static std::once_flag init_multi_gpu_op_var_map_flag; // lazy init -static std::vector< - std::unordered_map>& +static std::vector>& multi_op_var2gpu_str() { - static std::vector< - std::unordered_map> + static std::vector> _multi_op_var2gpu_str; return _multi_op_var2gpu_str; } @@ -43,14 +41,14 @@ static std::vector& multi_op_var2gpu_str_mutex() { } static void InitMultiGPUOpVarMap() { - int dev_count = phi::backends::gpu::GetGPUDeviceCount(); + int dev_count = backends::gpu::GetGPUDeviceCount(); PADDLE_ENFORCE_GT(dev_count, 0, common::errors::NotFound( "cuda device must > 0, now dev_count=%d", dev_count)); // https://stackoverflow.com/questions/16465633/how-can-i-use-something-like-stdvectorstdmutex - std::vector> + std::vector> tmp_multi(dev_count); std::vector tmp_multi_mutex(dev_count); @@ -111,8 +109,8 @@ __device__ void BlockReduceNumNanInfAndWrite(const int64_t num_nan, } template ::value || - std::is_same::value, + std::enable_if_t::value || + std::is_same::value, bool> = true> __device__ void BlockReduceMaxMinAndWrite(const T max_value, const T min_value, @@ -125,8 +123,8 @@ __device__ void BlockReduceMaxMinAndWrite(const T max_value, } template ::value && - !std::is_same::value, + std::enable_if_t::value && + !std::is_same::value, bool> = true> __device__ void BlockReduceMaxMinAndWrite(const T max_value, const T min_value, @@ -272,7 +270,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr, template inline std::string GetHintString(const std::string& op_type, const std::string& var_name, - const phi::Place& place, + const Place& place, int dev_id = -1) { std::string op_var = funcs::GetCpuHintString(op_type, var_name, place, dev_id); @@ -301,10 +299,10 @@ static char* GetGpuHintStringPtr(const GPUContext& dev_ctx, std::lock_guard guard(op_var2gpu_str_mutex); if (op_var2gpu_str.find(op_var) == op_var2gpu_str.end()) { // insert - auto gpu_str_tensor = phi::memory_utils::Alloc( + auto gpu_str_tensor = memory_utils::Alloc( dev_ctx.GetPlace(), op_var.length() + 1, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); gpu_str_ptr = reinterpret_cast(gpu_str_tensor->ptr()); op_var2gpu_str.emplace(op_var, std::move(gpu_str_tensor)); @@ -353,14 +351,14 @@ static void PrintStack(const GPUContext& dev_ctx, const std::string& op_type, const std::string& var_name, int dev_id) { - auto cpu_stats = phi::memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 3); + auto cpu_stats = memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 3); int64_t* cpu_stats_ptr = reinterpret_cast(cpu_stats->ptr()); - phi::memory_utils::Copy(CPUPlace(), - cpu_stats_ptr, - stats.place(), - stats.data(), - 3 * sizeof(int64_t), - dev_ctx.stream()); + memory_utils::Copy(CPUPlace(), + cpu_stats_ptr, + stats.place(), + stats.data(), + 3 * sizeof(int64_t), + dev_ctx.stream()); dev_ctx.Wait(); if (cpu_stats_ptr[0] > 0 || cpu_stats_ptr[1] > 0) { const std::string debug_info = @@ -437,7 +435,7 @@ void CheckNumericsKernel(const Context& dev_ctx, std::min(static_cast(128), static_cast((tensor.numel() + threads - 1) / threads)); - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; int64_t numel_max_min = blocks; @@ -505,12 +503,12 @@ void CheckNumericsKernel(const Context& dev_ctx, #ifdef _WIN32 INSTANTIATE_CHECKNUMBERICS_KERNEL(float, GPUContext) INSTANTIATE_CHECKNUMBERICS_KERNEL(double, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::float16, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::bfloat16, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::complex64, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::complex128, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::float8_e4m3fn, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::float8_e5m2, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(float16, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(bfloat16, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(complex64, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(complex128, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(float8_e4m3fn, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(float8_e5m2, GPUContext) #endif } // namespace phi diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu index 95e009785ee999..a74998897135e4 100644 --- a/paddle/phi/kernels/gpu/cholesky_kernel.cu +++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu @@ -82,10 +82,10 @@ struct MatrixBandPartFunctor { int workspace_size = 0; \ PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \ handle, uplo, n, A, lda, &workspace_size)); \ - auto workspace = phi::memory_utils::Alloc( \ + auto workspace = memory_utils::Alloc( \ dev_ctx.GetPlace(), \ workspace_size * sizeof(T), \ - phi::Stream(reinterpret_cast(dev_ctx.stream()))); \ + Stream(reinterpret_cast(dev_ctx.stream()))); \ T* workspace_ptr = reinterpret_cast(workspace->ptr()); \ PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf( \ handle, uplo, n, A, lda, workspace_ptr, workspace_size, info)); \ @@ -119,12 +119,12 @@ FUNC_WITH_TYPES(POTRF_INSTANCE); data_type, \ &workspace_device_size, \ &workspace_host_size)); \ - auto workspace_device = phi::memory_utils::Alloc( \ + auto workspace_device = memory_utils::Alloc( \ dev_ctx.GetPlace(), \ workspace_device_size, \ - phi::Stream(reinterpret_cast(dev_ctx.stream()))); \ + Stream(reinterpret_cast(dev_ctx.stream()))); \ auto workspace_host = \ - phi::memory_utils::Alloc(CPUPlace(), workspace_host_size); \ + memory_utils::Alloc(CPUPlace(), workspace_host_size); \ PADDLE_ENFORCE_GPU_SUCCESS( \ dynload::cusolverDnXpotrf(handle, \ params, \ @@ -209,10 +209,10 @@ void CholeskyKernel(const Context& dev_ctx, for_range(matrix_band_part_functor); } - auto info = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - sizeof(int) * batch_count, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto info = + memory_utils::Alloc(dev_ctx.GetPlace(), + sizeof(int) * batch_count, + Stream(reinterpret_cast(dev_ctx.stream()))); auto* info_ptr = reinterpret_cast(info->ptr()); #if CUDA_VERSION >= 9020 && !defined(_WIN32) diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu index 609378cc3b224f..bc97987f15347b 100644 --- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu +++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu @@ -37,7 +37,7 @@ void rocsolver_potrs(const solverHandle_t &handle, T *Bdata, int ldb); -using phi::dtype::complex; +using dtype::complex; #define FUNC_WITH_TYPES(m) \ m(float, s, float) m(double, d, double) \ m(complex, c, rocblas_float_complex) \ @@ -107,15 +107,15 @@ void cusolver_potrs(const solverHandle_t &handle, } template <> -void cusolver_potrs(const solverHandle_t &handle, - cublasFillMode_t uplo, - int M, - int N, - phi::complex64 *Adata, - int lda, - phi::complex64 *Bdata, - int ldb, - int *devInfo) { +void cusolver_potrs(const solverHandle_t &handle, + cublasFillMode_t uplo, + int M, + int N, + complex64 *Adata, + int lda, + complex64 *Bdata, + int ldb, + int *devInfo) { PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnCpotrs(handle, uplo, @@ -129,15 +129,15 @@ void cusolver_potrs(const solverHandle_t &handle, } template <> -void cusolver_potrs(const cusolverDnHandle_t &handle, - cublasFillMode_t uplo, - int M, - int N, - phi::complex128 *Adata, - int lda, - phi::complex128 *Bdata, - int ldb, - int *devInfo) { +void cusolver_potrs(const cusolverDnHandle_t &handle, + cublasFillMode_t uplo, + int M, + int N, + complex128 *Adata, + int lda, + complex128 *Bdata, + int ldb, + int *devInfo) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZpotrs( handle, uplo, diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu index fab500ff2991d7..bcf7986c1f8d1c 100644 --- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu @@ -350,9 +350,9 @@ void ClassCenterSampleKernel(const Context& dev_ctx, #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { auto stream = dev_ctx.stream(); - phi::distributed::NCCLCommContext* comm_ctx = nullptr; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + distributed::NCCLCommContext* comm_ctx = nullptr; + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( @@ -361,7 +361,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx, comm_ctx->AllReduce( &num_classes_per_device, num_classes_per_device, ncclSum, stream); - phi::backends::gpu::GpuStreamSync(stream); + backends::gpu::GpuStreamSync(stream); } #endif @@ -446,7 +446,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx, (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) + 1) * vec_size; - // auto gen_cuda = phi::DefaultCUDAGenerator(device_id); + // auto gen_cuda = DefaultCUDAGenerator(device_id); auto gen_cuda = dev_ctx.GetGenerator(); if (!fix_seed) { auto seed_offset = gen_cuda->IncrementOffset(offset); diff --git a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu index 47783fa3c5a093..a5ef2390d0d911 100644 --- a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu @@ -111,18 +111,18 @@ void GPUCollectFpnProposalsOpKernel( } } - phi::memory_utils::Copy(place, - concat_rois_data + roi_offset, - place, - roi_in->data(), - roi_in->numel() * sizeof(T), - dev_ctx.stream()); - phi::memory_utils::Copy(place, - concat_scores_data + score_offset, - place, - score_in->data(), - score_in->numel() * sizeof(T), - dev_ctx.stream()); + memory_utils::Copy(place, + concat_rois_data + roi_offset, + place, + roi_in->data(), + roi_in->numel() * sizeof(T), + dev_ctx.stream()); + memory_utils::Copy(place, + concat_scores_data + score_offset, + place, + score_in->data(), + score_in->numel() * sizeof(T), + dev_ctx.stream()); roi_offset += roi_in->numel(); score_offset += score_in->numel(); } @@ -161,7 +161,7 @@ void GPUCollectFpnProposalsOpKernel( sizeof(T) * 8, dev_ctx.stream()); // Allocate temporary storage - auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes); + auto d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes); // Run sorting operation // sort score to get corresponding index @@ -208,7 +208,7 @@ void GPUCollectFpnProposalsOpKernel( sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage - d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes); + d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes); // Run sorting operation // sort batch_id to get corresponding index @@ -246,12 +246,12 @@ void GPUCollectFpnProposalsOpKernel( "address into the graph; on replay the vector is re-created at a " "different address, causing a dangling-pointer write.")); std::vector length_lod_cpu(lod_size); - phi::memory_utils::Copy(CPUPlace(), - length_lod_cpu.data(), - place, - length_lod_data, - sizeof(int) * lod_size, - dev_ctx.stream()); + memory_utils::Copy(CPUPlace(), + length_lod_cpu.data(), + place, + length_lod_data, + sizeof(int) * lod_size, + dev_ctx.stream()); dev_ctx.Wait(); std::vector offset(1, 0); @@ -263,12 +263,12 @@ void GPUCollectFpnProposalsOpKernel( auto* rois_num = rois_num_out; rois_num->Resize({lod_size}); int* rois_num_data = dev_ctx.template Alloc(rois_num); - phi::memory_utils::Copy(place, - rois_num_data, - place, - length_lod_data, - lod_size * sizeof(int), - dev_ctx.stream()); + memory_utils::Copy(place, + rois_num_data, + place, + length_lod_data, + lod_size * sizeof(int), + dev_ctx.stream()); } LegacyLoD lod; diff --git a/paddle/phi/kernels/gpu/comm_init_all_kernel.cu b/paddle/phi/kernels/gpu/comm_init_all_kernel.cu index 9f759fdc0f4ddf..485bead29d228c 100644 --- a/paddle/phi/kernels/gpu/comm_init_all_kernel.cu +++ b/paddle/phi/kernels/gpu/comm_init_all_kernel.cu @@ -29,7 +29,7 @@ void CommInitAllKernel(const Context& dev_ctx, #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) std::vector devices = devices_input; if (devices.empty()) { - devices = phi::backends::gpu::GetSelectedDevices(); + devices = backends::gpu::GetSelectedDevices(); } paddle::platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices, diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu index 9618a4683c424a..d921d9badc021b 100644 --- a/paddle/phi/kernels/gpu/concat_kernel.cu +++ b/paddle/phi/kernels/gpu/concat_kernel.cu @@ -72,8 +72,8 @@ void ConcatKernel(const Context& dev_ctx, if (lod_size) { auto* out_lod = out->mutable_lod(); for (size_t i = 1; i < x.size(); ++i) { - auto in_lod = phi::ConvertToLengthBasedLegacyLoD(x[i]->lod()); - phi::AppendLegacyLoD(out_lod, in_lod); + auto in_lod = ConvertToLengthBasedLegacyLoD(x[i]->lod()); + AppendLegacyLoD(out_lod, in_lod); } } } diff --git a/paddle/phi/kernels/gpu/contiguous_kernel.cu b/paddle/phi/kernels/gpu/contiguous_kernel.cu index 90a734eddf76ac..3feb65323286f5 100644 --- a/paddle/phi/kernels/gpu/contiguous_kernel.cu +++ b/paddle/phi/kernels/gpu/contiguous_kernel.cu @@ -167,8 +167,8 @@ __global__ void ContiguousCaseOneFunc( template __global__ void ContiguousDefaultFunc( const T* input_data, - phi::Array input_stride, - phi::Array dims, + Array input_stride, + Array dims, const int64_t numel, T* out_data) { CUDA_KERNEL_LOOP_TYPE(i, numel, int64_t) { @@ -233,8 +233,8 @@ template bool LaunchContiguousCaseZeroKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, - const phi::Array& input_dims, + const Array& input_stride, + const Array& input_dims, int rank, T* output_data) { if (rank > 6) { @@ -305,13 +305,13 @@ template bool LaunchContiguousCaseOneKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, - const phi::Array& input_dims, + const Array& input_stride, + const Array& input_dims, int rank, int64_t numel, T* output_data) { Dim3 grid(1, 1, 1), block(1, 1, 1); - phi::Array cur_input_dims; + Array cur_input_dims; block.x = 512; if (rank >= 1) { @@ -445,8 +445,8 @@ template void LaunchContiguousDefaultKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, - const phi::Array& input_dims, + const Array& input_stride, + const Array& input_dims, int rank, int64_t numel, T* output_data) { diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu index b26fa327e01cc8..909fa7ef3a2a1b 100644 --- a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu @@ -111,7 +111,7 @@ void DepthwiseConv2dTransposeGradKernel(const Context& dev_ctx, &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize); if (dx) { - phi::math::DepthwiseConvFunctor depthwiseConv; + math::DepthwiseConvFunctor depthwiseConv; depthwiseConv(dev_ctx, dout, filter_, @@ -128,8 +128,7 @@ void DepthwiseConv2dTransposeGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(dfilter); set_zero(dev_ctx, dfilter, static_cast(0)); - phi::math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; + math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad( dev_ctx, dout, diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu index bdca3cc64f667d..cb8f5d6eada59c 100644 --- a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu @@ -85,7 +85,7 @@ void DepthwiseConv2dTransposeKernel(const Context& dev_ctx, funcs::SetConstant set_zero; set_zero(dev_ctx, out, static_cast(0)); - phi::math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; + math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad( dev_ctx, *out, diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu index 10f88d90317a3f..21f55aa59f702f 100644 --- a/paddle/phi/kernels/gpu/cross_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu @@ -40,7 +40,7 @@ __global__ void CrossGrad(const T* x, int64_t pos1 = offset + 1 * stride; int64_t pos2 = offset + 2 * stride; - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType x_pos0_mp = static_cast(x[pos0]); MPType x_pos1_mp = static_cast(x[pos1]); diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu index f93e92b343caff..3d491b5445d2bd 100644 --- a/paddle/phi/kernels/gpu/cross_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_kernel.cu @@ -37,7 +37,7 @@ __global__ void Cross(const T* x, int64_t pos1 = offset + 1 * stride; int64_t pos2 = offset + 2 * stride; - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType x_pos0_mp = static_cast(x[pos0]); MPType x_pos1_mp = static_cast(x[pos1]); diff --git a/paddle/phi/kernels/gpu/ctc_align_kernel.cu b/paddle/phi/kernels/gpu/ctc_align_kernel.cu index ca256a9f91b7a0..841965463ed124 100644 --- a/paddle/phi/kernels/gpu/ctc_align_kernel.cu +++ b/paddle/phi/kernels/gpu/ctc_align_kernel.cu @@ -125,7 +125,7 @@ void CTCAlignOpCUDAKernel(const Context& dev_ctx, output->Resize({num_tokens, 1}); T* output_data = dev_ctx.template Alloc(output); - phi::MixVector mixv_input_lod(&input_lod[level]); + MixVector mixv_input_lod(&input_lod[level]); MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( num_tokens, tokens, diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_cache.h b/paddle/phi/kernels/gpu/cudnn_lstm_cache.h index aeb3af5350441b..e3439ed4264127 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_cache.h +++ b/paddle/phi/kernels/gpu/cudnn_lstm_cache.h @@ -49,13 +49,13 @@ class ScopedRNNBase { template void Create(const cudnnHandle_t& handle, - const phi::Place& place, + const Place& place, const std::vector& sequence_length, size_t* workspace_size, size_t* reserve_size, DenseTensor* dropout_state) { int numDirections = is_bidirec_ ? 2 : 1; - cudnnDataType_t cudnn_type = phi::backends::gpu::CudnnDataType::type; + cudnnDataType_t cudnn_type = backends::gpu::CudnnDataType::type; // ------------------- cudnn x, y descriptors --------------------- std::vector dims_x = {batch_size_, input_size_, 1}; @@ -238,19 +238,19 @@ class ScopedRNNBase { std::vector x_descs_; std::vector y_descs_; - phi::backends::gpu::ScopedTensorDescriptor x_desc_; - phi::backends::gpu::ScopedTensorDescriptor y_desc_; + backends::gpu::ScopedTensorDescriptor x_desc_; + backends::gpu::ScopedTensorDescriptor y_desc_; #if CUDNN_VERSION >= 7201 - phi::backends::gpu::ScopedRNNTensorDescriptor x_seq_desc_; - phi::backends::gpu::ScopedRNNTensorDescriptor y_seq_desc_; + backends::gpu::ScopedRNNTensorDescriptor x_seq_desc_; + backends::gpu::ScopedRNNTensorDescriptor y_seq_desc_; #endif - phi::backends::gpu::ScopedTensorDescriptor init_h_desc_; - phi::backends::gpu::ScopedTensorDescriptor init_c_desc_; - phi::backends::gpu::ScopedTensorDescriptor last_h_desc_; - phi::backends::gpu::ScopedTensorDescriptor last_c_desc_; - phi::backends::gpu::ScopedDropoutDescriptor dropout_desc_; - phi::backends::gpu::ScopedFilterDescriptor weight_desc_; - phi::backends::gpu::ScopedRNNDescriptor rnn_desc_; + backends::gpu::ScopedTensorDescriptor init_h_desc_; + backends::gpu::ScopedTensorDescriptor init_c_desc_; + backends::gpu::ScopedTensorDescriptor last_h_desc_; + backends::gpu::ScopedTensorDescriptor last_c_desc_; + backends::gpu::ScopedDropoutDescriptor dropout_desc_; + backends::gpu::ScopedFilterDescriptor weight_desc_; + backends::gpu::ScopedRNNDescriptor rnn_desc_; }; } // namespace phi diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu index 845f3d17784479..877ffad5c46620 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu @@ -112,7 +112,7 @@ void CudnnLSTMGradKernel( bool has_seq_length = running_seq_length != nullptr; std::vector SequenceLength; if (has_seq_length) { - SequenceLength = phi::GetVectorFromTensor(running_seq_length); + SequenceLength = GetVectorFromTensor(running_seq_length); } int seq_length = input_dims[0]; diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu index bb3b9fc6dd5086..76dd3ad716dc7b 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu +++ b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu @@ -186,7 +186,7 @@ void CudnnLSTMKernel( if (seed == 0) { // If not specify seed, use global Generator to generate seed. int device_id = dev_ctx.GetPlace().GetDeviceId(); - auto gen_cuda = phi::DefaultCUDAGenerator(device_id); + auto gen_cuda = DefaultCUDAGenerator(device_id); seed = static_cast(gen_cuda->Random64()); } } @@ -195,7 +195,7 @@ void CudnnLSTMKernel( bool has_seq_length = running_sequence_length != nullptr; std::vector SequenceLength; if (has_seq_length) { - SequenceLength = phi::GetVectorFromTensor(running_sequence_length); + SequenceLength = GetVectorFromTensor(running_sequence_length); } auto handle = dev_ctx.cudnn_handle(); diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h index 182bc21af18bde..32a4738831bc1d 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h +++ b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h @@ -53,7 +53,7 @@ inline int size_sum(const std::vector &weight_list) { template inline void weight_to_tensor( - const phi::Place &place, + const Place &place, gpuStream_t stream, const std::vector &weight_list, DenseTensor *weight) { diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu index 0a7d31e5c22860..a120262e3f20c1 100644 --- a/paddle/phi/kernels/gpu/cum_kernel.cu +++ b/paddle/phi/kernels/gpu/cum_kernel.cu @@ -203,7 +203,7 @@ __global__ void BlockScanKernel(T* d_out, int64_t scan_size, bool exclusive, Op op) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; using CallbackOp = BlockPrefixCallbackOp; // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types @@ -263,13 +263,13 @@ void ThrustCumsumKernel(const Context& dev_ctx, int64_t size, bool reverse, bool exclusive) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; #ifdef __HIPCC__ const auto& policy = thrust::hip::par.on(dev_ctx.stream()); #else - phi::memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), - dev_ctx.stream()); + memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), + dev_ctx.stream()); const auto& policy = thrust::cuda::par(allocator).on(dev_ctx.stream()); #endif @@ -459,11 +459,10 @@ void CumsumKernel(const Context& dev_ctx, bool exclusive, bool reverse, DenseTensor* out) { - using Op = - typename std::conditional::value || - std::is_same::value, - ComplexSum, - cub::Sum>::type; + using Op = typename std::conditional::value || + std::is_same::value, + ComplexSum, + cub::Sum>::type; if (FLAGS_use_accuracy_compatible_kernel && !exclusive) { if (out && out->numel() == 0) { dev_ctx.template Alloc(out); diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu index 10a11e1f1e16ce..a33d7e34a66538 100644 --- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu @@ -169,15 +169,15 @@ void ReversedCumsum(const Context &dev_ctx, flipped_input.Resize(input.dims()); dev_ctx.template Alloc(&flipped_input); std::vector axis = {dim}; - phi::FlipKernel(dev_ctx, input, axis, &flipped_input); + FlipKernel(dev_ctx, input, axis, &flipped_input); DenseTensor cumsum_out; cumsum_out.Resize(input.dims()); dev_ctx.template Alloc(&cumsum_out); - phi::CumsumKernel( + CumsumKernel( dev_ctx, flipped_input, dim, false, false, false, &cumsum_out); - phi::FlipKernel(dev_ctx, cumsum_out, axis, output); + FlipKernel(dev_ctx, cumsum_out, axis, output); } template @@ -195,7 +195,7 @@ bool CumprodGradCompatible(const Context &dev_ctx, bool is_trivial = (x.numel() <= 1) || (x_dims[wrap_dim] == 1); if (is_trivial) { dev_ctx.template Alloc(dx); - phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx); + Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx); return true; } @@ -203,42 +203,40 @@ bool CumprodGradCompatible(const Context &dev_ctx, DenseTensor x_conj_tensor; DenseTensor out_conj_tensor; - if (phi::IsComplexType(x.dtype())) { + if (IsComplexType(x.dtype())) { x_conj_tensor.Resize(x.dims()); out_conj_tensor.Resize(out.dims()); dev_ctx.template Alloc(&x_conj_tensor); dev_ctx.template Alloc(&out_conj_tensor); - phi::ConjKernel(dev_ctx, x, &x_conj_tensor); - phi::ConjKernel(dev_ctx, out, &out_conj_tensor); + ConjKernel(dev_ctx, x, &x_conj_tensor); + ConjKernel(dev_ctx, out, &out_conj_tensor); } - const DenseTensor &x_ref = phi::IsComplexType(x.dtype()) ? x_conj_tensor : x; - const DenseTensor &out_ref = - phi::IsComplexType(x.dtype()) ? out_conj_tensor : out; + const DenseTensor &x_ref = IsComplexType(x.dtype()) ? x_conj_tensor : x; + const DenseTensor &out_ref = IsComplexType(x.dtype()) ? out_conj_tensor : out; DenseTensor zero_val; zero_val.Resize({1}); dev_ctx.template Alloc(&zero_val); - phi::FullKernel( - dev_ctx, {1}, static_cast(0), x.dtype(), &zero_val); + FullKernel(dev_ctx, {1}, static_cast(0), x.dtype(), &zero_val); DenseTensor is_zero_mask; is_zero_mask.Resize(x.dims()); dev_ctx.template Alloc(&is_zero_mask); - phi::EqualKernel(dev_ctx, x, zero_val, &is_zero_mask); + EqualKernel(dev_ctx, x, zero_val, &is_zero_mask); DenseTensor any_zero; any_zero.Resize({1}); dev_ctx.template Alloc(&any_zero); - phi::AnyKernel( + AnyKernel( dev_ctx, is_zero_mask, std::vector(), false, &any_zero); bool has_zero = false; #ifdef PADDLE_WITH_CUDA DenseTensor any_zero_cpu; - phi::Copy(dev_ctx, any_zero, CPUPlace(), true, &any_zero_cpu); + Copy(dev_ctx, any_zero, CPUPlace(), true, &any_zero_cpu); has_zero = *any_zero_cpu.data(); #else has_zero = *any_zero.data(); @@ -253,7 +251,7 @@ bool CumprodGradCompatible(const Context &dev_ctx, DenseTensor w; w.Resize(out_ref.dims()); dev_ctx.template Alloc(&w); - phi::MultiplyKernel(dev_ctx, out_ref, dout, &w); + MultiplyKernel(dev_ctx, out_ref, dout, &w); DenseTensor w_flipped, w_cum, rc_w; w_flipped.Resize(w.dims()); @@ -265,14 +263,14 @@ bool CumprodGradCompatible(const Context &dev_ctx, dev_ctx.template Alloc(&rc_w); std::vector axis = {dim}; - phi::FlipKernel(dev_ctx, w, axis, &w_flipped); + FlipKernel(dev_ctx, w, axis, &w_flipped); - phi::CumsumKernel( + CumsumKernel( dev_ctx, w_flipped, dim, false, false, false, &w_cum); - phi::FlipKernel(dev_ctx, w_cum, axis, &rc_w); + FlipKernel(dev_ctx, w_cum, axis, &rc_w); - phi::DivideKernel(dev_ctx, rc_w, x_ref, dx); + DivideKernel(dev_ctx, rc_w, x_ref, dx); return true; } @@ -324,7 +322,7 @@ void CumprodGradKernel(const Context &dev_ctx, const T *y_data_deal; Allocator::AllocationPtr x_conj; Allocator::AllocationPtr y_conj; - if (phi::IsComplexType(x.dtype())) { + if (IsComplexType(x.dtype())) { x_conj = const_cast(dev_ctx.GetAllocator()) .Allocate(numel * sizeof(T)); auto *x_data_conj = reinterpret_cast(x_conj->ptr()); diff --git a/paddle/phi/kernels/gpu/cvm_grad_kernel.cu b/paddle/phi/kernels/gpu/cvm_grad_kernel.cu index 957349834e461c..1d14e131203f5a 100644 --- a/paddle/phi/kernels/gpu/cvm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cvm_grad_kernel.cu @@ -22,8 +22,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void CvmGradComputeKernel(const bool use_cvm, const int64_t item_width, @@ -105,7 +103,7 @@ void CVMGradCUDAKernel(const Context& dev_ctx, lod[lod.size() - 1], common::errors::PreconditionNotMet( "Output(X@GRAD)'s dim[0] must be equal to last element of lod")); - phi::MixVector mixv_lod(&lod); + MixVector mixv_lod(&lod); CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, PADDLE_CUDA_NUM_THREADS, diff --git a/paddle/phi/kernels/gpu/cvm_kernel.cu b/paddle/phi/kernels/gpu/cvm_kernel.cu index 597ecfb92b818b..204ed5cb2dd81b 100644 --- a/paddle/phi/kernels/gpu/cvm_kernel.cu +++ b/paddle/phi/kernels/gpu/cvm_kernel.cu @@ -21,8 +21,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void CvmComputeKernel(const bool use_cvm, const int64_t item_width, diff --git a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu index d8d3577491e1a0..1594b4030e2a07 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu @@ -54,27 +54,21 @@ __device__ __forceinline__ T WARP_SHFL_DOWN(T value, } template <> -__device__ __forceinline__ phi::dtype::float16 -WARP_SHFL_DOWN(phi::dtype::float16 value, - unsigned int delta, - int width, - unsigned int mask) { +__device__ __forceinline__ dtype::float16 WARP_SHFL_DOWN( + dtype::float16 value, unsigned int delta, int width, unsigned int mask) { uint16_t val_as_ushort = *reinterpret_cast(&value); uint16_t shuffled = WARP_SHFL_DOWN(val_as_ushort, delta, width, mask); - return *reinterpret_cast(&shuffled); + return *reinterpret_cast(&shuffled); } template <> -__device__ __forceinline__ phi::dtype::bfloat16 -WARP_SHFL_DOWN(phi::dtype::bfloat16 value, - unsigned int delta, - int width, - unsigned int mask) { +__device__ __forceinline__ dtype::bfloat16 WARP_SHFL_DOWN( + dtype::bfloat16 value, unsigned int delta, int width, unsigned int mask) { uint16_t val_as_ushort = *reinterpret_cast(&value); uint16_t shuffled = WARP_SHFL_DOWN(val_as_ushort, delta, width, mask); - return *reinterpret_cast(&shuffled); + return *reinterpret_cast(&shuffled); } template @@ -124,7 +118,7 @@ __global__ void DWConv2dBwdInputKernel(const T* __restrict__ grad_output, const int padHeight, const int dilationWidth, const int dilationHeight) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth; const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight; const int strideW = (stride != 0) ? stride : strideWidth; @@ -194,7 +188,7 @@ __global__ void DWConv2dBwdWeightKernel(const T* __restrict__ grad_output, const int padHeight, const int dilationWidth, const int dilationHeight) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const int channelStride = kernelWidth * kernelHeight; int bidx = blockIdx.x; @@ -316,7 +310,7 @@ void LaunchDepthwiseConv2dBackwardCompatible(const Context& dev_ctx, // Launch Filter Gradient Kernel (grad_weight) if (filter_grad_nchw_ptr) { - phi::funcs::SetConstant set_zero; + funcs::SetConstant set_zero; set_zero(dev_ctx, filter_grad_nchw_ptr, static_cast(0)); int blocks = outputChannels * kH * kW; @@ -324,7 +318,7 @@ void LaunchDepthwiseConv2dBackwardCompatible(const Context& dev_ctx, dim3 block(GetGradParamsNumThreads(batchSize)); size_t smem = (block.x / CUDA_WARP_SIZE) * - sizeof(typename phi::dtype::MPTypeTrait::Type); + sizeof(typename dtype::MPTypeTrait::Type); DWConv2dBwdWeightKernel <<>>(out_grad_nchw.data(), @@ -420,12 +414,12 @@ void LaunchDepthwiseConv2dBackwardCompatible(const Context& dev_ctx, // Reduce over N(0), H(2), W(3) to get [C] std::vector reduce_dims = {0, 2, 3}; - phi::SumKernel(dev_ctx, - out_grad_nchw, - phi::IntArray(reduce_dims), - CppTypeToDataType::Type(), - false, - bias_grad); + SumKernel(dev_ctx, + out_grad_nchw, + IntArray(reduce_dims), + CppTypeToDataType::Type(), + false, + bias_grad); } if (input_grad && channel_last) { diff --git a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu index 557717c821b8be..a9d022e2ab01ce 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu @@ -52,7 +52,7 @@ __global__ void DWConv2dFwdKernel(const T* __restrict__ input, const int padHeight, const int dilationWidth, const int dilationHeight) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth; const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight; @@ -119,7 +119,7 @@ __global__ void DWConv2dFwdKernelGeneric(const T* __restrict__ input, const int padHeight, const int dilationWidth, const int dilationHeight) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; for (IndexT linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < totalElements; diff --git a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu index 58515d8a415a03..2c7913a17f5083 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu @@ -353,7 +353,7 @@ void LaunchDepthwiseConv3dBackwardCompatible(const Context& dev_ctx, } auto stream = dev_ctx.stream(); - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const T* input_ptr = input_ncdhw.data(); const T* grad_output_ptr = out_grad_ncdhw.data(); @@ -600,12 +600,12 @@ void LaunchDepthwiseConv3dBackwardCompatible(const Context& dev_ctx, dev_ctx.template Alloc(bias_grad); // Reduce N(0), D(2), H(3), W(4) -> C(1) for NCDHW std::vector reduce_dims = {0, 2, 3, 4}; - phi::SumKernel(dev_ctx, - out_grad_ncdhw, - phi::IntArray(reduce_dims), - CppTypeToDataType::Type(), - false, - bias_grad); + SumKernel(dev_ctx, + out_grad_ncdhw, + IntArray(reduce_dims), + CppTypeToDataType::Type(), + false, + bias_grad); } if (input_grad && channel_last) { diff --git a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu index f3a0e1bf85f491..ef16c89fbc6888 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu @@ -190,7 +190,7 @@ void LaunchDepthwiseConv3dCompatible(const Context& dev_ctx, int grid = std::min((num_outputs - 1) / block + 1, (int64_t)65536); auto stream = dev_ctx.stream(); - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const T* input_ptr = input_ncdhw.data(); T* output_ptr = out_ncdhw.data(); diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu index 058f2d34cdf577..4be5bea2db8c20 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu @@ -76,18 +76,18 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(filter_grad); set_zero(dev_ctx, filter_grad, static_cast(0)); } - phi::DepthwiseConvCudnnGradKernel(dev_ctx, - input, - filter, - *output_grad, - strides_t, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - input_grad, - filter_grad); + DepthwiseConvCudnnGradKernel(dev_ctx, + input, + filter, + *output_grad, + strides_t, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + input_grad, + filter_grad); return; } #endif @@ -121,7 +121,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, set_zero(dev_ctx, input_grad, static_cast(0)); if (fuse_relu) { - phi::math::DepthwiseConvInputGradFunctor + math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad(dev_ctx, input, @@ -133,7 +133,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, input_grad, data_layout); } else { - phi::math::DepthwiseConvInputGradFunctor + math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad(dev_ctx, input, @@ -151,7 +151,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(filter_grad); set_zero(dev_ctx, filter_grad, static_cast(0)); if (fuse_relu) { - phi::math::DepthwiseConvFilterGradFunctor + math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad(dev_ctx, input, @@ -162,7 +162,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, filter_grad, data_layout); } else { - phi::math::DepthwiseConvFilterGradFunctor + math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad(dev_ctx, input, diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu index 27529b8cd707bc..6c1bfa0596a4c8 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu @@ -80,16 +80,16 @@ void DepthwiseConvKernel(const Context& dev_ctx, !defined(PADDLE_WITH_HIP) DWConvParams params(has_fuse_relu, data_format, strides, dilations); if (params.UseCudnnDepthwise(dev_ctx, input, filter)) { - phi::DepthwiseConvCudnnKernel(dev_ctx, - input, - filter, - strides_t, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - out); + DepthwiseConvCudnnKernel(dev_ctx, + input, + filter, + strides_t, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + out); return; } #endif @@ -119,7 +119,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, } if (fuse_relu) { - phi::math::DepthwiseConvFunctor depthwiseConv; + math::DepthwiseConvFunctor depthwiseConv; depthwiseConv(dev_ctx, input, filter, @@ -129,7 +129,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, output, data_layout); } else { - phi::math::DepthwiseConvFunctor depthwiseConv; + math::DepthwiseConvFunctor depthwiseConv; depthwiseConv(dev_ctx, input, filter, diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu index b377f83f59a49e..60bb1b68f9038a 100644 --- a/paddle/phi/kernels/gpu/determinant_kernel.cu +++ b/paddle/phi/kernels/gpu/determinant_kernel.cu @@ -36,10 +36,9 @@ template class EigenMatrix {}; template <> -class EigenMatrix { +class EigenMatrix { public: - using MatrixType = - Eigen::Matrix; + using MatrixType = Eigen::Matrix; }; template <> @@ -85,7 +84,7 @@ struct DeterminantCudaFunctor { std::vector input_vec; std::vector output_vec; TensorToVector(input, dev_ctx, &input_vec); - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel auto begin_iter = input_vec.begin() + i * rank * rank; auto end_iter = input_vec.begin() + (i + 1) * rank * rank; @@ -129,31 +128,31 @@ __global__ void GetDetFromLUComplex(const T* lu_data, } template -struct DeterminantCudaFunctor, Context> { +struct DeterminantCudaFunctor, Context> { void operator()(const Context& dev_ctx, const DenseTensor& a, int64_t n, int64_t batch_size, DenseTensor* output) { #ifndef PADDLE_WITH_HIP - phi::Allocator::AllocationPtr tmp_gpu_mat_data; - const phi::dtype::complex* gpu_mat = a.data>(); + Allocator::AllocationPtr tmp_gpu_mat_data; + const dtype::complex* gpu_mat = a.data>(); // Copy all elements of input matrix A to a temporary memory space to // avoid being overridden by getrf. - tmp_gpu_mat_data = phi::memory_utils::Alloc( + tmp_gpu_mat_data = memory_utils::Alloc( dev_ctx.GetPlace(), - a.numel() * sizeof(phi::dtype::complex), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + a.numel() * sizeof(dtype::complex), + Stream(reinterpret_cast(dev_ctx.stream()))); memory_utils::Copy(dev_ctx.GetPlace(), tmp_gpu_mat_data->ptr(), dev_ctx.GetPlace(), a.data(), - a.numel() * sizeof(phi::dtype::complex), + a.numel() * sizeof(dtype::complex), dev_ctx.stream()); - gpu_mat = reinterpret_cast*>( - tmp_gpu_mat_data->ptr()); + gpu_mat = + reinterpret_cast*>(tmp_gpu_mat_data->ptr()); - std::vector*> cpu_ptrs(batch_size); + std::vector*> cpu_ptrs(batch_size); for (int i = 0; i < batch_size; ++i) { cpu_ptrs[i] = gpu_mat + i * n * n; } @@ -161,45 +160,45 @@ struct DeterminantCudaFunctor, Context> { int num_ints = batch_size * (n + 1); // num_ints is for pivot (n * batch_size) and info (batch_size) size_t total_bytes = - batch_size * sizeof(phi::dtype::complex*) + num_ints * sizeof(int); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc( + batch_size * sizeof(dtype::complex*) + num_ints * sizeof(int); + Allocator::AllocationPtr tmp_gpu_ptrs_data = memory_utils::Alloc( dev_ctx.GetPlace(), total_bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); memory_utils::Copy(dev_ctx.GetPlace(), tmp_gpu_ptrs_data->ptr(), CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(phi::dtype::complex*), + cpu_ptrs.size() * sizeof(dtype::complex*), dev_ctx.stream()); - phi::dtype::complex** gpu_mat_ptr = - reinterpret_cast**>(tmp_gpu_ptrs_data->ptr()); + dtype::complex** gpu_mat_ptr = + reinterpret_cast**>(tmp_gpu_ptrs_data->ptr()); int* gpu_info_ptr = reinterpret_cast(gpu_mat_ptr + cpu_ptrs.size()); int* pivot_data = gpu_info_ptr + batch_size; - auto blas = funcs::GetBlas>(dev_ctx); + auto blas = funcs::GetBlas>(dev_ctx); // This function performs the LU factorization of each matrix A by the // equation P * A = L * U. L and U are written back to original matrix A, // and diagonal elements of L are discarded. blas.BatchedGETRF(n, gpu_mat_ptr, pivot_data, gpu_info_ptr, batch_size); - phi::dtype::complex* out_data = - dev_ctx.template Alloc>(output); + dtype::complex* out_data = + dev_ctx.template Alloc>(output); int block_size = std::min(256, dev_ctx.GetMaxThreadsPerBlock()); dim3 dim_block(block_size); dim3 num_blocks((batch_size + block_size - 1) / block_size); - GetDetFromLUComplex><<>>( + GetDetFromLUComplex><<>>( gpu_mat, pivot_data, n, batch_size, out_data); #else using MatrixType = Eigen::Matrix, Eigen::Dynamic, Eigen::Dynamic>; - std::vector> input_vec; - std::vector> output_vec; + std::vector> input_vec; + std::vector> output_vec; TensorToVector(a, dev_ctx, &input_vec); for (int64_t i = 0; i < batch_size; ++i) { // maybe can be parallel auto begin_iter = input_vec.begin() + i * n * n; auto end_iter = input_vec.begin() + (i + 1) * n * n; - std::vector> sub_vec( + std::vector> sub_vec( begin_iter, end_iter); // get every square matrix data MatrixType matrix(n, n); @@ -209,7 +208,7 @@ struct DeterminantCudaFunctor, Context> { } } output_vec.push_back( - static_cast>(matrix.determinant())); + static_cast>(matrix.determinant())); } TensorFromVector(output_vec, dev_ctx, output); #endif diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu index f52882ef5a2327..5877cbdbe54089 100644 --- a/paddle/phi/kernels/gpu/dgc_kernel.cu +++ b/paddle/phi/kernels/gpu/dgc_kernel.cu @@ -186,18 +186,18 @@ void DGCKernel(const Context& dev_ctx, dev_ctx.template Alloc(gather_buff); int buf_size = paddle::communication::dgc::get_buffer_size(k); - phi::Allocator::AllocationPtr tmp_ious_data; + Allocator::AllocationPtr tmp_ious_data; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (dev_ctx.GetPlace().GetType() == AllocationType::GPU || dev_ctx.GetPlace().GetType() == AllocationType::CUSTOM) { - tmp_ious_data = phi::memory_utils::Alloc( + tmp_ious_data = memory_utils::Alloc( dev_ctx.GetPlace(), buf_size, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); } #endif if (dev_ctx.GetPlace().GetType() == AllocationType::CPU) { - tmp_ious_data = phi::memory_utils::Alloc(dev_ctx.GetPlace(), buf_size); + tmp_ious_data = memory_utils::Alloc(dev_ctx.GetPlace(), buf_size); } void* buf = reinterpret_cast(tmp_ious_data->ptr()); diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu index 7657c6c432935d..581e0f2f00e784 100644 --- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu @@ -21,8 +21,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template void DiagonalGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -66,7 +64,7 @@ void DiagonalGradKernel(const Context& dev_ctx, int blocks = std::min((numel + threads - 1) / threads, blocks_max); int64_t dout_numel = out_grad.numel(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( dx_data, 0, numel * sizeof(T), dev_ctx.stream()); switch (dx_dim_size) { diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu index 70770c90215b85..c2e10ecc809046 100644 --- a/paddle/phi/kernels/gpu/diagonal_kernel.cu +++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu @@ -20,7 +20,6 @@ #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/diagonal.h" namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; template void DiagonalKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu index 414cec8b37d883..4df46426834338 100644 --- a/paddle/phi/kernels/gpu/dist_kernel.cu +++ b/paddle/phi/kernels/gpu/dist_kernel.cu @@ -77,7 +77,7 @@ struct PowFunctorHighPrecision { template __global__ void ReduceSumWithSubtract( const T* x, const T* y, T* out, int64_t N, Functor func) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT sum_val(0.0); CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) { sum_val += func(x[i], y[i]); } @@ -92,7 +92,7 @@ __global__ void ReduceMaxWithSubtract(const T* x, const T* y, T* out, int64_t N) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT max_val = std::numeric_limits::min(); CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) { max_val = max(max_val, abs(static_cast(x[i]) - static_cast(y[i]))); @@ -109,7 +109,7 @@ __global__ void ReduceMinWithSubtract(const T* x, const T* y, T* out, int64_t N) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT min_val = std::numeric_limits::max(); CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) { min_val = min(min_val, abs(static_cast(x[i]) - static_cast(y[i]))); @@ -132,7 +132,7 @@ void DistKernel(const Context& dev_ctx, return; } - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; DenseTensor intermediate; const T* x_ptr = x.data(); const T* y_ptr = y.data(); @@ -144,7 +144,7 @@ void DistKernel(const Context& dev_ctx, if (xdim == y.dims()) { // same shape int64_t n = x.numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); intermediate.Resize({config.block_per_grid.x}); T* i_ptr = dev_ctx.template Alloc(&intermediate); std::vector axis_dims = {static_cast(-1)}; @@ -161,7 +161,7 @@ void DistKernel(const Context& dev_ctx, ReduceMaxWithSubtract <<>>( x_ptr, y_ptr, i_ptr, n); - phi::MaxRawKernel( + MaxRawKernel( dev_ctx, intermediate, reduce_axis, true, true, out); } else if (p == -INFINITY) { @@ -169,7 +169,7 @@ void DistKernel(const Context& dev_ctx, <<>>( x_ptr, y_ptr, i_ptr, n); - phi::MinRawKernel( + MinRawKernel( dev_ctx, intermediate, reduce_axis, true, true, out); } else { diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu index f082253897a7c0..3c6d6a74e299dc 100644 --- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu @@ -196,7 +196,7 @@ void DistributeFpnProposalsKernel( sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage - auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes); + auto d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes); // Run sorting operation // sort target level to get corresponding index diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu index c12789577642a1..af3c25ef1251ed 100644 --- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu +++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu @@ -27,8 +27,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void FillFirstRow(T* dist, const int N) { int64_t idx = diff --git a/paddle/phi/kernels/gpu/eig_grad_kernel.cu b/paddle/phi/kernels/gpu/eig_grad_kernel.cu index e79f4500e299b5..89ee3f6a347ff3 100644 --- a/paddle/phi/kernels/gpu/eig_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/eig_grad_kernel.cu @@ -55,13 +55,13 @@ void SolveLinearSystemGPU(const GPUContext& dev_ctx, #ifdef PADDLE_WITH_CUDA template <> -void SolveLinearSystemGPU>( +void SolveLinearSystemGPU>( const GPUContext& dev_ctx, - const phi::dtype::complex* + const dtype::complex* matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* + const dtype::complex* rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* + dtype::complex* out_data, // device ptr, row-major, size batch*order*rhs_cols int order, int rhs_cols, @@ -69,7 +69,7 @@ void SolveLinearSystemGPU>( // handles cublasHandle_t cublas_handle = dev_ctx.cublas_handle(); cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle(); - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + auto stream = Stream(reinterpret_cast(dev_ctx.stream())); // cuComplex constants const cuComplex kAlpha = make_cuFloatComplex(1.0f, 0.0f); @@ -88,22 +88,22 @@ void SolveLinearSystemGPU>( cuComplex* X_row_all = reinterpret_cast(out_data); auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); cuComplex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); cuComplex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); - auto d_pivots_alloc = phi::memory_utils::Alloc( + auto d_pivots_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * order * sizeof(int), stream); int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); auto d_info_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), - static_cast(batch_count) * sizeof(int), - stream); + memory_utils::Alloc(dev_ctx.GetPlace(), + static_cast(batch_count) * sizeof(int), + stream); int* d_info = reinterpret_cast(d_info_alloc->ptr()); // A_row layout: row-major (order x order), B_row layout: row-major (order @@ -157,7 +157,7 @@ void SolveLinearSystemGPU>( size_t work_bytes = static_cast(lwork) * sizeof(cuComplex); auto d_work_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); cuComplex* d_work = reinterpret_cast(d_work_alloc->ptr()); for (int i = 0; i < batch_count; ++i) { @@ -238,13 +238,13 @@ void SolveLinearSystemGPU>( } template <> -void SolveLinearSystemGPU>( +void SolveLinearSystemGPU>( const GPUContext& dev_ctx, - const phi::dtype::complex* + const dtype::complex* matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* + const dtype::complex* rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* + dtype::complex* out_data, // device ptr, row-major, size batch*order*rhs_cols int order, int rhs_cols, @@ -252,7 +252,7 @@ void SolveLinearSystemGPU>( // handles cublasHandle_t cublas_handle = dev_ctx.cublas_handle(); cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle(); - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + auto stream = Stream(reinterpret_cast(dev_ctx.stream())); // cuDoubleComplex constants const cuDoubleComplex kAlpha = make_cuDoubleComplex(1.0f, 0.0f); @@ -273,24 +273,24 @@ void SolveLinearSystemGPU>( cuDoubleComplex* X_row_all = reinterpret_cast(out_data); auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); cuDoubleComplex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); cuDoubleComplex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); - auto d_pivots_alloc = phi::memory_utils::Alloc( + auto d_pivots_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * order * sizeof(int), stream); int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); auto d_info_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), - static_cast(batch_count) * sizeof(int), - stream); + memory_utils::Alloc(dev_ctx.GetPlace(), + static_cast(batch_count) * sizeof(int), + stream); int* d_info = reinterpret_cast(d_info_alloc->ptr()); // A_row layout: row-major (order x order), B_row layout: row-major (order @@ -345,7 +345,7 @@ void SolveLinearSystemGPU>( size_t work_bytes = static_cast(lwork) * sizeof(cuDoubleComplex); auto d_work_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); cuDoubleComplex* d_work = reinterpret_cast(d_work_alloc->ptr()); @@ -429,20 +429,20 @@ void SolveLinearSystemGPU>( #ifdef PADDLE_WITH_HIP template <> -void SolveLinearSystemGPU>( +void SolveLinearSystemGPU>( const GPUContext& dev_ctx, - const phi::dtype::complex* + const dtype::complex* matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* + const dtype::complex* rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* + dtype::complex* out_data, // device ptr, row-major, size batch*order*rhs_cols int order, int rhs_cols, int batch_count) { // handles rocblas_handle rocblas_handle = dev_ctx.cusolver_dn_handle(); - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + auto stream = Stream(reinterpret_cast(dev_ctx.stream())); // rocblas_float_complex constants const rocblas_float_complex kAlpha = rocblas_float_complex{1.0f, 0.0f}; @@ -464,21 +464,21 @@ void SolveLinearSystemGPU>( reinterpret_cast(out_data); auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); rocblas_float_complex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); rocblas_float_complex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); - auto d_pivots_alloc = phi::memory_utils::Alloc( + auto d_pivots_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * order * sizeof(rocblas_int), stream); rocblas_int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); - auto d_info_alloc = phi::memory_utils::Alloc( + auto d_info_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * sizeof(rocblas_int), stream); @@ -587,13 +587,12 @@ void SolveLinearSystemGPU>( auto* cpu_ctx = static_cast(pool.Get(cpu_place)); std::vector h_info(batch_count, 0); - phi::memory_utils::Copy( - CPUPlace(), - h_info.data(), - dev_ctx.GetPlace(), - d_info, - static_cast(batch_count) * sizeof(rocblas_int), - reinterpret_cast(dev_ctx.stream())); + memory_utils::Copy(CPUPlace(), + h_info.data(), + dev_ctx.GetPlace(), + d_info, + static_cast(batch_count) * sizeof(rocblas_int), + reinterpret_cast(dev_ctx.stream())); dev_ctx.Wait(); for (int i = 0; i < batch_count; ++i) { @@ -607,20 +606,20 @@ void SolveLinearSystemGPU>( } template <> -void SolveLinearSystemGPU>( +void SolveLinearSystemGPU>( const GPUContext& dev_ctx, - const phi::dtype::complex* + const dtype::complex* matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* + const dtype::complex* rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* + dtype::complex* out_data, // device ptr, row-major, size batch*order*rhs_cols int order, int rhs_cols, int batch_count) { // handles rocblas_handle rocblas_handle = dev_ctx.cusolver_dn_handle(); - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + auto stream = Stream(reinterpret_cast(dev_ctx.stream())); // rocblas_double_complex constants const rocblas_double_complex kAlpha = rocblas_double_complex{1.0, 0.0}; @@ -642,21 +641,21 @@ void SolveLinearSystemGPU>( reinterpret_cast(out_data); auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); rocblas_double_complex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); rocblas_double_complex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); - auto d_pivots_alloc = phi::memory_utils::Alloc( + auto d_pivots_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * order * sizeof(rocblas_int), stream); rocblas_int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); - auto d_info_alloc = phi::memory_utils::Alloc( + auto d_info_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * sizeof(rocblas_int), stream); @@ -763,13 +762,12 @@ void SolveLinearSystemGPU>( auto* cpu_ctx = static_cast(pool.Get(cpu_place)); std::vector h_info(batch_count, 0); - phi::memory_utils::Copy( - CPUPlace(), - h_info.data(), - dev_ctx.GetPlace(), - d_info, - static_cast(batch_count) * sizeof(rocblas_int), - reinterpret_cast(dev_ctx.stream())); + memory_utils::Copy(CPUPlace(), + h_info.data(), + dev_ctx.GetPlace(), + d_info, + static_cast(batch_count) * sizeof(rocblas_int), + reinterpret_cast(dev_ctx.stream())); dev_ctx.Wait(); for (int i = 0; i < batch_count; ++i) { @@ -806,13 +804,12 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L, gV_safe = Fill(dev_ctx, vectorize(V.dims()), T(0)); } DenseTensor trans_v = TransposeLast2Dim(dev_ctx, V); - DenseTensor Vh = phi::Conj(dev_ctx, trans_v); - DenseTensor Lconj = phi::Conj(dev_ctx, L); - DenseTensor Econj = phi::Subtract(dev_ctx, - phi::funcs::Unsqueeze(Lconj, -2), - phi::funcs::Unsqueeze(Lconj, -1)); - DenseTensor VhgV = phi::Matmul(dev_ctx, Vh, gV_safe); - DenseTensor diag_real = phi::Real(dev_ctx, VhgV); + DenseTensor Vh = Conj(dev_ctx, trans_v); + DenseTensor Lconj = Conj(dev_ctx, L); + DenseTensor Econj = Subtract( + dev_ctx, funcs::Unsqueeze(Lconj, -2), funcs::Unsqueeze(Lconj, -1)); + DenseTensor VhgV = Matmul(dev_ctx, Vh, gV_safe); + DenseTensor diag_real = Real(dev_ctx, VhgV); auto cpu_place = CPUPlace(); DeviceContextPool& pool = DeviceContextPool::Instance(); @@ -823,13 +820,13 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L, Copy(dev_ctx, diag_real, cpu_place, false, &diag_real_cpu); DenseTensor diag_res_cpu = - phi::funcs::BatchDiag((*cpu_ctx), diag_real_cpu, batch_count); + funcs::BatchDiag((*cpu_ctx), diag_real_cpu, batch_count); DenseTensor diag_res; dev_ctx.template Alloc(&diag_res); Copy(dev_ctx, diag_res_cpu, GPUPlace(), false, &diag_res); - DenseTensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2); + DenseTensor diag_unsqueezed = funcs::Unsqueeze(diag_res, -2); auto numel = diag_unsqueezed.numel(); DenseTensor diag_unsqueezed_complex; @@ -838,21 +835,20 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L, auto* data_diag_un_com = dev_ctx.template Alloc( &diag_unsqueezed_complex, static_cast(numel * sizeof(T))); - phi::funcs::ForRange for_range(dev_ctx, numel); - phi::funcs::RealToComplexFunctor functor( - data_diag_un, data_diag_un_com, numel); + funcs::ForRange for_range(dev_ctx, numel); + funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, numel); for_range(functor); // real tensor multiply complex tensor in broadcast manner - DenseTensor res1 = phi::Multiply(dev_ctx, V, diag_unsqueezed_complex); - DenseTensor res2 = phi::Matmul(dev_ctx, Vh, res1); - DenseTensor result = phi::Subtract(dev_ctx, VhgV, res2); + DenseTensor res1 = Multiply(dev_ctx, V, diag_unsqueezed_complex); + DenseTensor res2 = Matmul(dev_ctx, Vh, res1); + DenseTensor result = Subtract(dev_ctx, VhgV, res2); result.Resize(V.dims()); dev_ctx.template Alloc(&result); - result = phi::Divide(dev_ctx, result, Econj); - result = phi::funcs::DiagFill( - dev_ctx, order, order, order, 0, gL_safe, result); - DenseTensor rhs = phi::Matmul(dev_ctx, result, Vh); + result = Divide(dev_ctx, result, Econj); + result = + funcs::DiagFill(dev_ctx, order, order, order, 0, gL_safe, result); + DenseTensor rhs = Matmul(dev_ctx, result, Vh); // solve linear system // solve(Vh, rhs, out, m, k) @@ -877,7 +873,7 @@ void EigGradKernel(const Context& dev_ctx, const optional& dout_w, const optional& dout_v, DenseTensor* dx) { - auto* dx_data = dev_ctx.template Alloc>(dx); + auto* dx_data = dev_ctx.template Alloc>(dx); if (dx->numel() == 0) { return; } @@ -885,7 +881,7 @@ void EigGradKernel(const Context& dev_ctx, int batch_count = BatchCount(out_v); const int64_t order = out_v.dims(-1); - ComputeBackwardForComplexInputGPU, Context>( + ComputeBackwardForComplexInputGPU, Context>( out_w, out_v, dout_w, dout_v, dx_data, batch_count, order, dev_ctx); } #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/gpu/eig_kernel.cu b/paddle/phi/kernels/gpu/eig_kernel.cu index e1322d11d1ee90..6578a4c054c41c 100644 --- a/paddle/phi/kernels/gpu/eig_kernel.cu +++ b/paddle/phi/kernels/gpu/eig_kernel.cu @@ -25,8 +25,8 @@ void EigKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out_w, DenseTensor* out_v) { - dev_ctx.template Alloc>(out_w); - dev_ctx.template Alloc>(out_v); + dev_ctx.template Alloc>(out_w); + dev_ctx.template Alloc>(out_v); if (x.numel() == 0) { return; @@ -39,9 +39,9 @@ void EigKernel(const Context& dev_ctx, // prepare cpu Tensor here, since magma requires output on cpu DenseTensor out_w_cpu, out_v_cpu; out_w_cpu.Resize(out_w->dims()); - (*cpu_ctx).template Alloc>(&out_w_cpu); + (*cpu_ctx).template Alloc>(&out_w_cpu); out_v_cpu.Resize(x.dims()); - (*cpu_ctx).template Alloc>(&out_v_cpu); + (*cpu_ctx).template Alloc>(&out_v_cpu); if (!IsComplexType(x.dtype())) { // output still be complex though input is real @@ -55,55 +55,53 @@ void EigKernel(const Context& dev_ctx, real_w_cpu.Resize(real_w_dim); (*cpu_ctx).template Alloc>(&real_w_cpu); real_v_cpu.Resize(x.dims()); - (*cpu_ctx).template Alloc>(&real_v_cpu); + (*cpu_ctx).template Alloc>(&real_v_cpu); - phi::ApplyEigKernelMagma, Context>( + ApplyEigKernelMagma, Context>( dev_ctx, x, &real_w_cpu, &real_v_cpu); // 1. extract real part & imag part from real_w_cpu - DenseTensor real_part_cpu = phi::funcs::Slice>( + DenseTensor real_part_cpu = funcs::Slice>( (*cpu_ctx), real_w_cpu, {-1}, {0}, {order}); - DenseTensor imag_part_cpu = phi::funcs::Slice>( + DenseTensor imag_part_cpu = funcs::Slice>( (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2}); // 2. construct complex values - auto* real_part_data = real_part_cpu.data>(); - auto* imag_part_data = imag_part_cpu.data>(); + auto* real_part_data = real_part_cpu.data>(); + auto* imag_part_data = imag_part_cpu.data>(); int64_t out_w_numel = static_cast(out_w->numel()); - phi::funcs::ForRange for_range((*cpu_ctx), out_w_numel); - phi::funcs::RealImagToComplexFunctor> functor( + funcs::ForRange for_range((*cpu_ctx), out_w_numel); + funcs::RealImagToComplexFunctor> functor( real_part_data, imag_part_data, - out_w_cpu.data>(), + out_w_cpu.data>(), out_w_numel); for_range(functor); // 3. construct complex vectors DenseTensor real_v_trans_cpu = - TransposeLast2Dim, CPUContext>((*cpu_ctx), - real_v_cpu); + TransposeLast2Dim, CPUContext>((*cpu_ctx), real_v_cpu); DenseTensor out_v_trans_cpu; out_v_trans_cpu.Resize(x.dims()); - (*cpu_ctx).template Alloc>(&out_v_trans_cpu); - - phi::ConstructComplexVectors, - phi::dtype::Complex, - CPUContext>(&out_v_trans_cpu, - out_w_cpu, - real_v_trans_cpu, - (*cpu_ctx), - batch_count, - order); - - TransposeTwoAxis, CPUContext>(out_v_trans_cpu, - &out_v_cpu, - x.dims().size() - 1, - x.dims().size() - 2, - (*cpu_ctx)); + (*cpu_ctx).template Alloc>(&out_v_trans_cpu); + + ConstructComplexVectors, dtype::Complex, CPUContext>( + &out_v_trans_cpu, + out_w_cpu, + real_v_trans_cpu, + (*cpu_ctx), + batch_count, + order); + + TransposeTwoAxis, CPUContext>(out_v_trans_cpu, + &out_v_cpu, + x.dims().size() - 1, + x.dims().size() - 2, + (*cpu_ctx)); } else { - phi::ApplyEigKernelMagma(dev_ctx, x, &out_w_cpu, &out_v_cpu); + ApplyEigKernelMagma(dev_ctx, x, &out_w_cpu, &out_v_cpu); } // copy result from cpu to gpu tensor diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index 29fd53f88d6f57..52fc578a478f51 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -33,7 +33,7 @@ void ReduceWrapper(const GPUContext &dev_ctx, DenseTensor *dst) { std::vector reduce_dims = funcs::GetReduceDim(dst->dims(), src->dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, *src, reduce_dims, src->dtype(), false, dst); } @@ -228,7 +228,7 @@ void DefaultMixedPrecisionAddGrad(const GPUContext &dev_ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), dout.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, dx); } } @@ -244,7 +244,7 @@ void DefaultMixedPrecisionAddGrad(const GPUContext &dev_ctx, dev_ctx.template Alloc(&dy_fp32); std::vector reduce_dims = funcs::GetReduceDim(y.dims(), dout.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, &dy_fp32); CastKernel(dev_ctx, dy_fp32, dy->dtype(), dy); } @@ -309,7 +309,7 @@ void DefaultElementwiseAddGrad(const GPUContext &dev_ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, dx); } } @@ -323,7 +323,7 @@ void DefaultElementwiseAddGrad(const GPUContext &dev_ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, dy); } } @@ -432,7 +432,7 @@ void default_elementwise_sub_grad(const GPUContext &dev_ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, dx); } } diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 2e09dca7a9648a..54a64e344acd34 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -67,7 +67,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, const optional& ddy, int axis, DenseTensor* ddout) { - phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); + SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } template @@ -184,12 +184,12 @@ void AddGradKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_CUDA if (x.dtype() == DataType::FLOAT32 && (y.dtype() == DataType::FLOAT16 || y.dtype() == DataType::BFLOAT16)) { - phi::MixedPrecisionAddGradImpl( + MixedPrecisionAddGradImpl( dev_ctx, x, y, dout, axis, dx, dy, MixedPrecisionAddGradFunc); return; } #endif - phi::AddGradImpl(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc); + AddGradImpl(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc); } template @@ -200,7 +200,7 @@ void AddDoubleGradKernel(const Context& dev_ctx, const optional& ddy, int axis, DenseTensor* ddout) { - phi::AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); + AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } template @@ -211,7 +211,7 @@ void AddTripleGradKernel(const Context& dev_ctx, int axis, DenseTensor* d_ddx, DenseTensor* d_ddy) { - phi::AddGradImpl( + AddGradImpl( dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc); } diff --git a/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu index 1a23afd87566c0..e68b3acbddddd9 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu @@ -33,7 +33,7 @@ namespace phi { template __global__ void EmbeddingGradAddTo(T* main_grad_out, - const phi::bfloat16* out_grad, + const bfloat16* out_grad, const IndexT* token_indices, const int64_t num_tokens, const int64_t token_length) { @@ -44,7 +44,7 @@ __global__ void EmbeddingGradAddTo(T* main_grad_out, while (idy < num_tokens) { auto id = static_cast(token_indices[idy]); - const phi::bfloat16* token_out_grad = out_grad + idy * token_length; + const bfloat16* token_out_grad = out_grad + idy * token_length; T* token_main_grad = main_grad_out + id * token_length; for (int64_t i = idx; i < token_length; i += blockDim.x) { CudaAtomicAdd(&token_main_grad[i], static_cast(token_out_grad[i])); @@ -77,8 +77,8 @@ struct EmbeddingGradAddToCUDAFunctor { auto main_grad_out_t = main_grad_out_; const auto* token_indices = token_indices_.template data(); T* main_grad_out = dev_ctx_.template Alloc(main_grad_out_t); - const phi::bfloat16* out_grad = reinterpret_cast( - out_grad_.template data()); + const bfloat16* out_grad = reinterpret_cast( + out_grad_.template data()); const int gridx = 2 * dev_ctx_.GetSMCount(); dim3 threads(128, 8); diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu index 259f108adedf53..31617e4bd4d329 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu @@ -187,7 +187,7 @@ struct EmbeddingSparseGradCUDAFunctor { new_rows.resize(ids_num); auto gpu_place = dev_ctx_.GetPlace(); - phi::MixVector mixv_new_rows(&new_rows); + MixVector mixv_new_rows(&new_rows); if (!std::is_same::value) { InputTypeConvert<<>>( ids_data, ids_num, mixv_new_rows.MutableData(gpu_place)); diff --git a/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu index 8fb4098f395065..af278b5f8ac7e7 100644 --- a/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu @@ -102,7 +102,7 @@ __global__ void ScaleGradKernel(const int* count_data, int64_t num_weights, int64_t num_weight_dim, T* table) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < num_weights) { MPType freq = static_cast(count_data[idx]); diff --git a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu index 7542d8be1bba24..f8112a8ef8963c 100644 --- a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu @@ -51,7 +51,7 @@ void ExpandAsGradKernel(const Context& dev_ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(in_dims, out_dims, -1); - phi::SumKernel( + SumKernel( dev_ctx, out_grad, reduce_dims, out_grad.dtype(), false, in_grad); } } diff --git a/paddle/phi/kernels/gpu/expand_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_grad_kernel.cu index e4fa8f88c35724..465c0db628f9d4 100644 --- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu @@ -44,7 +44,7 @@ void ExpandGradKernel(const Context& dev_ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(x_grad->dims(), out_grad.dims(), -1); - phi::SumKernel( + SumKernel( dev_ctx, out_grad, reduce_dims, out_grad.dtype(), false, x_grad); } } diff --git a/paddle/phi/kernels/gpu/exponential_kernel.cu b/paddle/phi/kernels/gpu/exponential_kernel.cu index 11747e7adefc1f..9e1694d5e1df2d 100644 --- a/paddle/phi/kernels/gpu/exponential_kernel.cu +++ b/paddle/phi/kernels/gpu/exponential_kernel.cu @@ -25,7 +25,7 @@ void ExponentialKernel(const Context &dev_ctx, const DenseTensor &x, float lambda, DenseTensor *out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; funcs::uniform_distribution dist; funcs::exponential_transform trans(lambda); funcs::distribution_and_transform(dev_ctx, out, dist, trans); diff --git a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu index 059468345ead84..31f56972ff0ba8 100644 --- a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu @@ -386,13 +386,13 @@ void FlashAttnUnpaddedGradBaseKernel(const Context& dev_ctx, if (!is_mha) { if (dk) { if (dk->meta().is_contiguous()) - phi::SumKernel(dev_ctx, dk_tmp, {2}, dk->type(), false, dk); + SumKernel(dev_ctx, dk_tmp, {2}, dk->type(), false, dk); else kvReduceForGQA(dev_ctx, dk_tmp, dk); } if (dv) { if (dv->meta().is_contiguous()) - phi::SumKernel(dev_ctx, dv_tmp, {2}, dv->type(), false, dv); + SumKernel(dev_ctx, dv_tmp, {2}, dv->type(), false, dv); else kvReduceForGQA(dev_ctx, dv_tmp, dv); } @@ -693,27 +693,27 @@ void FlashAttnGradBaseKernel(const Context& dev_ctx, dev_ctx.template Alloc(&flashmask_maxmin); downstart_row_indices = - phi::Slice(dev_ctx, startend_row_indices.get(), {3}, {0}, {1}); + Slice(dev_ctx, startend_row_indices.get(), {3}, {0}, {1}); downstart_row_indices_data = downstart_row_indices.data(); if (startend_row_indices->dims()[3] == 2) { if (!causal) { - upend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + upend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); upend_row_indices_data = upend_row_indices.data(); } else { - downend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + downend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); downend_row_indices_data = downend_row_indices.data(); } } else if (startend_row_indices->dims()[3] == 4) { - upend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {3}, {4}); + upend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {3}, {4}); upend_row_indices_data = upend_row_indices.data(); - downend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + downend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); downend_row_indices_data = downend_row_indices.data(); - upstart_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {2}, {3}); + upstart_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {2}, {3}); upstart_row_indices_data = upstart_row_indices.data(); } } @@ -888,16 +888,14 @@ void FlashAttnGradBaseKernel(const Context& dev_ctx, if (!is_mha) { if (dk) { if (dk->meta().is_contiguous()) - phi::SumKernel( - dev_ctx, dk_tmp, {3}, dk->type(), false, dk); + SumKernel(dev_ctx, dk_tmp, {3}, dk->type(), false, dk); else kvReduceBatchedForGQA(dev_ctx, dk_tmp, dk); } if (dv) { if (dv->meta().is_contiguous()) - phi::SumKernel( - dev_ctx, dv_tmp, {3}, dv->type(), false, dv); + SumKernel(dev_ctx, dv_tmp, {3}, dv->type(), false, dv); else kvReduceBatchedForGQA(dev_ctx, dv_tmp, dv); } diff --git a/paddle/phi/kernels/gpu/flash_attn_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_kernel.cu index 127ccb256c6285..07dc127414e624 100644 --- a/paddle/phi/kernels/gpu/flash_attn_kernel.cu +++ b/paddle/phi/kernels/gpu/flash_attn_kernel.cu @@ -451,27 +451,27 @@ void FlashAttnBaseKernel(const Context& dev_ctx, dev_ctx.template Alloc(&flashmask_maxmin); downstart_row_indices = - phi::Slice(dev_ctx, startend_row_indices.get(), {3}, {0}, {1}); + Slice(dev_ctx, startend_row_indices.get(), {3}, {0}, {1}); downstart_row_indices_data = downstart_row_indices.data(); if (startend_row_indices->dims()[3] == 2) { if (!causal) { - upend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + upend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); upend_row_indices_data = upend_row_indices.data(); } else { - downend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + downend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); downend_row_indices_data = downend_row_indices.data(); } } else if (startend_row_indices->dims()[3] == 4) { - upend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {3}, {4}); + upend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {3}, {4}); upend_row_indices_data = upend_row_indices.data(); - downend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + downend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); downend_row_indices_data = downend_row_indices.data(); - upstart_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {2}, {3}); + upstart_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {2}, {3}); upstart_row_indices_data = upstart_row_indices.data(); } } diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu index ff752cba4373dc..6a2a8c8bf5e666 100644 --- a/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu @@ -1034,22 +1034,22 @@ void FlashMaskV2GradBaseKernel( dev_ctx.template Alloc(&flashmask_maxmin); lt_start_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {0}, {1}); + Slice(dev_ctx, startend_row_indices, {3}, {0}, {1}); if (startend_row_indices.dims()[3] == 2) { if (!is_causal) { ut_end_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); + Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); } else { lt_end_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); + Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); } } else if (startend_row_indices.dims()[3] == 4) { ut_end_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {3}, {4}); + Slice(dev_ctx, startend_row_indices, {3}, {3}, {4}); lt_end_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); + Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); ut_start_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {2}, {3}); + Slice(dev_ctx, startend_row_indices, {3}, {2}, {3}); } } @@ -1447,7 +1447,7 @@ void FlashMaskV2GradBaseKernel( // different from hdim_qk for now DenseTensor tile_count_semaphore; if (arch >= 90) { - tile_count_semaphore = phi::Full(dev_ctx, {1}, 0); + tile_count_semaphore = Full(dev_ctx, {1}, 0); dynload::flashmaskv2_bwd_params_set_tile_count_semaphore( params_handle, tile_count_semaphore.data()); } else { diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu index 5330c135233760..9298e0c965788c 100644 --- a/paddle/phi/kernels/gpu/flip_kernel.cu +++ b/paddle/phi/kernels/gpu/flip_kernel.cu @@ -24,9 +24,9 @@ namespace phi { template __global__ void FlipCudaKernel(const T* in_data, T* out_data, - phi::Array shape, - phi::Array stride, - phi::Array flip_dims, + Array shape, + Array stride, + Array flip_dims, const int rank, const int64_t numel, const int flip_dims_size) { @@ -79,9 +79,9 @@ void FlipKernel(const Context& dev_ctx, size_t flip_dims_size = axis.size(); auto x_stride = common::stride(x_dims); - phi::Array stride_array; - phi::Array shape_array; - phi::Array flip_dims_array; + Array stride_array; + Array shape_array; + Array flip_dims_array; for (int i = 0; i < rank; ++i) { stride_array[i] = x_stride[i]; @@ -93,7 +93,7 @@ void FlipKernel(const Context& dev_ctx, } } - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); FlipCudaKernel <<>>( in_data, diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu index 71f022b58b4377..828a0a7c298ad7 100644 --- a/paddle/phi/kernels/gpu/full_kernel.cu +++ b/paddle/phi/kernels/gpu/full_kernel.cu @@ -71,14 +71,13 @@ void FullLikeKernel(const Context& dev_ctx, // the operator is 0 int64_t numel = out->numel(); - if (!std::is_same::value && - !std::is_same::value && - !std::is_same::value) { + if (!std::is_same::value && + !std::is_same::value && !std::is_same::value) { auto value = val.to(); using CommonType = typename std::common_type< float, - typename std::conditional::value || - std::is_same::value, + typename std::conditional::value || + std::is_same::value, float, T>::type>::type; auto common_type_value = static_cast(value); diff --git a/paddle/phi/kernels/gpu/fused_adam_kernel.cu b/paddle/phi/kernels/gpu/fused_adam_kernel.cu index f4261ffcbfb9ac..371632204d6e91 100644 --- a/paddle/phi/kernels/gpu/fused_adam_kernel.cu +++ b/paddle/phi/kernels/gpu/fused_adam_kernel.cu @@ -31,7 +31,7 @@ namespace phi { template struct FusedAdamBetaPowInfo { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; FusedAdamBetaPowInfo(const MT* beta1pow, const MT* beta2pow) { beta1pow_ = *beta1pow; beta2pow_ = *beta2pow; @@ -48,7 +48,7 @@ struct FusedAdamBetaPowInfo { template struct FusedAdamBetaPowInfo { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; FusedAdamBetaPowInfo(const MT* beta1pow, const MT* beta2pow) { beta1pow_ = beta1pow; beta2pow_ = beta2pow; @@ -286,7 +286,7 @@ static void CopyTensorIfDifferent(const Context& dev_ctx, for (size_t i = 0; i < src.size(); ++i) { if (src[i] != dst[i]) { VLOG(10) << "Copy Tensor " << i; - phi::Place place = (use_src_place ? src[i]->place() : dev_ctx.GetPlace()); + Place place = (use_src_place ? src[i]->place() : dev_ctx.GetPlace()); Copy(dev_ctx, *(src[i]), place, false, dst[i]); } } @@ -330,7 +330,7 @@ PADDLE_API void FusedAdamKernel( std::vector beta1_pows_out, std::vector beta2_pows_out, std::vector master_params_out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; auto n = params.size(); auto beta1_pow_first = beta1_pows[0]; diff --git a/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu b/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu index f6493b89b46ecb..0014b33005f170 100644 --- a/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu +++ b/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu @@ -1056,7 +1056,7 @@ struct AffineQuantStore { float normalized_val = normalized_i * static_cast(gamma_pack.elem[i]) + static_cast(beta_pack.elem[i]); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { y_pack.elem[i] = FP8QuantHelperFunc(normalized_val, quant_out_scale, quant_round_type, @@ -1138,7 +1138,7 @@ void RmsNormQuantKernel(const Context& dev_ctx, quant_scale)); } - using ComputeType = typename phi::dtype::MPTypeTrait::Type; + using ComputeType = typename dtype::MPTypeTrait::Type; const T* x_data = x.data(); const T* norm_weight_data = norm_weight.data(); @@ -1181,9 +1181,8 @@ void RmsNormQuantKernel(const Context& dev_ctx, dev_ctx.stream(), load, store, rows, cols, epsilon, inv_var_data); } else if (out->dtype() == DataType::FLOAT8_E4M3FN) { // Quantize and output float8_e4m3fn. - phi::float8_e4m3fn* out_data = - dev_ctx.template Alloc(out); - AffineQuantStore store( + float8_e4m3fn* out_data = dev_ctx.template Alloc(out); + AffineQuantStore store( out_data, cols, norm_weight_data, @@ -1220,9 +1219,8 @@ void RmsNormQuantKernel(const Context& dev_ctx, dev_ctx.stream(), load, store, rows, cols, epsilon, inv_var_data); } else if (out->dtype() == DataType::FLOAT8_E4M3FN) { // Quantize and output float8_e4m3fn. - phi::float8_e4m3fn* out_data = - dev_ctx.template Alloc(out); - AffineQuantStore store( + float8_e4m3fn* out_data = dev_ctx.template Alloc(out); + AffineQuantStore store( out_data, cols, norm_weight_data, @@ -1256,7 +1254,7 @@ void ResidualAddRmsNormWrapper(const Context& dev_ctx, const int cols, T* residual_output, T* output) { - using ComputeType = typename phi::dtype::MPTypeTrait::Type; + using ComputeType = typename dtype::MPTypeTrait::Type; ResidualAddBiasLoad load( x, residual, bias, residual_output, cols); AffineStore store(output, cols, norm_weight, norm_bias); @@ -1265,28 +1263,28 @@ void ResidualAddRmsNormWrapper(const Context& dev_ctx, } template void ResidualAddRmsNormWrapper(const GPUContext& dev_ctx, - const phi::float16* x, - const phi::float16* residual, - const phi::float16* bias, - const phi::float16* norm_weight, - const phi::float16* norm_bias, + const float16* x, + const float16* residual, + const float16* bias, + const float16* norm_weight, + const float16* norm_bias, const float epsilon, const int rows, const int cols, - phi::float16* residual_output, - phi::float16* output); + float16* residual_output, + float16* output); template void ResidualAddRmsNormWrapper(const GPUContext& dev_ctx, - const phi::bfloat16* x, - const phi::bfloat16* residual, - const phi::bfloat16* bias, - const phi::bfloat16* norm_weight, - const phi::bfloat16* norm_bias, + const bfloat16* x, + const bfloat16* residual, + const bfloat16* bias, + const bfloat16* norm_weight, + const bfloat16* norm_bias, const float epsilon, const int rows, const int cols, - phi::bfloat16* residual_output, - phi::bfloat16* output); + bfloat16* residual_output, + bfloat16* output); template void ResidualAddRmsNormWrapper(const GPUContext& dev_ctx, const float* x, @@ -1309,7 +1307,7 @@ void RmsNormWrapper(const Context& dev_ctx, const int rows, const int cols, T* output) { - using ComputeType = typename phi::dtype::MPTypeTrait::Type; + using ComputeType = typename dtype::MPTypeTrait::Type; DirectLoad load(x, cols); AffineStore store(output, cols, weight, bias); @@ -1318,22 +1316,22 @@ void RmsNormWrapper(const Context& dev_ctx, } template void RmsNormWrapper(const GPUContext& dev_ctx, - const phi::float16* x, - const phi::float16* weight, - const phi::float16* bias, + const float16* x, + const float16* weight, + const float16* bias, const float epsilon, const int rows, const int cols, - phi::float16* output); + float16* output); template void RmsNormWrapper(const GPUContext& dev_ctx, - const phi::bfloat16* x, - const phi::bfloat16* weight, - const phi::bfloat16* bias, + const bfloat16* x, + const bfloat16* weight, + const bfloat16* bias, const float epsilon, const int rows, const int cols, - phi::bfloat16* output); + bfloat16* output); template void RmsNormWrapper(const GPUContext& dev_ctx, const float* x, diff --git a/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu b/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu index 2c6feb325ac789..10c3145f6f0309 100644 --- a/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu +++ b/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu @@ -148,8 +148,8 @@ void FusedTokenPruneOpCUDAKernel(const Context& dev_ctx, dev_ctx, attn_tmp, false, reduce_dims, attn_accu.dtype(), &attn_accu); // 3. Prepare token indices - phi::backends::gpu::GpuLaunchConfig config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * max_seq_len); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * max_seq_len); FillIndex<<::max(); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz); MaximumFirst << <<>>( x.data(), diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu index 8cb5c5028415f6..18f0b880a55793 100644 --- a/paddle/phi/kernels/gpu/gather_nd_kernel.cu +++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu @@ -40,7 +40,7 @@ void GatherNdKernel(const Context &dev_ctx, for (int i = 0; i < x.dims().size(); ++i) { out_dims.emplace_back(1); } - phi::TileKernel(dev_ctx, x, phi::IntArray(out_dims), out); + TileKernel(dev_ctx, x, IntArray(out_dims), out); return; } if (index.dims()[0] == 0 && index.numel() == 0) return; diff --git a/paddle/phi/kernels/gpu/gaussian_kernel.cu b/paddle/phi/kernels/gpu/gaussian_kernel.cu index d105035fe1f28a..7eb7857015c95e 100644 --- a/paddle/phi/kernels/gpu/gaussian_kernel.cu +++ b/paddle/phi/kernels/gpu/gaussian_kernel.cu @@ -29,7 +29,7 @@ namespace phi { template -using ComplexType = phi::dtype::complex; +using ComplexType = dtype::complex; template struct GaussianGenerator { @@ -46,7 +46,7 @@ struct GaussianGenerator { __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed_); - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; thrust::normal_distribution dist(static_cast(mean_), static_cast(std_)); unsigned int new_n = n + offset_; @@ -86,8 +86,8 @@ struct GaussianGenerator> { // If T is not complex template ::value && - !std::is_same::value, + std::enable_if_t::value && + !std::is_same::value, bool> = true> void GaussianRandom(const Context& dev_ctx, const IntArray& shape, @@ -100,7 +100,7 @@ void GaussianRandom(const Context& dev_ctx, dev_ctx.template Alloc(out); if (seed == 0) { // use global Generator seed - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; funcs::normal_distribution dist; funcs::normal_transform trans(static_cast(mean), static_cast(std)); @@ -116,8 +116,8 @@ void GaussianRandom(const Context& dev_ctx, // If T is complex template ::value || - std::is_same::value, + std::enable_if_t::value || + std::is_same::value, bool> = true> void GaussianRandom(const Context& dev_ctx, const IntArray& shape, @@ -137,15 +137,14 @@ void GaussianRandom(const Context& dev_ctx, out_imag.Resize(shape.GetData()); dev_ctx.template Alloc(&out_real); dev_ctx.template Alloc(&out_imag); - funcs::normal_distribution> dist; - funcs::normal_distribution> dist_imag; - funcs::normal_transform> trans(mean, - std_of_real_or_imag); - funcs::distribution_and_transform>( + funcs::normal_distribution> dist; + funcs::normal_distribution> dist_imag; + funcs::normal_transform> trans(mean, std_of_real_or_imag); + funcs::distribution_and_transform>( dev_ctx, &out_real, dist, trans); - funcs::distribution_and_transform>( + funcs::distribution_and_transform>( dev_ctx, &out_imag, dist_imag, trans); - phi::ComplexKernel>(dev_ctx, out_real, out_imag, out); + ComplexKernel>(dev_ctx, out_real, out_imag, out); } else { // use OP seed auto func = GaussianGenerator(mean, std_of_real_or_imag, seed); @@ -156,8 +155,8 @@ void GaussianRandom(const Context& dev_ctx, // If T is not complex template ::value && - !std::is_same::value, + std::enable_if_t::value && + !std::is_same::value, bool> = true> void GaussianRandomInplace(const Context& dev_ctx, const DenseTensor& x, @@ -168,7 +167,7 @@ void GaussianRandomInplace(const Context& dev_ctx, dev_ctx.template Alloc(out); if (seed == 0) { // use global Generator seed - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; funcs::normal_distribution dist; funcs::normal_transform trans(static_cast(mean), static_cast(std)); @@ -184,8 +183,8 @@ void GaussianRandomInplace(const Context& dev_ctx, // If T is complex template ::value || - std::is_same::value, + std::enable_if_t::value || + std::is_same::value, bool> = true> void GaussianRandomInplace(const Context& dev_ctx, const DenseTensor& x, @@ -203,15 +202,14 @@ void GaussianRandomInplace(const Context& dev_ctx, out_imag.Resize(x.dims()); dev_ctx.template Alloc(&out_real); dev_ctx.template Alloc(&out_imag); - funcs::normal_distribution> dist; - funcs::normal_distribution> dist_imag; - funcs::normal_transform> trans(mean, - std_of_real_or_imag); - funcs::distribution_and_transform>( + funcs::normal_distribution> dist; + funcs::normal_distribution> dist_imag; + funcs::normal_transform> trans(mean, std_of_real_or_imag); + funcs::distribution_and_transform>( dev_ctx, &out_real, dist, trans); - funcs::distribution_and_transform>( + funcs::distribution_and_transform>( dev_ctx, &out_imag, dist_imag, trans); - phi::ComplexKernel>(dev_ctx, out_real, out_imag, out); + ComplexKernel>(dev_ctx, out_real, out_imag, out); } else { // use OP seed auto func = GaussianGenerator(mean, std_of_real_or_imag, seed); diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu index a2da0717bba5d7..bdc2265e636c27 100644 --- a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu @@ -28,7 +28,7 @@ namespace phi { template struct GeluWithApproximateGradFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { MPType x = static_cast(arg_x); MPType dout = static_cast(arg_dout); @@ -54,7 +54,7 @@ struct GeluWithApproximateGradFunctor { template struct GeluWithoutApproximateGradFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { MPType x = static_cast(arg_x); MPType dout = static_cast(arg_dout); diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu index b276ca277311df..a9058c157774cf 100644 --- a/paddle/phi/kernels/gpu/gelu_kernel.cu +++ b/paddle/phi/kernels/gpu/gelu_kernel.cu @@ -32,7 +32,7 @@ namespace phi { template struct GeluWithApproximateFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x) { // this function is tanh approximation of gelu MPType x = static_cast(arg_x); @@ -48,7 +48,7 @@ struct GeluWithApproximateFunctor { template struct GeluWithoutApproximateFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x) { // actual gelu with approximation = false MPType x = static_cast(arg_x); diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu index 7015d483d2188f..bed31526f6ccb7 100644 --- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu @@ -68,7 +68,7 @@ static void SortDescending(const GPUContext &dev_ctx, dev_ctx.stream()); // Allocate temporary storage auto place = dev_ctx.GetPlace(); - auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes); + auto d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes); // Run sorting operation cub::DeviceRadixSort::SortPairsDescending(d_temp_storage->ptr(), @@ -293,10 +293,10 @@ static void NMS(const GPUContext &dev_ctx, const T *boxes = proposals.data(); auto place = dev_ctx.GetPlace(); - auto mask_ptr = phi::memory_utils::Alloc( - place, - boxes_num * col_blocks * sizeof(uint64_t), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto mask_ptr = + memory_utils::Alloc(place, + boxes_num * col_blocks * sizeof(uint64_t), + Stream(reinterpret_cast(dev_ctx.stream()))); uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); NMSKernel<<>>( diff --git a/paddle/phi/kernels/gpu/global_gather_kernel.cu b/paddle/phi/kernels/gpu/global_gather_kernel.cu index a7e31903e24daf..0663075345a1d0 100644 --- a/paddle/phi/kernels/gpu/global_gather_kernel.cu +++ b/paddle/phi/kernels/gpu/global_gather_kernel.cu @@ -76,15 +76,15 @@ struct GlobalGatherFunctor { cpu_global_count_data = cpu_global_count.data(); } - ncclDataType_t dtype = phi::ToNCCLDataType(x->dtype()); + ncclDataType_t dtype = ToNCCLDataType(x->dtype()); gpuStream_t stream = nullptr; stream = dev_ctx.stream(); - phi::distributed::NCCLCommContext *comm_ctx = nullptr; + distributed::NCCLCommContext *comm_ctx = nullptr; int nranks = 0; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( diff --git a/paddle/phi/kernels/gpu/global_scatter_kernel.cu b/paddle/phi/kernels/gpu/global_scatter_kernel.cu index f14b3eea87517d..f1d4a4add322cf 100644 --- a/paddle/phi/kernels/gpu/global_scatter_kernel.cu +++ b/paddle/phi/kernels/gpu/global_scatter_kernel.cu @@ -75,14 +75,14 @@ struct GlobalScatterFunctor { global_count_len = cpu_global_count.numel(); } - ncclDataType_t dtype = phi::ToNCCLDataType(x->dtype()); + ncclDataType_t dtype = ToNCCLDataType(x->dtype()); gpuStream_t stream = nullptr; stream = dev_ctx.stream(); - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; int nranks = 0; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu index 6cbdb172e0a9f6..e08075b69facc8 100644 --- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu @@ -40,14 +40,14 @@ __global__ void InitializeHashTable(T* tensor, int len) { } template -std::shared_ptr FillHashTable(const Context& dev_ctx, - const T* input, - int num_input, - int64_t len_hashtable, - T* keys, - int* values, - int* key_index, - int* final_nodes_len) { +std::shared_ptr FillHashTable(const Context& dev_ctx, + const T* input, + int num_input, + int64_t len_hashtable, + T* keys, + int* values, + int* key_index, + int* final_nodes_len) { const auto place = dev_ctx.GetPlace(); int block = 1024; @@ -73,7 +73,7 @@ std::shared_ptr FillHashTable(const Context& dev_ctx, int total_unique_items = item_count[num_input]; auto unique_items = - phi::memory_utils::AllocShared(place, total_unique_items * sizeof(T)); + memory_utils::AllocShared(place, total_unique_items * sizeof(T)); T* unique_items_data = reinterpret_cast(unique_items->ptr()); *final_nodes_len = total_unique_items; @@ -176,12 +176,11 @@ void Reindex(const Context& dev_ctx, int64_t log_num = 1 << static_cast(1 + std::log2(num >> 1)); int64_t table_size = log_num << 1; - auto keys = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T)); + auto keys = memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T)); auto values = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); + memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); auto key_index = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); + memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); T* keys_ptr = reinterpret_cast(keys->ptr()); int* values_ptr = reinterpret_cast(values->ptr()); int* key_index_ptr = reinterpret_cast(key_index->ptr()); @@ -197,7 +196,7 @@ void Reindex(const Context& dev_ctx, key_index_ptr, table_size); int unique_len = 0; - std::shared_ptr unique_items = + std::shared_ptr unique_items = FillHashTable(dev_ctx, thrust::raw_pointer_cast(out_nodes->data()), out_nodes->size(), diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h index 71c915ef916192..32d6ac6a5947c1 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -42,7 +42,7 @@ struct GraphSendRecvMaxCUDAFunctor { T* output, const IndexT& in_i, const IndexT& out_i) { - phi::CudaAtomicMax(output + out_i, *(params + in_i)); + CudaAtomicMax(output + out_i, *(params + in_i)); } }; @@ -52,7 +52,7 @@ struct GraphSendRecvMinCUDAFunctor { T* output, const IndexT& in_i, const IndexT& out_i) { - phi::CudaAtomicMin(output + out_i, *(params + in_i)); + CudaAtomicMin(output + out_i, *(params + in_i)); } }; diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index 3d6eb173d10f47..30afc8569778f7 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -109,14 +109,14 @@ struct GraphSendUERecvSumCUDAFunctor { template struct GraphSendUERecvMaxCUDAFunctor { DEVICE inline void operator()(T* output, T val) { - phi::CudaAtomicMax(output, val); + CudaAtomicMax(output, val); } }; template struct GraphSendUERecvMinCUDAFunctor { DEVICE inline void operator()(T* output, T val) { - phi::CudaAtomicMin(output, val); + CudaAtomicMin(output, val); } }; diff --git a/paddle/phi/kernels/gpu/grid_sample_utils.h b/paddle/phi/kernels/gpu/grid_sample_utils.h index 8097e6f7007837..66ee5d5fd50a18 100644 --- a/paddle/phi/kernels/gpu/grid_sample_utils.h +++ b/paddle/phi/kernels/gpu/grid_sample_utils.h @@ -62,14 +62,14 @@ inline bool cudnnIsAvailable() { return false; #elif defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // cuDNN/MIOpen version > 0 means DNN lib loaded; require v7+ for sampler - return phi::backends::gpu::DnnVersion() >= 7000; + return backends::gpu::DnnVersion() >= 7000; #else return false; #endif } inline bool isGpuTensor(const DenseTensor& x) { - return phi::is_gpu_place(x.place()); + return is_gpu_place(x.place()); } inline bool canUse32bitIndexMath(const DenseTensor& x) { diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu index 5bb61893098ef9..1931e8042256f3 100644 --- a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu @@ -160,7 +160,7 @@ template __device__ __forceinline__ AccT GradWarpReduceSum(AccT val) { #pragma unroll for (int offset = warpSize / 2; offset > 0; offset >>= 1) { - val += phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val, offset); + val += backends::gpu::CudaShuffleDownSync(0xffffffff, val, offset); } return val; } @@ -752,7 +752,7 @@ void GroupNormGradKernel(const Context& dev_ctx, } return; } - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const DataLayout data_layout = StringToDataLayout(data_layout_str); const auto scale_ptr = scale.get_ptr(); const auto bias_ptr = bias.get_ptr(); diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu index 9d94f6d36c1d6c..ebbb4ad20199c0 100644 --- a/paddle/phi/kernels/gpu/group_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu @@ -125,7 +125,7 @@ static int64_t findMaxDivisor(int64_t n, int64_t maxAllowedDivisor) { template inline __device__ void UpdateSum(const T* srcX, float* sum, float* sumSq) { - float src_data = phi::__2float(*srcX); + float src_data = __2float(*srcX); *sum += src_data; *sumSq += src_data * src_data; } @@ -135,8 +135,8 @@ inline __device__ void UpdateSum(const T* srcX, const T* srcR, float* sum, float* sumSq) { - float src_data = phi::__2float(*srcX); - float srcy_data = phi::__2float(*srcR); + float src_data = __2float(*srcX); + float srcy_data = __2float(*srcR); *sum += src_data + srcy_data; *sumSq += (src_data + srcy_data) * (src_data + srcy_data); } @@ -166,9 +166,9 @@ inline __device__ void UpdateSum<__half, 2>(const __half* srcX, } template <> -inline __device__ void UpdateSum(const phi::float16* srcX, - float* sum, - float* sumSq) { +inline __device__ void UpdateSum(const float16* srcX, + float* sum, + float* sumSq) { __half2 h2 = *reinterpret_cast<__half2 const*>(srcX); float2 f2 = __half22float2(h2); *sum += f2.x + f2.y; @@ -176,10 +176,10 @@ inline __device__ void UpdateSum(const phi::float16* srcX, } template <> -inline __device__ void UpdateSum(const phi::float16* srcX, - const phi::float16* srcR, - float* sum, - float* sumSq) { +inline __device__ void UpdateSum(const float16* srcX, + const float16* srcR, + float* sum, + float* sumSq) { __half2 h2 = *reinterpret_cast<__half2 const*>(srcX); __half2 h2_r = *reinterpret_cast<__half2 const*>(srcR); float2 f2 = __half22float2(h2); @@ -191,24 +191,24 @@ inline __device__ void UpdateSum(const phi::float16* srcX, #ifdef PADDLE_CUDA_BF16 template <> -inline __device__ void UpdateSum(const phi::bfloat16* srcX, - float* sum, - float* sumSq) { +inline __device__ void UpdateSum(const bfloat16* srcX, + float* sum, + float* sumSq) { __nv_bfloat162 h2 = *reinterpret_cast<__nv_bfloat162 const*>(srcX); - float2 f2 = phi::bfloat1622float2(h2); + float2 f2 = bfloat1622float2(h2); *sum += f2.x + f2.y; *sumSq += f2.x * f2.x + f2.y * f2.y; } template <> -inline __device__ void UpdateSum(const phi::bfloat16* srcX, - const phi::bfloat16* srcR, - float* sum, - float* sumSq) { +inline __device__ void UpdateSum(const bfloat16* srcX, + const bfloat16* srcR, + float* sum, + float* sumSq) { __nv_bfloat162 h2 = *reinterpret_cast<__nv_bfloat162 const*>(srcX); __nv_bfloat162 h2_r = *reinterpret_cast<__nv_bfloat162 const*>(srcR); - float2 f2 = phi::bfloat1622float2(h2); - float2 f2_r = phi::bfloat1622float2(h2_r); + float2 f2 = bfloat1622float2(h2); + float2 f2_r = bfloat1622float2(h2_r); *sum += f2.x + f2_r.x + f2.y + f2_r.y; *sumSq += (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y); @@ -418,22 +418,20 @@ inline __device__ void GroupNormCompute(int64_t dhwBegin, const GroupNormNDHWCParams& params, float mean, float invStdDev) { - float gamma = - phi::__2float(*(reinterpret_cast(params.gamma) + ci)); - float beta = - phi::__2float(*(reinterpret_cast(params.beta) + ci)); + float gamma = __2float(*(reinterpret_cast(params.gamma) + ci)); + float beta = __2float(*(reinterpret_cast(params.beta) + ci)); for (int64_t dhwi = dhwBegin; dhwi < dhwEnd; ++dhwi) { // The src/dst offset. int64_t offset = static_cast(blockIdx.z) * params.dhwc + dhwi * params.c + ci; - float src_data = phi::__2float(params.srcX[offset]); + float src_data = __2float(params.srcX[offset]); if (params.srcR != nullptr) { auto gi = ci / params.cPerGroup; auto gj = ci % params.cPerGroup; int64_t g_offset = params.y_same_with_x ? offset : gi * params.cPerGroup + gj; - src_data += phi::__2float(params.srcR[g_offset]); - *reinterpret_cast(¶ms.eleOut[offset]) = phi::__2dst(src_data); + src_data += __2float(params.srcR[g_offset]); + *reinterpret_cast(¶ms.eleOut[offset]) = __2dst(src_data); } // Normalize the channels. float dst_data = (src_data - mean) * invStdDev; @@ -446,16 +444,16 @@ inline __device__ void GroupNormCompute(int64_t dhwBegin, } // Store the scaled values. - *reinterpret_cast(¶ms.dst[offset]) = phi::__2dst(dst_data); + *reinterpret_cast(¶ms.dst[offset]) = __2dst(dst_data); } } template <> -inline __device__ void GroupNormCompute( +inline __device__ void GroupNormCompute( int64_t dhwBegin, int64_t dhwEnd, int32_t ci, - const GroupNormNDHWCParams& params, + const GroupNormNDHWCParams& params, float mean, float invStdDev) { float2 gammaF2, betaF2; @@ -563,17 +561,17 @@ inline __device__ void GroupNormCompute<__half, 2>( #ifdef PADDLE_CUDA_BF16 template <> -inline __device__ void GroupNormCompute( +inline __device__ void GroupNormCompute( int64_t dhwBegin, int64_t dhwEnd, int32_t ci, - const GroupNormNDHWCParams& params, + const GroupNormNDHWCParams& params, float mean, float invStdDev) { float2 gammaF2, betaF2; - gammaF2 = phi::bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>( + gammaF2 = bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>( reinterpret_cast<__nv_bfloat16 const*>(params.gamma) + ci)); - betaF2 = phi::bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>( + betaF2 = bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>( reinterpret_cast<__nv_bfloat16 const*>(params.beta) + ci)); // Iterate over the activations to compute the sums. @@ -587,7 +585,7 @@ inline __device__ void GroupNormCompute( *reinterpret_cast<__nv_bfloat162 const*>(¶ms.srcX[offset]); // Extract the two half values. - float2 f2 = phi::bfloat1622float2(h2); + float2 f2 = bfloat1622float2(h2); if (params.srcR != nullptr) { auto gi = ci / params.cPerGroup; @@ -596,11 +594,11 @@ inline __device__ void GroupNormCompute( params.y_same_with_x ? offset : gi * params.cPerGroup + gj; __nv_bfloat162 r2 = *reinterpret_cast<__nv_bfloat162 const*>(¶ms.srcR[g_offset]); - float2 r_f2 = phi::bfloat1622float2(r2); + float2 r_f2 = bfloat1622float2(r2); f2.x += r_f2.x; f2.y += r_f2.y; *reinterpret_cast<__nv_bfloat162*>(¶ms.eleOut[offset]) = - phi::float22bfloat162_rn(f2); + float22bfloat162_rn(f2); } // Normalize the channels. f2.x = (f2.x - mean) * invStdDev; @@ -617,7 +615,7 @@ inline __device__ void GroupNormCompute( } // Store the scaled values. *reinterpret_cast<__nv_bfloat162*>(¶ms.dst[offset]) = - phi::float22bfloat162_rn(f2); + float22bfloat162_rn(f2); } } #endif @@ -736,7 +734,7 @@ void GroupNormNDHWCKernel(const Context& dev_ctx, if (data_layout != DataLayout::NHWC) { PD_THROW("data_layout only supports NHWC and NDHWC"); } - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; GroupNormNDHWCParams params_; params_.withSilu = activation == "silu" ? true : false; @@ -886,17 +884,17 @@ void GroupNormNDHWCKernel(const Context& dev_ctx, groupNormNDHWCScale ndhwc_scale; ndhwc_scale(params_, stream); #ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync(mean_data, - params_.redBuffer, - params_.n * groups * sizeof(float), - hipMemcpyDeviceToHost, - stream); + backends::gpu::GpuMemcpyAsync(mean_data, + params_.redBuffer, + params_.n * groups * sizeof(float), + hipMemcpyDeviceToHost, + stream); #else - phi::backends::gpu::GpuMemcpyAsync(mean_data, - params_.redBuffer, - params_.n * groups * sizeof(float), - cudaMemcpyDeviceToHost, - stream); + backends::gpu::GpuMemcpyAsync(mean_data, + params_.redBuffer, + params_.n * groups * sizeof(float), + cudaMemcpyDeviceToHost, + stream); #endif } @@ -952,13 +950,10 @@ __device__ __forceinline__ WelfordData WelfordWarpReduce( for (int offset = warpSize / 2; offset > 0; offset >>= 1) { WelfordData other; other.mean = - phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.mean, offset); - other.m2 = - phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.m2, offset); - other.n = - phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.n, offset); - other.nf = - phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.nf, offset); + backends::gpu::CudaShuffleDownSync(0xffffffff, val.mean, offset); + other.m2 = backends::gpu::CudaShuffleDownSync(0xffffffff, val.m2, offset); + other.n = backends::gpu::CudaShuffleDownSync(0xffffffff, val.n, offset); + other.nf = backends::gpu::CudaShuffleDownSync(0xffffffff, val.nf, offset); val = WelfordCombine(val, other); } return val; @@ -1291,7 +1286,7 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); #endif #ifdef __HIPCC__ - if (blockDim.x < phi::kps::details::kWarpSize) { + if (blockDim.x < kps::details::kWarpSize) { CudaAtomicAdd(&mean[bid * groups + gid], x_mean); CudaAtomicAdd(&var[bid * groups + gid], x_var); } else { @@ -1477,12 +1472,12 @@ void GroupNormDirectCUDAFunctor::operator()( while (block_size_nchw < max_block_size) { block_size_nchw *= 2; } - block_size_nchw = std::max(block_size_nchw, phi::kps::details::kWarpSize); + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); int64_t n_groups = input_ddim[0] * static_cast(groups); dim3 grids(std::min(max_grid_x, n_groups)); dim3 blocks(block_size_nchw); if (size < vec_size * block_size_nchw) { - phi::ScalarGetMeanAndVarNCHW<<>>( + ScalarGetMeanAndVarNCHW<<>>( input, mean, temp_variance, size, n_groups); } else { VectorizedGetMeanAndVarNCHW @@ -1515,7 +1510,7 @@ void GroupNormDirectCUDAFunctor::operator()( cudaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups); #endif - phi::GroupNormForwardGetMeanAndVar + GroupNormForwardGetMeanAndVar <<>>(input, input_ddim[0], C, @@ -1559,7 +1554,7 @@ void GroupNormGeneralCaseKernel(const Context& dev_ctx, DenseTensor* y, DenseTensor* mean, DenseTensor* var) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const DataLayout data_layout = StringToDataLayout(data_layout_str); const auto scale_ptr = scale.get_ptr(); const auto bias_ptr = bias.get_ptr(); @@ -1843,42 +1838,42 @@ void GroupNormKernel(const Context& dev_ctx, return; } using std::is_same; - if (is_same::value && data_layout_str == "NHWC") { + if (is_same::value && data_layout_str == "NHWC") { const optional& residual = optional(paddle::none); DenseTensor empty_tensor; - GroupNormNDHWCKernel(dev_ctx, - x, - residual, - scale, - bias, - epsilon, - groups, - data_layout_str, - "", - y, - &empty_tensor, - mean, - var); + GroupNormNDHWCKernel(dev_ctx, + x, + residual, + scale, + bias, + epsilon, + groups, + data_layout_str, + "", + y, + &empty_tensor, + mean, + var); return; } #ifdef PADDLE_CUDA_BF16 - if (is_same::value && data_layout_str == "NHWC") { + if (is_same::value && data_layout_str == "NHWC") { const optional& residual = optional(paddle::none); DenseTensor empty_tensor; - GroupNormNDHWCKernel(dev_ctx, - x, - residual, - scale, - bias, - epsilon, - groups, - data_layout_str, - "", - y, - &empty_tensor, - mean, - var); + GroupNormNDHWCKernel(dev_ctx, + x, + residual, + scale, + bias, + epsilon, + groups, + data_layout_str, + "", + y, + &empty_tensor, + mean, + var); return; } #endif diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h index 6fb6d155398ead..c4929a4506f5f5 100644 --- a/paddle/phi/kernels/gpu/group_norm_utils.h +++ b/paddle/phi/kernels/gpu/group_norm_utils.h @@ -49,7 +49,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { } template -__device__ __forceinline__ void ThreadReduce(phi::Array arrs, +__device__ __forceinline__ void ThreadReduce(Array arrs, int64_t size, const int offset, AccT* out_mean, @@ -168,7 +168,7 @@ __global__ void VectorizedGetMeanAndVarNCHW( AccT x_var = static_cast(0); x += i * size; const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); - phi::Array ins; + Array ins; ins[0] = x; ThreadReduce(ins, size, input_offset, &x_mean, &x_var); ReduceMeanAndVar(mean, var, x_mean, x_var, size, i); diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu index 39d15f32cd10d1..eff113fa9e975c 100644 --- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu @@ -137,7 +137,7 @@ struct GumbleNoiseGenerator { DenseTensor random_tensor; int64_t size = size_to_axis * size_from_axis; random_tensor.Resize({size}); - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType* random_data = dev_ctx.template Alloc(&random_tensor); // generate gumbel noise diff --git a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu index ebc51effe98de1..e6f6cc81095a41 100644 --- a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu @@ -25,8 +25,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template void IndexAddGradKernel(const Context& dev_ctx, const DenseTensor& index, @@ -100,7 +98,7 @@ void IndexAddGradKernel(const Context& dev_ctx, auto* add_value_grad_data = dev_ctx.template Alloc(add_value_grad); unsigned int block_dim = PADDLE_CUDA_NUM_THREADS; dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim); - phi::backends::gpu::LimitGridDim(dev_ctx, &grid_dim); + backends::gpu::LimitGridDim(dev_ctx, &grid_dim); if (index_type == DataType::INT64) { const int64_t* index_data = index.data(); diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu index 466383c9ae7a80..31851f9f21c032 100644 --- a/paddle/phi/kernels/gpu/index_add_kernel.cu +++ b/paddle/phi/kernels/gpu/index_add_kernel.cu @@ -26,8 +26,6 @@ COMMON_DECLARE_bool(cudnn_deterministic); namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void index_add_cuda_kernel(const T* input, const IndexT* index, @@ -125,7 +123,7 @@ void IndexAddKernel(const Context& dev_ctx, unsigned int block_dim = PADDLE_CUDA_NUM_THREADS; dim3 grid_dim = dim3((num_columns + block_dim - 1) / block_dim); - phi::backends::gpu::LimitGridDim(dev_ctx, &grid_dim); + backends::gpu::LimitGridDim(dev_ctx, &grid_dim); if (index_type == DataType::INT64) { const int64_t* index_data = index.data(); @@ -153,7 +151,7 @@ void IndexAddKernel(const Context& dev_ctx, } else { unsigned int block_dim = PADDLE_CUDA_NUM_THREADS; dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim); - phi::backends::gpu::LimitGridDim(dev_ctx, &grid_dim); + backends::gpu::LimitGridDim(dev_ctx, &grid_dim); if (index_type == DataType::INT64) { const int64_t* index_data = index.data(); diff --git a/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu b/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu index 6f669d024e449e..568294f9ef1ab7 100644 --- a/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu @@ -97,13 +97,13 @@ void GPUIndexElementwiseGetGrad(const GPUContext& dev_ctx, funcs::IndexPutStride<3>(input_dims, input_strides, - phi::SizeOf(input.dtype()), + SizeOf(input.dtype()), vectorize(value.dims()), vectorize(value.strides()), - phi::SizeOf(value.dtype()), + SizeOf(value.dtype()), shape_tmp, stride_tmp, - phi::SizeOf(index[0]->dtype()), + SizeOf(index[0]->dtype()), &desired_shape, &strides_array, &numel, @@ -111,8 +111,8 @@ void GPUIndexElementwiseGetGrad(const GPUContext& dev_ctx, auto offset_calc = funcs::make_offset_calculator_put<3, false, OffsetT>( desired_shape, strides_array); - auto max_grid_size = phi::backends::gpu::GetGpuMaxGridDimSize( - dev_ctx.GetPlace().GetDeviceId()); + auto max_grid_size = + backends::gpu::GetGpuMaxGridDimSize(dev_ctx.GetPlace().GetDeviceId()); const int64_t N = numel; constexpr int nt = 128; @@ -180,7 +180,7 @@ __global__ void IndexingBackwardKernel(const int64_t* sorted_indices, int64_t stride_before, int64_t outer_dim, bool accumulate) { - using opmath_t = typename phi::dtype::MPTypeTrait::Type; + using opmath_t = typename dtype::MPTypeTrait::Type; for (int64_t z = blockIdx.z; z < outer_dim; z += gridDim.z) { for (int64_t idx = @@ -273,7 +273,7 @@ void IndexPutWithSortKernel(const GPUContext& dev_ctx, const bool unsafe = true; const bool self_contiguous = self.meta().is_contiguous(); auto self_ = - self_contiguous ? self : phi::Contiguous(dev_ctx, self); + self_contiguous ? self : Contiguous(dev_ctx, self); DenseTensor linearIndex, src, expandedValue = value; int64_t nElemBefore, strideBefore, sliceSize; std::vector inversePerm; @@ -298,19 +298,18 @@ void IndexPutWithSortKernel(const GPUContext& dev_ctx, } DenseTensor expanded_tensor; - phi::ExpandKernel( - dev_ctx, expandedValue, phi::IntArray(expanded_size), &expanded_tensor); + ExpandKernel( + dev_ctx, expandedValue, IntArray(expanded_size), &expanded_tensor); expandedValue = expanded_tensor; } if (!expandedValue.meta().is_contiguous()) { - expandedValue = phi::Contiguous(dev_ctx, expandedValue); + expandedValue = Contiguous(dev_ctx, expandedValue); } if (num_indices > 0 && sliceSize > 0) { const bool permuted = !src.meta().is_contiguous(); - DenseTensor src_ = - permuted ? phi::Contiguous(dev_ctx, src) : src; - linearIndex = phi::Reshape(dev_ctx, linearIndex, {-1}); + DenseTensor src_ = permuted ? Contiguous(dev_ctx, src) : src; + linearIndex = Reshape(dev_ctx, linearIndex, {-1}); DenseTensor sorted_indices; sorted_indices.Resize(linearIndex.dims()); @@ -321,21 +320,17 @@ void IndexPutWithSortKernel(const GPUContext& dev_ctx, auto stream = dev_ctx.stream(); - auto shape = phi::IntArray(vectorize(linearIndex.dims())); - auto divisor = - Full(dev_ctx, shape, phi::Scalar(sliceSize)); + auto shape = IntArray(vectorize(linearIndex.dims())); + auto divisor = Full(dev_ctx, shape, Scalar(sliceSize)); DenseTensor linearIndex_d = - phi::FloorDivide(dev_ctx, linearIndex, divisor); + FloorDivide(dev_ctx, linearIndex, divisor); DenseTensor range; range.Resize({num_indices}); dev_ctx.Alloc(&range); - phi::ArangeKernel(dev_ctx, - phi::Scalar(0), - phi::Scalar(num_indices), - phi::Scalar(1), - &range); + ArangeKernel( + dev_ctx, Scalar(0), Scalar(num_indices), Scalar(1), &range); int64_t nbits = funcs::GetNumBits(funcs::LargestIndex(self_) / sliceSize); funcs::RadixSortPairs(dev_ctx, @@ -350,8 +345,8 @@ void IndexPutWithSortKernel(const GPUContext& dev_ctx, const int UNROLL = 4; const int INDICES_PER_BLOCK = 4; - auto max_grid_size = phi::backends::gpu::GetGpuMaxGridDimSize( - dev_ctx.GetPlace().GetDeviceId()); + auto max_grid_size = + backends::gpu::GetGpuMaxGridDimSize(dev_ctx.GetPlace().GetDeviceId()); dim3 grid( std::min(static_cast(max_grid_size[0]), diff --git a/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu index 5674931fb1dfaa..54d9aff4b925ad 100644 --- a/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu @@ -63,18 +63,18 @@ void GPUIndexElementwisePutGradKernel( if (value_grad) { value_dims = vectorize(value_grad->dims()); value_strides = vectorize(value_grad->strides()); - value_ele_size = phi::SizeOf(value_grad->dtype()); + value_ele_size = SizeOf(value_grad->dtype()); } funcs::IndexPutStride<3>(input_dims, input_strides, - phi::SizeOf(out_grad.dtype()), + SizeOf(out_grad.dtype()), value_dims, value_strides, value_ele_size, shape_tmp, stride_tmp, - phi::SizeOf(index[0]->dtype()), + SizeOf(index[0]->dtype()), &desired_shape, &strides_array, &numel, diff --git a/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu b/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu index 6ae475c0214527..66da5ba0cd6c59 100644 --- a/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu +++ b/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu @@ -63,13 +63,13 @@ void GPUIndexElementwisePutKernel(const GPUContext& dev_ctx, funcs::IndexPutStride<3>(input_dims, input_strides, - phi::SizeOf(input.dtype()), + SizeOf(input.dtype()), {}, {}, 4, shape_tmp, stride_tmp, - phi::SizeOf(index[0]->dtype()), + SizeOf(index[0]->dtype()), &desired_shape, &strides_array, &numel, @@ -172,13 +172,13 @@ void GPUIndexElementwisePutWithTensorKernel( funcs::IndexPutStride<3>(input_dims, input_strides, - phi::SizeOf(input.dtype()), + SizeOf(input.dtype()), vectorize(value.dims()), vectorize(value.strides()), - phi::SizeOf(value.dtype()), + SizeOf(value.dtype()), shape_tmp, stride_tmp, - phi::SizeOf(index[0]->dtype()), + SizeOf(index[0]->dtype()), &desired_shape, &strides_array, &numel, diff --git a/paddle/phi/kernels/gpu/index_fill_grad_kernel.cu b/paddle/phi/kernels/gpu/index_fill_grad_kernel.cu index 3c385e21b6c8de..07475a038478ad 100644 --- a/paddle/phi/kernels/gpu/index_fill_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_fill_grad_kernel.cu @@ -97,13 +97,12 @@ void LaunchIndexFillGradCudaKernel(const Context& dev_ctx, dev_ctx.template Alloc(&index_int64); int64_t index_numel = index.numel(); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, index_numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, index_numel); - phi::funcs::CastToInt64Kernel<<>>( + funcs::CastToInt64Kernel<<>>( index.data(), index_int64.data(), index_numel); ptr_index = &index_int64; @@ -136,7 +135,7 @@ void LaunchIndexFillGradCudaKernel(const Context& dev_ctx, // Step 2: launch kernel to zero out gradients at the filled positions. int64_t numel = outer_size * index_size * inner_size; - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); T* x_grad_data = x_grad->data(); diff --git a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu index 9a2ae17d64a36a..0fa9bece638b81 100644 --- a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu @@ -96,7 +96,7 @@ void LaunchIndexPutGradCudaKernel( const bool accumulate, DenseTensor* value_grad, DenseTensor* x_grad) { - phi::Allocator::AllocationPtr indices_holder_1, indices_holder_2; + Allocator::AllocationPtr indices_holder_1, indices_holder_2; if (x_grad) { Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); if (!accumulate) { @@ -115,7 +115,7 @@ void LaunchIndexPutGradCudaKernel( const int64_t numel = indices[0]->numel(); auto pd_indices = funcs::GetDevicePointerArray( dev_ctx, indices, &indices_holder_1); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); SetZeroCudaKernel<<numel(); auto pd_indices = funcs::GetDevicePointerArray( dev_ctx, indices, &indices_holder_2); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); if (value_grad) { if (value_grad->numel() == 1) { diff --git a/paddle/phi/kernels/gpu/index_put_kernel.cu b/paddle/phi/kernels/gpu/index_put_kernel.cu index a79acdb80cc641..4703c046f0c1b6 100644 --- a/paddle/phi/kernels/gpu/index_put_kernel.cu +++ b/paddle/phi/kernels/gpu/index_put_kernel.cu @@ -90,11 +90,11 @@ void LaunchIndexPutCudaKernel(const Context& dev_ctx, int64_t is_single_val_tensor = (value.numel() == 1) ? 0 : INT64_MAX; const int64_t numel = indices[0]->numel(); - phi::Allocator::AllocationPtr holder; + Allocator::AllocationPtr holder; auto pd_indices = funcs::GetDevicePointerArray(dev_ctx, indices, &holder); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); IndexPutCudaKernel <<>>( x_data, diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu index cf2376a257f61d..87c3b4c89c3a0b 100644 --- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu @@ -96,16 +96,15 @@ void IndexSampleGradKernel(const Context& dev_ctx, } bool same_data_in_index_row = index_length == 1 ? false : true; - auto block_width = phi::backends::gpu::RoundToPowerOfTwo(index_length); + auto block_width = backends::gpu::RoundToPowerOfTwo(index_length); block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); auto block_height = - phi::backends::gpu::RoundToPowerOfTwo(index_length * batch_size) / - block_width; + backends::gpu::RoundToPowerOfTwo(index_length * batch_size) / block_width; block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); dim3 block_dim(block_width, block_height); dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y); - phi::backends::gpu::LimitGridDim(dev_ctx, &grid_dim); + backends::gpu::LimitGridDim(dev_ctx, &grid_dim); bool use_int32 = true; if (out_grad.numel() > UINT32_MAX || x_grad->numel() > UINT32_MAX) { diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu index 906108d6a7ed2d..1acfb98510275e 100644 --- a/paddle/phi/kernels/gpu/index_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu @@ -83,16 +83,15 @@ void IndexSampleKernel(const Context& dev_ctx, if (batch_size == 0 || input_length == 0 || index_length == 0) { return; } - auto block_width = phi::backends::gpu::RoundToPowerOfTwo(index_length); + auto block_width = backends::gpu::RoundToPowerOfTwo(index_length); block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); int block_height = - phi::backends::gpu::RoundToPowerOfTwo(index_length * batch_size) / - block_width; + backends::gpu::RoundToPowerOfTwo(index_length * batch_size) / block_width; block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); dim3 block_dim(block_width, block_height); dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y); - phi::backends::gpu::LimitGridDim(dev_ctx, &grid_dim); + backends::gpu::LimitGridDim(dev_ctx, &grid_dim); // choose the element index type ; uint32 or int64 based on the tensor size bool use_uint32 = true; if (x.numel() > UINT32_MAX || out->numel() > UINT32_MAX) { diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu index c1a3968650b2a0..c34b218c474b8b 100644 --- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -28,8 +28,6 @@ COMMON_DECLARE_bool(cudnn_deterministic); namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void index_select_grad_cuda_kernel(const T* output_grad, T* input_grad, @@ -131,7 +129,7 @@ void IndexSelectGradKernel(const Context& dev_ctx, unsigned int block_dim = PADDLE_CUDA_NUM_THREADS; dim3 grid_dim = dim3((num_columns + block_dim - 1) / block_dim); - phi::backends::gpu::LimitGridDim(dev_ctx, &grid_dim); + backends::gpu::LimitGridDim(dev_ctx, &grid_dim); if (index_type == DataType::INT64) { const int64_t* index_data = index.data(); @@ -158,7 +156,7 @@ void IndexSelectGradKernel(const Context& dev_ctx, } else { unsigned int block_dim = PADDLE_CUDA_NUM_THREADS; dim3 grid_dim = dim3((out_nums + block_dim - 1) / block_dim); - phi::backends::gpu::LimitGridDim(dev_ctx, &grid_dim); + backends::gpu::LimitGridDim(dev_ctx, &grid_dim); if (index_type == DataType::INT64) { const int64_t* index_data = index.data(); diff --git a/paddle/phi/kernels/gpu/index_select_impl.h b/paddle/phi/kernels/gpu/index_select_impl.h index 894df50a30cadb..db7d4c652eff98 100644 --- a/paddle/phi/kernels/gpu/index_select_impl.h +++ b/paddle/phi/kernels/gpu/index_select_impl.h @@ -22,8 +22,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void index_select_cuda_kernel(const T* input, T* output, diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu index f6cd62be6b3bf5..0282e6b74329a3 100644 --- a/paddle/phi/kernels/gpu/index_select_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_kernel.cu @@ -23,8 +23,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template void IndexSelectKernel(const Context& dev_ctx, const DenseTensor& x, @@ -64,7 +62,7 @@ void IndexSelectKernel(const Context& dev_ctx, unsigned int block_dim = PADDLE_CUDA_NUM_THREADS; dim3 grid_dim = dim3((numel + block_dim - 1) / block_dim); - phi::backends::gpu::LimitGridDim(dev_ctx, &grid_dim); + backends::gpu::LimitGridDim(dev_ctx, &grid_dim); if (index_type == DataType::INT64) { const int64_t* index_data = index.data(); diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu index e1f2b01f2db254..16e058410db9f5 100644 --- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu @@ -35,7 +35,7 @@ void InstanceNormKernel(const Context &dev_ctx, DenseTensor *y, DenseTensor *saved_mean, DenseTensor *saved_variance) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; double epsilon = static_cast(epsilon_f); auto &x_dims = x.dims(); PADDLE_ENFORCE_GE(x_dims.size(), @@ -165,14 +165,14 @@ void InstanceNormKernel(const Context &dev_ctx, dev_ctx.template Alloc>(saved_mean); functor(dev_ctx, saved_mean, static_cast>(0)); } else { - saved_mean_tmp = phi::Full>( + saved_mean_tmp = Full>( dev_ctx, {NxC}, static_cast>(0)); } if (saved_variance) { dev_ctx.template Alloc>(saved_variance); functor(dev_ctx, saved_variance, static_cast>(0)); } else { - saved_variance_tmp = phi::Full>( + saved_variance_tmp = Full>( dev_ctx, {NxC}, static_cast>(0)); } auto *saved_mean_data = saved_mean diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h index 1b2e00475dad05..6075329e58cb8e 100644 --- a/paddle/phi/kernels/gpu/instance_norm_utils.h +++ b/paddle/phi/kernels/gpu/instance_norm_utils.h @@ -25,7 +25,7 @@ namespace phi { template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; @@ -45,7 +45,7 @@ static __global__ void add_param(const T *input, T *output, const int repeat_num, const int C) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage ou_storage; for (int i = blockIdx.x; i < C; i += gridDim.x) { diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu index ceeeb7113bf5a1..6d60a42da10b60 100644 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu @@ -233,7 +233,7 @@ __inline__ __device__ T PartialBlockMin(T val, } } else { shared_last_val = std::numeric_limits::max(); - phi::CudaAtomicMin(&shared_last_val, val); + CudaAtomicMin(&shared_last_val, val); shared[wid] = shared_last_val; shared_last_idx = wid; } @@ -1264,7 +1264,7 @@ static void Interpolate1DCUDABwd( return; } - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT ratio_w = funcs::AreaPixelComputeScale(in_w, out_w, align_corners, scale_w); int64_t in_cw = c * in_w; @@ -1397,10 +1397,9 @@ static void Interpolate2DCUDABwd( return; } - using MT = - typename std::conditional_t::value, - float, - typename phi::dtype::MPTypeTrait::Type>; + using MT = typename std::conditional_t::value, + float, + typename dtype::MPTypeTrait::Type>; MT ratio_h = funcs::AreaPixelComputeScale(in_h, out_h, align_corners, scale_h); MT ratio_w = @@ -1671,10 +1670,9 @@ static void InterpolateAA2DCUDABwd( return; } - using MT = - typename std::conditional_t::value, - float, - typename phi::dtype::MPTypeTrait::Type>; + using MT = typename std::conditional_t::value, + float, + typename dtype::MPTypeTrait::Type>; MT ratio_h = funcs::AreaPixelComputeScale(in_h, out_h, align_corners, scale_h); MT ratio_w = @@ -1685,7 +1683,7 @@ static void InterpolateAA2DCUDABwd( // Lambda to launch AA interpolation backward kernel auto launch_aa_bw_kernel = [&](auto filter) { int device_id = dev_ctx.GetPlace().GetDeviceId(); - auto& gpu_props = phi::backends::gpu::GetDeviceProperties(device_id); + auto& gpu_props = backends::gpu::GetDeviceProperties(device_id); // Use AAInterpLaunchConfig to compute block/grid dimensions with dynamic // adjustment for shared memory limits @@ -1912,10 +1910,9 @@ static void Interpolate3DCUDABwd( return; } - using MT = - typename std::conditional_t::value, - float, - typename phi::dtype::MPTypeTrait::Type>; + using MT = typename std::conditional_t::value, + float, + typename dtype::MPTypeTrait::Type>; MT ratio_d = funcs::AreaPixelComputeScale(in_d, out_d, align_corners, scale_d); MT ratio_h = diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu index cae1022d3891ef..4dbed5f545d216 100644 --- a/paddle/phi/kernels/gpu/interpolate_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu @@ -1074,7 +1074,7 @@ static void Interpolate1DCUDAFwd( using MT = std::conditional_t::value, float, - typename phi::dtype::MPTypeTrait::Type>; + typename dtype::MPTypeTrait::Type>; MT ratio_w = funcs::AreaPixelComputeScale(in_w, out_w, align_corners, scale_w); @@ -1219,7 +1219,7 @@ static void Interpolate2DCUDAFwd( using MT = std::conditional_t::value, float, - typename phi::dtype::MPTypeTrait::Type>; + typename dtype::MPTypeTrait::Type>; MT ratio_h = funcs::AreaPixelComputeScale(in_h, out_h, align_corners, scale_h); MT ratio_w = @@ -1463,7 +1463,7 @@ static void InterpolateAA2DCUDAFwd( return; } - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT ratio_h = funcs::AreaPixelComputeScale(in_h, out_h, align_corners, scale_h); MT ratio_w = @@ -1483,7 +1483,7 @@ static void InterpolateAA2DCUDAFwd( auto launch_aa_kernel = [&](auto filter) { int64_t nc = static_cast(n) * c; int device_id = dev_ctx.GetPlace().GetDeviceId(); - auto& gpu_props = phi::backends::gpu::GetDeviceProperties(device_id); + auto& gpu_props = backends::gpu::GetDeviceProperties(device_id); // Use AAInterpLaunchConfig to compute block/grid dimensions with dynamic // adjustment for shared memory limits @@ -1724,7 +1724,7 @@ static void Interpolate3DCUDAFwd( using MT = std::conditional_t::value, float, - typename phi::dtype::MPTypeTrait::Type>; + typename dtype::MPTypeTrait::Type>; MT ratio_d = funcs::AreaPixelComputeScale(in_d, out_d, align_corners, scale_d); MT ratio_h = diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu index 7aad617adb6189..6492410d6d5f31 100644 --- a/paddle/phi/kernels/gpu/isfinite_kernel.cu +++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu @@ -72,21 +72,21 @@ INSTANTIATE_ISFINITE_KERNEL_Isnan(float, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isnan(double, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isnan(int, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isnan(int64_t, GPUContext); -INSTANTIATE_ISFINITE_KERNEL_Isnan(phi::float16, GPUContext); -INSTANTIATE_ISFINITE_KERNEL_Isnan(phi::bfloat16, GPUContext); +INSTANTIATE_ISFINITE_KERNEL_Isnan(float16, GPUContext); +INSTANTIATE_ISFINITE_KERNEL_Isnan(bfloat16, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isinf(float, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isinf(double, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isinf(int, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isinf(int64_t, GPUContext); -INSTANTIATE_ISFINITE_KERNEL_Isinf(phi::float16, GPUContext); -INSTANTIATE_ISFINITE_KERNEL_Isinf(phi::bfloat16, GPUContext); +INSTANTIATE_ISFINITE_KERNEL_Isinf(float16, GPUContext); +INSTANTIATE_ISFINITE_KERNEL_Isinf(bfloat16, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isfinite(float, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isfinite(double, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isfinite(int, GPUContext); INSTANTIATE_ISFINITE_KERNEL_Isfinite(int64_t, GPUContext); -INSTANTIATE_ISFINITE_KERNEL_Isfinite(phi::float16, GPUContext); -INSTANTIATE_ISFINITE_KERNEL_Isfinite(phi::bfloat16, GPUContext); +INSTANTIATE_ISFINITE_KERNEL_Isfinite(float16, GPUContext); +INSTANTIATE_ISFINITE_KERNEL_Isfinite(bfloat16, GPUContext); } // namespace phi #endif diff --git a/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu index 6ca70f7a82c044..3bc09ffb8fb47d 100644 --- a/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu @@ -21,7 +21,7 @@ namespace phi { template struct LabelSmoothGradFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType epsilon; __forceinline__ LabelSmoothGradFunctor(float epsilon_data) { diff --git a/paddle/phi/kernels/gpu/label_smooth_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_kernel.cu index 6d5d6b2c770f57..31a7687eaa2408 100644 --- a/paddle/phi/kernels/gpu/label_smooth_kernel.cu +++ b/paddle/phi/kernels/gpu/label_smooth_kernel.cu @@ -24,7 +24,7 @@ namespace phi { template struct LabelSmoothFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType epsilon; MPType label_dim; @@ -48,7 +48,7 @@ __global__ void LabelSmoothRunDistKernel(const int64_t N, const T* src, const T* dist_data, T* dst) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { int64_t dist_idx = idx % dist_numel; dst[idx] = diff --git a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu index d1da08cc8ad71b..782c2120464cee 100644 --- a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu +++ b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu @@ -32,7 +32,7 @@ namespace phi { template -using MultiPrecisionType = typename phi::dtype::MPTypeTrait::Type; +using MultiPrecisionType = typename dtype::MPTypeTrait::Type; __device__ __forceinline__ float Sqrt(float x) { return sqrtf(x); } __device__ __forceinline__ double Sqrt(double x) { return sqrt(x); } diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu index 5f6cb84cb6377d..f3d4fefa011d0c 100644 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu @@ -188,7 +188,7 @@ void LayerNormGradKernel(const Context& dev_ctx, epsilon); \ } while (0) - auto compute_dtype = phi::CppTypeToDataType::Type(); + auto compute_dtype = CppTypeToDataType::Type(); auto kernel_variant = LayerNormGradKernelDispatch(scale_bias_dtype, x_dtype, x_dtype, diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index 1e0c2c92a3766c..f1285732c95ecb 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -397,7 +397,7 @@ void LaunchLayerNormKernel(const Context& dev_ctx, addr = valid_bias ? (addr | reinterpret_cast(void_bias_data)) : addr; data_vec_size = - std::min(4, phi::GetVectorizedSize(reinterpret_cast(addr))); + std::min(4, GetVectorizedSize(reinterpret_cast(addr))); } else { uint64_t bias_addr = reinterpret_cast(void_bias_data); uint64_t attr_addr = valid_scale @@ -406,9 +406,9 @@ void LaunchLayerNormKernel(const Context& dev_ctx, attr_addr = valid_bias ? (valid_scale ? (attr_addr | bias_addr) : attr_addr) : attr_addr; - data_vec_size = std::min( - phi::GetVectorizedSize(reinterpret_cast(addr)), - phi::GetVectorizedSize(reinterpret_cast(attr_addr))); + data_vec_size = + std::min(GetVectorizedSize(reinterpret_cast(addr)), + GetVectorizedSize(reinterpret_cast(attr_addr))); data_vec_size = std::min(4, data_vec_size); } } @@ -582,7 +582,7 @@ void LayerNormKernel(const Context& dev_ctx, bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype; if (!is_scale_bias_same_dtype_with_x) { PADDLE_ENFORCE_EQ(scale_bias_dtype, - phi::CppTypeToDataType::Type(), + CppTypeToDataType::Type(), common::errors::InvalidArgument( "Unsupported data type of Scale and Bias")); } @@ -655,7 +655,7 @@ void LayerNormKernel(const Context& dev_ctx, PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 1792); \ PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 2048); \ PADDLE_LAUNCH_FAST_LAYERNORM_V1_FWD_BASE(ScaleT, 4096) - auto compute_dtype = phi::CppTypeToDataType::Type(); + auto compute_dtype = CppTypeToDataType::Type(); auto kernel_variant = LayerNormKernelDispatch(scale_bias_dtype, x_dtype, y_dtype, @@ -761,7 +761,7 @@ template PADDLE_API void LayerNormKernel( DenseTensor* y, DenseTensor* mean, DenseTensor* var); -template PADDLE_API void LayerNormKernel( +template PADDLE_API void LayerNormKernel( const GPUContext& dev_ctx, const DenseTensor& x, const optional& scale_opt, diff --git a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu index d260127bb4e4d9..a02eec51f73552 100644 --- a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu @@ -42,7 +42,7 @@ __global__ void LerpGradKernelImpl(const T* weight, const int64_t out_size, const int64_t x_size, const int64_t y_size) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; CUDA_KERNEL_LOOP_TYPE(idx, out_size, int64_t) { MPType temp_dx = static_cast(weight[idx]) * static_cast(dout[idx]); @@ -179,8 +179,7 @@ void SwitchKernel(const Context& dev_ctx, const int64_t out_size = out_grad.numel(); const int64_t weight_size = weight.numel(); - auto gpu_config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_size); + auto gpu_config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_size); if (weight.dtype() == DataType::FLOAT64) { const double* weight_data = weight.data(); @@ -242,14 +241,13 @@ void SwitchKernel(const Context& dev_ctx, DenseTensor b_out = EmptyLike(dev_ctx, out_grad); std::vector out_tensors = {&b_weight, &b_out}; - phi::BroadcastTensorsKernel(dev_ctx, in_tensors, out_tensors); + BroadcastTensorsKernel(dev_ctx, in_tensors, out_tensors); const T* weight_data = b_weight.data(); const T* out_grad_data = b_out.data(); const int64_t out_size = out_grad.numel(); const int64_t weight_size = weight.numel(); - auto gpu_config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_size); + auto gpu_config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_size); if (FLAGS_use_accuracy_compatible_kernel) { LerpGradKernelCompatibleImpl<<( + SumKernel( dev_ctx, b_xgrad, reduce_axis_x, b_xgrad.dtype(), false, x_grad); } else { x_grad->ShareDataWith(b_xgrad); @@ -383,7 +381,7 @@ void LerpGradKernel(const Context& dev_ctx, b_ygrad.dims(), -1); if (!reduce_axis_y.empty()) { - phi::SumKernel( + SumKernel( dev_ctx, b_ygrad, reduce_axis_y, b_ygrad.dtype(), false, y_grad); } else { y_grad->ShareDataWith(b_ygrad); diff --git a/paddle/phi/kernels/gpu/lgamma_kernel.cu b/paddle/phi/kernels/gpu/lgamma_kernel.cu index 2e22d71e113498..3e3d6c89896891 100644 --- a/paddle/phi/kernels/gpu/lgamma_kernel.cu +++ b/paddle/phi/kernels/gpu/lgamma_kernel.cu @@ -23,7 +23,7 @@ namespace phi { template struct CudaLgammaFunctor { __device__ __forceinline__ T operator()(const T x) const { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; const MT mp_x = static_cast(x); return static_cast(Eigen::numext::lgamma(mp_x)); } diff --git a/paddle/phi/kernels/gpu/linear_v2_kernel.cu b/paddle/phi/kernels/gpu/linear_v2_kernel.cu index a37e9f63e20051..24782f0caeeffd 100644 --- a/paddle/phi/kernels/gpu/linear_v2_kernel.cu +++ b/paddle/phi/kernels/gpu/linear_v2_kernel.cu @@ -65,7 +65,7 @@ namespace phi { // Direct cublasLt matmul+bias, bypassing MatmulPlanner/DescriptorSetter/ // CublasLtBase. Uses persistent workspace from GPUContext. template -static void CublasLtMatmulBias(const phi::GPUContext& ctx, +static void CublasLtMatmulBias(const GPUContext& ctx, const T* x, const T* w, const T* bias, @@ -74,11 +74,11 @@ static void CublasLtMatmulBias(const phi::GPUContext& ctx, int64_t N, int64_t K, bool trans_w) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; constexpr auto compute = std::is_same::value ? CUBLAS_COMPUTE_64F : CUBLAS_COMPUTE_32F; - const auto dtype = phi::backends::gpu::ToCudaDataType(); - const auto stype = phi::backends::gpu::ToCudaDataType(); + const auto dtype = backends::gpu::ToCudaDataType(); + const auto stype = backends::gpu::ToCudaDataType(); MT alpha = static_cast(1), beta = static_cast(0); auto lt = ctx.cublaslt_handle(); @@ -194,7 +194,7 @@ void LinearV2Kernel(const Context& dev_ctx, if (N > 1 && K > 1) { DenseTensor bias_processed; if (bias.numel() != N) { - phi::TileKernel(dev_ctx, bias, {N}, &bias_processed); + TileKernel(dev_ctx, bias, {N}, &bias_processed); } else { bias_processed = bias; } @@ -214,16 +214,16 @@ void LinearV2Kernel(const Context& dev_ctx, DenseTensor bias_processed = bias; if (bias.numel() != (M * N)) { bias_processed.Resize({1, bias.numel()}); - phi::TileKernel( + TileKernel( dev_ctx, bias_processed, {M, 1}, &bias_processed); } - phi::AddmmKernel(dev_ctx, - bias_processed, - input_processed, - weight_processed, - 1.0f, - 1.0f, - out); + AddmmKernel(dev_ctx, + bias_processed, + input_processed, + weight_processed, + 1.0f, + 1.0f, + out); } out->Resize(out_dim_original); } else // NOLINT diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu index fc5f46c4b71d29..463759968080aa 100644 --- a/paddle/phi/kernels/gpu/linspace_kernel.cu +++ b/paddle/phi/kernels/gpu/linspace_kernel.cu @@ -72,9 +72,9 @@ T GetValueOfExpectedType(const Context& dev_ctx, const DenseTensor& x) { case DataType::INT64: return static_cast(GetValue(dev_ctx, x)); case DataType::FLOAT16: - return static_cast(GetValue(dev_ctx, x)); + return static_cast(GetValue(dev_ctx, x)); case DataType::BFLOAT16: - return static_cast(GetValue(dev_ctx, x)); + return static_cast(GetValue(dev_ctx, x)); case DataType::BOOL: return static_cast(GetValue(dev_ctx, x)); case DataType::INT16: diff --git a/paddle/phi/kernels/gpu/llm_int8_linear_kernel.cu b/paddle/phi/kernels/gpu/llm_int8_linear_kernel.cu index 27e51c678274f7..27f8002eaee0b3 100644 --- a/paddle/phi/kernels/gpu/llm_int8_linear_kernel.cu +++ b/paddle/phi/kernels/gpu/llm_int8_linear_kernel.cu @@ -55,7 +55,7 @@ void llm_int8_compute(const Context& dev_ctx, k, n); if (bias) { - phi::AddKernel(dev_ctx, *out, bias.get(), out); + AddKernel(dev_ctx, *out, bias.get(), out); } #else PADDLE_THROW(common::errors::Unimplemented( diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu index 4e7ca17d9967be..6b67aef49e744d 100644 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -35,7 +35,7 @@ void LogSoftmaxGradKernel(const Context &dev_ctx, return; } if (out.numel() == 0) return; - phi::SoftmaxBackwardCUDAKernelDriver( + SoftmaxBackwardCUDAKernelDriver( dev_ctx, out, out_grad, axis, x_grad); } diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu index 41125bf29ed966..219eae7a82f913 100644 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -35,7 +35,7 @@ void LogSoftmaxKernel(const Context &dev_ctx, funcs::set_constant(dev_ctx, out, static_cast(0.0)); return; } - phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, x, axis, out); + SoftmaxForwardCUDAKernelDriver(dev_ctx, x, axis, out); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/logspace_kernel.cu b/paddle/phi/kernels/gpu/logspace_kernel.cu index b0ba0eb1fb740f..f6d088d04cd190 100644 --- a/paddle/phi/kernels/gpu/logspace_kernel.cu +++ b/paddle/phi/kernels/gpu/logspace_kernel.cu @@ -26,7 +26,7 @@ namespace phi { template __global__ void LogspaceKernelInner( T start, T stop, double step, T base, int64_t size, T* out) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType mt_start = static_cast(start); MPType mt_stop = static_cast(stop); MPType mt_base = static_cast(base); @@ -50,7 +50,7 @@ __global__ void LogspaceKernelInner( template __global__ void LogspaceSpecialKernel(T start, T base, T* out) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType mt_start = static_cast(start); MPType mt_base = static_cast(base); @@ -66,7 +66,7 @@ void LogspaceKernel(const Context& dev_ctx, const DenseTensor& base, DataType dtype, DenseTensor* out) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; auto start_t = funcs::TransDataType(dev_ctx, start, dtype); auto stop_t = funcs::TransDataType(dev_ctx, stop, dtype); diff --git a/paddle/phi/kernels/gpu/logsumexp_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_kernel.cu index e064381044558f..0e1b19d62365b2 100644 --- a/paddle/phi/kernels/gpu/logsumexp_kernel.cu +++ b/paddle/phi/kernels/gpu/logsumexp_kernel.cu @@ -35,12 +35,12 @@ struct ComputeType { }; template <> -struct ComputeType { +struct ComputeType { using type = float; }; template <> -struct ComputeType { +struct ComputeType { using type = float; }; @@ -65,7 +65,7 @@ void LogsumexpFallbackKernel(const Context& dev_ctx, max_x.Resize(outdim); dev_ctx.template Alloc(&max_x); - phi::MaxKernel(dev_ctx, *in_x, axis_vec, false, &max_x); + MaxKernel(dev_ctx, *in_x, axis_vec, false, &max_x); max_x.Resize(keep_outdim); DenseTensor temp_x = Subtract(dev_ctx, *in_x, max_x); @@ -75,10 +75,10 @@ void LogsumexpFallbackKernel(const Context& dev_ctx, DenseTensor log_out; log_out.Resize(outdim); dev_ctx.template Alloc(&log_out); - phi::LogKernel(dev_ctx, *out_y, &log_out); + LogKernel(dev_ctx, *out_y, &log_out); log_out.Resize(outdim); out->Resize(outdim); - phi::AddKernel(dev_ctx, log_out, max_x, out); + AddKernel(dev_ctx, log_out, max_x, out); } template diff --git a/paddle/phi/kernels/gpu/lookup_table_grad_kernel.cu b/paddle/phi/kernels/gpu/lookup_table_grad_kernel.cu index 87fca7699f9ca1..1f98ecd207aabe 100644 --- a/paddle/phi/kernels/gpu/lookup_table_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/lookup_table_grad_kernel.cu @@ -150,13 +150,13 @@ void LookupTableSparseGradCUDAKernel( auto gpu_place = dev_ctx.GetPlace(); // TODO(yuyang18): Strange code here. - phi::MixVector mixv_new_rows(&new_rows); - phi::memory_utils::Copy(gpu_place, - mixv_new_rows.CUDAMutableData(dev_ctx.GetPlace()), - gpu_place, - ids_data, - ids_num * sizeof(int64_t), - stream); + MixVector mixv_new_rows(&new_rows); + memory_utils::Copy(gpu_place, + mixv_new_rows.CUDAMutableData(dev_ctx.GetPlace()), + gpu_place, + ids_data, + ids_num * sizeof(int64_t), + stream); mixv_new_rows.CopyToCPU(); d_table->set_rows(new_rows); @@ -178,12 +178,12 @@ void LookupTableSparseGradCUDAKernel( "output@Grad's shape = [%s].", d_table_value->dims(), d_output_dims_2d)); - phi::memory_utils::Copy(gpu_place, - d_table_data, - gpu_place, - d_output_data, - d_output->numel() * sizeof(T), - stream); + memory_utils::Copy(gpu_place, + d_table_data, + gpu_place, + d_output_data, + d_output->numel() * sizeof(T), + stream); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu index 9f03f17dbb1679..c22098d1698c14 100644 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu +++ b/paddle/phi/kernels/gpu/lstsq_kernel.cu @@ -66,10 +66,10 @@ void LstsqKernel(const Context& dev_ctx, int max_mn = std::max(m, n); int k = min_mn; - int x_stride = phi::GetMatrixStride(x_dims); - int y_stride = phi::GetMatrixStride(y_dims); + int x_stride = GetMatrixStride(x_dims); + int y_stride = GetMatrixStride(y_dims); int tau_stride = min_mn; - int batch_count = phi::GetBatchCount(x_dims); + int batch_count = GetBatchCount(x_dims); T rcond = rcond_scalar.to(); @@ -123,14 +123,14 @@ void LstsqKernel(const Context& dev_ctx, DenseTensor res_r; res_r.Resize({batch_count, min_mn, min_mn}); dev_ctx.template Alloc(&res_r); - phi::TrilTriuKernel(dev_ctx, slice_r, 0, false, &res_r); + TrilTriuKernel(dev_ctx, slice_r, 0, false, &res_r); DenseTensor trans_y = TransposeLast2Dim(dev_ctx, tmp_y); DenseTensor slice_y = funcs::Slice(dev_ctx, trans_y, {-2}, {0}, {min_mn}); // Step 3, solve R X = Y - phi::TriangularSolveKernel( + TriangularSolveKernel( dev_ctx, res_r, slice_y, true, false, false, solution); } else { @@ -148,9 +148,9 @@ void LstsqKernel(const Context& dev_ctx, DenseTensor res_r; res_r.Resize({batch_count, min_mn, min_mn}); dev_ctx.template Alloc(&res_r); - phi::TrilTriuKernel(dev_ctx, slice_r, 0, false, &res_r); + TrilTriuKernel(dev_ctx, slice_r, 0, false, &res_r); - phi::TriangularSolveKernel( + TriangularSolveKernel( dev_ctx, res_r, new_y, true, true, false, solution); // Step 3, X <- Q Z @@ -168,7 +168,7 @@ void LstsqKernel(const Context& dev_ctx, DenseTensor trans_q = TransposeLast2Dim(dev_ctx, new_x); DenseTensor slice_q = funcs::Slice(dev_ctx, trans_q, {-1}, {0}, {m}); DenseTensor solu_tensor = - phi::Matmul(dev_ctx, slice_q, *solution, false, false); + Matmul(dev_ctx, slice_q, *solution, false, false); Copy(dev_ctx, solu_tensor, dev_ctx.GetPlace(), true, solution); } if (batch_count == 1) solution->Resize({n, nrhs}); diff --git a/paddle/phi/kernels/gpu/lu_kernel.cu b/paddle/phi/kernels/gpu/lu_kernel.cu index 6518be10b4e002..5c632aaeb9f904 100644 --- a/paddle/phi/kernels/gpu/lu_kernel.cu +++ b/paddle/phi/kernels/gpu/lu_kernel.cu @@ -258,10 +258,10 @@ void lu_decomposed_kernel(const Context& dev_ctx, int lwork; cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork); - auto work_buff = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - lwork * sizeof(T), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto work_buff = + memory_utils::Alloc(dev_ctx.GetPlace(), + lwork * sizeof(T), + Stream(reinterpret_cast(dev_ctx.stream()))); T* d_work = reinterpret_cast(work_buff->ptr()); /* step 3: LU factorization */ diff --git a/paddle/phi/kernels/gpu/lu_solve_kernle.cu b/paddle/phi/kernels/gpu/lu_solve_kernle.cu index ca49202d3b0e60..7ac8446124314c 100644 --- a/paddle/phi/kernels/gpu/lu_solve_kernle.cu +++ b/paddle/phi/kernels/gpu/lu_solve_kernle.cu @@ -212,7 +212,7 @@ void LuSolveKernel(const Context& dev_ctx, DenseTensor* out) { dev_ctx.template Alloc(out); // Copy x to out since cusolverDn*getrs overwrites the input - *out = phi::Transpose2DTo6D(dev_ctx, b); + *out = Transpose2DTo6D(dev_ctx, b); DenseTensor tem_lu = Transpose2DTo6D(dev_ctx, lu); // Validate input dimensions auto x_dims = b.dims(); @@ -288,7 +288,7 @@ void LuSolveKernel(const Context& dev_ctx, d_info); #endif } - *out = phi::Transpose2DTo6D(dev_ctx, *out); + *out = Transpose2DTo6D(dev_ctx, *out); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu index b581e6cd291262..3893bd959a639c 100644 --- a/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu @@ -27,7 +27,7 @@ __global__ void CalculateGrad(T* logits_grad, const int64_t N, const int64_t D, const int* class_interval_ptr) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; int start_index = class_interval_ptr[rank]; CUDA_KERNEL_LOOP(i, N * D) { auto row = i / D; diff --git a/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu index 45a896b43df55d..9d7c840fde170f 100644 --- a/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu @@ -81,9 +81,8 @@ __global__ void LogitsMinusLogSumKernel(T* logits, const int64_t D) { CUDA_KERNEL_LOOP_TYPE(i, N * D, int64_t) { auto row = i / D; - logits[i] = - static_cast(logits[i]) - - static_cast(phi::kps::details::Log(logits_sum_per_row[row])); + logits[i] = static_cast(logits[i]) - + static_cast(kps::details::Log(logits_sum_per_row[row])); } } @@ -103,9 +102,9 @@ __global__ void HardLabelSoftmaxWithCrossEntropyKernel( if ((col + start_index) == labels[row]) { auto softmax = log_softmax[i]; loss[row] = -softmax; - log_softmax[i] = phi::kps::details::Exp(softmax); + log_softmax[i] = kps::details::Exp(softmax); } else { - log_softmax[i] = phi::kps::details::Exp(log_softmax[i]); + log_softmax[i] = kps::details::Exp(log_softmax[i]); } } } @@ -125,14 +124,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, DenseTensor* softmax, DenseTensor* loss) { const auto& place = dev_ctx.GetPlace(); // old code - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; gpuStream_t stream; if (nranks > 1) { - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( @@ -229,11 +228,11 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, dev_ctx.template Alloc(&logits_max); T* logits_max_buff = dev_ctx.template Alloc(&logits_max); - funcs::ReduceKernel>( + funcs::ReduceKernel>( static_cast(dev_ctx), softmax_2d, &logits_max, - phi::kps::IdentityFunctor(), + kps::IdentityFunctor(), {1}); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -252,11 +251,11 @@ void MarginCrossEntropyKernel(const Context& dev_ctx, sum_exp_logits.Resize({N, 1}); dev_ctx.template Alloc(&sum_exp_logits); T* sum_exp_logits_buff = dev_ctx.template Alloc(&sum_exp_logits); - funcs::ReduceKernel>( + funcs::ReduceKernel>( static_cast(dev_ctx), softmax_2d, &sum_exp_logits, - phi::kps::ExpFunctor(), + kps::ExpFunctor(), {1}); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) diff --git a/paddle/phi/kernels/gpu/masked_fill_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_fill_grad_kernel.cu index e41630a139ba1d..923fc15462589c 100644 --- a/paddle/phi/kernels/gpu/masked_fill_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_fill_grad_kernel.cu @@ -133,16 +133,15 @@ __global__ void GPUMaskedFillGradKernel(const T* out_grad, } template -void DispatchMaskFillGradKernel( - const GPUContext& dev_ctx, - const T* input, - const bool* mask, - const int64_t input_len, - const int64_t batch_size, - T* x_grad, - T* value_grad, - int vec_size, - const phi::backends::gpu::GpuLaunchConfig& config) { +void DispatchMaskFillGradKernel(const GPUContext& dev_ctx, + const T* input, + const bool* mask, + const int64_t input_len, + const int64_t batch_size, + T* x_grad, + T* value_grad, + int vec_size, + const backends::gpu::GpuLaunchConfig& config) { auto stream = dev_ctx.stream(); if (x_grad && value_grad) { switch (vec_size) { @@ -207,7 +206,7 @@ void DispatchMaskFillOneValueGradKernel( const int64_t batch_size, T* x_grad, int vec_size, - const phi::backends::gpu::GpuLaunchConfig& config) { + const backends::gpu::GpuLaunchConfig& config) { auto stream = dev_ctx.stream(); if (x_grad) { switch (vec_size) { @@ -246,15 +245,15 @@ void GPUMaskedFillGrad(const GPUContext& dev_ctx, int64_t batch_size = input_len / mask_len; int vec_size = 8; - vec_size = std::min(phi::GetVectorizedSize(out_grad_data), vec_size); + vec_size = std::min(GetVectorizedSize(out_grad_data), vec_size); if (x_grad && x_grad->initialized()) { x_grad_data = x_grad->data(); - vec_size = std::min(phi::GetVectorizedSize(x_grad_data), vec_size); + vec_size = std::min(GetVectorizedSize(x_grad_data), vec_size); } if (value_grad && value_grad->initialized()) { value_grad_data = value_grad->data(); - vec_size = std::min(phi::GetVectorizedSize(value_grad_data), vec_size); + vec_size = std::min(GetVectorizedSize(value_grad_data), vec_size); } while (vec_size > 1 && batch_size % vec_size != 0) { @@ -262,7 +261,7 @@ void GPUMaskedFillGrad(const GPUContext& dev_ctx, } auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_len, vec_size); + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_len, vec_size); if (value_grad && value_grad->numel() == 1) { DispatchMaskFillOneValueGradKernel(dev_ctx, diff --git a/paddle/phi/kernels/gpu/masked_fill_kernel.cu b/paddle/phi/kernels/gpu/masked_fill_kernel.cu index 01ba03cec83158..0d53a158717eea 100644 --- a/paddle/phi/kernels/gpu/masked_fill_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_fill_kernel.cu @@ -98,7 +98,7 @@ void DispatchMaskFillKernel(const GPUContext& dev_ctx, const int64_t batch_size, T* output, int vec_size, - const phi::backends::gpu::GpuLaunchConfig& config) { + const backends::gpu::GpuLaunchConfig& config) { auto stream = dev_ctx.stream(); switch (vec_size) { #define CASE_VECSIZE(__Vs) \ @@ -128,7 +128,7 @@ void DispatchMaskFillOneValueKernel( const int64_t batch_size, T* output, int vec_size, - const phi::backends::gpu::GpuLaunchConfig& config) { + const backends::gpu::GpuLaunchConfig& config) { auto stream = dev_ctx.stream(); switch (vec_size) { #define CASE_VECSIZE(__Vs) \ @@ -164,14 +164,14 @@ void GPUMaskedFill(const GPUContext& dev_ctx, int64_t batch_size = input_len / mask_len; int vec_size = 8; - vec_size = std::min(phi::GetVectorizedSize(input_data), vec_size); - vec_size = std::min(phi::GetVectorizedSize(output_data), vec_size); + vec_size = std::min(GetVectorizedSize(input_data), vec_size); + vec_size = std::min(GetVectorizedSize(output_data), vec_size); while (vec_size > 1 && batch_size % vec_size != 0) { vec_size /= 2; } auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_len, vec_size); + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_len, vec_size); if (value.numel() == 1) { DispatchMaskFillOneValueKernel(dev_ctx, @@ -227,7 +227,7 @@ void MaskedFillKernel(const Context& dev_ctx, DenseTensor value_expand = value; if (value.numel() != 1 && value.dims() != expanded_dims) { - phi::ExpandKernel( + ExpandKernel( dev_ctx, value, IntArray(expanded_size), &value_expand); } @@ -240,15 +240,14 @@ void MaskedFillKernel(const Context& dev_ctx, DenseTensor x_expand; if (mask.dims() != expanded_dims) { - phi::ExpandKernel( + ExpandKernel( dev_ctx, mask, IntArray(expanded_size), &mask_expand); } else { mask_expand = mask; } if (x.dims() != expanded_dims) { - phi::ExpandKernel( - dev_ctx, x, IntArray(expanded_size), &x_expand); + ExpandKernel(dev_ctx, x, IntArray(expanded_size), &x_expand); } else { x_expand = x; } diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu index 1095c7e7fbd77a..6f75c6d20b5a3d 100644 --- a/paddle/phi/kernels/gpu/masked_select_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu @@ -65,15 +65,14 @@ void MaskedSelectKernel(const Context& dev_ctx, DDim expand_dims = make_ddim(expanded_size); if (mask.dims() != expand_dims) { - phi::ExpandKernel( + ExpandKernel( dev_ctx, mask, IntArray(expanded_size), &mask_expand); } else { mask_expand = mask; } if (x.dims() != expand_dims) { - phi::ExpandKernel( - dev_ctx, x, IntArray(expanded_size), &x_expand); + ExpandKernel(dev_ctx, x, IntArray(expanded_size), &x_expand); } else { x_expand = x; } diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu index 1b1860cad84a09..86c1f002b81d6c 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu @@ -32,9 +32,9 @@ void MatrixRankKernel(const Context& dev_ctx, DenseTensor* out) { DenseTensor atol_tensor; if (use_default_tol) { - atol_tensor = phi::Full(dev_ctx, {1}, static_cast(0)); + atol_tensor = Full(dev_ctx, {1}, static_cast(0)); } else { - atol_tensor = phi::Full(dev_ctx, {1}, static_cast(tol)); + atol_tensor = Full(dev_ctx, {1}, static_cast(tol)); } MatrixRankTolKernel( dev_ctx, x, atol_tensor, use_default_tol, hermitian, out); diff --git a/paddle/phi/kernels/gpu/median_grad_kernel.cu b/paddle/phi/kernels/gpu/median_grad_kernel.cu index e73da173d546df..7b29d90098ac9d 100644 --- a/paddle/phi/kernels/gpu/median_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/median_grad_kernel.cu @@ -26,7 +26,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; } diff --git a/paddle/phi/kernels/gpu/median_kernel.cu b/paddle/phi/kernels/gpu/median_kernel.cu index dc04c0633570cb..53731f3e1cc984 100644 --- a/paddle/phi/kernels/gpu/median_kernel.cu +++ b/paddle/phi/kernels/gpu/median_kernel.cu @@ -64,7 +64,7 @@ __global__ void KernelNanCounts(const T* input, } int len = stride > blockDim.x ? blockDim.x : stride; - num = phi::backends::gpu::reduceSum(num, tx, len); + num = backends::gpu::reduceSum(num, tx, len); if (tx == 0) { nan_counts[j] = num; } @@ -278,8 +278,7 @@ void ProcessMedianKernel(const Context& dev_ctx, grid_size = std::min(grid_size, max_grid_dim); KernelNanCounts<<>>( x_data, numel, pre_dim, stride, nan_counts_ptr, nan_indices_ptr); - auto nan_stat_mem_cpu = - phi::memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 2); + auto nan_stat_mem_cpu = memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 2); int64_t* nan_stat_cpu_ptr = reinterpret_cast(nan_stat_mem_cpu->ptr()); int64_t sum = @@ -324,7 +323,7 @@ void ProcessMedianKernel(const Context& dev_ctx, dev_ctx, x, Scalar(sort_k), -1, false, true, &sort_out, &sort_indices); T div_factor = static_cast(2.0); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pre_dim); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pre_dim); if (ignore_nan) { if (mode == "avg") { CalcNanmedianMeanKernel diff --git a/paddle/phi/kernels/gpu/miopen_lstm_cache.h b/paddle/phi/kernels/gpu/miopen_lstm_cache.h index 762f603f75989b..1c8f878f3a6b11 100644 --- a/paddle/phi/kernels/gpu/miopen_lstm_cache.h +++ b/paddle/phi/kernels/gpu/miopen_lstm_cache.h @@ -49,13 +49,13 @@ class ScopedRNNBase { template void Create(const miopenHandle_t& handle, - const phi::Place& place, + const Place& place, const std::vector& sequence_length, size_t* workspace_size, size_t* reserve_size, DenseTensor* dropout_state) { int numDirections = is_bidirec_ ? 2 : 1; - miopenDataType_t miopen_type = phi::backends::gpu::CudnnDataType::type; + miopenDataType_t miopen_type = backends::gpu::CudnnDataType::type; // ------------------- miopen x, y descriptors --------------------- std::vector dims_x = {batch_size_, input_size_, 1}; @@ -155,15 +155,15 @@ class ScopedRNNBase { std::vector x_descs_; std::vector y_descs_; - phi::backends::gpu::ScopedTensorDescriptor x_desc_; - phi::backends::gpu::ScopedTensorDescriptor y_desc_; - phi::backends::gpu::ScopedTensorDescriptor init_h_desc_; - phi::backends::gpu::ScopedTensorDescriptor init_c_desc_; - phi::backends::gpu::ScopedTensorDescriptor last_h_desc_; - phi::backends::gpu::ScopedTensorDescriptor last_c_desc_; - phi::backends::gpu::ScopedDropoutDescriptor dropout_desc_; - phi::backends::gpu::ScopedFilterDescriptor weight_desc_; - phi::backends::gpu::ScopedRNNDescriptor rnn_desc_; + backends::gpu::ScopedTensorDescriptor x_desc_; + backends::gpu::ScopedTensorDescriptor y_desc_; + backends::gpu::ScopedTensorDescriptor init_h_desc_; + backends::gpu::ScopedTensorDescriptor init_c_desc_; + backends::gpu::ScopedTensorDescriptor last_h_desc_; + backends::gpu::ScopedTensorDescriptor last_c_desc_; + backends::gpu::ScopedDropoutDescriptor dropout_desc_; + backends::gpu::ScopedFilterDescriptor weight_desc_; + backends::gpu::ScopedRNNDescriptor rnn_desc_; }; } // namespace phi diff --git a/paddle/phi/kernels/gpu/moe_permute_kernel.cu b/paddle/phi/kernels/gpu/moe_permute_kernel.cu index 070ab1f984cb0b..9c0d0dacb56fcc 100644 --- a/paddle/phi/kernels/gpu/moe_permute_kernel.cu +++ b/paddle/phi/kernels/gpu/moe_permute_kernel.cu @@ -387,19 +387,19 @@ template -void launch_permute_kernel(const phi::GPUContext &dev_ctx, - const phi::DenseTensor &X, - const phi::DenseTensor &expert_routemap_topk, - const phi::DenseTensor &expert_prob_topk, - const paddle::optional &XScale, - const phi::DenseTensor &expert_offsets, - const phi::DenseTensor &expert_offset_end, - phi::DenseTensor *X_unzipped, - phi::DenseTensor *zipped_expertwise_rowmap, - phi::DenseTensor *token_prob_unzipped, - phi::DenseTensor *XScale_unzipped, - phi::DenseTensor *global_expertwise_block_cumsum, - phi::DenseTensor *expert_indices, +void launch_permute_kernel(const GPUContext &dev_ctx, + const DenseTensor &X, + const DenseTensor &expert_routemap_topk, + const DenseTensor &expert_prob_topk, + const paddle::optional &XScale, + const DenseTensor &expert_offsets, + const DenseTensor &expert_offset_end, + DenseTensor *X_unzipped, + DenseTensor *zipped_expertwise_rowmap, + DenseTensor *token_prob_unzipped, + DenseTensor *XScale_unzipped, + DenseTensor *global_expertwise_block_cumsum, + DenseTensor *expert_indices, int total_zipped_tokens_num, int token_length, int scale_length, @@ -489,18 +489,18 @@ void launch_permute_kernel(const phi::GPUContext &dev_ctx, // ============================================================================ template void dispatch_permute_kernel(const Context &dev_ctx, - const phi::DenseTensor &X, - const phi::DenseTensor &expert_routemap_topk, - const phi::DenseTensor &expert_prob_topk, - const paddle::optional &XScale, - const phi::DenseTensor &expert_offsets, - const phi::DenseTensor &expert_offset_end, - phi::DenseTensor *X_unzipped, - phi::DenseTensor *zipped_expertwise_rowmap, - phi::DenseTensor *token_prob_unzipped, - phi::DenseTensor *XScale_unzipped, - phi::DenseTensor *global_expertwise_block_cumsum, - phi::DenseTensor *expert_indices, + const DenseTensor &X, + const DenseTensor &expert_routemap_topk, + const DenseTensor &expert_prob_topk, + const paddle::optional &XScale, + const DenseTensor &expert_offsets, + const DenseTensor &expert_offset_end, + DenseTensor *X_unzipped, + DenseTensor *zipped_expertwise_rowmap, + DenseTensor *token_prob_unzipped, + DenseTensor *XScale_unzipped, + DenseTensor *global_expertwise_block_cumsum, + DenseTensor *expert_indices, int total_zipped_tokens_num, int token_length, int topk, diff --git a/paddle/phi/kernels/gpu/moe_unpermute_kernel.cu b/paddle/phi/kernels/gpu/moe_unpermute_kernel.cu index 4c6daf37213a0a..e8ea5f4700f71e 100644 --- a/paddle/phi/kernels/gpu/moe_unpermute_kernel.cu +++ b/paddle/phi/kernels/gpu/moe_unpermute_kernel.cu @@ -24,11 +24,11 @@ using moe::kMaxNumExperts; template __global__ __launch_bounds__(256) void tokens_zip_kernel( - const phi::bfloat16 *__restrict__ unzipped_tokens_in, + const bfloat16 *__restrict__ unzipped_tokens_in, const int *__restrict__ zipped_expertwise_rowmap, const int *__restrict__ expert_routemap_topk, const float *__restrict__ unzipped_token_probs, - phi::bfloat16 *__restrict__ zipped_tokens_out, + bfloat16 *__restrict__ zipped_tokens_out, float *__restrict__ zipped_probs_topk, const int total_zipped_tokens_num, const int token_length, @@ -205,11 +205,11 @@ void dispatch_tokens_zip(const Context &dev_ctx, tokens_zip_kernel <<>>( - unzipped_tokens.data(), + unzipped_tokens.data(), zipped_expertwise_rowmap.data(), expert_routemap_topk.data(), unzipped_token_probs.data(), - zipped_tokens->data(), + zipped_tokens->data(), zipped_probs_topk->data(), total_zipped_tokens_num, token_length, diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu index 050301d6a9a1d2..8667e734cf0192 100644 --- a/paddle/phi/kernels/gpu/multinomial_kernel.cu +++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu @@ -137,7 +137,7 @@ void MultinomialKernel(const Context& dev_ctx, const Scalar& num_samples, bool replacement, DenseTensor* out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; auto int_num_samples = num_samples.to(); auto* in_data = x.data(); @@ -236,7 +236,7 @@ void MultinomialKernel(const Context& dev_ctx, // Sample the multinomial distributions. dim3 block(128); int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); - const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id); + const auto& prop = backends::gpu::GetDeviceProperties(device_id); int grid_y = std::min(num_distributions, prop.maxGridSize[1]); dim3 grid((int_num_samples - 1) / block.x + 1, grid_y); diff --git a/paddle/phi/kernels/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/gpu/mv_grad_kernel.cu index f729b58b1b2c02..b0cfb063e96a9a 100644 --- a/paddle/phi/kernels/gpu/mv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/mv_grad_kernel.cu @@ -63,7 +63,7 @@ void MvGradKernel(const Context &dev_ctx, auto blas = funcs::GetBlas(dev_ctx); auto stream = dev_ctx.stream(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n); if (dx) { T *dx_data = dev_ctx.template Alloc(dx); diff --git a/paddle/phi/kernels/gpu/nadam_kernel.cu b/paddle/phi/kernels/gpu/nadam_kernel.cu index 59917044e7a40e..bf6ccc76eefd97 100644 --- a/paddle/phi/kernels/gpu/nadam_kernel.cu +++ b/paddle/phi/kernels/gpu/nadam_kernel.cu @@ -127,7 +127,7 @@ void NAdamKernel(const Context& dev_ctx, DenseTensor* moment1_out, DenseTensor* moment2_out, DenseTensor* master_param_out) { - using MT = typename phi::dtype::template MPTypeTrait::Type; + using MT = typename dtype::template MPTypeTrait::Type; T* param_out_data = dev_ctx.template Alloc(param_out); MT* momentum_decay_pow_out_data = diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu index 2b6f2a665a2ccf..82da19cb688758 100644 --- a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu @@ -25,7 +25,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; } diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu index 4932f4735e1e83..2fee4aa382f7f9 100644 --- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu +++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu @@ -64,7 +64,7 @@ __global__ void KernelNanCounts(const T* input, } int len = stride > blockDim.x ? blockDim.x : stride; - num = phi::backends::gpu::reduceSum(num, tx, len); + num = backends::gpu::reduceSum(num, tx, len); if (tx == 0) { nan_counts[j] = num; } @@ -278,8 +278,7 @@ void ProcessMedianKernel(const Context& dev_ctx, grid_size = std::min(grid_size, max_grid_dim); KernelNanCounts<<>>( x_data, numel, pre_dim, stride, nan_counts_ptr, nan_indices_ptr); - auto nan_stat_mem_cpu = - phi::memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 2); + auto nan_stat_mem_cpu = memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 2); int64_t* nan_stat_cpu_ptr = reinterpret_cast(nan_stat_mem_cpu->ptr()); int64_t sum = @@ -324,7 +323,7 @@ void ProcessMedianKernel(const Context& dev_ctx, dev_ctx, x, Scalar(sort_k), -1, false, true, &sort_out, &sort_indices); T div_factor = static_cast(2.0); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pre_dim); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, pre_dim); if (ignore_nan) { if (mode == "avg") { CalcNanmedianMeanKernel diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu index 391a4bbebe7af7..f33d593dfb37d5 100644 --- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu +++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu @@ -48,7 +48,7 @@ void NllLossRawKernel(const Context& dev_ctx, auto batch_size = x_dims[0]; auto n_classes = x_dims[1]; int size_average = static_cast(reduction == "mean"); - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; if (x_dims.size() == 2) { int64_t blocks = NumBlocks(batch_size); int threads = kNumCUDAThreads; diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu index a23e0b8dd4709f..2a09c0f3658c42 100644 --- a/paddle/phi/kernels/gpu/nms_kernel.cu +++ b/paddle/phi/kernels/gpu/nms_kernel.cu @@ -75,10 +75,10 @@ void NMSKernel(const Context& dev_ctx, const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock); dim3 block(threadsPerBlock); dim3 grid(blocks_per_line, blocks_per_line); - auto mask_data = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - num_boxes * blocks_per_line * sizeof(uint64_t), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto mask_data = + memory_utils::Alloc(dev_ctx.GetPlace(), + num_boxes * blocks_per_line * sizeof(uint64_t), + Stream(reinterpret_cast(dev_ctx.stream()))); uint64_t* mask_dev = reinterpret_cast(mask_data->ptr()); NMS<<>>( boxes.data(), threshold, num_boxes, mask_dev); diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu index 079e1d48c1583f..12b7581a719ef5 100644 --- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu @@ -31,7 +31,7 @@ __global__ void NormalizeGradient(const T* x, const int axis_n, const int64_t post, T* x_grad) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage_sum; int64_t num = pre * post; diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu index b00a05ffd19997..de88c87e693aff 100644 --- a/paddle/phi/kernels/gpu/norm_kernel.cu +++ b/paddle/phi/kernels/gpu/norm_kernel.cu @@ -39,7 +39,7 @@ __global__ void Normalize(const T* x, const float eps, T* y, T* out_norm) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int64_t num = pre * post; diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu index 5fb16480915be6..e88ecbdd2d82fc 100644 --- a/paddle/phi/kernels/gpu/one_hot_kernel.cu +++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu @@ -21,8 +21,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data, @@ -60,7 +58,7 @@ void OneHotKernel(const Context& dev_ctx, auto stream = dev_ctx.stream(); funcs::set_constant(dev_ctx, out, static_cast(0.0)); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); FillOutputKernel<< struct PNormGradFunctor { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; HOSTDEVICE explicit inline PNormGradFunctor(float porder, float eps) { this->porder = static_cast(porder - 1.0f); this->eps = static_cast(eps); @@ -416,19 +416,19 @@ void PNormGradKernel(const Context& dev_ctx, DenseTensor x_abs; x_abs.Resize(in_x->dims()); dev_ctx.template Alloc(&x_abs); - phi::AbsKernel(dev_ctx, *in_x, &x_abs); + AbsKernel(dev_ctx, *in_x, &x_abs); DenseTensor amax_grad_out; amax_grad_out.Resize(in_x->dims()); dev_ctx.template Alloc(&amax_grad_out); - phi::ReduceAMaxGradKernel(dev_ctx, - x_abs, - *in_norm, - *in_norm_dy, - dims_for_amax, - keepdim, - reduce_all, - &amax_grad_out); + ReduceAMaxGradKernel(dev_ctx, + x_abs, + *in_norm, + *in_norm_dy, + dims_for_amax, + keepdim, + reduce_all, + &amax_grad_out); DenseTensor x_sign; x_sign.Resize(in_x->dims()); dev_ctx.template Alloc(&x_sign); diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu index fef22e5335bc44..342fbff8828e4b 100644 --- a/paddle/phi/kernels/gpu/p_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu @@ -99,8 +99,7 @@ void PNormKernel(const Context& dev_ctx, if (x.numel() == 0) { if (out->numel() > 0) { std::vector vec_dims = vectorize(out->dims()); - phi::Full( - dev_ctx, phi::IntArray(vec_dims), static_cast(0), out); + Full(dev_ctx, IntArray(vec_dims), static_cast(0), out); } return; } @@ -135,7 +134,7 @@ void PNormKernel(const Context& dev_ctx, dev_ctx, *in_x, out_norm, reduce_axis); } else { // vanilla norm - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; funcs::ReduceGpuKernel( dev_ctx, *in_x, out_norm, reduce_axis, porder); } diff --git a/paddle/phi/kernels/gpu/p_recv_kernel.cu b/paddle/phi/kernels/gpu/p_recv_kernel.cu index 7eff93f447eeb8..2ffa47160abe6f 100644 --- a/paddle/phi/kernels/gpu/p_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/p_recv_kernel.cu @@ -42,7 +42,7 @@ void PRecvKernel(const Context& dev_ctx, GetCommContext(dev_ctx, peer); gpuStream_t stream = dev_ctx.stream(); - // auto data_type = phi::TransToPhiDataType(dtype); + // auto data_type = TransToPhiDataType(dtype); if (dynamic_shape) { DDim new_dim = recv_shape_info( diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu index 030d0e21dfd039..66363c5d7214ad 100644 --- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu @@ -21,8 +21,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void Pad3DGradConstNCDHW(const IndexType in_size, T* d_in_data, diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu index 61127add56e62a..6c0edc4e07affb 100644 --- a/paddle/phi/kernels/gpu/pad3d_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu @@ -22,8 +22,6 @@ #include "paddle/phi/kernels/full_kernel.h" namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void Pad3DConstNCDHW(const IndexType nthreads, const T* in_data, diff --git a/paddle/phi/kernels/gpu/partial_allgather_kernel.cu b/paddle/phi/kernels/gpu/partial_allgather_kernel.cu index 58c628d0485190..8382a9fc38bbf9 100644 --- a/paddle/phi/kernels/gpu/partial_allgather_kernel.cu +++ b/paddle/phi/kernels/gpu/partial_allgather_kernel.cu @@ -31,16 +31,16 @@ void PartialAllGatherOpCUDAKernel(const Context& dev_ctx, #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = &x_in; int64_t numel = in->numel(); - ncclDataType_t dtype = phi::ToNCCLDataType(in->dtype()); + ncclDataType_t dtype = ToNCCLDataType(in->dtype()); gpuStream_t stream = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; int real_nranks = 0; int real_rank = 0; comm_ctx = - static_cast(dev_ctx.GetCommContext()); + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( diff --git a/paddle/phi/kernels/gpu/partial_recv_kernel.cu b/paddle/phi/kernels/gpu/partial_recv_kernel.cu index cedef236d0a812..bbac8a959760d7 100644 --- a/paddle/phi/kernels/gpu/partial_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/partial_recv_kernel.cu @@ -60,7 +60,7 @@ void PartialRecvKernel(const Context& dev_ctx, int64_t offset = recv_numel * id; gpuStream_t stream = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; int nranks = 0; int rank = 0; diff --git a/paddle/phi/kernels/gpu/partial_send_kernel.cu b/paddle/phi/kernels/gpu/partial_send_kernel.cu index d9b92679d8ed66..e2d8591bdae5e0 100644 --- a/paddle/phi/kernels/gpu/partial_send_kernel.cu +++ b/paddle/phi/kernels/gpu/partial_send_kernel.cu @@ -62,12 +62,12 @@ void PartialSendKernel(const Context& dev_ctx, gpuStream_t stream = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; int nranks = 0; int rank = 0; comm_ctx = - static_cast(dev_ctx.GetCommContext()); + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( diff --git a/paddle/phi/kernels/gpu/partial_sum_kernel.cu b/paddle/phi/kernels/gpu/partial_sum_kernel.cu index a8c7e18b7a3680..98570150f66625 100644 --- a/paddle/phi/kernels/gpu/partial_sum_kernel.cu +++ b/paddle/phi/kernels/gpu/partial_sum_kernel.cu @@ -119,16 +119,16 @@ void PartialSumOpCUDAKernel(const Context &dev_ctx, } if (!in_data.empty()) { - auto tmp_in_array = phi::memory_utils::Alloc( + auto tmp_in_array = memory_utils::Alloc( dev_ctx.GetPlace(), in_data.size() * sizeof(T *), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); - phi::memory_utils::Copy(dev_ctx.GetPlace(), - tmp_in_array->ptr(), - CPUPlace(), - reinterpret_cast(in_data.data()), - in_data.size() * sizeof(T *)); + memory_utils::Copy(dev_ctx.GetPlace(), + tmp_in_array->ptr(), + CPUPlace(), + reinterpret_cast(in_data.data()), + in_data.size() * sizeof(T *)); T **in_array_data = reinterpret_cast(tmp_in_array->ptr()); ComputeKernelParameter(lod_length); @@ -200,16 +200,16 @@ void PartialSumGradOpCUDAKernel(const Context &dev_ctx, } if (!out_data.empty()) { - auto tmp_out_array = phi::memory_utils::Alloc( + auto tmp_out_array = memory_utils::Alloc( dev_ctx.GetPlace(), out_data.size() * sizeof(T *), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); - phi::memory_utils::Copy(dev_ctx.GetPlace(), - tmp_out_array->ptr(), - CPUPlace(), - reinterpret_cast(out_data.data()), - out_data.size() * sizeof(T *)); + memory_utils::Copy(dev_ctx.GetPlace(), + tmp_out_array->ptr(), + CPUPlace(), + reinterpret_cast(out_data.data()), + out_data.size() * sizeof(T *)); T **out_grad_data = reinterpret_cast(tmp_out_array->ptr()); ComputeKernelParameter(lod_length); diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu index 519b56b05a330a..53d55a9f38d9be 100644 --- a/paddle/phi/kernels/gpu/prelu_kernel.cu +++ b/paddle/phi/kernels/gpu/prelu_kernel.cu @@ -51,21 +51,18 @@ void PReluKernel(const Context& dev_ctx, size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; if (channel_last) { auto func = PReluChannelLastWiseCUDAFunctor(x_ptr, alpha_ptr, channel); - phi::IndexKernel>( - dev_ctx, out, func); + IndexKernel>(dev_ctx, out, func); } else { size_t plane_size = numel / dim[0] / channel; auto func = PReluChannelFirstWiseCUDAFunctor( x_ptr, alpha_ptr, numel, channel, plane_size); - phi::IndexKernel>( - dev_ctx, out, func); + IndexKernel>(dev_ctx, out, func); } } else if (mode == "element") { size_t spatial_size = numel / dim[0]; auto func = PreluElementWiseDirectCUDAFunctor(x_ptr, alpha_ptr, spatial_size); - phi::IndexKernel>( - dev_ctx, out, func); + IndexKernel>(dev_ctx, out, func); } else { std::vector ins = {&x}; std::vector outs = {out}; diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu index bde80e51fe8616..f2fab3332cfb91 100644 --- a/paddle/phi/kernels/gpu/qr_kernel.cu +++ b/paddle/phi/kernels/gpu/qr_kernel.cu @@ -85,17 +85,17 @@ struct QrFunctor { int tau_stride = min_mn; if (compute_q) { - dev_ctx.template Alloc>( - q, batch_size * m * k * sizeof(phi::dtype::Real)); + dev_ctx.template Alloc>( + q, batch_size * m * k * sizeof(dtype::Real)); } - dev_ctx.template Alloc>( - r, batch_size * k * n * sizeof(phi::dtype::Real)); + dev_ctx.template Alloc>( + r, batch_size * k * n * sizeof(dtype::Real)); // Note: allocate temporary tensors because of lacking in-place operations. // Prepare qr DenseTensor qr; - dev_ctx.template Alloc>( - &qr, size_t(batch_size * m * n * sizeof(phi::dtype::Real))); + dev_ctx.template Alloc>( + &qr, size_t(batch_size * m * n * sizeof(dtype::Real))); // BatchedGeqrf performs computation in-place and 'qr' must be a copy of // input Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &qr); @@ -109,8 +109,8 @@ struct QrFunctor { // Transpose 'qr' to conform the column-major order auto tmp_qr = TransposeLast2Dim(dev_ctx, qr); Copy(dev_ctx, tmp_qr, qr.place(), false, &qr); - auto qr_data = dev_ctx.template Alloc>(&qr); - auto tau_data = dev_ctx.template Alloc>(&tau); + auto qr_data = dev_ctx.template Alloc>(&qr); + auto tau_data = dev_ctx.template Alloc>(&tau); BatchedGeqrf( dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride, tau_stride); @@ -152,15 +152,14 @@ struct QrFunctor { auto new_qr_dims_vec = vectorize(x_dims); new_qr_dims_vec[new_qr_dims_vec.size() - 1] = m; DenseTensor new_qr = Fill(dev_ctx, new_qr_dims_vec, T(0)); - auto new_qr_data = - dev_ctx.template Alloc>(&new_qr); + auto new_qr_data = dev_ctx.template Alloc>(&new_qr); auto new_qr_stride = m * m; for (int i = 0; i < batch_size; ++i) { memory_utils::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride), dev_ctx.GetPlace(), (qr_data + i * qr_stride), - qr_stride * sizeof(phi::dtype::Real), + qr_stride * sizeof(dtype::Real), dev_ctx.stream()); } BatchedOrgqr(dev_ctx, @@ -197,7 +196,7 @@ struct QrFunctor { }; template -struct QrFunctor, Context> { +struct QrFunctor, Context> { void operator()(const Context& dev_ctx, const DenseTensor& x, bool compute_q, @@ -214,16 +213,16 @@ struct QrFunctor, Context> { int qr_stride = m * n; int tau_stride = min_mn; if (compute_q) { - dev_ctx.template Alloc>( - q, batch_size * m * k * sizeof(phi::dtype::complex)); + dev_ctx.template Alloc>( + q, batch_size * m * k * sizeof(dtype::complex)); } - dev_ctx.template Alloc>( - r, batch_size * k * n * sizeof(phi::dtype::complex)); + dev_ctx.template Alloc>( + r, batch_size * k * n * sizeof(dtype::complex)); // Note: allocate temporary tensors because of lacking in-place operations. // Prepare qr DenseTensor qr; - dev_ctx.template Alloc>( - &qr, size_t(batch_size * m * n * sizeof(phi::dtype::complex))); + dev_ctx.template Alloc>( + &qr, size_t(batch_size * m * n * sizeof(dtype::complex))); // BatchedGeqrf performs computation in-place and 'qr' must be a copy of // input Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &qr); @@ -232,29 +231,28 @@ struct QrFunctor, Context> { tau_dims_vec.pop_back(); tau_dims_vec[tau_dims_vec.size() - 1] = min_mn; DenseTensor tau = - Fill, Context>(dev_ctx, tau_dims_vec, T(0)); + Fill, Context>(dev_ctx, tau_dims_vec, T(0)); // Transpose 'qr' to conform the column-major order - auto tmp_qr = - TransposeLast2Dim, Context>(dev_ctx, qr); + auto tmp_qr = TransposeLast2Dim, Context>(dev_ctx, qr); Copy(dev_ctx, tmp_qr, qr.place(), false, &qr); - auto qr_data = dev_ctx.template Alloc>(&qr); - auto tau_data = dev_ctx.template Alloc>(&tau); - BatchedGeqrf>( + auto qr_data = dev_ctx.template Alloc>(&qr); + auto tau_data = dev_ctx.template Alloc>(&tau); + BatchedGeqrf>( dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride, tau_stride); if (reduced_mode) { auto trans_qr = - TransposeLast2Dim, Context>(dev_ctx, qr); - auto sliced_qr = Slice, Context>( + TransposeLast2Dim, Context>(dev_ctx, qr); + auto sliced_qr = Slice, Context>( dev_ctx, trans_qr, {trans_qr.dims().size() - 2}, {0}, {min_mn}); - auto tmp_r = TrilTriu, Context>( - dev_ctx, sliced_qr, 0, false); + auto tmp_r = + TrilTriu, Context>(dev_ctx, sliced_qr, 0, false); // Transpose 'tmp_r' to restore the original row-major order Copy(dev_ctx, tmp_r, r->place(), false, r); } else { auto trans_qr = - TransposeLast2Dim, Context>(dev_ctx, qr); - auto tmp_r = TrilTriu, Context>( - dev_ctx, trans_qr, 0, false); + TransposeLast2Dim, Context>(dev_ctx, qr); + auto tmp_r = + TrilTriu, Context>(dev_ctx, trans_qr, 0, false); // Transpose 'tmp_r' to restore the original row-major order Copy(dev_ctx, tmp_r, r->place(), false, r); } @@ -262,65 +260,64 @@ struct QrFunctor, Context> { // Perform QRGQR for Q using the result from GEQRF // Transpose 'q' to restore the original row-major order if (reduced_mode) { - BatchedOrgqr>(dev_ctx, - batch_size, - m, - min_mn, - min_mn, - qr_data, - m, - tau_data, - qr_stride, - tau_stride); + BatchedOrgqr>(dev_ctx, + batch_size, + m, + min_mn, + min_mn, + qr_data, + m, + tau_data, + qr_stride, + tau_stride); auto trans_q = - TransposeLast2Dim, Context>(dev_ctx, qr); - auto sliced_q = Slice, Context>( + TransposeLast2Dim, Context>(dev_ctx, qr); + auto sliced_q = Slice, Context>( dev_ctx, trans_q, {trans_q.dims().size() - 1}, {0}, {min_mn}); Copy(dev_ctx, sliced_q, q->place(), false, q); } else { if (m > n) { auto new_qr_dims_vec = vectorize(x_dims); new_qr_dims_vec[new_qr_dims_vec.size() - 1] = m; - DenseTensor new_qr = Fill, Context>( - dev_ctx, new_qr_dims_vec, T(0)); - auto new_qr_data = - dev_ctx.template Alloc>(&new_qr); + DenseTensor new_qr = + Fill, Context>(dev_ctx, new_qr_dims_vec, T(0)); + auto new_qr_data = dev_ctx.template Alloc>(&new_qr); auto new_qr_stride = m * m; for (int i = 0; i < batch_size; ++i) { memory_utils::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride), dev_ctx.GetPlace(), (qr_data + i * qr_stride), - qr_stride * sizeof(phi::dtype::complex), + qr_stride * sizeof(dtype::complex), dev_ctx.stream()); } - BatchedOrgqr>(dev_ctx, - batch_size, - m, - m, - min_mn, - new_qr_data, - m, - tau_data, - new_qr_stride, - tau_stride); - auto trans_q = TransposeLast2Dim, Context>( - dev_ctx, new_qr); + BatchedOrgqr>(dev_ctx, + batch_size, + m, + m, + min_mn, + new_qr_data, + m, + tau_data, + new_qr_stride, + tau_stride); + auto trans_q = + TransposeLast2Dim, Context>(dev_ctx, new_qr); Copy(dev_ctx, trans_q, q->place(), false, q); } else { - BatchedOrgqr>(dev_ctx, - batch_size, - m, - m, - min_mn, - qr_data, - m, - tau_data, - qr_stride, - tau_stride); + BatchedOrgqr>(dev_ctx, + batch_size, + m, + m, + min_mn, + qr_data, + m, + tau_data, + qr_stride, + tau_stride); auto trans_q = - TransposeLast2Dim, Context>(dev_ctx, qr); - auto sliced_q = Slice, Context>( + TransposeLast2Dim, Context>(dev_ctx, qr); + auto sliced_q = Slice, Context>( dev_ctx, trans_q, {trans_q.dims().size() - 1}, {0}, {m}); Copy(dev_ctx, sliced_q, q->place(), false, q); } @@ -592,15 +589,15 @@ void BatchedGeqrf(const GPUContext& dev_ctx, } template <> -void BatchedGeqrf(const GPUContext& dev_ctx, - int batch_size, - int m, - int n, - phi::complex64* a, - int lda, - phi::complex64* tau, - int a_stride, - int tau_stride) { +void BatchedGeqrf(const GPUContext& dev_ctx, + int batch_size, + int m, + int n, + complex64* a, + int lda, + complex64* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -609,16 +606,15 @@ void BatchedGeqrf(const GPUContext& dev_ctx, DenseTensor workspace = DenseTensor(); workspace.Resize({lwork}); - phi::complex64* workspace_ptr = - dev_ctx.template Alloc(&workspace); + complex64* workspace_ptr = dev_ctx.template Alloc(&workspace); DenseTensor info = DenseTensor(); info.Resize({1}); int* info_d = dev_ctx.template Alloc(&info); for (int i = 0; i < batch_size; ++i) { - phi::complex64* a_working_ptr = &a[i * a_stride]; - phi::complex64* tau_working_ptr = &tau[i * tau_stride]; + complex64* a_working_ptr = &a[i * a_stride]; + complex64* tau_working_ptr = &tau[i * tau_stride]; // compute geqrf PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnCgeqrf(handle, @@ -648,15 +644,15 @@ void BatchedGeqrf(const GPUContext& dev_ctx, } template <> -void BatchedGeqrf(const GPUContext& dev_ctx, - int batch_size, - int m, - int n, - phi::complex128* a, - int lda, - phi::complex128* tau, - int a_stride, - int tau_stride) { +void BatchedGeqrf(const GPUContext& dev_ctx, + int batch_size, + int m, + int n, + complex128* a, + int lda, + complex128* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -665,16 +661,15 @@ void BatchedGeqrf(const GPUContext& dev_ctx, DenseTensor workspace = DenseTensor(); workspace.Resize({lwork}); - phi::complex128* workspace_ptr = - dev_ctx.template Alloc(&workspace); + complex128* workspace_ptr = dev_ctx.template Alloc(&workspace); DenseTensor info = DenseTensor(); info.Resize({1}); int* info_d = dev_ctx.template Alloc(&info); for (int i = 0; i < batch_size; ++i) { - phi::complex128* a_working_ptr = &a[i * a_stride]; - phi::complex128* tau_working_ptr = &tau[i * tau_stride]; + complex128* a_working_ptr = &a[i * a_stride]; + complex128* tau_working_ptr = &tau[i * tau_stride]; // compute geqrf PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZgeqrf( handle, @@ -816,16 +811,16 @@ void BatchedOrgqr(const GPUContext& dev_ctx, } template <> -void BatchedOrgqr(const GPUContext& dev_ctx, - int batch_size, - int m, - int n, - int k, - phi::complex64* a, - int lda, - phi::complex64* tau, - int a_stride, - int tau_stride) { +void BatchedOrgqr(const GPUContext& dev_ctx, + int batch_size, + int m, + int n, + int k, + complex64* a, + int lda, + complex64* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -841,16 +836,15 @@ void BatchedOrgqr(const GPUContext& dev_ctx, DenseTensor workspace = DenseTensor(); workspace.Resize({lwork}); - phi::complex64* workspace_ptr = - dev_ctx.template Alloc(&workspace); + complex64* workspace_ptr = dev_ctx.template Alloc(&workspace); DenseTensor info = DenseTensor(); info.Resize({1}); int* info_d = dev_ctx.template Alloc(&info); for (int i = 0; i < batch_size; ++i) { - phi::complex64* a_working_ptr = &a[i * a_stride]; - phi::complex64* tau_working_ptr = &tau[i * tau_stride]; + complex64* a_working_ptr = &a[i * a_stride]; + complex64* tau_working_ptr = &tau[i * tau_stride]; // compute orggr PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnCungqr(handle, @@ -881,16 +875,16 @@ void BatchedOrgqr(const GPUContext& dev_ctx, } template <> -void BatchedOrgqr(const GPUContext& dev_ctx, - int batch_size, - int m, - int n, - int k, - phi::complex128* a, - int lda, - phi::complex128* tau, - int a_stride, - int tau_stride) { +void BatchedOrgqr(const GPUContext& dev_ctx, + int batch_size, + int m, + int n, + int k, + complex128* a, + int lda, + complex128* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -906,16 +900,15 @@ void BatchedOrgqr(const GPUContext& dev_ctx, DenseTensor workspace = DenseTensor(); workspace.Resize({lwork}); - phi::complex128* workspace_ptr = - dev_ctx.template Alloc(&workspace); + complex128* workspace_ptr = dev_ctx.template Alloc(&workspace); DenseTensor info = DenseTensor(); info.Resize({1}); int* info_d = dev_ctx.template Alloc(&info); for (int i = 0; i < batch_size; ++i) { - phi::complex128* a_working_ptr = &a[i * a_stride]; - phi::complex128* tau_working_ptr = &tau[i * tau_stride]; + complex128* a_working_ptr = &a[i * a_stride]; + complex128* tau_working_ptr = &tau[i * tau_stride]; // compute orggr PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZungqr( handle, diff --git a/paddle/phi/kernels/gpu/quantize_linear_kernel.cu b/paddle/phi/kernels/gpu/quantize_linear_kernel.cu index ecb4d7b626e0cd..e4166efa4a77a6 100644 --- a/paddle/phi/kernels/gpu/quantize_linear_kernel.cu +++ b/paddle/phi/kernels/gpu/quantize_linear_kernel.cu @@ -114,7 +114,7 @@ struct DequantizeFunctor { } }; -template struct DequantizeFunctor; +template struct DequantizeFunctor; template struct DequantizeFunctor; template struct DequantizeFunctor; template struct ChannelDequantizeFunctorV2; diff --git a/paddle/phi/kernels/gpu/radam_kernel.cu b/paddle/phi/kernels/gpu/radam_kernel.cu index e9a8fa56fb8c11..149d59dc11b98b 100644 --- a/paddle/phi/kernels/gpu/radam_kernel.cu +++ b/paddle/phi/kernels/gpu/radam_kernel.cu @@ -125,7 +125,7 @@ void RAdamKernel(const Context& dev_ctx, DenseTensor* moment1_out, DenseTensor* moment2_out, DenseTensor* master_param_out) { - using MT = typename phi::dtype::template MPTypeTrait::Type; + using MT = typename dtype::template MPTypeTrait::Type; T* param_out_data = dev_ctx.template Alloc(param_out); MT* beta1_pow_out_data = dev_ctx.template Alloc(beta1_pow_out); diff --git a/paddle/phi/kernels/gpu/random_kernel.cu b/paddle/phi/kernels/gpu/random_kernel.cu index de3f78075b31a7..349eb506d59696 100644 --- a/paddle/phi/kernels/gpu/random_kernel.cu +++ b/paddle/phi/kernels/gpu/random_kernel.cu @@ -31,9 +31,8 @@ void RandomKernel(const Context& dev_ctx, out->Resize(x.dims()); T* data = dev_ctx.template Alloc(out); - if constexpr (std::is_floating_point_v || - std::is_same_v || - std::is_same_v) { + if constexpr (std::is_floating_point_v || std::is_same_v || + std::is_same_v) { from = update_from(from); to = update_to(to); diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index dbfcf2755a621e..771ab5f9b50e77 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -125,10 +125,10 @@ void RandpermKernel(const Context& dev_ctx, end_bit < 32 ? end_bit : 32, dev_ctx.stream()); - auto d_temp_storage = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - temp_storage_bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto d_temp_storage = + memory_utils::Alloc(dev_ctx.GetPlace(), + temp_storage_bytes, + Stream(reinterpret_cast(dev_ctx.stream()))); cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), temp_storage_bytes, key.data(), @@ -143,7 +143,7 @@ void RandpermKernel(const Context& dev_ctx, auto gen_cuda = dev_ctx.GetGenerator(); auto seed_offset = gen_cuda->IncrementOffset(n); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); SwapRepeatKernel<<::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType start_value = static_cast(GetValue(dev_ctx, start)); MPType end_value = static_cast(GetValue(dev_ctx, end)); @@ -66,7 +66,7 @@ void RangeNullaryKernel(const Context& dev_ctx, const T end_value, const T step_value, DenseTensor* out) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType start_value_mpt = static_cast(start_value); MPType end_value_mpt = static_cast(end_value); MPType step_value_mpt = static_cast(step_value); diff --git a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h index 4f5507815dfbb7..c6ce04b1e41c79 100644 --- a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h +++ b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h @@ -85,16 +85,16 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx, funcs::BroadcastKernel( dev_ctx, equal_inputs, &equal_outputs, funcs::EqualFunctor(), 0); // 2. equal_count = reduceSum(equal_out) - phi::SumKernel(dev_ctx, - equal_out, - reduce_dims, - equal_out.dtype(), - keep_dim, - &equal_count); + SumKernel(dev_ctx, + equal_out, + reduce_dims, + equal_out.dtype(), + keep_dim, + &equal_count); // 3. dx = dout * 1 - phi::MultiplyKernel(dev_ctx, new_dout, equal_out, &equal_out); + MultiplyKernel(dev_ctx, new_dout, equal_out, &equal_out); // 4. dx = Div(dx, equal_out) - phi::DivideKernel(dev_ctx, equal_out, equal_count, &new_dx); + DivideKernel(dev_ctx, equal_out, equal_count, &new_dx); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/reduce_as_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_as_grad_kernel.cu index d34ec05140d2c6..af7e8604c60798 100644 --- a/paddle/phi/kernels/gpu/reduce_as_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_as_grad_kernel.cu @@ -44,13 +44,13 @@ void ReduceAsGradKernel(const Context& dev_ctx, new_out_grad.ShareDataWith(out_grad); new_out_grad.Resize(update_dims); - using MPType = typename phi::dtype::MPTypeTrait::Type; - phi::ReduceGrad>( + using MPType = typename dtype::MPTypeTrait::Type; + ReduceGrad>( dev_ctx, &new_out_grad, x_grad, out_grad.dtype(), - phi::kps::IdentityFunctor()); + kps::IdentityFunctor()); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/reduce_as_kernel.cu b/paddle/phi/kernels/gpu/reduce_as_kernel.cu index f65c10614add37..8d22367c7c7142 100644 --- a/paddle/phi/kernels/gpu/reduce_as_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_as_kernel.cu @@ -29,7 +29,7 @@ void ReduceAsKernel(const Context& dev_ctx, auto reduce_dim = funcs::GetReduceDims(x, target); dev_ctx.template Alloc(out); if (reduce_dim.size() != 0) { - phi::SumKernel(dev_ctx, x, reduce_dim, out->type(), false, out); + SumKernel(dev_ctx, x, reduce_dim, out->type(), false, out); } else { Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); } diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu index aa2a6543afdfb2..54b18754b09ac8 100644 --- a/paddle/phi/kernels/gpu/reduce_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_kernel.cu @@ -71,8 +71,8 @@ void ReduceSumGradKernel(const Context& dev_ctx, // call ReduceGrad dev_ctx.Alloc(x_grad, x.dtype()); - using MPType = typename phi::dtype::MPTypeTrait::Type; - phi::ReduceGrad>( + using MPType = typename dtype::MPTypeTrait::Type; + ReduceGrad>( dev_ctx, &new_out_grad, x_grad, @@ -116,7 +116,7 @@ void ReduceMeanGradKernel(const Context& dev_ctx, std::vector inputs = {&new_out_grad}; std::vector outputs = {x_grad}; - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; funcs::BroadcastKernel(dev_ctx, inputs, &outputs, @@ -275,8 +275,8 @@ void NansumGradKernel(const Context& dev_ctx, new_out_grad.Resize(update_dims); dev_ctx.Alloc(x_grad, x.dtype()); - using MPType = typename phi::dtype::MPTypeTrait::Type; - phi::ReduceGrad>( + using MPType = typename dtype::MPTypeTrait::Type; + ReduceGrad>( dev_ctx, &new_out_grad, x_grad, @@ -287,7 +287,7 @@ void NansumGradKernel(const Context& dev_ctx, const T* x_data = x.data(); T* x_grad_data = x_grad->data(); int64_t numel = x.numel(); - phi::funcs::ForRange for_range(dev_ctx, numel); + funcs::ForRange for_range(dev_ctx, numel); for_range(NanMaskFunctor(x_data, x_grad_data)); } diff --git a/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu index ab2483bab7a000..28936e015a0b1e 100644 --- a/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu @@ -27,7 +27,6 @@ #include "paddle/phi/kernels/primitive/kernel_primitives.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void index_select_grad_cuda_kernel(const T* output_grad, @@ -143,9 +142,8 @@ void RepeatInterleaveWithTensorIndexGradKernel( auto* in_grad_data = x_grad->data(); auto stream = dev_ctx.stream(); int vec_size = 8; - vec_size = std::min(phi::GetVectorizedSize(in_grad_data), vec_size); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); + vec_size = std::min(GetVectorizedSize(in_grad_data), vec_size); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); switch (vec_size) { #define CASE_VEC_SIZE(__Sz) \ @@ -230,7 +228,7 @@ void RepeatInterleaveGradKernel(const Context& dev_ctx, SumKernel(dev_ctx, out_grad_copy, - phi::IntArray({dim + 1}), + IntArray({dim + 1}), x_grad->dtype(), false, x_grad); diff --git a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu index 58c011b028fc92..194b65d9de5eb3 100644 --- a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu +++ b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu @@ -28,7 +28,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void index_select_cuda_kernel(const T* input, T* output, @@ -257,14 +256,14 @@ void RepeatInterleaveKernel(const Context& dev_ctx, outer_size * repeat_size * repeats * inner_size; int vec_size = 8; - vec_size = std::min(phi::GetVectorizedSize(x.data()), vec_size); - vec_size = std::min(phi::GetVectorizedSize(out->data()), vec_size); + vec_size = std::min(GetVectorizedSize(x.data()), vec_size); + vec_size = std::min(GetVectorizedSize(out->data()), vec_size); while (vec_size > 1 && inner_size % vec_size != 0) { vec_size /= 2; } constexpr int loop_count = 1; - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + auto config = backends::gpu::GetGpuLaunchConfig1D( dev_ctx, total_elements, vec_size * loop_count); switch (vec_size) { diff --git a/paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h b/paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h index 2f8a4b4a91e021..15d6ed84d01f2c 100644 --- a/paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h +++ b/paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h @@ -1232,7 +1232,7 @@ void RMSNormFwdKernel(const Context& dev_ctx, double epsilon, DenseTensor* y, DenseTensor* invvar) { - using T_ACC = typename phi::dtype::MPTypeTrait::Type; + using T_ACC = typename dtype::MPTypeTrait::Type; if (x.numel() == 0) { dev_ctx.template Alloc(y); @@ -1265,8 +1265,8 @@ void RMSNormFwdKernel(const Context& dev_ctx, bool can_vec_X2 = can_vectorize(x_data, alignment2); bool can_vec_Y2 = can_vectorize(y_data, alignment2); bool can_vec_scale2 = can_vectorize(scale_data, alignment2); - bool is_supported_type2 = (std::is_same::value || - std::is_same::value); + bool is_supported_type2 = (std::is_same::value || + std::is_same::value); if (is_supported_type2 && cols <= static_cast(1ULL << std::numeric_limits::digits) && @@ -1292,8 +1292,8 @@ void RMSNormFwdKernel(const Context& dev_ctx, bool can_vec_Y = can_vectorize(y_data, alignment); bool can_vec_scale = can_vectorize(scale_data, alignment); bool is_supported_type = (std::is_same::value || - std::is_same::value || - std::is_same::value); + std::is_same::value || + std::is_same::value); if (is_supported_type && cols <= @@ -1329,7 +1329,7 @@ void RMSNormBwdKernel(const Context& dev_ctx, double epsilon, DenseTensor* dX, DenseTensor* dscale) { - using T_ACC = typename phi::dtype::MPTypeTrait::Type; + using T_ACC = typename dtype::MPTypeTrait::Type; if (X.numel() == 0) { if (dX) { @@ -1374,16 +1374,16 @@ void RMSNormBwdKernel(const Context& dev_ctx, can_vectorize(scale_data, alignment) && can_vectorize(dX_data, alignment); bool is_supported_type = (std::is_same::value || - std::is_same::value || - std::is_same::value); + std::is_same::value || + std::is_same::value); const unsigned int alignment2 = sizeof(T) * 8; bool bAlignedBuffers2 = can_vectorize(dY_data, alignment2) && can_vectorize(X_data, alignment2) && can_vectorize(scale_data, alignment2) && can_vectorize(dX_data, alignment2); - bool is_supported_type2 = (std::is_same::value || - std::is_same::value); + bool is_supported_type2 = (std::is_same::value || + std::is_same::value); dim3 blocks(M); constexpr int num_threads = 128; @@ -1449,7 +1449,7 @@ void RMSNormBwdKernel(const Context& dev_ctx, } // Sum reduction along blocks.y dimension to get final dscale - phi::SumKernel( + SumKernel( dev_ctx, dscale_blocks, {0}, dscale->dtype(), false, dscale); } else { diff --git a/paddle/phi/kernels/gpu/rms_norm_funcs.h b/paddle/phi/kernels/gpu/rms_norm_funcs.h index affa7d3ed8a321..f7dc86fecba6cb 100644 --- a/paddle/phi/kernels/gpu/rms_norm_funcs.h +++ b/paddle/phi/kernels/gpu/rms_norm_funcs.h @@ -51,36 +51,36 @@ namespace { // NOLINT } while (0); \ break -#define DISPATCH_SCALE_TYPE(INPUT_TYPE, SCALE_DTYPE, NAME, ...) \ - do { \ - auto input_dtype = phi::CppTypeToDataType::Type(); \ - bool is_scale_same_dtype_with_x = input_dtype == SCALE_DTYPE; \ - using U = typename phi::backends::gpu::CudnnDataType< \ - INPUT_TYPE>::BatchNormParamType; \ - if (!is_scale_same_dtype_with_x) { \ - PADDLE_ENFORCE_EQ( \ - SCALE_DTYPE, \ - phi::CppTypeToDataType::Type(), \ - common::errors::InvalidArgument("Unsupported data type of Scale")); \ - } \ - switch (SCALE_DTYPE) { \ - case paddle::DataType::FLOAT32: { \ - using SCALE_TYPE = float; \ - __VA_ARGS__; \ - break; \ - } \ - case paddle::DataType::FLOAT16: { \ - using SCALE_TYPE = phi::float16; \ - __VA_ARGS__; \ - break; \ - } \ - case paddle::DataType::BFLOAT16: { \ - using SCALE_TYPE = phi::bfloat16; \ - __VA_ARGS__; \ - break; \ - } \ - DEFAULT_THROW(NAME, SCALE_DTYPE); \ - } \ +#define DISPATCH_SCALE_TYPE(INPUT_TYPE, SCALE_DTYPE, NAME, ...) \ + do { \ + auto input_dtype = CppTypeToDataType::Type(); \ + bool is_scale_same_dtype_with_x = input_dtype == SCALE_DTYPE; \ + using U = \ + typename backends::gpu::CudnnDataType::BatchNormParamType; \ + if (!is_scale_same_dtype_with_x) { \ + PADDLE_ENFORCE_EQ( \ + SCALE_DTYPE, \ + CppTypeToDataType::Type(), \ + common::errors::InvalidArgument("Unsupported data type of Scale")); \ + } \ + switch (SCALE_DTYPE) { \ + case paddle::DataType::FLOAT32: { \ + using SCALE_TYPE = float; \ + __VA_ARGS__; \ + break; \ + } \ + case paddle::DataType::FLOAT16: { \ + using SCALE_TYPE = float16; \ + __VA_ARGS__; \ + break; \ + } \ + case paddle::DataType::BFLOAT16: { \ + using SCALE_TYPE = bfloat16; \ + __VA_ARGS__; \ + break; \ + } \ + DEFAULT_THROW(NAME, SCALE_DTYPE); \ + } \ } while (0) #ifdef PADDLE_WITH_HIP @@ -270,7 +270,7 @@ __device__ void cuWelfordMuSigma2(const T* __restrict__ vals, } template <> -__device__ void cuWelfordMuSigma2(const phi::float16* __restrict__ vals, +__device__ void cuWelfordMuSigma2(const float16* __restrict__ vals, const int n1, const int n2, const int i1, diff --git a/paddle/phi/kernels/gpu/rmsprop_kernel.cu b/paddle/phi/kernels/gpu/rmsprop_kernel.cu index 4a2c391e6293cf..8da9e233e1832c 100644 --- a/paddle/phi/kernels/gpu/rmsprop_kernel.cu +++ b/paddle/phi/kernels/gpu/rmsprop_kernel.cu @@ -47,7 +47,7 @@ struct RmsFunctor { size_t limit = static_cast(ms_tensor.numel()); DenseRmspropGradFunctor grad_func(grad_tensor.data()); funcs::ForRange for_range(dev_ctx, limit); - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT *master_out_data = multi_precision ? dev_ctx.template Alloc(master_param_outs) : nullptr; @@ -95,7 +95,7 @@ struct RmsFunctor { }; template struct RmsFunctor; template struct RmsFunctor; -template struct RmsFunctor; +template struct RmsFunctor; } // namespace phi PD_REGISTER_KERNEL(rmsprop, diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h index 1cbdf16541ad52..ccc2ca6c909c94 100644 --- a/paddle/phi/kernels/gpu/rnn_functor.h +++ b/paddle/phi/kernels/gpu/rnn_functor.h @@ -64,7 +64,7 @@ class RNNDescriptors { size_t *reserve_size, DenseTensor *dropout_state) { int numDirections = is_bidirec_ ? 2 : 1; - gpuDnnDataType_t cudnn_type = phi::backends::gpu::CudnnDataType::type; + gpuDnnDataType_t cudnn_type = backends::gpu::CudnnDataType::type; // ------------------- cudnn x, y descriptors --------------------- std::vector dims_x = {batch_size_, input_size_, 1}; std::vector strides_x = {input_size_, 1, 1}; @@ -319,19 +319,19 @@ class RNNDescriptors { std::vector y_descs_; #endif - phi::backends::gpu::ScopedTensorDescriptor x_desc_; - phi::backends::gpu::ScopedTensorDescriptor y_desc_; + backends::gpu::ScopedTensorDescriptor x_desc_; + backends::gpu::ScopedTensorDescriptor y_desc_; #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201 - phi::backends::gpu::ScopedRNNTensorDescriptor x_seq_desc_; - phi::backends::gpu::ScopedRNNTensorDescriptor y_seq_desc_; + backends::gpu::ScopedRNNTensorDescriptor x_seq_desc_; + backends::gpu::ScopedRNNTensorDescriptor y_seq_desc_; #endif - phi::backends::gpu::ScopedTensorDescriptor init_h_desc_; - phi::backends::gpu::ScopedTensorDescriptor init_c_desc_; - phi::backends::gpu::ScopedTensorDescriptor last_h_desc_; - phi::backends::gpu::ScopedTensorDescriptor last_c_desc_; - phi::backends::gpu::ScopedDropoutDescriptor dropout_desc_; - phi::backends::gpu::ScopedFilterDescriptor weight_desc_; - phi::backends::gpu::ScopedRNNDescriptor rnn_desc_; + backends::gpu::ScopedTensorDescriptor init_h_desc_; + backends::gpu::ScopedTensorDescriptor init_c_desc_; + backends::gpu::ScopedTensorDescriptor last_h_desc_; + backends::gpu::ScopedTensorDescriptor last_c_desc_; + backends::gpu::ScopedDropoutDescriptor dropout_desc_; + backends::gpu::ScopedFilterDescriptor weight_desc_; + backends::gpu::ScopedRNNDescriptor rnn_desc_; }; template diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu index 6ec2727d4aca3a..f6619dcb4ef2d5 100644 --- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu @@ -222,7 +222,7 @@ void RnnGradKernel(const Context &dev_ctx, #endif std::vector SequenceLength; if (has_seq_length) { - SequenceLength = phi::GetVectorFromTensor(sequence_length.get_ptr()); + SequenceLength = GetVectorFromTensor(sequence_length.get_ptr()); } auto input_dims = x.dims(); diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu b/paddle/phi/kernels/gpu/rnn_kernel.cu index 8d88e77e51be18..762d856297dd20 100644 --- a/paddle/phi/kernels/gpu/rnn_kernel.cu +++ b/paddle/phi/kernels/gpu/rnn_kernel.cu @@ -231,7 +231,7 @@ void RnnKernel(const Context &dev_ctx, #endif std::vector SequenceLength; if (has_seq_length) { - SequenceLength = phi::GetVectorFromTensor(sequence_length.get_ptr()); + SequenceLength = GetVectorFromTensor(sequence_length.get_ptr()); } auto handle = dev_ctx.cudnn_handle(); diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu index 43b16b9541af62..ad0e9c2bcc6d69 100644 --- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu @@ -247,10 +247,10 @@ void RoiAlignGradKernel(const Context& dev_ctx, } } } - auto roi_ptr = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - box_batch_id_list.numel() * sizeof(int), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto roi_ptr = + memory_utils::Alloc(dev_ctx.GetPlace(), + box_batch_id_list.numel() * sizeof(int), + Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); int64_t bytes = box_batch_id_list.numel() * sizeof(int); const int* stable_box_batch_size = diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu index e75ac419ce274c..a6f2bb15759ad9 100644 --- a/paddle/phi/kernels/gpu/roi_align_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu @@ -262,10 +262,10 @@ void RoiAlignKernel(const Context& dev_ctx, } } int64_t bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto roi_ptr = + memory_utils::Alloc(dev_ctx.GetPlace(), + bytes, + Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const int* stable_roi_batch_id = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu index edae8204d88267..f20398b971f06a 100644 --- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu @@ -133,10 +133,10 @@ void RoiPoolGradKernel(const Context& dev_ctx, } } int bytes = box_batch_id_list.numel() * sizeof(int); - auto roi_ptr = phi::memory_utils::Alloc( + auto roi_ptr = memory_utils::Alloc( dev_ctx.GetPlace(), bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const int* stable_box_batch_id = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu index d161b985ae226f..e3635cf6fe1877 100644 --- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu @@ -196,10 +196,10 @@ void RoiPoolKernel(const Context& dev_ctx, } int bytes = box_batch_id_list.numel() * sizeof(int); - auto box_ptr = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto box_ptr = + memory_utils::Alloc(dev_ctx.GetPlace(), + bytes, + Stream(reinterpret_cast(dev_ctx.stream()))); int* box_id_data = reinterpret_cast(box_ptr->ptr()); const int* stable_box_batch_id = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph( diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h index 7689f5242a1223..72c897e0ec8210 100644 --- a/paddle/phi/kernels/gpu/roll_kernel_impl.h +++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h @@ -20,16 +20,14 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void RollCudaKernel(const T* input, T* output, const int rank, const int64_t numel, - phi::Array shifts, - phi::Array strides, - phi::Array sizes) { + Array shifts, + Array strides, + Array sizes) { int64_t idx = static_cast(blockIdx.x) * static_cast(blockDim.x) + static_cast(threadIdx.x); @@ -64,11 +62,9 @@ void LaunchRollKernel(const Context& dev_ctx, const std::vector shifts, const std::vector strides, const std::vector sizes) { - using phi::PADDLE_CUDA_NUM_THREADS; - - phi::Array strides_array; - phi::Array shifts_array; - phi::Array sizes_array; + Array strides_array; + Array shifts_array; + Array sizes_array; for (int i = 0; i < rank; ++i) { strides_array[i] = strides[i]; shifts_array[i] = shifts[i]; diff --git a/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu index bbcac5fbcb1123..b38fc6c4868fc4 100644 --- a/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/row_conv_grad_kernel.cu @@ -173,7 +173,7 @@ __global__ void RowConvGradFilterImproved(const T *in, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + val += backends::gpu::CudaShuffleDownSync(mask, val, offset); } __syncthreads(); @@ -239,7 +239,7 @@ __global__ void RowConvGradFilter(const T *in, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + val += backends::gpu::CudaShuffleDownSync(mask, val, offset); } __syncthreads(); @@ -297,7 +297,7 @@ void RowConvGradKernel(const Context &dev_ctx, PADDLE_ENFORCE_LE_INT_MAX(future_context, "future_context"); int future_context_int = static_cast(future_context); - phi::MixVector mixv_batch_indices(&batch_indices); + MixVector mixv_batch_indices(&batch_indices); size_t *idx = mixv_batch_indices.CUDAMutableData(dev_ctx.GetPlace()); funcs::SetConstant zero; diff --git a/paddle/phi/kernels/gpu/row_conv_kernel.cu b/paddle/phi/kernels/gpu/row_conv_kernel.cu index 1e9f231af21fea..7baacdfbca4da1 100644 --- a/paddle/phi/kernels/gpu/row_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/row_conv_kernel.cu @@ -138,7 +138,7 @@ void RowConvKernel(const Context &dev_ctx, PADDLE_ENFORCE_LE_INT_MAX(future_context, "future_context"); int future_context_int = static_cast(future_context); - phi::MixVector mix_vector(&batch_indices); + MixVector mix_vector(&batch_indices); size_t *idx = mix_vector.CUDAMutableData(dev_ctx.GetPlace()); auto stream = dev_ctx.stream(); diff --git a/paddle/phi/kernels/gpu/rprop_kernel.cu b/paddle/phi/kernels/gpu/rprop_kernel.cu index 83775e527add88..1e79dcecd4b0c9 100644 --- a/paddle/phi/kernels/gpu/rprop_kernel.cu +++ b/paddle/phi/kernels/gpu/rprop_kernel.cu @@ -99,7 +99,7 @@ void RpropKernel(const Context& dev_ctx, DenseTensor* prev_out, DenseTensor* learning_rate_out, DenseTensor* master_param_out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; const MT* master_in_data = multi_precision ? master_param->data() : nullptr; MT* master_out_data = diff --git a/paddle/phi/kernels/gpu/rrelu_kernel.cu b/paddle/phi/kernels/gpu/rrelu_kernel.cu index 4d02548850f1ec..272890293f11f5 100644 --- a/paddle/phi/kernels/gpu/rrelu_kernel.cu +++ b/paddle/phi/kernels/gpu/rrelu_kernel.cu @@ -93,7 +93,7 @@ void RReluKernel(const Context& dev_ctx, RReluTestCudaFunctor functor(x_data, out_data, noise_data, mid_val); for_range(functor); } else { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; funcs::uniform_distribution dist; funcs::uniform_real_transform trans(lower, upper); funcs::distribution_and_transform(dev_ctx, noise, dist, trans); diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu index a5d318143a78bc..9dc653dba9a2af 100644 --- a/paddle/phi/kernels/gpu/scale_kernel.cu +++ b/paddle/phi/kernels/gpu/scale_kernel.cu @@ -47,7 +47,7 @@ void ScaleKernel(const Context& dev_ctx, const Scalar& bias, bool bias_after_scale, DenseTensor* out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; std::vector inputs; std::vector outputs; inputs.emplace_back(&x); @@ -68,7 +68,7 @@ void DivScaleKernel(const Context& dev_ctx, const DenseTensor& x, const Scalar& scale, DenseTensor* out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; std::vector inputs; std::vector outputs; inputs.emplace_back(&x); @@ -90,7 +90,7 @@ INSTANCE_SCALAR_KERNEL(int, GPUContext) INSTANCE_SCALAR_KERNEL(int64_t, GPUContext) INSTANCE_SCALAR_KERNEL(float, GPUContext) INSTANCE_SCALAR_KERNEL(double, GPUContext) -INSTANCE_SCALAR_KERNEL(phi::float16, GPUContext) +INSTANCE_SCALAR_KERNEL(float16, GPUContext) INSTANCE_SCALAR_KERNEL(int16_t, GPUContext) INSTANCE_SCALAR_KERNEL(uint8_t, GPUContext) INSTANCE_SCALAR_KERNEL(int8_t, GPUContext) diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu index fb78738ad65516..0b56a509b605d8 100644 --- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu @@ -151,12 +151,11 @@ void CalculateXGrad(const Context& dev_ctx, bcast_info.out_len, functor); // Run reduce_sum - DenseTensor x_grad_out = - phi::Sum(dev_ctx, - x_grad_v2, - phi::IntArray(reduce_idx), - CppTypeToDataType::Type(), - true); + DenseTensor x_grad_out = Sum(dev_ctx, + x_grad_v2, + IntArray(reduce_idx), + CppTypeToDataType::Type(), + true); #ifdef PADDLE_WITH_HIP hipMemcpy(x_grad, x_grad_out.data(), @@ -228,12 +227,11 @@ void CalculateXGrad(const Context& dev_ctx, bcast_info.use_bcast, mul_functor, sum_functor); - DenseTensor x_grad_out = - phi::Sum(dev_ctx, - x_grad_v2, - phi::IntArray(reduce_idx), - CppTypeToDataType::Type(), - true); + DenseTensor x_grad_out = Sum(dev_ctx, + x_grad_v2, + IntArray(reduce_idx), + CppTypeToDataType::Type(), + true); #ifdef PADDLE_WITH_HIP hipMemcpy(x_grad, x_grad_out.data(), @@ -274,12 +272,11 @@ void CalculateXGrad(const Context& dev_ctx, bcast_info.out_len, s_count); // Run reduce_sum - DenseTensor x_grad_out = - phi::Sum(dev_ctx, - x_grad_v2, - phi::IntArray(reduce_idx), - CppTypeToDataType::Type(), - true); + DenseTensor x_grad_out = Sum(dev_ctx, + x_grad_v2, + IntArray(reduce_idx), + CppTypeToDataType::Type(), + true); #ifdef PADDLE_WITH_HIP hipMemcpy(x_grad, x_grad_out.data(), @@ -342,12 +339,11 @@ void CalculateXGrad(const Context& dev_ctx, out_len, bcast_info.use_bcast); // Run reduce_sum - DenseTensor x_grad_out = - phi::Sum(dev_ctx, - x_grad_v2, - phi::IntArray(reduce_idx), - CppTypeToDataType::Type(), - true); + DenseTensor x_grad_out = Sum(dev_ctx, + x_grad_v2, + IntArray(reduce_idx), + CppTypeToDataType::Type(), + true); // TODO(daisiming): Whether use x_grad instead. #ifdef PADDLE_WITH_HIP hipMemcpy(x_grad, diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu index 0cbe6406d76d74..ce53876f136dab 100644 --- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu @@ -110,12 +110,11 @@ void CalculateGrad(const Context& dev_ctx, x_grad_v2_data); // Run reduce sum - DenseTensor x_grad_out = - phi::Sum(dev_ctx, - x_grad_v2, - phi::IntArray(reduce_idx), - CppTypeToDataType::Type(), - true); + DenseTensor x_grad_out = Sum(dev_ctx, + x_grad_v2, + IntArray(reduce_idx), + CppTypeToDataType::Type(), + true); #ifdef PADDLE_WITH_HIP hipMemcpy(x_grad, x_grad_out.data(), @@ -193,12 +192,11 @@ void CalculateGrad(const Context& dev_ctx, mul_functor, sum_functor); // Run reduce_sum - DenseTensor x_grad_out = - phi::Sum(dev_ctx, - x_grad_v2, - phi::IntArray(reduce_idx), - CppTypeToDataType::Type(), - true); + DenseTensor x_grad_out = Sum(dev_ctx, + x_grad_v2, + IntArray(reduce_idx), + CppTypeToDataType::Type(), + true); #ifdef PADDLE_WITH_HIP hipMemcpy(x_grad, x_grad_out.data(), diff --git a/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu b/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu index 778df972b81549..75e5bd4a5670b5 100644 --- a/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu @@ -71,9 +71,9 @@ struct SequenceExpandGradFunctor { int block_x = static_cast(ref_lod.size()); dim3 block_size(thread_x, thread_y, thread_z); dim3 grid_size(block_x, 1); - phi::MixVector mixv_ref_lod(&ref_lod); - phi::MixVector mixv_x_lod(&x_lod); - phi::MixVector mixv_out_offset(&out_offset); + MixVector mixv_ref_lod(&ref_lod); + MixVector mixv_x_lod(&x_lod); + MixVector mixv_out_offset(&out_offset); sequence_expand_grad_kernel<<>>( dout.data(), mixv_ref_lod.CUDAData(dev_ctx.GetPlace()), diff --git a/paddle/phi/kernels/gpu/sequence_expand_kernel.cu b/paddle/phi/kernels/gpu/sequence_expand_kernel.cu index 6414374f001223..0f0a3e7a8997cf 100644 --- a/paddle/phi/kernels/gpu/sequence_expand_kernel.cu +++ b/paddle/phi/kernels/gpu/sequence_expand_kernel.cu @@ -45,7 +45,7 @@ static inline int ExpandByMemoryCopy(const GPUContext& dev_ctx, } for (size_t j = 0; j < repeat_num; j++) { for (size_t k = 0; k < x_seq_len; k++) { - phi::memory_utils::Copy( + memory_utils::Copy( gpu_place, out_data + (out_start + j * x_seq_len + k) * x_item_length, gpu_place, @@ -117,7 +117,7 @@ struct SequenceExpandFunctor { out_offset[2 * x_lod_size + i] = ref_lod[i]; } - phi::MixVector mixv_out_offset(&out_offset); + MixVector mixv_out_offset(&out_offset); const size_t* out_offset_data = mixv_out_offset.CUDAData(dev_ctx.GetPlace()); const size_t* x_lod_data = out_offset_data + x_lod_size; diff --git a/paddle/phi/kernels/gpu/sequence_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/sequence_softmax_grad_kernel.cu index aca08cbf0bfec0..d0934a1c6d3f37 100644 --- a/paddle/phi/kernels/gpu/sequence_softmax_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sequence_softmax_grad_kernel.cu @@ -78,7 +78,7 @@ struct SequenceSoftmaxGradFunctor { dim3 block_size(thread_x); dim3 grid_size(max_blocks); - phi::MixVector mixv_ref_lod(&ref_lod); + MixVector mixv_ref_lod(&ref_lod); sequence_softmax_grad_kernel <<>>( dout.data(), diff --git a/paddle/phi/kernels/gpu/sequence_softmax_kernel.cu b/paddle/phi/kernels/gpu/sequence_softmax_kernel.cu index d24e865cf5076b..61234544c673db 100644 --- a/paddle/phi/kernels/gpu/sequence_softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/sequence_softmax_kernel.cu @@ -89,7 +89,7 @@ struct SequenceSoftmaxFunctor { dim3 block_size(thread_x); dim3 grid_size(max_blocks); - phi::MixVector mixv_ref_lod(&ref_lod); + MixVector mixv_ref_lod(&ref_lod); sequence_softmax_kernel <<>>( x.data(), diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu index eab39da86df51d..a9b567ec8b3782 100644 --- a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu @@ -97,15 +97,15 @@ void SetValueGradKernel(const Context& dev_ctx, std::vector axes_int32(axes.begin(), axes.end()); std::vector decrease_axes_int32(decrease_axes.begin(), decrease_axes.end()); - phi::StridedSliceRawInferMeta(meta_in, - axes_int32, - starts, - ends, - steps, - infer_flags, - decrease_axes_int32, - &meta_out, - MetaConfig(true, false)); + StridedSliceRawInferMeta(meta_in, + axes_int32, + starts, + ends, + steps, + infer_flags, + decrease_axes_int32, + &meta_out, + MetaConfig(true, false)); if (value_grad_orig.dims() != value_grad->dims()) { StridedSliceRawKernel(dev_ctx, out_grad, diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu index c3c72f4b33facd..dcc4198b668381 100644 --- a/paddle/phi/kernels/gpu/sgd_kernel.cu +++ b/paddle/phi/kernels/gpu/sgd_kernel.cu @@ -72,7 +72,7 @@ void SGDDenseKernel(const Context& dev_ctx, bool multi_precision, DenseTensor* param_out, DenseTensor* master_param_out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; // do check here // if (multi_precision) { // bool has_master = @@ -119,7 +119,7 @@ void SGDDenseParamSparseGradKernel(const Context& dev_ctx, bool multi_precision, DenseTensor* param_out, DenseTensor* master_param_out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; // do some check here // if (multi_precision) { // bool has_master = @@ -167,7 +167,7 @@ void SGDDenseParamSparseGradKernel(const Context& dev_ctx, int thread_x = kThreadsPerBlock; int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); - phi::MixVector mixv_in_rows(&in_rows); + MixVector mixv_in_rows(&in_rows); SparseSGDFunctorKernel<<>>( in_data, mixv_in_rows.CUDAData(dev_ctx.GetPlace()), diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu index ed0f526e0dfd5e..30ca6910d7e7e7 100644 --- a/paddle/phi/kernels/gpu/shard_index_kernel.cu +++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu @@ -20,8 +20,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void ShardIndexInner(const T* in_data, T* out_data, diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu index 2319b54924903d..e51dfe7c41fb03 100644 --- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu +++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu @@ -78,8 +78,8 @@ void ShuffleBatchKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_CUDA // CacheAllocator allocator(dev_ctx.GetPlace()); - phi::memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), - dev_ctx.stream()); + memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), + dev_ctx.stream()); const auto& exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream()); #else const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream()); diff --git a/paddle/phi/kernels/gpu/shuffle_batch_utils.h b/paddle/phi/kernels/gpu/shuffle_batch_utils.h index 807fdedd5511da..474857ead1121f 100644 --- a/paddle/phi/kernels/gpu/shuffle_batch_utils.h +++ b/paddle/phi/kernels/gpu/shuffle_batch_utils.h @@ -22,7 +22,7 @@ namespace phi { struct CacheAllocator { typedef char value_type; - explicit CacheAllocator(phi::Place place) { + explicit CacheAllocator(Place place) { VLOG(2) << "construct allocator"; place_ = place; } @@ -49,10 +49,10 @@ struct CacheAllocator { } private: - typedef std::unordered_map> + typedef std::unordered_map> allocation_map_type; allocation_map_type busy_allocation_; - phi::Place place_; + Place place_; }; template diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu index c0098b8e5a45fa..d07f3db4ac43cc 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -29,9 +29,9 @@ struct SigmoidBwdFunctor { HOSTDEVICE inline SigmoidBwdFunctor(const T ignore_index) : ignore_index_(ignore_index) {} - HOSTDEVICE inline phi::Array operator()(const T x, - const T label, - const T dout) { + HOSTDEVICE inline Array operator()(const T x, + const T label, + const T dout) { T counts; T dx_data; @@ -46,7 +46,7 @@ struct SigmoidBwdFunctor { dx_data = dout * diff; counts = 1; } - phi::Array outs; + Array outs; outs[0] = dx_data; outs[1] = counts; @@ -62,10 +62,10 @@ struct SigmoidBwdPosWeightFunctor { HOSTDEVICE inline SigmoidBwdPosWeightFunctor(const T ignore_index) : ignore_index_(ignore_index) {} - HOSTDEVICE inline phi::Array operator()(const T x, - const T label, - const T pos_weight, - const T dout) { + HOSTDEVICE inline Array operator()(const T x, + const T label, + const T pos_weight, + const T dout) { T counts; T dx_data; @@ -85,7 +85,7 @@ struct SigmoidBwdPosWeightFunctor { counts = 1; } - phi::Array outs; + Array outs; outs[0] = dx_data; outs[1] = counts; @@ -160,7 +160,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel( auto eps = static_cast(1e-5); *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; - phi::ScaleKernel( + ScaleKernel( dev_ctx, *in_grad, (1.0 / *norm_cpu_ptr), 0.0f, false, in_grad); } } diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu index 271193d0287d5f..2fa7b2a5f6a69e 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -29,7 +29,7 @@ struct SigmoidFwdFunctor { HOSTDEVICE inline SigmoidFwdFunctor(const T ignore_index) : ignore_index_(ignore_index) {} - HOSTDEVICE inline phi::Array operator()(const T x, const T label) { + HOSTDEVICE inline Array operator()(const T x, const T label) { T counts; T out_data; @@ -46,7 +46,7 @@ struct SigmoidFwdFunctor { out_data = term1 - term2 + term3; counts = 1; } - phi::Array outs; + Array outs; outs[0] = out_data; outs[1] = counts; @@ -62,9 +62,9 @@ struct SigmoidFwdPosWeightFunctor { HOSTDEVICE inline SigmoidFwdPosWeightFunctor(const T ignore_index) : ignore_index_(ignore_index) {} - HOSTDEVICE inline phi::Array operator()(const T x, - const T label, - T pos_weight) { + HOSTDEVICE inline Array operator()(const T x, + const T label, + T pos_weight) { T counts; T out_data; @@ -81,7 +81,7 @@ struct SigmoidFwdPosWeightFunctor { counts = 1; } - phi::Array outs; + Array outs; outs[0] = out_data; outs[1] = counts; @@ -155,7 +155,7 @@ void SigmoidCrossEntropyWithLogitsKernel( auto eps = static_cast(1e-5); *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps; - phi::ScaleKernel(dev_ctx, *out, 1.0 / (*norm_cpu_ptr), 0.0f, false, out); + ScaleKernel(dev_ctx, *out, 1.0 / (*norm_cpu_ptr), 0.0f, false, out); } } diff --git a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu index 1b44673ce9ccff..cd769520d60336 100644 --- a/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu +++ b/paddle/phi/kernels/gpu/slogdeterminant_kernel.cu @@ -110,32 +110,31 @@ __global__ void GetSlogDetFromLUComplex(const T* lu_data, } template -struct SlogDeterminantFunctor, Context> { +struct SlogDeterminantFunctor, Context> { void operator()(const Context& dev_ctx, const DenseTensor& input, int64_t rank, int64_t batch_count, DenseTensor* output) { #ifndef PADDLE_WITH_HIP - phi::Allocator::AllocationPtr tmp_gpu_mat_data; - const phi::dtype::complex* gpu_mat = - input.data>(); + Allocator::AllocationPtr tmp_gpu_mat_data; + const dtype::complex* gpu_mat = input.data>(); // Copy all elements of input matrix A to a temporary memory space to // avoid being overridden by getrf. - tmp_gpu_mat_data = phi::memory_utils::Alloc( + tmp_gpu_mat_data = memory_utils::Alloc( dev_ctx.GetPlace(), - input.numel() * sizeof(phi::dtype::complex), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + input.numel() * sizeof(dtype::complex), + Stream(reinterpret_cast(dev_ctx.stream()))); memory_utils::Copy(dev_ctx.GetPlace(), tmp_gpu_mat_data->ptr(), dev_ctx.GetPlace(), input.data(), - input.numel() * sizeof(phi::dtype::complex), + input.numel() * sizeof(dtype::complex), dev_ctx.stream()); - gpu_mat = reinterpret_cast*>( - tmp_gpu_mat_data->ptr()); + gpu_mat = + reinterpret_cast*>(tmp_gpu_mat_data->ptr()); - std::vector*> cpu_ptrs(batch_count); + std::vector*> cpu_ptrs(batch_count); for (int64_t i = 0; i < batch_count; ++i) { cpu_ptrs[i] = gpu_mat + i * rank * rank; } @@ -143,8 +142,8 @@ struct SlogDeterminantFunctor, Context> { // num_ints is for pivot (rank * batch_count) and info (batch_count) int64_t num_ints = batch_count * (rank + 1); size_t total_bytes = - batch_count * sizeof(phi::dtype::complex*) + num_ints * sizeof(int); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc( + batch_count * sizeof(dtype::complex*) + num_ints * sizeof(int); + Allocator::AllocationPtr tmp_gpu_ptrs_data = memory_utils::Alloc( dev_ctx.GetPlace(), total_bytes, phi::Stream(reinterpret_cast(dev_ctx.stream()))); @@ -161,35 +160,35 @@ struct SlogDeterminantFunctor, Context> { nbytes_ptrs_c1, dev_ctx.stream()); - phi::dtype::complex** gpu_mat_ptr = - reinterpret_cast**>(tmp_gpu_ptrs_data->ptr()); + dtype::complex** gpu_mat_ptr = + reinterpret_cast**>(tmp_gpu_ptrs_data->ptr()); int* gpu_info_ptr = reinterpret_cast(gpu_mat_ptr + cpu_ptrs.size()); int* pivot_data = gpu_info_ptr + batch_count; - auto blas = funcs::GetBlas>(dev_ctx); + auto blas = funcs::GetBlas>(dev_ctx); // This function performs the LU factorization of each matrix A by the // equation P * A = L * U. L and U are written back to original matrix A, // and diagonal elements of L are discarded. blas.BatchedGETRF(rank, gpu_mat_ptr, pivot_data, gpu_info_ptr, batch_count); - phi::dtype::complex* out_data = - dev_ctx.template Alloc>(output); + dtype::complex* out_data = + dev_ctx.template Alloc>(output); int block_size = std::min(256, dev_ctx.GetMaxThreadsPerBlock()); dim3 dim_block(block_size); dim3 num_blocks((batch_count + block_size - 1) / block_size); - GetSlogDetFromLUComplex><<>>( + GetSlogDetFromLUComplex><<>>( gpu_mat, pivot_data, rank, batch_count, out_data); #else using MatrixType = Eigen::Matrix, Eigen::Dynamic, Eigen::Dynamic>; - std::vector> input_vec; - std::vector> sign_vec; - std::vector> log_vec; - std::vector> output_vec; + std::vector> input_vec; + std::vector> sign_vec; + std::vector> log_vec; + std::vector> output_vec; TensorToVector(input, dev_ctx, &input_vec); for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel auto begin_iter = input_vec.begin() + i * rank * rank; auto end_iter = input_vec.begin() + (i + 1) * rank * rank; - std::vector> sub_vec( + std::vector> sub_vec( begin_iter, end_iter); // get every square matrix data MatrixType matrix(rank, rank); @@ -202,10 +201,9 @@ struct SlogDeterminantFunctor, Context> { VLOG(2) << "matrix val: " << matrix; std::complex det_val = matrix.determinant(); T abs_det_val = std::abs(det_val); - sign_vec.push_back(static_cast>( + sign_vec.push_back(static_cast>( sign(det_val, static_cast>(abs_det_val)))); - log_vec.push_back( - static_cast>(std::log(abs_det_val))); + log_vec.push_back(static_cast>(std::log(abs_det_val))); } // merge sign_vec and log_vec as final output_vec output_vec.insert(output_vec.end(), sign_vec.begin(), sign_vec.end()); @@ -307,20 +305,18 @@ struct SlogDeterminantV2Functor { } dev_ctx.template Alloc(logdet); if (logdet->numel() > 0) { - Full(dev_ctx, - logdet->dims(), - static_cast>(0), - logdet); + Full( + dev_ctx, logdet->dims(), static_cast>(0), logdet); } return; } #ifndef PADDLE_WITH_HIP - phi::Allocator::AllocationPtr tmp_gpu_mat_data; + Allocator::AllocationPtr tmp_gpu_mat_data; const T* gpu_mat = input.data(); - tmp_gpu_mat_data = phi::memory_utils::Alloc( + tmp_gpu_mat_data = memory_utils::Alloc( dev_ctx.GetPlace(), input.numel() * sizeof(T), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); memory_utils::Copy(dev_ctx.GetPlace(), tmp_gpu_mat_data->ptr(), dev_ctx.GetPlace(), @@ -337,7 +333,7 @@ struct SlogDeterminantV2Functor { // num_ints is for pivot (rank * batch_count) and info (batch_count) int num_ints = batch_count * (rank + 1); size_t total_bytes = batch_count * sizeof(T*) + num_ints * sizeof(int); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc( + Allocator::AllocationPtr tmp_gpu_ptrs_data = memory_utils::Alloc( dev_ctx.GetPlace(), total_bytes, phi::Stream(reinterpret_cast(dev_ctx.stream()))); @@ -443,7 +439,7 @@ __global__ void GetSlogDetV2FromLUComplex(const Complex_T* lu_data, } template -struct SlogDeterminantV2Functor, Context> { +struct SlogDeterminantV2Functor, Context> { void operator()(const Context& dev_ctx, const DenseTensor& input, int64_t rank, @@ -451,43 +447,37 @@ struct SlogDeterminantV2Functor, Context> { DenseTensor* sign, DenseTensor* logdet) { if (input.numel() == 0) { - dev_ctx.template Alloc>(sign); + dev_ctx.template Alloc>(sign); if (sign->numel() > 0) { - Full, Context>( - dev_ctx, - sign->dims(), - static_cast>(1), - sign); + Full, Context>( + dev_ctx, sign->dims(), static_cast>(1), sign); } dev_ctx.template Alloc(logdet); if (logdet->numel() > 0) { - Full(dev_ctx, - logdet->dims(), - static_cast>(0), - logdet); + Full( + dev_ctx, logdet->dims(), static_cast>(0), logdet); } return; } #ifndef PADDLE_WITH_HIP - phi::Allocator::AllocationPtr tmp_gpu_mat_data; - const phi::dtype::complex* gpu_mat = - input.data>(); + Allocator::AllocationPtr tmp_gpu_mat_data; + const dtype::complex* gpu_mat = input.data>(); // Copy all elements of input matrix A to a temporary memory space to // avoid being overridden by getrf. - tmp_gpu_mat_data = phi::memory_utils::Alloc( + tmp_gpu_mat_data = memory_utils::Alloc( dev_ctx.GetPlace(), - input.numel() * sizeof(phi::dtype::complex), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + input.numel() * sizeof(dtype::complex), + Stream(reinterpret_cast(dev_ctx.stream()))); memory_utils::Copy(dev_ctx.GetPlace(), tmp_gpu_mat_data->ptr(), dev_ctx.GetPlace(), input.data(), - input.numel() * sizeof(phi::dtype::complex), + input.numel() * sizeof(dtype::complex), dev_ctx.stream()); - gpu_mat = reinterpret_cast*>( - tmp_gpu_mat_data->ptr()); + gpu_mat = + reinterpret_cast*>(tmp_gpu_mat_data->ptr()); - std::vector*> cpu_ptrs(batch_count); + std::vector*> cpu_ptrs(batch_count); for (int64_t i = 0; i < batch_count; ++i) { cpu_ptrs[i] = gpu_mat + i * rank * rank; } @@ -495,8 +485,8 @@ struct SlogDeterminantV2Functor, Context> { // num_ints is for pivot (rank * batch_count) and info (batch_count) int64_t num_ints = batch_count * (rank + 1); size_t total_bytes = - batch_count * sizeof(phi::dtype::complex*) + num_ints * sizeof(int); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc( + batch_count * sizeof(dtype::complex*) + num_ints * sizeof(int); + Allocator::AllocationPtr tmp_gpu_ptrs_data = memory_utils::Alloc( dev_ctx.GetPlace(), total_bytes, phi::Stream(reinterpret_cast(dev_ctx.stream()))); @@ -513,37 +503,36 @@ struct SlogDeterminantV2Functor, Context> { nbytes_ptrs_v2c, dev_ctx.stream()); - phi::dtype::complex** gpu_mat_ptr = - reinterpret_cast**>(tmp_gpu_ptrs_data->ptr()); + dtype::complex** gpu_mat_ptr = + reinterpret_cast**>(tmp_gpu_ptrs_data->ptr()); int* gpu_info_ptr = reinterpret_cast(gpu_mat_ptr + cpu_ptrs.size()); int* pivot_data = gpu_info_ptr + batch_count; - auto blas = funcs::GetBlas>(dev_ctx); + auto blas = funcs::GetBlas>(dev_ctx); // This function performs the LU factorization of each matrix A by the // equation P * A = L * U. L and U are written back to original matrix A, // and diagonal elements of L are discarded. blas.BatchedGETRF(rank, gpu_mat_ptr, pivot_data, gpu_info_ptr, batch_count); - phi::dtype::complex* sign_data = - dev_ctx.template Alloc>(sign); + dtype::complex* sign_data = + dev_ctx.template Alloc>(sign); T* logdet_data = dev_ctx.template Alloc(logdet); int block_size = std::min(256, dev_ctx.GetMaxThreadsPerBlock()); dim3 dim_block(block_size); dim3 num_blocks((batch_count + block_size - 1) / block_size); - GetSlogDetV2FromLUComplex, T> - <<>>( - gpu_mat, pivot_data, rank, batch_count, sign_data, logdet_data); + GetSlogDetV2FromLUComplex, T><<>>( + gpu_mat, pivot_data, rank, batch_count, sign_data, logdet_data); #else using MatrixType = Eigen::Matrix, Eigen::Dynamic, Eigen::Dynamic>; - std::vector> input_vec; - std::vector> sign_vec; - std::vector> log_vec; + std::vector> input_vec; + std::vector> sign_vec; + std::vector> log_vec; DDim out_dims = sign->dims(); TensorToVector(input, dev_ctx, &input_vec); for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel auto begin_iter = input_vec.begin() + i * rank * rank; auto end_iter = input_vec.begin() + (i + 1) * rank * rank; - std::vector> sub_vec( + std::vector> sub_vec( begin_iter, end_iter); // get every square matrix data MatrixType matrix(rank, rank); @@ -556,7 +545,7 @@ struct SlogDeterminantV2Functor, Context> { VLOG(2) << "matrix val: " << matrix; std::complex det_val = matrix.determinant(); T abs_det_val = std::abs(det_val); - sign_vec.push_back(static_cast>( + sign_vec.push_back(static_cast>( phi::sign(det_val, static_cast>(abs_det_val)))); log_vec.push_back(std::log(abs_det_val)); } diff --git a/paddle/phi/kernels/gpu/soft_relu_grad_kernel.cu b/paddle/phi/kernels/gpu/soft_relu_grad_kernel.cu index 8aa0db2c877b5d..fc0ffea65567bf 100644 --- a/paddle/phi/kernels/gpu/soft_relu_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/soft_relu_grad_kernel.cu @@ -22,7 +22,7 @@ namespace phi { template struct CudaSoftReluGradFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType one = static_cast(1.0f); float threshold; diff --git a/paddle/phi/kernels/gpu/soft_relu_kernel.cu b/paddle/phi/kernels/gpu/soft_relu_kernel.cu index 478c58065ceb57..a3e59c6bc75145 100644 --- a/paddle/phi/kernels/gpu/soft_relu_kernel.cu +++ b/paddle/phi/kernels/gpu/soft_relu_kernel.cu @@ -22,7 +22,7 @@ namespace phi { template struct CudaSoftReluFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType one = static_cast(1.0f); float threshold; diff --git a/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu index 8de4b312069b9e..5684509847bd0c 100644 --- a/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu @@ -45,7 +45,7 @@ void SquaredL2NormGradKernel(const Context& dev_ctx, std::vector ins{&x, &dout}; std::vector outs{dx}; - funcs::BroadcastKernel(dev_ctx, ins, &outs, phi::DoubleMulFunctor()); + funcs::BroadcastKernel(dev_ctx, ins, &outs, DoubleMulFunctor()); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/std_var_grad_kernel.cu b/paddle/phi/kernels/gpu/std_var_grad_kernel.cu index f99650e515ab7f..f86ff8aaa71b3d 100644 --- a/paddle/phi/kernels/gpu/std_var_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/std_var_grad_kernel.cu @@ -46,28 +46,27 @@ void VarGradKernel(const Context& dev_ctx, int rank = x.dims().size(); if (rank == 0 || axis.size() == 0) { const auto dof = static_cast(x.numel()) - correction; - DenseTensor x_mean = phi::Mean(dev_ctx, x, {}, true); + DenseTensor x_mean = Mean(dev_ctx, x, {}, true); if (dof <= 0) { // grad * at::where(x == // x.mean(),std::numeric_limits::quiet_NaN(),std::numeric_limits::infinity()); DenseTensor cond; cond.Resize(x.dims()); - phi::EqualKernel(dev_ctx, x, x_mean, &cond); - DenseTensor nan_tensor = phi::FullLike( + EqualKernel(dev_ctx, x, x_mean, &cond); + DenseTensor nan_tensor = FullLike( dev_ctx, x, static_cast(std::numeric_limits::quiet_NaN())); - DenseTensor inf_tensor = phi::FullLike( + DenseTensor inf_tensor = FullLike( dev_ctx, x, static_cast(std::numeric_limits::infinity())); dev_ctx.template Alloc(x_grad); - phi::WhereKernel( - dev_ctx, cond, nan_tensor, inf_tensor, x_grad); + WhereKernel(dev_ctx, cond, nan_tensor, inf_tensor, x_grad); } else { // (2.0 / dof) * grad * (x - x.mean()); - DenseTensor diff = phi::Subtract(dev_ctx, x, x_mean); + DenseTensor diff = Subtract(dev_ctx, x, x_mean); DenseTensor scale = - phi::FullLike(dev_ctx, x, static_cast(2.0 / dof)); - DenseTensor tmp = phi::Multiply(dev_ctx, scale, out_grad); + FullLike(dev_ctx, x, static_cast(2.0 / dof)); + DenseTensor tmp = Multiply(dev_ctx, scale, out_grad); dev_ctx.template Alloc(x_grad); - phi::MultiplyKernel(dev_ctx, tmp, diff, x_grad); + MultiplyKernel(dev_ctx, tmp, diff, x_grad); } return; } @@ -142,12 +141,12 @@ void StdGradKernel(const Context& dev_ctx, } // grad_var = (grad / (out * 2)).masked_fill_(out == 0, 0); DenseTensor two_tensor = - phi::FullLike(dev_ctx, out, static_cast(2.0)); + FullLike(dev_ctx, out, static_cast(2.0)); DenseTensor denom = Multiply(dev_ctx, out, two_tensor); DenseTensor div = Divide(dev_ctx, out_grad, denom); DenseTensor zero_tensor = - phi::FullLike(dev_ctx, out, static_cast(0.0)); + FullLike(dev_ctx, out, static_cast(0.0)); DenseTensor cond_zero; cond_zero.Resize(out.dims()); EqualKernel(dev_ctx, out, zero_tensor, &cond_zero); diff --git a/paddle/phi/kernels/gpu/std_var_kernel.cu b/paddle/phi/kernels/gpu/std_var_kernel.cu index d70f50c82736e3..0279f34121b2e7 100644 --- a/paddle/phi/kernels/gpu/std_var_kernel.cu +++ b/paddle/phi/kernels/gpu/std_var_kernel.cu @@ -174,7 +174,7 @@ void Std_VarKernel(const Context& dev_ctx, dense_iter_config.add_const_input(x); DenseTensorIterator iter = dense_iter_config.build(); - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; using ops_t = WelfordOps>; ops_t ops(static_cast(correction), take_sqrt); diff --git a/paddle/phi/kernels/gpu/strided_copy_kernel.cu b/paddle/phi/kernels/gpu/strided_copy_kernel.cu index 250c05b46bc9ac..edf29d905f30ca 100644 --- a/paddle/phi/kernels/gpu/strided_copy_kernel.cu +++ b/paddle/phi/kernels/gpu/strided_copy_kernel.cu @@ -23,9 +23,9 @@ namespace phi { template __global__ void StridedCopyCaseZeroFunc( const T* input_data, - phi::Array input_stride, + Array input_stride, T* output_data, - phi::Array output_stride) { + Array output_stride) { int64_t input_offset = 0; int64_t output_offset = 0; int64_t coordinate[6] = {threadIdx.x, @@ -48,10 +48,10 @@ template bool LaunchStridedCopyCaseZeroKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, + const Array& input_stride, T* output_data, - const phi::Array& output_stride, - const phi::Array& dims, + const Array& output_stride, + const Array& dims, int rank) { if (rank > 6) { return false; @@ -120,10 +120,10 @@ bool LaunchStridedCopyCaseZeroKernel( template __global__ void StridedCopyCaseOneFunc( const T* input_data, - phi::Array input_stride, + Array input_stride, T* out_data, - phi::Array output_stride, - phi::Array dims, + Array output_stride, + Array dims, const int64_t x_max) { int64_t x = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; if (x < x_max) { @@ -214,14 +214,14 @@ template bool LaunchStridedCopyCaseOneKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, + const Array& input_stride, T* output_data, - const phi::Array& output_stride, - const phi::Array& dims, + const Array& output_stride, + const Array& dims, int rank, int64_t numel) { dim3 grid(1, 1, 1), block(1, 1, 1); - phi::Array cur_dims; + Array cur_dims; block.x = 512; if (rank >= 1) { @@ -316,10 +316,10 @@ bool LaunchStridedCopyCaseOneKernel( template __global__ void StridedCopyDefaultFunc( const T* input_data, - phi::Array input_stride, + Array input_stride, T* output_data, - phi::Array output_stride, - phi::Array dims, + Array output_stride, + Array dims, const int64_t numel) { int64_t gid = static_cast(blockIdx.x) * static_cast(blockDim.x) + @@ -348,10 +348,10 @@ template void LaunchStridedCopyDefaultKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, + const Array& input_stride, T* output_data, - const phi::Array& output_stride, - const phi::Array& dims, + const Array& output_stride, + const Array& dims, int rank, int64_t numel) { int64_t block = 512; @@ -382,7 +382,7 @@ void LaunchStridedCopyDefaultKernel( template __global__ void Strided2ContiguousCaseZeroFunc( const T* input_data, - phi::Array input_stride, + Array input_stride, T* output_data) { int64_t input_offset = 0; int64_t output_offset = @@ -415,9 +415,9 @@ template bool LaunchStrided2ContiguousCaseZeroKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, + const Array& input_stride, T* output_data, - const phi::Array& dims, + const Array& dims, int rank) { if (rank > 6) { return false; @@ -475,9 +475,9 @@ bool LaunchStrided2ContiguousCaseZeroKernel( template __global__ void Strided2ContiguousCaseOneFunc( const T* input_data, - phi::Array input_stride, + Array input_stride, T* out_data, - phi::Array dims, + Array dims, const int64_t x_max) { int64_t x = static_cast(blockIdx.x) * static_cast(blockDim.x) + @@ -573,13 +573,13 @@ template bool LaunchStrided2ContiguousCaseOneKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, + const Array& input_stride, T* output_data, - const phi::Array& dims, + const Array& dims, int rank, int64_t numel) { dim3 grid(1, 1, 1), block(1, 1, 1); - phi::Array cur_dims; + Array cur_dims; block.x = 512; if (rank >= 1) { @@ -692,9 +692,9 @@ template void LaunchStrided2ContiguousDefaultKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, + const Array& input_stride, T* output_data, - const phi::Array& dims, + const Array& dims, int rank, int64_t numel) { int64_t block = 512; @@ -754,8 +754,8 @@ void StridedCopyKernel(const Context& dev_ctx, // count vecsize int VecSize = 8; - VecSize = std::min(phi::GetVectorizedSize(input_data), VecSize); - VecSize = std::min(phi::GetVectorizedSize(output_data), VecSize); + VecSize = std::min(GetVectorizedSize(input_data), VecSize); + VecSize = std::min(GetVectorizedSize(output_data), VecSize); while (VecSize > 1 && output_numel % VecSize != 0) { VecSize /= 2; } @@ -850,12 +850,12 @@ void StridedCopyKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP hipMemcpy(output_data, input_data, - phi::SizeOf(input.dtype()), + SizeOf(input.dtype()), hipMemcpyDeviceToDevice); #else cudaMemcpy(output_data, input_data, - phi::SizeOf(input.dtype()), + SizeOf(input.dtype()), cudaMemcpyDeviceToDevice); #endif diff --git a/paddle/phi/kernels/gpu/strided_elementwise_copy_kernel.cu b/paddle/phi/kernels/gpu/strided_elementwise_copy_kernel.cu index a8d5eeb48f0456..d9433d93c0f54d 100644 --- a/paddle/phi/kernels/gpu/strided_elementwise_copy_kernel.cu +++ b/paddle/phi/kernels/gpu/strided_elementwise_copy_kernel.cu @@ -43,12 +43,12 @@ void StridedElementwiseCopyKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP hipMemcpy(output_data, input_data, - phi::SizeOf(input.dtype()), + SizeOf(input.dtype()), hipMemcpyDeviceToDevice); #else cudaMemcpy(output_data, input_data, - phi::SizeOf(input.dtype()), + SizeOf(input.dtype()), cudaMemcpyDeviceToDevice); #endif @@ -69,10 +69,10 @@ void StridedElementwiseCopyKernel(const Context& dev_ctx, funcs::CopyStride<2>(out_dims, out_strides, - phi::SizeOf(out->dtype()), + SizeOf(out->dtype()), vectorize(input.dims()), vectorize(input.strides()), - phi::SizeOf(input.dtype()), + SizeOf(input.dtype()), &desired_shape, &strides_array, &numel, diff --git a/paddle/phi/kernels/gpu/svd_kernel.cu b/paddle/phi/kernels/gpu/svd_kernel.cu index 0e35df4903c818..a3da9d05f379ed 100644 --- a/paddle/phi/kernels/gpu/svd_kernel.cu +++ b/paddle/phi/kernels/gpu/svd_kernel.cu @@ -36,7 +36,7 @@ static void GesvdjBatched(const GPUContext& dev_ctx, T* A, T* U, T* V, - phi::dtype::Real* S, + dtype::Real* S, int* info, int thin_UV = 1); @@ -78,10 +78,10 @@ void GesvdjBatched(const GPUContext& dev_ctx, ldt, &lwork, gesvdj_params)); - auto workspace = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - lwork * sizeof(float), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto workspace = + memory_utils::Alloc(dev_ctx.GetPlace(), + lwork * sizeof(float), + Stream(reinterpret_cast(dev_ctx.stream()))); float* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); @@ -159,10 +159,10 @@ void GesvdjBatched(const GPUContext& dev_ctx, ldt, &lwork, gesvdj_params)); - auto workspace = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - lwork * sizeof(double), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto workspace = + memory_utils::Alloc(dev_ctx.GetPlace(), + lwork * sizeof(double), + Stream(reinterpret_cast(dev_ctx.stream()))); double* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); @@ -203,17 +203,17 @@ void GesvdjBatched(const GPUContext& dev_ctx, } template <> -void GesvdjBatched(const GPUContext& dev_ctx, - int batchSize, - int m, - int n, - int k, - phi::complex64* A, - phi::complex64* U, - phi::complex64* V, - float* S, - int* info, - int thin_UV) { +void GesvdjBatched(const GPUContext& dev_ctx, + int batchSize, + int m, + int n, + int k, + complex64* A, + complex64* U, + complex64* V, + float* S, + int* info, + int thin_UV) { /* compute singular vectors */ const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */ @@ -240,12 +240,11 @@ void GesvdjBatched(const GPUContext& dev_ctx, ldt, &lwork, gesvdj_params)); - auto workspace = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - lwork * sizeof(phi::complex64), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); - phi::complex64* workspace_ptr = - reinterpret_cast(workspace->ptr()); + auto workspace = + memory_utils::Alloc(dev_ctx.GetPlace(), + lwork * sizeof(complex64), + Stream(reinterpret_cast(dev_ctx.stream()))); + complex64* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); int stride_V = ldt * (thin_UV ? k : n); @@ -286,17 +285,17 @@ void GesvdjBatched(const GPUContext& dev_ctx, } template <> -void GesvdjBatched(const GPUContext& dev_ctx, - int batchSize, - int m, - int n, - int k, - phi::complex128* A, - phi::complex128* U, - phi::complex128* V, - double* S, - int* info, - int thin_UV) { +void GesvdjBatched(const GPUContext& dev_ctx, + int batchSize, + int m, + int n, + int k, + complex128* A, + complex128* U, + complex128* V, + double* S, + int* info, + int thin_UV) { /* compute singular vectors */ const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */ @@ -323,12 +322,11 @@ void GesvdjBatched(const GPUContext& dev_ctx, ldt, &lwork, gesvdj_params)); - auto workspace = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - lwork * sizeof(phi::complex128), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); - phi::complex128* workspace_ptr = - reinterpret_cast(workspace->ptr()); + auto workspace = + memory_utils::Alloc(dev_ctx.GetPlace(), + lwork * sizeof(complex128), + Stream(reinterpret_cast(dev_ctx.stream()))); + complex128* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); int stride_V = ldt * (thin_UV ? k : n); @@ -377,7 +375,7 @@ void SvdKernel(const Context& dev_ctx, DenseTensor* VH) { if (X.numel() == 0) { dev_ctx.template Alloc(U); - dev_ctx.template Alloc>(S); + dev_ctx.template Alloc>(S); dev_ctx.template Alloc(VH); return; } @@ -401,7 +399,7 @@ void SvdKernel(const Context& dev_ctx, auto* u_data = dev_ctx.template Alloc(U); auto* vh_data = dev_ctx.template Alloc(VH); - auto* s_data = dev_ctx.template Alloc>(S); + auto* s_data = dev_ctx.template Alloc>(S); // NOTE:(@xiongkun03) // matrices are assumed to be stored in column-major order in cusolver // then view A as n x m and do A^T SVD, we can avoid transpose diff --git a/paddle/phi/kernels/gpu/swiglu_grad_kernel.cu b/paddle/phi/kernels/gpu/swiglu_grad_kernel.cu index 44cffb91ba36b8..870596ee7f28e1 100644 --- a/paddle/phi/kernels/gpu/swiglu_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/swiglu_grad_kernel.cu @@ -115,16 +115,15 @@ void SwiGLUGradKernelImpl(const Context &dev_ctx, T *dy, int64_t m, int64_t n) { - int vec_size = - std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(dz)); + int vec_size = std::min(GetVectorizedSize(x), GetVectorizedSize(dz)); if (y) { - vec_size = std::min(vec_size, phi::GetVectorizedSize(y)); + vec_size = std::min(vec_size, GetVectorizedSize(y)); } if (dx) { - vec_size = std::min(vec_size, phi::GetVectorizedSize(dx)); + vec_size = std::min(vec_size, GetVectorizedSize(dx)); } if (dy) { - vec_size = std::min(vec_size, phi::GetVectorizedSize(dy)); + vec_size = std::min(vec_size, GetVectorizedSize(dy)); } #define PD_LAUNCH_SWIGLU_GRAD_CUDA_KERNEL_BASE( \ @@ -157,8 +156,7 @@ void SwiGLUGradKernelImpl(const Context &dev_ctx, } while (0) if (y) { - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n, vec_size); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n, vec_size); if (dx) { if (dy) { PD_LAUNCH_SWIGLU_GRAD_CUDA_KERNEL(false, true, true); @@ -183,7 +181,7 @@ void SwiGLUGradKernelImpl(const Context &dev_ctx, y = x + n; dy = dx + n; auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n / vec_size, 1); + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n / vec_size, 1); PD_LAUNCH_SWIGLU_GRAD_CUDA_KERNEL(true, true, true); } } diff --git a/paddle/phi/kernels/gpu/swiglu_kernel.cu b/paddle/phi/kernels/gpu/swiglu_kernel.cu index f004c663d271c8..6e423fb4930c64 100644 --- a/paddle/phi/kernels/gpu/swiglu_kernel.cu +++ b/paddle/phi/kernels/gpu/swiglu_kernel.cu @@ -41,8 +41,8 @@ __global__ void SwiGLUCUDAKernel(const T *__restrict__ x, int64_t x_offset = z_offset + row_offset; AlignedVector x_vec; AlignedVector y_vec; - phi::Load(x + x_offset, &x_vec); - phi::Load(y + x_offset, &y_vec); + Load(x + x_offset, &x_vec); + Load(y + x_offset, &y_vec); #pragma unroll for (int i = 0; i < VecSize; ++i) { y_vec[i] = functor(x_vec[i], y_vec[i]); @@ -60,8 +60,8 @@ __global__ void SwiGLUCUDAKernel(const T *__restrict__ x, while (idx <= limit) { AlignedVector x_vec; AlignedVector y_vec; - phi::Load(x + idx, &x_vec); - phi::Load(y + idx, &y_vec); + Load(x + idx, &x_vec); + Load(y + idx, &y_vec); #pragma unroll for (int i = 0; i < VecSize; ++i) { y_vec[i] = functor(x_vec[i], y_vec[i]); @@ -84,8 +84,7 @@ void SwiGLUKernelImpl(const Context &dev_ctx, T *z, int64_t m, int64_t n) { - int vec_size = - std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(z)); + int vec_size = std::min(GetVectorizedSize(x), GetVectorizedSize(z)); #define PD_LAUNCH_SWIGLU_CUDA_KERNEL_BASE(__vec_size, __is_combine) \ case __vec_size: { \ @@ -112,9 +111,8 @@ void SwiGLUKernelImpl(const Context &dev_ctx, } while (0) if (y) { - vec_size = std::min(vec_size, phi::GetVectorizedSize(y)); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n, vec_size); + vec_size = std::min(vec_size, GetVectorizedSize(y)); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n, vec_size); PD_LAUNCH_SWIGLU_CUDA_KERNEL(false); } else { while (n % vec_size != 0) { @@ -122,7 +120,7 @@ void SwiGLUKernelImpl(const Context &dev_ctx, } y = x + n; auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n / vec_size, 1); + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n / vec_size, 1); PD_LAUNCH_SWIGLU_CUDA_KERNEL(true); } } diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu index 0041331d32bcb4..def9fbb8f992c1 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu @@ -76,7 +76,7 @@ void SyncBatchNormKernel(const Context& dev_ctx, const int block = 512; int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - phi::Allocator::AllocationPtr alloc_ptr{nullptr}; + Allocator::AllocationPtr alloc_ptr{nullptr}; if (test_mode) { mean_data = mean.template data>(); diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu index 7cd7c7250abf7b..e65ac91e65d6d1 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu @@ -34,8 +34,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx, return; } if (x.numel() == 0) { - phi::Full( - dev_ctx, vectorize(out->dims()), static_cast(0), out); + Full(dev_ctx, vectorize(out->dims()), static_cast(0), out); return; } diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu index ae77f6c3157242..6e675ebac3cb70 100644 --- a/paddle/phi/kernels/gpu/top_k_kernel.cu +++ b/paddle/phi/kernels/gpu/top_k_kernel.cu @@ -198,8 +198,7 @@ void TopkKernel(const Context& dev_ctx, // NOTE: old matrix implementation of stride is different to eigen. const int kMaxHeight = 2048; int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width); switch (config.thread_per_block.x) { #ifdef PADDLE_WITH_HIP FIXED_BLOCK_DIM( @@ -307,8 +306,7 @@ void TopkKernel(const Context& dev_ctx, const int kMaxHeight = 2048; int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, input_width); switch (config.thread_per_block.x) { #ifdef PADDLE_WITH_HIP FIXED_BLOCK_DIM( diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu index bb5575f14f4b31..091fc8449061c3 100644 --- a/paddle/phi/kernels/gpu/trace_kernel.cu +++ b/paddle/phi/kernels/gpu/trace_kernel.cu @@ -40,8 +40,7 @@ void TraceKernel(const Context& dev_ctx, auto out_dim_size = out->dims().size(); if (out_dim_size == 0) out_dim_size = 1; reduce_dims.push_back(out_dim_size); - phi::SumKernel( - dev_ctx, diag, reduce_dims, diag.dtype(), false, out); + SumKernel(dev_ctx, diag, reduce_dims, diag.dtype(), false, out); } else { funcs::SetConstant functor; functor(dev_ctx, out, static_cast(0)); diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu index ab3c5a38e4e462..3dbf561c09cf46 100644 --- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu +++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu @@ -108,7 +108,7 @@ void TriangularSolveKernel(const Context& dev_ctx, for (int64_t i = 0; i < batch_size; ++i) { cpu_a_ptrs[i] = x_bst_data + i * M * M; } - phi::Allocator::AllocationPtr gpu_a_ptrs_data = phi::memory_utils::Alloc( + Allocator::AllocationPtr gpu_a_ptrs_data = memory_utils::Alloc( dev_ctx.GetPlace(), cpu_a_ptrs.size() * sizeof(T*), phi::Stream(reinterpret_cast(dev_ctx.stream()))); @@ -126,10 +126,10 @@ void TriangularSolveKernel(const Context& dev_ctx, const T** gpu_a_ptrs = reinterpret_cast(gpu_a_ptrs_data->ptr()); - phi::Allocator::AllocationPtr gpu_b_ptrs_data = phi::memory_utils::Alloc( + Allocator::AllocationPtr gpu_b_ptrs_data = memory_utils::Alloc( dev_ctx.GetPlace(), batch_size * sizeof(T*), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); T** gpu_b_ptrs = reinterpret_cast(gpu_b_ptrs_data->ptr()); for (int64_t i = 0; i < n_chunks; ++i) { @@ -173,11 +173,10 @@ void TriangularSolveKernel(const Context& dev_ctx, cpu_ptrs[i + batch_size] = out_data + i * M * N; } - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - cpu_ptrs.size() * sizeof(T*), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Allocator::AllocationPtr tmp_gpu_ptrs_data = memory_utils::Alloc( + dev_ctx.GetPlace(), + cpu_ptrs.size() * sizeof(T*), + Stream(reinterpret_cast(dev_ctx.stream()))); size_t nbytes_ptrs = cpu_ptrs.size() * sizeof(T*); const void* stable_ptrs = diff --git a/paddle/phi/kernels/gpu/triu_indices_kernel.cu b/paddle/phi/kernels/gpu/triu_indices_kernel.cu index f9b7ebde2584e4..11b21d4dcf0db7 100644 --- a/paddle/phi/kernels/gpu/triu_indices_kernel.cu +++ b/paddle/phi/kernels/gpu/triu_indices_kernel.cu @@ -116,7 +116,7 @@ void TriuIndicesKernel(const Context& dev_ctx, } // using gpu_launch_config to get grid_size and block_size - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, triu_size); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, triu_size); triu_indices_kernel<< __global__ void TruncGrad(T* dx, int64_t N) { CUDA_KERNEL_LOOP_TYPE(index, N, int64_t) { dx[index] = static_cast(0.0); } diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu index 8c673ced195bca..416452eaa07ec6 100644 --- a/paddle/phi/kernels/gpu/trunc_kernel.cu +++ b/paddle/phi/kernels/gpu/trunc_kernel.cu @@ -22,14 +22,12 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template class TruncFunctor { public: __device__ TruncFunctor(const T x) : x_(x) {} __device__ T operator()() { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; return static_cast(trunc(static_cast(x_))); } @@ -76,7 +74,7 @@ void TruncKernel(const Context& dev_ctx, } int64_t numel = x.numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); Trunc<<>>( x_data, out_data, numel); diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu index c1ed70d206072a..352f5a6c0b6188 100644 --- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu @@ -106,7 +106,7 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx, DenseTensor* out) { T* data = dev_ctx.template Alloc(out); - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; thrust::counting_iterator index_sequence_begin(0); int64_t size = out->numel(); diff --git a/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu index c4fe15f788ac73..8d58fb0c2ffb5e 100644 --- a/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu @@ -67,7 +67,7 @@ void UniformInplaceKernel(const Context& dev_ctx, dev_ctx.template Alloc(out); if (seed == 0) { // Use global Generator seed - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; funcs::uniform_distribution dist; funcs::uniform_real_transform trans(min, max); funcs::distribution_and_transform(dev_ctx, out, dist, trans); diff --git a/paddle/phi/kernels/gpu/uniform_kernel.cu b/paddle/phi/kernels/gpu/uniform_kernel.cu index 570130d0bed361..cda9faaca18a8f 100644 --- a/paddle/phi/kernels/gpu/uniform_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_kernel.cu @@ -66,7 +66,7 @@ struct UniformKernelImpl { const Scalar& max, int seed, DenseTensor* out) { - using RealType = phi::dtype::Real; + using RealType = dtype::Real; RealType min_val = min.to(); RealType max_val = max.to(); @@ -88,13 +88,13 @@ struct UniformKernelImpl { }; template -struct UniformKernelImpl, Context, true> { +struct UniformKernelImpl, Context, true> { static void Apply(const Context& dev_ctx, const Scalar& min, const Scalar& max, int seed, DenseTensor* out) { - using T = phi::dtype::complex; + using T = dtype::complex; using RealType = float; RealType min_val = min.to(); RealType max_val = max.to(); @@ -122,13 +122,13 @@ struct UniformKernelImpl, Context, true> { }; template -struct UniformKernelImpl, Context, true> { +struct UniformKernelImpl, Context, true> { static void Apply(const Context& dev_ctx, const Scalar& min, const Scalar& max, int seed, DenseTensor* out) { - using T = phi::dtype::complex; + using T = dtype::complex; using RealType = double; RealType min_val = min.to(); RealType max_val = max.to(); @@ -163,7 +163,7 @@ struct UniformKernelImpl { int seed, DenseTensor* out) { if (seed == 0) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; funcs::uniform_distribution dist; funcs::uniform_real_transform trans(min.to(), max.to()); funcs::distribution_and_transform(dev_ctx, out, dist, trans); @@ -191,9 +191,8 @@ void UniformKernel(const Context& dev_ctx, out->Resize(shape.GetData()); dev_ctx.template Alloc(out); - constexpr bool is_complex = - std::is_same>::value || - std::is_same>::value; + constexpr bool is_complex = std::is_same>::value || + std::is_same>::value; UniformKernelImpl::Apply( dev_ctx, min, max, seed, out); diff --git a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu index 286f4e1dbb36ef..06e2610e81ed0d 100644 --- a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu +++ b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu @@ -46,7 +46,7 @@ void UniqueConsecutiveKernel(const Context& dev_ctx, // if 'axis' is not required, flatten the Tensor. if (axis.empty()) { - phi::VisitDataTypeTiny( + VisitDataTypeTiny( dtype, UniqueConsecutiveFlattenedCUDAFunctor( dev_ctx, x, out, return_inverse, return_counts, index, counts)); @@ -54,7 +54,7 @@ void UniqueConsecutiveKernel(const Context& dev_ctx, // 'axis' is required. int valid_axis = axis[0]; if (valid_axis < 0) valid_axis += x.dims().size(); - phi::VisitDataTypeTiny( + VisitDataTypeTiny( dtype, UniqueConsecutiveDimsCUDAFunctor(dev_ctx, x, diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu index b62eb522d9b603..9c502239f3e31c 100644 --- a/paddle/phi/kernels/gpu/unique_kernel.cu +++ b/paddle/phi/kernels/gpu/unique_kernel.cu @@ -102,8 +102,8 @@ struct BinaryNotEqual { // The core logic of computing Unique for a flattened DenseTensor template -static typename std::enable_if::value && - !std::is_same::value>::type +static typename std::enable_if::value && + !std::is_same::value>::type UniqueFlattenedCUDATensor(const Context& dev_ctx, const DenseTensor& in, DenseTensor* out, @@ -129,8 +129,8 @@ UniqueFlattenedCUDATensor(const Context& dev_ctx, auto* indices_data = dev_ctx.template Alloc(indices); #ifdef PADDLE_WITH_CUDA - phi::memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), - dev_ctx.stream()); + memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), + dev_ctx.stream()); const auto& exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream()); #else const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream()); @@ -183,7 +183,7 @@ UniqueFlattenedCUDATensor(const Context& dev_ctx, num_input, dev_ctx.stream()); auto d_temp_storage = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), temp_storage_bytes); + memory_utils::Alloc(dev_ctx.GetPlace(), temp_storage_bytes); cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), temp_storage_bytes, inv_loc_data_ptr, @@ -237,8 +237,8 @@ UniqueFlattenedCUDATensor(const Context& dev_ctx, // The core logic of computing Unique for a flattened DenseTensor template -static typename std::enable_if::value || - std::is_same::value>::type +static typename std::enable_if::value || + std::is_same::value>::type UniqueFlattenedCUDATensor(const Context& dev_ctx, const DenseTensor& in, DenseTensor* out, @@ -266,8 +266,8 @@ UniqueFlattenedCUDATensor(const Context& dev_ctx, auto* indices_data = dev_ctx.template Alloc(indices); #ifdef PADDLE_WITH_CUDA - phi::memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), - dev_ctx.stream()); + memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), + dev_ctx.stream()); const auto& exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream()); #else const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream()); @@ -319,7 +319,7 @@ UniqueFlattenedCUDATensor(const Context& dev_ctx, indices->Resize({num_out}); out->Resize({num_out}); dev_ctx.template Alloc(out); - phi::IndexSelectKernel(dev_ctx, in_resize, *indices, 0, out); + IndexSelectKernel(dev_ctx, in_resize, *indices, 0, out); // 4. Calculate 'counts' if (return_counts) { @@ -356,8 +356,8 @@ static void ComputeUniqueDims(const Context& dev_ctx, not_equal_T not_equal, int64_t row) { #ifdef PADDLE_WITH_CUDA - phi::memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), - dev_ctx.stream()); + memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), + dev_ctx.stream()); const auto& exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream()); #else const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream()); @@ -468,8 +468,8 @@ static void UniqueDimsCUDATensor(const Context& dev_ctx, // 2. Calculate 'indices', 'inverse', 'counts' // Init index and sort #ifdef PADDLE_WITH_CUDA - phi::memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), - dev_ctx.stream()); + memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), + dev_ctx.stream()); const auto& exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream()); #else const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream()); @@ -501,8 +501,7 @@ static void UniqueDimsCUDATensor(const Context& dev_ctx, out_trans.Resize(out_trans_dims_vec); dev_ctx.template Alloc(&out_trans); - phi::IndexSelectKernel( - dev_ctx, in_trans, *indices, 0, &out_trans); + IndexSelectKernel(dev_ctx, in_trans, *indices, 0, &out_trans); std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); out->Resize(out_trans_dims_vec); @@ -513,7 +512,7 @@ static void UniqueDimsCUDATensor(const Context& dev_ctx, out->Resize(out_trans_dims_vec); dev_ctx.template Alloc(out); - phi::IndexSelectKernel(dev_ctx, in_trans, *indices, 0, out); + IndexSelectKernel(dev_ctx, in_trans, *indices, 0, out); } } @@ -639,32 +638,31 @@ void UniqueRawKernel(const Context& dev_ctx, } // if 'axis' is not required, flatten the DenseTensor. if (axis.empty()) { - phi::VisitDataTypeTiny( - dtype, - UniqueFlattenedCUDAFunctor(dev_ctx, - x, - out, - indices, - index, - counts, - return_index, - return_inverse, - return_counts)); - } else { - // 'axis' is required. - int axis_value = axis[0]; - axis_value = (axis_value == -1) ? (x.dims().size() - 1) : axis_value; - phi::VisitDataTypeTiny(dtype, - UniqueDimsCUDAFunctor(dev_ctx, + VisitDataTypeTiny(dtype, + UniqueFlattenedCUDAFunctor(dev_ctx, x, out, indices, index, counts, - axis_value, return_index, return_inverse, return_counts)); + } else { + // 'axis' is required. + int axis_value = axis[0]; + axis_value = (axis_value == -1) ? (x.dims().size() - 1) : axis_value; + VisitDataTypeTiny(dtype, + UniqueDimsCUDAFunctor(dev_ctx, + x, + out, + indices, + index, + counts, + axis_value, + return_index, + return_inverse, + return_counts)); } } diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu index cd8fcb2354c32e..e83ed1a3d94680 100644 --- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu +++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu @@ -297,45 +297,40 @@ void ViterbiDecodeKernel(const Context& dev_ctx, auto logit0 = input_exp.Slice(0, 1); logit0.Resize({batch_size, n_labels}); if (include_bos_eos_tag) { - phi::AddKernel(dev_ctx, logit0, start_trans, &alpha); + AddKernel(dev_ctx, logit0, start_trans, &alpha); GetMask()( dev_ctx, left_length, one, &float_mask); - phi::MultiplyKernel( - dev_ctx, stop_trans, float_mask, &alpha_nxt); - phi::AddKernel(dev_ctx, alpha, alpha_nxt, &alpha); + MultiplyKernel(dev_ctx, stop_trans, float_mask, &alpha_nxt); + AddKernel(dev_ctx, alpha, alpha_nxt, &alpha); } else { alpha = logit0; } - phi::SubtractKernel( - dev_ctx, left_length, one, &left_length); + SubtractKernel(dev_ctx, left_length, one, &left_length); Argmax argmax; for (int64_t i = 1; i < max_seq_len; ++i) { DenseTensor logit = input_exp.Slice(i, i + 1); logit.Resize({batch_size, n_labels}); DenseTensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1}); - phi::AddKernel(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum); + AddKernel(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum); auto alpha_argmax_temp = alpha_argmax_unbind[i - 1]; alpha_argmax_temp.Resize({batch_size, n_labels}); argmax(dev_ctx, alpha_trn_sum, &alpha_argmax_temp, &alpha_max, 1); historys.emplace_back(alpha_argmax_temp); - phi::AddKernel(dev_ctx, alpha_max, logit, &alpha_nxt); + AddKernel(dev_ctx, alpha_max, logit, &alpha_nxt); alpha.Resize({batch_size, n_labels}); GetMask()( dev_ctx, left_length, zero, &float_mask); - phi::MultiplyKernel(dev_ctx, alpha_nxt, float_mask, &alpha_nxt); - phi::SubtractKernel( - dev_ctx, float_one, float_mask, &float_mask); - phi::MultiplyKernel(dev_ctx, alpha, float_mask, &alpha); - phi::AddKernel(dev_ctx, alpha, alpha_nxt, &alpha); + MultiplyKernel(dev_ctx, alpha_nxt, float_mask, &alpha_nxt); + SubtractKernel(dev_ctx, float_one, float_mask, &float_mask); + MultiplyKernel(dev_ctx, alpha, float_mask, &alpha); + AddKernel(dev_ctx, alpha, alpha_nxt, &alpha); if (include_bos_eos_tag) { GetMask()( dev_ctx, left_length, one, &float_mask); - phi::MultiplyKernel( - dev_ctx, stop_trans, float_mask, &alpha_nxt); - phi::AddKernel(dev_ctx, alpha, alpha_nxt, &alpha); + MultiplyKernel(dev_ctx, stop_trans, float_mask, &alpha_nxt); + AddKernel(dev_ctx, alpha, alpha_nxt, &alpha); } - phi::SubtractKernel( - dev_ctx, left_length, one, &left_length); + SubtractKernel(dev_ctx, left_length, one, &left_length); } argmax(dev_ctx, alpha, &last_ids, scores, 1); left_length.Resize({batch_size}); @@ -344,7 +339,7 @@ void ViterbiDecodeKernel(const Context& dev_ctx, // last_ids_update = last_ids * tag_mask int last_ids_index = 1; int actual_len = (std::min)(seq_len, static_cast(max_seq_len)); - phi::MultiplyKernel( + MultiplyKernel( dev_ctx, last_ids, int_mask, &batch_path[actual_len - last_ids_index]); // The algorithm below can refer to // https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/layers/crf.py#L438 @@ -353,32 +348,29 @@ void ViterbiDecodeKernel(const Context& dev_ctx, Gather gather; for (auto hist = historys.rbegin(); hist != historys.rend(); ++hist) { ++last_ids_index; - phi::AddKernel(dev_ctx, left_length, one, &left_length); - phi::AddKernel( - dev_ctx, batch_offset, last_ids, &gather_idx); + AddKernel(dev_ctx, left_length, one, &left_length); + AddKernel(dev_ctx, batch_offset, last_ids, &gather_idx); DenseTensor& last_ids_update = batch_path[actual_len - last_ids_index]; hist->Resize({batch_size * n_labels}); gather(dev_ctx, *hist, gather_idx, &last_ids_update); GetMask()( dev_ctx, left_length, zero, &int_mask); - phi::MultiplyKernel( + MultiplyKernel( dev_ctx, last_ids_update, int_mask, &last_ids_update); GetMask()( dev_ctx, left_length, zero, &zero_len_mask); - phi::MultiplyKernel( + MultiplyKernel( dev_ctx, last_ids, zero_len_mask, &last_ids_tmp); - phi::SubtractKernel( + SubtractKernel( dev_ctx, one, zero_len_mask, &zero_len_mask); - phi::MultiplyKernel( + MultiplyKernel( dev_ctx, last_ids_update, zero_len_mask, &last_ids_update); - phi::AddKernel( + AddKernel( dev_ctx, last_ids_update, last_ids_tmp, &last_ids_update); GetMask()( dev_ctx, left_length, zero, &int_mask); - phi::MultiplyKernel( - dev_ctx, last_ids, int_mask, &last_ids); - phi::AddKernel( - dev_ctx, last_ids_update, last_ids, &last_ids); + MultiplyKernel(dev_ctx, last_ids, int_mask, &last_ids); + AddKernel(dev_ctx, last_ids_update, last_ids, &last_ids); } TransposeKernel(dev_ctx, tpath, {1, 0}, path); } diff --git a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu index bfeda2d5249b01..59c9b027adf556 100644 --- a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu +++ b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu @@ -345,19 +345,19 @@ void WeightedSampleNeighborsKernel(const Context& dev_ctx, int* out_count_data = dev_ctx.template Alloc(out_count); // finally copy sample_count int* neighbor_count_ptr = nullptr; - std::shared_ptr neighbor_count; - auto sample_count = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - (bs + 1) * sizeof(int), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + std::shared_ptr neighbor_count; + auto sample_count = + memory_utils::Alloc(dev_ctx.GetPlace(), + (bs + 1) * sizeof(int), + Stream(reinterpret_cast(dev_ctx.stream()))); int* sample_count_ptr = reinterpret_cast(sample_count->ptr()); int grid_size = (bs + 127) / 128; if (need_neighbor_count) { - neighbor_count = phi::memory_utils::AllocShared( + neighbor_count = memory_utils::AllocShared( dev_ctx.GetPlace(), (bs + 1) * sizeof(int), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); neighbor_count_ptr = reinterpret_cast(neighbor_count->ptr()); GetSampleCountAndNeighborCountKernel <<>>(col_ptr_data, @@ -372,10 +372,10 @@ void WeightedSampleNeighborsKernel(const Context& dev_ctx, col_ptr_data, x_data, sample_count_ptr, nullptr, sample_size, bs); } - auto sample_offset = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - (bs + 1) * sizeof(int), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto sample_offset = + memory_utils::Alloc(dev_ctx.GetPlace(), + (bs + 1) * sizeof(int), + Stream(reinterpret_cast(dev_ctx.stream()))); int* sample_offset_ptr = reinterpret_cast(sample_offset->ptr()); #ifdef PADDLE_WITH_CUDA @@ -438,10 +438,10 @@ void WeightedSampleNeighborsKernel(const Context& dev_ctx, dev_ctx.stream()); cudaStreamSynchronize(dev_ctx.stream()); - auto tmh_weights = phi::memory_utils::Alloc( + auto tmh_weights = memory_utils::Alloc( dev_ctx.GetPlace(), target_neighbor_counts * sizeof(float), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); float* target_weights_keys_buf_ptr = reinterpret_cast(tmh_weights->ptr()); constexpr int BLOCK_SIZE = 256;