Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions paddle/phi/kernels/fusion/gpu/attn_gemm_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
namespace phi {
namespace fusion {

using phi::backends::gpu::GpuLaunchConfig;
using backends::gpu::GpuLaunchConfig;

template <typename T>
class AttnMatmulINT8 {
Expand All @@ -37,8 +37,8 @@ class AttnMatmulINT8 {
auto helper = std::make_shared<phi::CublasLtHelper>(
m, k, n, dev_ctx.cublaslt_handle());
helpers_.emplace_back(helper);
gpu_config_ = std::make_unique<GpuLaunchConfig>(
phi::backends::gpu::GetGpuLaunchConfig1D(
gpu_config_ =
std::make_unique<GpuLaunchConfig>(backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, m * n, DequantKernelVecSize));
}
~AttnMatmulINT8() {}
Expand Down
6 changes: 3 additions & 3 deletions paddle/phi/kernels/fusion/gpu/block_attn.h
Original file line number Diff line number Diff line change
Expand Up @@ -1812,10 +1812,10 @@ inline GPU(Error_t) GetNumBlocks(int64_t n, int *num_blocks) {
constexpr int kBlockSize = 128;
constexpr int kNumWaves = 16;

const int device_id = phi::backends::gpu::GetCurrentDeviceId();
const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id);
const int device_id = backends::gpu::GetCurrentDeviceId();
const int sm_count = backends::gpu::GetGPUMultiProcessors(device_id);
const int max_thread_per_multiprocessor =
phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);

*num_blocks =
std::max<int>(1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,6 @@
#include "paddle/phi/kernels/gpu/flash_attn_utils.h"
#include "paddle/utils/none.h"

inline int getSMVersion() {
const int device = phi::backends::gpu::GetCurrentDeviceId();
const phi::gpuDeviceProp prop =
phi::backends::gpu::GetDeviceProperties(device);
return prop.major * 10 + prop.minor;
}

#if defined(__CUDACC__) && CUDA_VERSION >= 11000
#define CUDA_BFLOAT16_AVAILABLE
#include <cuda_bf16.h>
Expand All @@ -37,6 +30,12 @@ inline int getSMVersion() {
namespace phi {
namespace fusion {

inline int getSMVersion() {
const int device = backends::gpu::GetCurrentDeviceId();
const phi::gpuDeviceProp prop = backends::gpu::GetDeviceProperties(device);
return prop.major * 10 + prop.minor;
}

int GetMaxLen(const GPUContext& dev_ctx,
const DenseTensor& seq_lens_tensor,
DenseTensor* max_len_tensor,
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ static void VecCastKernel(const GPUContext &dev_ctx,
const InT *x,
OutT *y,
size_t n) {
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, VecSize);
auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n, VecSize);
auto block = config.GetGridSize();
auto thread = config.GetBlockSize();
auto main_offset = n / (VecSize * thread) * VecSize * thread;
Expand Down
11 changes: 5 additions & 6 deletions paddle/phi/kernels/fusion/gpu/cudnn_bn_stats_finalize.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,15 @@ namespace fusion {

template <typename T>
using BatchNormParamType =
typename phi::backends::gpu::CudnnDataType<T>::BatchNormParamType;
typename backends::gpu::CudnnDataType<T>::BatchNormParamType;

#if CUDNN_VERSION >= 8000

template <typename T>
struct BNStatsFinalizeArgs {
BNStatsFinalizeArgs() {
dtype = phi::backends::gpu::CudnnDataType<T>::type;
param_dtype =
phi::backends::gpu::CudnnDataType<BatchNormParamType<T>>::type;
dtype = backends::gpu::CudnnDataType<T>::type;
param_dtype = backends::gpu::CudnnDataType<BatchNormParamType<T>>::type;
format = CUDNN_TENSOR_NHWC;
}

Expand All @@ -53,8 +52,8 @@ struct BNStatsFinalizeArgs {
cudnnDataType_t param_dtype;
cudnnTensorFormat_t format;

phi::backends::gpu::TensorDescriptor in_desc;
phi::backends::gpu::TensorDescriptor out_desc;
backends::gpu::TensorDescriptor in_desc;
backends::gpu::TensorDescriptor out_desc;
};

template <typename T>
Expand Down
16 changes: 8 additions & 8 deletions paddle/phi/kernels/fusion/gpu/cudnn_norm_conv.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ namespace dynload = phi::dynload;

template <typename T>
using ScalingParamType =
typename phi::backends::gpu::CudnnDataType<T>::ScalingParamType;
typename backends::gpu::CudnnDataType<T>::ScalingParamType;

#if CUDNN_VERSION >= 8000

Expand All @@ -32,9 +32,9 @@ static size_t RoundUp(int64_t a, int64_t b) { return (a + b - 1) / b * b; }
template <typename T>
struct NormConvolutionArgs {
NormConvolutionArgs() {
dtype = phi::backends::gpu::CudnnDataType<T>::type;
dtype = backends::gpu::CudnnDataType<T>::type;
format = CUDNN_TENSOR_NHWC;
compute_type = phi::backends::gpu::CudnnDataType<float>::type;
compute_type = backends::gpu::CudnnDataType<float>::type;
}

void Set(const GPUContext &dev_ctx,
Expand Down Expand Up @@ -163,11 +163,11 @@ struct NormConvolutionArgs {
std::vector<int> paddings;
std::vector<int> dilations;

phi::backends::gpu::TensorDescriptor in_desc;
phi::backends::gpu::FilterDescriptor filter_desc;
phi::backends::gpu::TensorDescriptor out_desc;
phi::backends::gpu::TensorDescriptor out_stats_desc;
phi::backends::gpu::ConvolutionDescriptor conv_desc;
backends::gpu::TensorDescriptor in_desc;
backends::gpu::FilterDescriptor filter_desc;
backends::gpu::TensorDescriptor out_desc;
backends::gpu::TensorDescriptor out_stats_desc;
backends::gpu::ConvolutionDescriptor conv_desc;

bool is_support;
};
Expand Down
21 changes: 10 additions & 11 deletions paddle/phi/kernels/fusion/gpu/cudnn_scale_bias_add_relu.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,19 @@
namespace phi {
namespace fusion {
template <typename T>
using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
using CudnnDataType = backends::gpu::CudnnDataType<T>;
namespace dynload = phi::dynload;
template <typename T>
using BatchNormParamType =
typename phi::backends::gpu::CudnnDataType<T>::BatchNormParamType;
typename backends::gpu::CudnnDataType<T>::BatchNormParamType;

#if CUDNN_VERSION >= 8000

template <typename T>
struct ScaleBiasAddReluArgs {
ScaleBiasAddReluArgs() {
dtype = phi::backends::gpu::CudnnDataType<T>::type;
param_dtype =
phi::backends::gpu::CudnnDataType<BatchNormParamType<T>>::type;
dtype = backends::gpu::CudnnDataType<T>::type;
param_dtype = backends::gpu::CudnnDataType<BatchNormParamType<T>>::type;
format = CUDNN_TENSOR_NHWC;
}

Expand Down Expand Up @@ -89,12 +88,12 @@ struct ScaleBiasAddReluArgs {
cudnnDataType_t param_dtype;
cudnnTensorFormat_t format;

phi::backends::gpu::TensorDescriptor in_desc;
phi::backends::gpu::TensorDescriptor out_desc;
phi::backends::gpu::TensorDescriptor equiv_scale_bias_desc;
phi::backends::gpu::TensorDescriptor scale_bias_mean_var_desc;
phi::backends::gpu::TensorDescriptor bitmask_desc;
phi::backends::gpu::ActivationDescriptor activation_desc;
backends::gpu::TensorDescriptor in_desc;
backends::gpu::TensorDescriptor out_desc;
backends::gpu::TensorDescriptor equiv_scale_bias_desc;
backends::gpu::TensorDescriptor scale_bias_mean_var_desc;
backends::gpu::TensorDescriptor bitmask_desc;
backends::gpu::ActivationDescriptor activation_desc;
};

template <typename T>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ void DistributedFusedLambInitOpKernel(
VLOG(10) << "rank = " << rank << ", nranks = " << nranks
<< " , alignment = " << alignment;
if (alignment <= 0) {
alignment = phi::backends::gpu::GpuMinChunkSize();
alignment = backends::gpu::GpuMinChunkSize();
}
PADDLE_ENFORCE_GE(
alignment,
Expand Down
6 changes: 3 additions & 3 deletions paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,10 @@ inline gpuError_t GetNumBlocks(int64_t n, int *num_blocks) {
constexpr int kBlockSize = 128;
constexpr int kNumWaves = 16;

const int device_id = phi::backends::gpu::GetCurrentDeviceId();
const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id);
const int device_id = backends::gpu::GetCurrentDeviceId();
const int sm_count = backends::gpu::GetGPUMultiProcessors(device_id);
const int max_thread_per_multiprocessor =
phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);

*num_blocks =
std::max<int>(1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx,
DenseTensor *bias_grad) {
// Note(andsonder): Fused bn activation only used in the gpu place.
#if defined(PADDLE_WITH_CUDA) and CUDNN_VERSION >= 7401
using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
using CudnnDataType = backends::gpu::CudnnDataType<T>;
using BatchNormParamType = typename CudnnDataType::BatchNormParamType;
double epsilon1 = static_cast<double>(epsilon);

Expand Down Expand Up @@ -147,7 +147,7 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx,
DenseTensor workspace_tensor;
auto reserve_space_size = reserve_space.memory_size();
cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
backends::gpu::ScopedActivationDescriptor scope_act_desc;
cudnnActivationDescriptor_t activation_desc_ =
scope_act_desc.descriptor<T>(act_type);
// --------------- cudnn batchnorm workspace ---------------
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void FusedBatchNormActKernel(const Context &dev_ctx,
DenseTensor *reserve_space) {
// Note(andsonder): Fused bn activation only used in the gpu place.
#if defined(PADDLE_WITH_CUDA) and CUDNN_VERSION >= 7401
using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
using CudnnDataType = backends::gpu::CudnnDataType<T>;
using BatchNormParamType = typename CudnnDataType::BatchNormParamType;
double epsilon1 = static_cast<double>(epsilon);

Expand Down Expand Up @@ -129,7 +129,7 @@ void FusedBatchNormActKernel(const Context &dev_ctx,

double this_factor = 1. - momentum;
cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
backends::gpu::ScopedActivationDescriptor scope_act_desc;
cudnnActivationDescriptor_t activation_desc_ =
scope_act_desc.descriptor<T>(act_type);
size_t workspace_size = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ namespace phi {
namespace fusion {

template <typename T>
using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
using CudnnDataType = backends::gpu::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;

Expand Down Expand Up @@ -141,7 +141,7 @@ void FusedBatchNormAddActGradKernel(const Context &dev_ctx,
DenseTensor workspace_tensor;
auto reserve_space_size = reserve_space_ptr->memory_size();
cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
backends::gpu::ScopedActivationDescriptor scope_act_desc;
cudnnActivationDescriptor_t activation_desc_ =
scope_act_desc.descriptor<T>(act_type);
// --------------- cudnn batchnorm workspace ---------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ namespace phi {
namespace fusion {

template <typename T>
using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
using CudnnDataType = backends::gpu::CudnnDataType<T>;
template <typename T>
using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;

Expand Down Expand Up @@ -112,7 +112,7 @@ void FusedBatchNormAddActKernel(const Context &dev_ctx,

double this_factor = 1. - momentum;
cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
phi::backends::gpu::ScopedActivationDescriptor scope_act_desc;
backends::gpu::ScopedActivationDescriptor scope_act_desc;
cudnnActivationDescriptor_t activation_desc_ =
scope_act_desc.descriptor<T>(act_type);
size_t workspace_size = 0;
Expand Down
30 changes: 15 additions & 15 deletions paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ class CudnnConvDescManager {
}

struct CudnnCacheInfo {
phi::backends::gpu::TensorDescriptor* x_desc{nullptr};
phi::backends::gpu::FilterDescriptor* w_desc{nullptr};
phi::backends::gpu::TensorDescriptor* b_desc{nullptr};
phi::backends::gpu::TensorDescriptor* o_desc{nullptr};
phi::backends::gpu::ConvolutionDescriptor* conv_desc{nullptr};
phi::backends::gpu::ActivationDescriptor* act_desc{nullptr};
backends::gpu::TensorDescriptor* x_desc{nullptr};
backends::gpu::FilterDescriptor* w_desc{nullptr};
backends::gpu::TensorDescriptor* b_desc{nullptr};
backends::gpu::TensorDescriptor* o_desc{nullptr};
backends::gpu::ConvolutionDescriptor* conv_desc{nullptr};
backends::gpu::ActivationDescriptor* act_desc{nullptr};
size_t workspace_size;
cudnnConvolutionFwdAlgo_t algo;

Expand Down Expand Up @@ -283,42 +283,42 @@ class CudnnConvDescManager {
}

private:
phi::backends::gpu::TensorDescriptor* GetTensorDescInfo(
backends::gpu::TensorDescriptor* GetTensorDescInfo(
const std::vector<int>& input_dims,
phi::DataType input_dtype,
cudnnTensorFormat_t input_format) {
auto* desc = new phi::backends::gpu::TensorDescriptor();
auto* desc = new backends::gpu::TensorDescriptor();
desc->set(
input_dims, input_format, backends::gpu::ToCudnnDataType(input_dtype));
return desc;
}

phi::backends::gpu::FilterDescriptor* GetFilterDescInfo(
backends::gpu::FilterDescriptor* GetFilterDescInfo(
const std::vector<int>& input_dims,
phi::DataType input_dtype,
cudnnTensorFormat_t input_format) {
auto* desc = new phi::backends::gpu::FilterDescriptor();
auto* desc = new backends::gpu::FilterDescriptor();
desc->set(
input_dims, input_format, backends::gpu::ToCudnnDataType(input_dtype));
return desc;
}

phi::backends::gpu::ConvolutionDescriptor* GetConvDescInfo(
backends::gpu::ConvolutionDescriptor* GetConvDescInfo(
const std::vector<int>& paddings,
const std::vector<int>& strides,
const std::vector<int>& dilations,
int groups,
cudnnDataType_t dtype) {
auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
auto* desc = new backends::gpu::ConvolutionDescriptor();
desc->set(
dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
return desc;
}

phi::backends::gpu::ActivationDescriptor* GetActivationDescInfo(
backends::gpu::ActivationDescriptor* GetActivationDescInfo(
const std::string& act,
double value_max = std::numeric_limits<double>::max()) {
auto* desc = new phi::backends::gpu::ActivationDescriptor();
auto* desc = new backends::gpu::ActivationDescriptor();
cudnnActivationMode_t mode;
double relu_ceiling = 0.0;
if (act == "identity") {
Expand Down Expand Up @@ -545,7 +545,7 @@ void FusedConv2dAddActKernel(const Context& dev_ctx,
conv_attr_cache->dilations,
transformed_input.dtype(),
groups,
phi::backends::gpu::CudnnDataType<T>::type,
backends::gpu::CudnnDataType<T>::type,
compute_format,
search_func,
activation);
Expand Down
Loading
Loading