diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu index 15aebaa33d922d..c633015a68835c 100644 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu @@ -39,6 +39,9 @@ static inline LayerNormGadKernelVariant LayerNormGradKernelDispatch( const DenseTensor* scale, const DenseTensor* bias) { #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(_WIN32) + if (FLAGS_use_accuracy_compatible_kernel) { + return LayerNormGadKernelVariant::GENERIC; + } if (scale != nullptr && bias != nullptr && input_type != paddle::DataType::FLOAT32 && hidden_size != 4096 && hidden_size > 1024 && hidden_size <= 10240 && diff --git a/paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h b/paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h index 987273577ae802..2f8a4b4a91e021 100644 --- a/paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h +++ b/paddle/phi/kernels/gpu/rms_norm_cuda_kernel.h @@ -731,37 +731,6 @@ void LayerNormFwdCompatKernel( auto stream = dev_ctx.stream(); - // if (!FLAGS_use_accuracy_compatible_kernel && rows <= 1024 && - // (cols / rows >= 32)) { - // constexpr int num_vec_elems2 = 8; - // constexpr int alignment2 = num_vec_elems2 * sizeof(T); - // bool can_vec_X2 = can_vectorize(x_data, alignment2); - // bool can_vec_Y2 = can_vectorize(y_data, alignment2); - // bool can_vec_gamma2 = can_vectorize(gamma_data, alignment2); - // bool can_vec_beta2 = can_vectorize(beta_data, alignment2); - // bool is_supported_type2 = (std::is_same::value || - // std::is_same::value); - // if (is_supported_type2 && - // cols <= - // static_cast(1ULL << std::numeric_limits::digits) - // && - // cols % num_vec_elems2 == 0 && can_vec_X2 && can_vec_Y2 && - // can_vec_gamma2 && can_vec_beta2) { - // launch_vectorized_layer_norm_kernel_driver( - // cols, - // rows, - // static_cast(epsilon), - // x_data, - // gamma_data, - // beta_data, - // y_data, - // mean_data, - // var_data, - // stream); - // return; - // } - // } - // Check vectorization conditions for vec_size=4 constexpr int num_vec_elems = 4; constexpr int alignment = num_vec_elems * sizeof(T); @@ -1555,6 +1524,7 @@ __device__ __inline__ void layer_norm_compute_gI(const T* __restrict__ dY, } stats_x1 = BlockReduceSum(stats_x1, buf); + __syncthreads(); stats_x2 = BlockReduceSum(stats_x2, buf); if (threadIdx.x == 0) { buf[0] = stats_x1; @@ -1658,6 +1628,7 @@ __global__ void layer_norm_grad_input_kernel_vectorized( // Reduction in Shared Memory stats_x1 = BlockReduceSum(stats_x1, reduce_buf); + __syncthreads(); stats_x2 = BlockReduceSum(stats_x2, reduce_buf); if (threadIdx.x == 0) { reduce_buf[0] = stats_x1; @@ -2084,12 +2055,7 @@ void LayerNormBwdCompatKernel( constexpr int num_threads = 128; constexpr int nshared = (num_threads / kWarpSize) * sizeof(T_ACC); - if (!FLAGS_use_accuracy_compatible_kernel && is_supported_type2 && - bAlignedBuffers2 && (N % 8 == 0 && M <= 1024 && (N / M >= 32))) { - layer_norm_grad_input_kernel_vectorized - <<>>( - dY_data, X_data, mean_data, rstd_data, gamma_data, dX_data, N); - } else if (is_supported_type && bAlignedBuffers && bVectorSizeMultiple) { + if (is_supported_type && bAlignedBuffers && bVectorSizeMultiple) { layer_norm_grad_input_kernel_vectorized <<>>( dY_data, X_data, mean_data, rstd_data, gamma_data, dX_data, N);