| 1 |
paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu |
@Le-soleile #2004 #76694 @YqGe585
|
| 2 |
paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu |
@Le-soleile #75506 #2004
|
| 3 |
paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu |
@wanglezz #75601 #2090
|
| 4 |
paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu |
@wanglezz #75625 #2090
|
| 5 |
paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu |
@wanglezz #75626 #2090
|
| 6 |
paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu |
@WanRui37 #75532
|
| 7 |
paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_grad_kernel.cu |
@SpongeBob0318 #75531 #75536 #2007 #2008
|
| 8 |
paddle/phi/kernels/fusion/gpu/fused_seqpool_cvm_kernel.cu |
@SpongeBob0318 #75537 #2009
|
| 9 |
paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu |
@SpongeBob0318 #75538 #2010
|
| 10 |
paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu |
@youge325 #75655 #2072
|
| 11 |
paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu |
@Le-soleile #2209 #2236
|
| 12 |
paddle/phi/kernels/fusion/gpu/fused_stack_transpose_quant_kernel.cu |
@youge325 #75658 #2045
|
| 13 |
paddle/phi/kernels/fusion/gpu/fused_transpose_split_quant_kernel.cu |
@SpongeBob0318 #75539 #2011
|
| 14 |
paddle/phi/kernels/fusion/gpu/fused_transpose_wlch_split_quant_kernel.cu |
@SpongeBob0318 #75540 #2012
|
| 15 |
paddle/phi/kernels/fusion/gpu/fusion_group_kernel.cu |
@SpongeBob0318 #75541 #2013
|
| 16 |
paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu |
@Le-soleile #75706 #2100
|
| 17 |
paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu |
@Le-soleile #75707 #2100
|
| 18 |
paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu |
@SpongeBob0318 #75542 #2014
|
| 19 |
paddle/phi/kernels/gpu/affine_channel_grad_kernel.cu |
@SpongeBob0318 #75543 #2015 #2025 #2029
|
| 20 |
paddle/phi/kernels/gpu/affine_channel_kernel.cu |
@SpongeBob0318 #75545 #2016
|
| 21 |
paddle/phi/kernels/gpu/ap_facade_kernel.cu |
@youge325 #75659 #2046 @Echo-Nie #75657 #2043
|
| 22 |
paddle/phi/kernels/gpu/ap_trivial_fusion_begin_kernel.cu |
@youge325 #75660 #2073
|
| 23 |
paddle/phi/kernels/gpu/ap_trivial_fusion_end_kernel.cu |
@youge325 #75661 #2092
|
| 24 |
paddle/phi/kernels/gpu/ap_variadic_kernel.cu |
@youge325 #75662 #2093
|
| 25 |
paddle/phi/kernels/gpu/argsort_grad_kernel.cu |
@Patrisam #2169
|
| 26 |
paddle/phi/kernels/gpu/barrier_kernel.cu |
@youge325 #75663 #2094
|
| 27 |
paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu |
@Luxorion-12
|
| 28 |
paddle/phi/kernels/gpu/bce_loss_kernel.cu |
@tjujingzong #2123
|
| 29 |
paddle/phi/kernels/gpu/binomial_kernel.cu |
@tjujingzong #2123 #76487
|
| 30 |
paddle/phi/kernels/gpu/bmm_grad_kernel.cu |
@tjujingzong #2123
|
| 31 |
paddle/phi/kernels/gpu/bmm_kernel.cu |
@tjujingzong #2123
|
| 32 |
paddle/phi/kernels/gpu/box_clip_kernel.cu |
@algorithm1832 #75592 #2021
|
| 33 |
paddle/phi/kernels/gpu/c_concat_kernel.cu |
@algorithm1832 #75648 #2052
|
| 34 |
paddle/phi/kernels/gpu/c_embedding_grad_kernel.cu |
@algorithm1832 #2036
|
| 35 |
paddle/phi/kernels/gpu/c_scatter_kernel.cu |
@algorithm1832 #75653 #2059
|
| 36 |
paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_grad_kernel.cu |
@youge325 #75664 #2095
|
| 37 |
paddle/phi/kernels/gpu/cast_kernel.cu |
@Patrisam #2153
|
| 38 |
paddle/phi/kernels/gpu/class_center_sample_kernel.cu |
@Patrisam #76374 #2168
|
| 39 |
paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu |
@youge325 #75665 #2096
|
| 40 |
paddle/phi/kernels/gpu/comm_init_all_kernel.cu |
@youge325 #75666 #2097
|
| 41 |
paddle/phi/kernels/gpu/complex_kernel.cu |
@Le-soleile #2209 #2236 #76774
|
| 42 |
paddle/phi/kernels/gpu/correlation_grad_kernel.cu |
@tjujingzong #75633 #2047
|
| 43 |
paddle/phi/kernels/gpu/correlation_kernel.cu |
@youge325 #75667 #2098
|
| 44 |
paddle/phi/kernels/gpu/ctc_align_kernel.cu |
@Le-soleile #2237
|
| 45 |
paddle/phi/kernels/gpu/cvm_grad_kernel.cu |
@Le-soleile #75704 #2100
|
| 46 |
paddle/phi/kernels/gpu/cvm_kernel.cu |
@Le-soleile #75703 #2100
|
| 47 |
paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu |
@123wjr #2178
|
| 48 |
paddle/phi/kernels/gpu/deformable_conv_kernel.cu |
@123wjr #2178
|
| 49 |
paddle/phi/kernels/gpu/elementwise_grad_kernel.cu |
@LiaoYFBH #2129 #2155
|
| 50 |
paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu |
@LiaoYFBH #2150 @metax666 #2174
|
| 51 |
paddle/phi/kernels/gpu/exponential_kernel.cu |
@LiaoYFBH #2222
|
| 52 |
paddle/phi/kernels/gpu/flip_kernel.cu |
@LiaoYFBH #2240
|
| 53 |
paddle/phi/kernels/gpu/fused_token_prune_kernel.cu |
@Le-soleile #75701 #2100
|
| 54 |
paddle/phi/kernels/gpu/gather_grad_kernel.cu |
|
| 55 |
paddle/phi/kernels/gpu/gelu_grad_kernel.cu |
@Patrisam #2170
|
| 56 |
paddle/phi/kernels/gpu/global_gather_kernel.cu |
@Le-soleile #75700 #2142 #2191
|
| 57 |
paddle/phi/kernels/gpu/global_scatter_kernel.cu |
@Le-soleile #75699 #2142 #2191
|
| 58 |
paddle/phi/kernels/gpu/group_norm_grad_kernel.cu |
@algorithm1832 #2198 @Le-soleile #2209
|
| 59 |
paddle/phi/kernels/gpu/group_norm_kernel.cu |
@algorithm1832 #76524 @Le-soleile #2209
|
| 60 |
paddle/phi/kernels/gpu/gru_kernel.cu |
@algorithm1832 #75845 #2126
|
| 61 |
paddle/phi/kernels/gpu/index_add_grad_kernel.cu |
@algorithm1832 #2068 #2071
|
| 62 |
paddle/phi/kernels/gpu/interpolate_grad_kernel.cu |
@algorithm1832 #75974 #2127
|
| 63 |
paddle/phi/kernels/gpu/interpolate_kernel.cu |
@algorithm1832 #76261 #2163
|
| 64 |
paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu |
@algorithm1832 #2117
|
| 65 |
paddle/phi/kernels/gpu/kldiv_loss_kernel.cu |
@algorithm1832 #2197 @Le-soleile #2209
|
| 66 |
paddle/phi/kernels/gpu/l1_norm_grad_kernel.cu |
@Le-soleile #75647 #2142 #2191
|
| 67 |
paddle/phi/kernels/gpu/l1_norm_kernel.cu |
@Le-soleile #2210
|
| 68 |
paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu |
@Le-soleile #2210
|
| 69 |
paddle/phi/kernels/gpu/label_smooth_kernel.cu |
@Le-soleile #2210
|
| 70 |
paddle/phi/kernels/gpu/lamb_kernel.cu |
@Le-soleile #2210
|
| 71 |
paddle/phi/kernels/gpu/lgamma_kernel.cu |
@Le-soleile #2210
|
| 72 |
paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu |
@Le-soleile #2210
|
| 73 |
paddle/phi/kernels/gpu/logsumexp_kernel.cu |
@Le-soleile #2210
|
| 74 |
paddle/phi/kernels/gpu/lookup_table_grad_kernel.cu |
@Le-soleile #75645 #2142 #2191
|
| 75 |
paddle/phi/kernels/gpu/lookup_table_kernel.cu |
@Le-soleile #75645 #2142 #2191
|
| 76 |
paddle/phi/kernels/gpu/lu_solve_kernel.cu |
@Forest-Lee
|
| 77 |
paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu |
@Forest-Lee
|
| 78 |
paddle/phi/kernels/gpu/matrix_power_grad_kernel.cu |
@Forest-Lee
|
| 79 |
paddle/phi/kernels/gpu/matrix_power_kernel.cu |
@SidusAntares #2270
|
| 80 |
paddle/phi/kernels/gpu/mean_all_grad_kernel.cu |
@Patrisam #2171
|
| 81 |
paddle/phi/kernels/gpu/moe_unpermute_kernel.cu |
@Le-soleile #75644 #2142 #2192
|
| 82 |
paddle/phi/kernels/gpu/momentum_kernel.cu |
@Le-soleile #2238
|
| 83 |
paddle/phi/kernels/gpu/mp_allreduce_sum_kernel.cu |
@fsylmxx #76503 #76561 #2218
|
| 84 |
paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu |
@fsylmxx #76548 #2220
|
| 85 |
paddle/phi/kernels/gpu/multiplex_grad_kernel.cu |
@fsylmxx #2204
|
| 86 |
paddle/phi/kernels/gpu/nonzero_kernel.cu |
@dakelong #76547 #2232
|
| 87 |
paddle/phi/kernels/gpu/pad3d_kernel.cu |
@dakelong
|
| 88 |
paddle/phi/kernels/gpu/partial_allgather_kernel.cu |
@Le-soleile #75643 #2142 #2192
|
| 89 |
paddle/phi/kernels/gpu/partial_concat_grad_kernel.cu |
@Le-soleile #75642 #2142 #2192
|
| 90 |
paddle/phi/kernels/gpu/partial_concat_kernel.cu |
@Le-soleile #2239 #76797
|
| 91 |
paddle/phi/kernels/gpu/partial_recv_kernel.cu |
@Le-soleile #75641 #2142 #2192
|
| 92 |
paddle/phi/kernels/gpu/partial_send_kernel.cu |
@Le-soleile #75640 #2142 #2192
|
| 93 |
paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu |
@xxiu1 #75938 #2088
|
| 94 |
paddle/phi/kernels/gpu/quantize_linear_kernel.cu |
@LiaoYFBH #2250 #76792 @Le-soleile #2252
|
| 95 |
paddle/phi/kernels/gpu/reduce_kernel.cu |
@LiaoYFBH #76781 @Le-soleile #2252
|
| 96 |
paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu |
@LiaoYFBH @Le-soleile #2252
|
| 97 |
paddle/phi/kernels/gpu/repeat_interleave_kernel.cu |
@LiaoYFBH @Le-soleile #2252
|
| 98 |
paddle/phi/kernels/gpu/rmsprop_kernel.cu |
@LiaoYFBH @Le-soleile #2252
|
| 99 |
paddle/phi/kernels/gpu/roi_align_grad_kernel.cu |
@LiaoYFBH @Le-soleile #2252
|
| 100 |
paddle/phi/kernels/gpu/roi_align_kernel.cu |
@Le-soleile #2005
|
| 101 |
paddle/phi/kernels/gpu/row_conv_grad_kernel.cu |
@Le-soleile #75554 #2142 #2193
|
| 102 |
paddle/phi/kernels/gpu/row_conv_kernel.cu |
@Le-soleile #75562 #2142 #2193
|
| 103 |
paddle/phi/kernels/gpu/seed_kernel.cu |
@Le-soleile #75577 #2142 #2193
|
| 104 |
paddle/phi/kernels/gpu/sequence_expand_kernel.cu |
@Le-soleile #75578 #2142 #2193
|
| 105 |
paddle/phi/kernels/gpu/set_value_kernel.cu |
@Le-soleile #2018
|
| 106 |
paddle/phi/kernels/gpu/shuffle_channel_grad_kernel.cu |
@Le-soleile #75580 #2142 #2193 #76798
|
| 107 |
paddle/phi/kernels/gpu/shuffle_channel_kernel.cu |
@Le-soleile #2020 #75608 #2142 #2206
|
| 108 |
paddle/phi/kernels/gpu/soft_relu_grad_kernel.cu |
@Le-soleile #75581 #2142 #2206
|
| 109 |
paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu |
@Le-soleile #2027
|
| 110 |
paddle/phi/kernels/gpu/spectral_norm_kernel.cu |
@Le-soleile #2028
|
| 111 |
paddle/phi/kernels/gpu/stack_grad_kernel.cu |
@Forest-Lee #2293
|
| 112 |
paddle/phi/kernels/gpu/stft_grad_kernel.cu |
@Le-soleile #75614 #2142 #2206
|
| 113 |
paddle/phi/kernels/gpu/sync_batch_norm_grad_kernel.cu |
|
| 114 |
paddle/phi/kernels/gpu/top_k_kernel.cu |
|
| 115 |
paddle/phi/kernels/gpu/uniform_random_batch_size_like_kernel.cu |
@Le-soleile #75615 #2142 #2206
|
| 116 |
paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu |
@WHoutstanding #2283 #77134
|
| 117 |
paddle/phi/kernels/gpu/yolo_box_head_kernel.cu |
@Le-soleile #75616 #2142 #2206
|
| 118 |
paddle/phi/kernels/gpu/yolo_box_post_kernel.cu |
@Le-soleile #75636 #2142 #2207
|
| 119 |
paddle/phi/kernels/kps/elementwise_kernel.cu |
|
| 120 |
paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu |
@Le-soleile #75637 #2142 #2207
|
| 121 |
paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu |
@Le-soleile #75639 #2142 #2207
|
| 122 |
paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu |
@Le-soleile #75708 #2142 #2207
|
| 123 |
paddle/phi/kernels/legacy/gpu/ext_build_src_rank_and_local_expert_id_kernel.cu |
@Le-soleile #75709 #2142 #2207
|
| 124 |
paddle/phi/kernels/legacy/gpu/fp8_quant_blockwise_kernel.cu |
@Le-soleile #75710 #2142 #2208
|
| 125 |
paddle/phi/kernels/legacy/gpu/int_bincount.cu |
@LingmaFuture
|
| 126 |
paddle/phi/kernels/legacy/gpu/layer_norm_cuda_kernel.cu |
@LingmaFuture
|
| 127 |
paddle/phi/kernels/legacy/gpu/moe_combine_grad_kernel.cu |
@ADchampion3 #76690 #2244
|
| 128 |
paddle/phi/kernels/legacy/gpu/moe_combine_kernel.cu |
@ADchampion3 #76690 #2244
|
| 129 |
paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_kernel.cu |
@ADchampion3 #76690 #2244
|
| 130 |
paddle/phi/kernels/legacy/gpu/moe_gate_dispatch_grad_kernel.cu |
@ADchampion3 #76690 #2244
|
| 131 |
paddle/phi/kernels/legacy/gpu/moe_gate_dispatch_kernel.cu |
@ADchampion3 #76690 #2244
|
| 132 |
paddle/phi/kernels/legacy/gpu/moe_gate_dispatch_permute_grad_kernel.cu |
@Le-soleile #75711 #2142 #2208
|
| 133 |
paddle/phi/kernels/legacy/gpu/moe_gate_dispatch_permute_kernel.cu |
@Le-soleile #75713 #2142 #2208
|
| 134 |
paddle/phi/kernels/legacy/gpu/moe_ops_partial_nosoftmaxtopk_grad_kernel.cu |
@Le-soleile #75714 #2142 #2208
|
| 135 |
paddle/phi/kernels/legacy/gpu/moe_ops_partial_nosoftmaxtopk_kernel.cu |
@Le-soleile #75715 #2142 #2208
|
| 136 |
paddle/phi/kernels/legacy/kps/compare_kernel.cu |
@LiaoYFBH #2255
|
背景
飞桨在3.1 版本推出了 类 CUDA 硬件接入方案。该方案在 Custom Device硬件接入方案 的基础上进行了升级,最大的特点是可以 复用飞桨 PHI 算子库中的大量 CUDA Kernel。 当前此方案已经成功接入沐曦(metax_gpu)和天数智芯(iluvatar_gpu)。
然而,目前PHI 算子库中的部分 CUDA Kernel 并未考虑被其他模块复用的情况,导致出现以下问题: 部分 Kernel 缺少函数声明,类 CUDA 硬件在复用时不得不直接
#include.cu源文件,这不符合代码规范。因此,本次活动旨在对 PHI算子库的 CUDA Kernel 进行规范化修复:
.h);#include cu用法,改为#include正确的头文件。涉及范围
涉及仓库
影响文件
在 PaddleCustomDevice 仓 中,所有被
#include到注册文件中的算子 Kernel.cu源文件,共 136 个。具体文件列表见下方表格:
任务
修复目标
#include *.cu,改为#include新增的头文件,同时把Kernel的实现代码正确的添加到CMakelists编译列表中。需要修改的代码只出现在backends/metax_gpu和backends/iluvatar_gpu这两个目录下。@YqGe585
@Echo-Nie
@metax666
@Le-soleile
@Le-soleile
@Le-soleile
@Le-soleile
@Le-soleile
@Le-soleile
@Le-soleile
@Le-soleile
@Le-soleile
示例修复&代码提交方式
请参考 #75226 (comment)
认领方式
请大家以 comment 的形式认领任务,如:
-part结尾看板信息
统计信息