Skip to content

Latest commit

 

History

History
72 lines (72 loc) · 8.02 KB

File metadata and controls

72 lines (72 loc) · 8.02 KB
File Original Latency Current Latency Speedup
example_dequant_gemv_fp16xint4 0.0052611 0.00556904 0.944705
example_mha_fwd_bhsd 0.009312 0.009664 0.963576
example_mha_fwd_bshd_wgmma_pipelined 0.014688 0.015104 0.972458
example_per_token_cast_to_fp8 0.00885861 0.0091083 0.972587
example_dequant_gemm_bf16_mxfp4_hopper 0.011872 0.012096 0.981481
example_gqa_sink_fwd_bhsd_wgmma_pipelined 0.013135 0.0133685 0.982534
example_tilelang_nsa_decode 0.00833827 0.00848195 0.98306
example_mha_bwd_bshd 0.0323905 0.032925 0.983766
example_dequant_gemm_fp4_hopper 0.010944 0.011104 0.985591
example_mha_sink_fwd_bhsd 0.0112312 0.0113914 0.985937
example_mha_sink_bwd_bhsd_sliding_window 0.0250769 0.0253728 0.988338
example_mha_fwd_bhsd_wgmma_pipelined 0.015296 0.015456 0.989648
example_warp_specialize_gemm_copy_0_gemm_1 0.03088 0.031168 0.99076
example_gemm_intrinsics 0.027712 0.027904 0.993119
sparse_mla_fwd_pipelined 0.0984533 0.0987996 0.996495
example_vertical_slash_sparse_attn 0.00148107 0.00148577 0.996837
example_tilelang_gemm_fp8_intrinsic 0.005984 0.006 0.997333
example_mha_sink_fwd_bhsd_wgmma_pipelined 0.0149988 0.0150285 0.998024
example_gemm 0.017281 0.017312 0.998209
example_fusedmoe_tilelang 0.217889 0.218217 0.998497
example_tilelang_gemm_fp8 0.01376 0.013776 0.998839
example_mha_sink_fwd_bhsd_sliding_window 0.0113744 0.0113844 0.999122
example_mha_sink_bwd_bhsd 0.0410323 0.0410627 0.99926
example_gqa_bwd 0.0362193 0.0362422 0.999368
example_group_per_split_token_cast_to_fp8 0.0100313 0.0100362 0.999512
example_gemv 0.0481227 0.048128 0.99989
example_linear_attn_bwd 0.114526 0.114538 0.999895
sparse_mla_bwd 0.247906 0.247916 0.99996
example_gemm_autotune 0.020544 0.020544 1
example_dequant_gemm_w4a8 0.00624 0.00624 1
block_sparse_attn_tilelang 0.0112801 0.0112754 1.00042
topk_selector 0.0449657 0.044939 1.00059
example_tilelang_gemm_splitk_vectorize_atomicadd 0.0413707 0.0413452 1.00062
tilelang_example_sparse_tensorcore 0.0133828 0.0133715 1.00085
example_convolution_autotune 0.69616 0.695552 1.00087
example_linear_attn_fwd 0.0277008 0.0276757 1.00091
example_mha_inference 0.0652448 0.0651648 1.00123
example_tilelang_gemm_splitk 0.0402086 0.0401445 1.0016
example_dequant_groupedgemm_bf16_mxfp4_hopper 0.0145726 0.0145491 1.00162
example_mha_fwd_varlen 1.43004 1.42759 1.00172
example_tilelang_nsa_fwd 0.00870593 0.00868398 1.00253
example_dequant_gemm_bf16_mxfp4_hopper_tma 0.01168 0.011648 1.00275
example_topk 0.010048 0.010016 1.00319
example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window 0.0147675 0.0147171 1.00342
example_gqa_bwd_wgmma_pipelined 0.0461091 0.0459494 1.00348
example_mha_bwd_bhsd 0.0310782 0.03097 1.00349
example_gqa_sink_bwd_bhsd 0.0315594 0.0314257 1.00425
example_blocksparse_gemm 0.020372 0.0202743 1.00482
example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window 0.0132417 0.0131672 1.00566
example_mla_decode 0.327616 0.32576 1.0057
example_gqa_bwd_tma_reduce_varlen 0.0477759 0.0474919 1.00598
example_mha_fwd_bshd 0.023712 0.023552 1.00679
fp8_lighting_indexer 0.0293298 0.0291277 1.00694
example_gqa_decode 0.043808 0.043488 1.00736
example_tilelang_gemm_fp8_2xAcc 0.135392 0.134336 1.00786
example_gqa_fwd_bshd 0.056896 0.056448 1.00794
example_warp_specialize_gemm_barrierpipe_stage2 0.031456 0.031136 1.01028
example_warp_specialize_gemm_copy_1_gemm_0 0.030368 0.030048 1.01065
example_gemm_schedule 0.0276354 0.0273229 1.01144
example_convolution 0.94832 0.937536 1.0115
example_tilelang_sparse_gqa_decode_varlen_mask 0.0211596 0.0208851 1.01314
example_elementwise_add 0.0193343 0.0190617 1.0143
example_dynamic 0.023712 0.02336 1.01507
example_mha_bwd_bshd_wgmma_pipelined 0.0221314 0.0218026 1.01508
sparse_mla_fwd 0.358282 0.351806 1.01841
example_gqa_sink_bwd_bhsd_sliding_window 0.0203378 0.0199598 1.01894
example_dequant_gemm_bf16_fp4_hopper 0.010816 0.010592 1.02115
example_gqa_fwd_bshd_wgmma_pipelined 0.046112 0.04512 1.02199
example_tilelang_block_sparse_attn 0.00847182 0.0081466 1.03992
example_tilelang_sparse_gqa_decode_varlen_indice 0.0162253 0.013691 1.18511